Initial commit

2026-02-01 09:31:38 +01:00
commit e02db93960
4396 changed files with 1511612 additions and 0 deletions
--- a/backend/venv/Lib/site-packages/nltk/corpus/init.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/init.py
@@ -0,0 +1,551 @@
+# Natural Language Toolkit: Corpus Readers
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+# TODO this docstring isn't up-to-date!
+"""
+NLTK corpus readers.  The modules in this package provide functions
+that can be used to read corpus files in a variety of formats.  These
+functions can be used to read both the corpus files that are
+distributed in the NLTK corpus package, and corpus files that are part
+of external corpora.
+
+Available Corpora
+=================
+
+Please see https://www.nltk.org/nltk_data/ for a complete list.
+Install corpora using nltk.download().
+
+Corpus Reader Functions
+=======================
+Each corpus module defines one or more "corpus reader functions",
+which can be used to read documents from that corpus.  These functions
+take an argument, ``item``, which is used to indicate which document
+should be read from the corpus:
+
+- If ``item`` is one of the unique identifiers listed in the corpus
+  module's ``items`` variable, then the corresponding document will
+  be loaded from the NLTK corpus package.
+- If ``item`` is a filename, then that file will be read.
+
+Additionally, corpus reader functions can be given lists of item
+names; in which case, they will return a concatenation of the
+corresponding documents.
+
+Corpus reader functions are named based on the type of information
+they return.  Some common examples, and their return types, are:
+
+- words(): list of str
+- sents(): list of (list of str)
+- paras(): list of (list of (list of str))
+- tagged_words(): list of (str,str) tuple
+- tagged_sents(): list of (list of (str,str))
+- tagged_paras(): list of (list of (list of (str,str)))
+- chunked_sents(): list of (Tree w/ (str,str) leaves)
+- parsed_sents(): list of (Tree with str leaves)
+- parsed_paras(): list of (list of (Tree with str leaves))
+- xml(): A single xml ElementTree
+- raw(): unprocessed corpus contents
+
+For example, to read a list of the words in the Brown Corpus, use
+``nltk.corpus.brown.words()``:
+
+    >>> from nltk.corpus import brown
+    >>> print(", ".join(brown.words())) # doctest: +ELLIPSIS
+    The, Fulton, County, Grand, Jury, said, ...
+
+"""
+
+import re
+
+from nltk.corpus.reader import *
+from nltk.corpus.util import LazyCorpusLoader
+from nltk.tokenize import RegexpTokenizer
+
+abc: PlaintextCorpusReader = LazyCorpusLoader(
+    "abc",
+    PlaintextCorpusReader,
+    r"(?!\.).*\.txt",
+    encoding=[("science", "latin_1"), ("rural", "utf8")],
+)
+alpino: AlpinoCorpusReader = LazyCorpusLoader(
+    "alpino", AlpinoCorpusReader, tagset="alpino"
+)
+bcp47: BCP47CorpusReader = LazyCorpusLoader(
+    "bcp47", BCP47CorpusReader, r"(cldr|iana)/*"
+)
+brown: CategorizedTaggedCorpusReader = LazyCorpusLoader(
+    "brown",
+    CategorizedTaggedCorpusReader,
+    r"c[a-z]\d\d",
+    cat_file="cats.txt",
+    tagset="brown",
+    encoding="ascii",
+)
+cess_cat: BracketParseCorpusReader = LazyCorpusLoader(
+    "cess_cat",
+    BracketParseCorpusReader,
+    r"(?!\.).*\.tbf",
+    tagset="unknown",
+    encoding="ISO-8859-15",
+)
+cess_esp: BracketParseCorpusReader = LazyCorpusLoader(
+    "cess_esp",
+    BracketParseCorpusReader,
+    r"(?!\.).*\.tbf",
+    tagset="unknown",
+    encoding="ISO-8859-15",
+)
+cmudict: CMUDictCorpusReader = LazyCorpusLoader(
+    "cmudict", CMUDictCorpusReader, ["cmudict"]
+)
+comtrans: AlignedCorpusReader = LazyCorpusLoader(
+    "comtrans", AlignedCorpusReader, r"(?!\.).*\.txt"
+)
+comparative_sentences: ComparativeSentencesCorpusReader = LazyCorpusLoader(
+    "comparative_sentences",
+    ComparativeSentencesCorpusReader,
+    r"labeledSentences\.txt",
+    encoding="latin-1",
+)
+conll2000: ConllChunkCorpusReader = LazyCorpusLoader(
+    "conll2000",
+    ConllChunkCorpusReader,
+    ["train.txt", "test.txt"],
+    ("NP", "VP", "PP"),
+    tagset="wsj",
+    encoding="ascii",
+)
+conll2002: ConllChunkCorpusReader = LazyCorpusLoader(
+    "conll2002",
+    ConllChunkCorpusReader,
+    r".*\.(test|train).*",
+    ("LOC", "PER", "ORG", "MISC"),
+    encoding="utf-8",
+)
+conll2007: DependencyCorpusReader = LazyCorpusLoader(
+    "conll2007",
+    DependencyCorpusReader,
+    r".*\.(test|train).*",
+    encoding=[("eus", "ISO-8859-2"), ("esp", "utf8")],
+)
+crubadan: CrubadanCorpusReader = LazyCorpusLoader(
+    "crubadan", CrubadanCorpusReader, r".*\.txt"
+)
+dependency_treebank: DependencyCorpusReader = LazyCorpusLoader(
+    "dependency_treebank", DependencyCorpusReader, r".*\.dp", encoding="ascii"
+)
+extended_omw: CorpusReader = LazyCorpusLoader(
+    "extended_omw", CorpusReader, r".*/wn-[a-z\-]*\.tab", encoding="utf8"
+)
+floresta: BracketParseCorpusReader = LazyCorpusLoader(
+    "floresta",
+    BracketParseCorpusReader,
+    r"(?!\.).*\.ptb",
+    "#",
+    tagset="unknown",
+    encoding="ISO-8859-15",
+)
+framenet15: FramenetCorpusReader = LazyCorpusLoader(
+    "framenet_v15",
+    FramenetCorpusReader,
+    [
+        "frRelation.xml",
+        "frameIndex.xml",
+        "fulltextIndex.xml",
+        "luIndex.xml",
+        "semTypes.xml",
+    ],
+)
+framenet: FramenetCorpusReader = LazyCorpusLoader(
+    "framenet_v17",
+    FramenetCorpusReader,
+    [
+        "frRelation.xml",
+        "frameIndex.xml",
+        "fulltextIndex.xml",
+        "luIndex.xml",
+        "semTypes.xml",
+    ],
+)
+gazetteers: WordListCorpusReader = LazyCorpusLoader(
+    "gazetteers", WordListCorpusReader, r"(?!LICENSE|\.).*\.txt", encoding="ISO-8859-2"
+)
+genesis: PlaintextCorpusReader = LazyCorpusLoader(
+    "genesis",
+    PlaintextCorpusReader,
+    r"(?!\.).*\.txt",
+    encoding=[
+        ("finnish|french|german", "latin_1"),
+        ("swedish", "cp865"),
+        (".*", "utf_8"),
+    ],
+)
+gutenberg: PlaintextCorpusReader = LazyCorpusLoader(
+    "gutenberg", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="latin1"
+)
+ieer: IEERCorpusReader = LazyCorpusLoader("ieer", IEERCorpusReader, r"(?!README|\.).*")
+inaugural: PlaintextCorpusReader = LazyCorpusLoader(
+    "inaugural", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="latin1"
+)
+# [XX] This should probably just use TaggedCorpusReader:
+indian: IndianCorpusReader = LazyCorpusLoader(
+    "indian", IndianCorpusReader, r"(?!\.).*\.pos", tagset="unknown", encoding="utf8"
+)
+
+jeita: ChasenCorpusReader = LazyCorpusLoader(
+    "jeita", ChasenCorpusReader, r".*\.chasen", encoding="utf-8"
+)
+knbc: KNBCorpusReader = LazyCorpusLoader(
+    "knbc/corpus1", KNBCorpusReader, r".*/KN.*", encoding="euc-jp"
+)
+lin_thesaurus: LinThesaurusCorpusReader = LazyCorpusLoader(
+    "lin_thesaurus", LinThesaurusCorpusReader, r".*\.lsp"
+)
+mac_morpho: MacMorphoCorpusReader = LazyCorpusLoader(
+    "mac_morpho",
+    MacMorphoCorpusReader,
+    r"(?!\.).*\.txt",
+    tagset="unknown",
+    encoding="latin-1",
+)
+machado: PortugueseCategorizedPlaintextCorpusReader = LazyCorpusLoader(
+    "machado",
+    PortugueseCategorizedPlaintextCorpusReader,
+    r"(?!\.).*\.txt",
+    cat_pattern=r"([a-z]*)/.*",
+    encoding="latin-1",
+)
+masc_tagged: CategorizedTaggedCorpusReader = LazyCorpusLoader(
+    "masc_tagged",
+    CategorizedTaggedCorpusReader,
+    r"(spoken|written)/.*\.txt",
+    cat_file="categories.txt",
+    tagset="wsj",
+    encoding="utf-8",
+    sep="_",
+)
+movie_reviews: CategorizedPlaintextCorpusReader = LazyCorpusLoader(
+    "movie_reviews",
+    CategorizedPlaintextCorpusReader,
+    r"(?!\.).*\.txt",
+    cat_pattern=r"(neg|pos)/.*",
+    encoding="ascii",
+)
+multext_east: MTECorpusReader = LazyCorpusLoader(
+    "mte_teip5", MTECorpusReader, r"(oana).*\.xml", encoding="utf-8"
+)
+names: WordListCorpusReader = LazyCorpusLoader(
+    "names", WordListCorpusReader, r"(?!\.).*\.txt", encoding="ascii"
+)
+nps_chat: NPSChatCorpusReader = LazyCorpusLoader(
+    "nps_chat", NPSChatCorpusReader, r"(?!README|\.).*\.xml", tagset="wsj"
+)
+opinion_lexicon: OpinionLexiconCorpusReader = LazyCorpusLoader(
+    "opinion_lexicon",
+    OpinionLexiconCorpusReader,
+    r"(\w+)\-words\.txt",
+    encoding="ISO-8859-2",
+)
+ppattach: PPAttachmentCorpusReader = LazyCorpusLoader(
+    "ppattach", PPAttachmentCorpusReader, ["training", "test", "devset"]
+)
+product_reviews_1: ReviewsCorpusReader = LazyCorpusLoader(
+    "product_reviews_1", ReviewsCorpusReader, r"^(?!Readme).*\.txt", encoding="utf8"
+)
+product_reviews_2: ReviewsCorpusReader = LazyCorpusLoader(
+    "product_reviews_2", ReviewsCorpusReader, r"^(?!Readme).*\.txt", encoding="utf8"
+)
+pros_cons: ProsConsCorpusReader = LazyCorpusLoader(
+    "pros_cons",
+    ProsConsCorpusReader,
+    r"Integrated(Cons|Pros)\.txt",
+    cat_pattern=r"Integrated(Cons|Pros)\.txt",
+    encoding="ISO-8859-2",
+)
+ptb: CategorizedBracketParseCorpusReader = (
+    LazyCorpusLoader(  # Penn Treebank v3: WSJ and Brown portions
+        "ptb",
+        CategorizedBracketParseCorpusReader,
+        r"(WSJ/\d\d/WSJ_\d\d|BROWN/C[A-Z]/C[A-Z])\d\d.MRG",
+        cat_file="allcats.txt",
+        tagset="wsj",
+    )
+)
+qc: StringCategoryCorpusReader = LazyCorpusLoader(
+    "qc", StringCategoryCorpusReader, ["train.txt", "test.txt"], encoding="ISO-8859-2"
+)
+reuters: CategorizedPlaintextCorpusReader = LazyCorpusLoader(
+    "reuters",
+    CategorizedPlaintextCorpusReader,
+    "(training|test).*",
+    cat_file="cats.txt",
+    encoding="ISO-8859-2",
+)
+rte: RTECorpusReader = LazyCorpusLoader("rte", RTECorpusReader, r"(?!\.).*\.xml")
+senseval: SensevalCorpusReader = LazyCorpusLoader(
+    "senseval", SensevalCorpusReader, r"(?!\.).*\.pos"
+)
+sentence_polarity: CategorizedSentencesCorpusReader = LazyCorpusLoader(
+    "sentence_polarity",
+    CategorizedSentencesCorpusReader,
+    r"rt-polarity\.(neg|pos)",
+    cat_pattern=r"rt-polarity\.(neg|pos)",
+    encoding="utf-8",
+)
+sentiwordnet: SentiWordNetCorpusReader = LazyCorpusLoader(
+    "sentiwordnet", SentiWordNetCorpusReader, "SentiWordNet_3.0.0.txt", encoding="utf-8"
+)
+shakespeare: XMLCorpusReader = LazyCorpusLoader(
+    "shakespeare", XMLCorpusReader, r"(?!\.).*\.xml"
+)
+sinica_treebank: SinicaTreebankCorpusReader = LazyCorpusLoader(
+    "sinica_treebank",
+    SinicaTreebankCorpusReader,
+    ["parsed"],
+    tagset="unknown",
+    encoding="utf-8",
+)
+state_union: PlaintextCorpusReader = LazyCorpusLoader(
+    "state_union", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="ISO-8859-2"
+)
+stopwords: WordListCorpusReader = LazyCorpusLoader(
+    "stopwords", WordListCorpusReader, r"(?!README|\.).*", encoding="utf8"
+)
+subjectivity: CategorizedSentencesCorpusReader = LazyCorpusLoader(
+    "subjectivity",
+    CategorizedSentencesCorpusReader,
+    r"(quote.tok.gt9|plot.tok.gt9)\.5000",
+    cat_map={"quote.tok.gt9.5000": ["subj"], "plot.tok.gt9.5000": ["obj"]},
+    encoding="latin-1",
+)
+swadesh: SwadeshCorpusReader = LazyCorpusLoader(
+    "swadesh", SwadeshCorpusReader, r"(?!README|\.).*", encoding="utf8"
+)
+swadesh110: PanlexSwadeshCorpusReader = LazyCorpusLoader(
+    "panlex_swadesh", PanlexSwadeshCorpusReader, r"swadesh110/.*\.txt", encoding="utf8"
+)
+swadesh207: PanlexSwadeshCorpusReader = LazyCorpusLoader(
+    "panlex_swadesh", PanlexSwadeshCorpusReader, r"swadesh207/.*\.txt", encoding="utf8"
+)
+switchboard: SwitchboardCorpusReader = LazyCorpusLoader(
+    "switchboard", SwitchboardCorpusReader, tagset="wsj"
+)
+timit: TimitCorpusReader = LazyCorpusLoader("timit", TimitCorpusReader)
+timit_tagged: TimitTaggedCorpusReader = LazyCorpusLoader(
+    "timit", TimitTaggedCorpusReader, r".+\.tags", tagset="wsj", encoding="ascii"
+)
+toolbox: ToolboxCorpusReader = LazyCorpusLoader(
+    "toolbox", ToolboxCorpusReader, r"(?!.*(README|\.)).*\.(dic|txt)"
+)
+treebank: BracketParseCorpusReader = LazyCorpusLoader(
+    "treebank/combined",
+    BracketParseCorpusReader,
+    r"wsj_.*\.mrg",
+    tagset="wsj",
+    encoding="ascii",
+)
+treebank_chunk: ChunkedCorpusReader = LazyCorpusLoader(
+    "treebank/tagged",
+    ChunkedCorpusReader,
+    r"wsj_.*\.pos",
+    sent_tokenizer=RegexpTokenizer(r"(?<=/\.)\s*(?![^\[]*\])", gaps=True),
+    para_block_reader=tagged_treebank_para_block_reader,
+    tagset="wsj",
+    encoding="ascii",
+)
+treebank_raw: PlaintextCorpusReader = LazyCorpusLoader(
+    "treebank/raw", PlaintextCorpusReader, r"wsj_.*", encoding="ISO-8859-2"
+)
+twitter_samples: TwitterCorpusReader = LazyCorpusLoader(
+    "twitter_samples", TwitterCorpusReader, r".*\.json"
+)
+udhr: UdhrCorpusReader = LazyCorpusLoader("udhr", UdhrCorpusReader)
+udhr2: PlaintextCorpusReader = LazyCorpusLoader(
+    "udhr2", PlaintextCorpusReader, r".*\.txt", encoding="utf8"
+)
+universal_treebanks: ConllCorpusReader = LazyCorpusLoader(
+    "universal_treebanks_v20",
+    ConllCorpusReader,
+    r".*\.conll",
+    columntypes=(
+        "ignore",
+        "words",
+        "ignore",
+        "ignore",
+        "pos",
+        "ignore",
+        "ignore",
+        "ignore",
+        "ignore",
+        "ignore",
+    ),
+)
+verbnet: VerbnetCorpusReader = LazyCorpusLoader(
+    "verbnet", VerbnetCorpusReader, r"(?!\.).*\.xml"
+)
+webtext: PlaintextCorpusReader = LazyCorpusLoader(
+    "webtext", PlaintextCorpusReader, r"(?!README|\.).*\.txt", encoding="ISO-8859-2"
+)
+wordnet: WordNetCorpusReader = LazyCorpusLoader(
+    "wordnet",
+    WordNetCorpusReader,
+    LazyCorpusLoader("omw-1.4", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
+)
+## Use the following template to add a custom Wordnet package.
+## Just uncomment, and replace the identifier (my_wordnet) in two places:
+##
+# my_wordnet: WordNetCorpusReader = LazyCorpusLoader(
+#    "my_wordnet",
+#    WordNetCorpusReader,
+#    LazyCorpusLoader("omw-1.4", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
+# )
+wordnet31: WordNetCorpusReader = LazyCorpusLoader(
+    "wordnet31",
+    WordNetCorpusReader,
+    LazyCorpusLoader("omw-1.4", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
+)
+wordnet2021: WordNetCorpusReader = LazyCorpusLoader(
+    # Obsolete, use english_wordnet instead.
+    "wordnet2021",
+    WordNetCorpusReader,
+    LazyCorpusLoader("omw-1.4", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
+)
+wordnet2022: WordNetCorpusReader = LazyCorpusLoader(
+    # Obsolete, use english_wordnet instead.
+    "wordnet2022",
+    WordNetCorpusReader,
+    LazyCorpusLoader("omw-1.4", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
+)
+english_wordnet: WordNetCorpusReader = LazyCorpusLoader(
+    # Latest Open English Wordnet
+    "english_wordnet",
+    WordNetCorpusReader,
+    LazyCorpusLoader("omw-1.4", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
+)
+wordnet_ic: WordNetICCorpusReader = LazyCorpusLoader(
+    "wordnet_ic", WordNetICCorpusReader, r".*\.dat"
+)
+words: WordListCorpusReader = LazyCorpusLoader(
+    "words", WordListCorpusReader, r"(?!README|\.).*", encoding="ascii"
+)
+
+# defined after treebank
+propbank: PropbankCorpusReader = LazyCorpusLoader(
+    "propbank",
+    PropbankCorpusReader,
+    "prop.txt",
+    r"frames/.*\.xml",
+    "verbs.txt",
+    lambda filename: re.sub(r"^wsj/\d\d/", "", filename),
+    treebank,
+)  # Must be defined *after* treebank corpus.
+nombank: NombankCorpusReader = LazyCorpusLoader(
+    "nombank.1.0",
+    NombankCorpusReader,
+    "nombank.1.0",
+    r"frames/.*\.xml",
+    "nombank.1.0.words",
+    lambda filename: re.sub(r"^wsj/\d\d/", "", filename),
+    treebank,
+)  # Must be defined *after* treebank corpus.
+propbank_ptb: PropbankCorpusReader = LazyCorpusLoader(
+    "propbank",
+    PropbankCorpusReader,
+    "prop.txt",
+    r"frames/.*\.xml",
+    "verbs.txt",
+    lambda filename: filename.upper(),
+    ptb,
+)  # Must be defined *after* ptb corpus.
+nombank_ptb: NombankCorpusReader = LazyCorpusLoader(
+    "nombank.1.0",
+    NombankCorpusReader,
+    "nombank.1.0",
+    r"frames/.*\.xml",
+    "nombank.1.0.words",
+    lambda filename: filename.upper(),
+    ptb,
+)  # Must be defined *after* ptb corpus.
+semcor: SemcorCorpusReader = LazyCorpusLoader(
+    "semcor", SemcorCorpusReader, r"brown./tagfiles/br-.*\.xml", wordnet
+)  # Must be defined *after* wordnet corpus.
+
+nonbreaking_prefixes: NonbreakingPrefixesCorpusReader = LazyCorpusLoader(
+    "nonbreaking_prefixes",
+    NonbreakingPrefixesCorpusReader,
+    r"(?!README|\.).*",
+    encoding="utf8",
+)
+perluniprops: UnicharsCorpusReader = LazyCorpusLoader(
+    "perluniprops",
+    UnicharsCorpusReader,
+    r"(?!README|\.).*",
+    nltk_data_subdir="misc",
+    encoding="utf8",
+)
+
+# mwa_ppdb = LazyCorpusLoader(
+#     'mwa_ppdb', MWAPPDBCorpusReader, r'(?!README|\.).*', nltk_data_subdir='misc', encoding='utf8')
+
+# See https://github.com/nltk/nltk/issues/1579
+# and https://github.com/nltk/nltk/issues/1716
+#
+# pl196x = LazyCorpusLoader(
+#     'pl196x', Pl196xCorpusReader, r'[a-z]-.*\.xml',
+#     cat_file='cats.txt', textid_file='textids.txt', encoding='utf8')
+#
+# ipipan = LazyCorpusLoader(
+#     'ipipan', IPIPANCorpusReader, r'(?!\.).*morph\.xml')
+#
+# nkjp = LazyCorpusLoader(
+#     'nkjp', NKJPCorpusReader, r'', encoding='utf8')
+#
+# panlex_lite = LazyCorpusLoader(
+#    'panlex_lite', PanLexLiteCorpusReader)
+#
+# ycoe = LazyCorpusLoader(
+#     'ycoe', YCOECorpusReader)
+#
+# corpus not available with NLTK; these lines caused help(nltk.corpus) to break
+# hebrew_treebank = LazyCorpusLoader(
+#    'hebrew_treebank', BracketParseCorpusReader, r'.*\.txt')
+
+
+# FIXME:  override any imported demo from various corpora, see https://github.com/nltk/nltk/issues/2116
+def demo():
+    # This is out-of-date:
+    abc.demo()
+    brown.demo()
+    #    chat80.demo()
+    cmudict.demo()
+    conll2000.demo()
+    conll2002.demo()
+    genesis.demo()
+    gutenberg.demo()
+    ieer.demo()
+    inaugural.demo()
+    indian.demo()
+    names.demo()
+    ppattach.demo()
+    senseval.demo()
+    shakespeare.demo()
+    sinica_treebank.demo()
+    state_union.demo()
+    stopwords.demo()
+    timit.demo()
+    toolbox.demo()
+    treebank.demo()
+    udhr.demo()
+    webtext.demo()
+    words.demo()
+
+
+#    ycoe.demo()
+
+if __name__ == "__main__":
+    # demo()
+    pass
--- a/backend/venv/Lib/site-packages/nltk/corpus/europarl_raw.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/europarl_raw.py
@@ -0,0 +1,56 @@
+# Natural Language Toolkit: Europarl Corpus Readers
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author:  Nitin Madnani <nmadnani@umiacs.umd.edu>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+import re
+
+from nltk.corpus.reader import *
+from nltk.corpus.util import LazyCorpusLoader
+
+# Create a new corpus reader instance for each European language
+danish: EuroparlCorpusReader = LazyCorpusLoader(
+    "europarl_raw/danish", EuroparlCorpusReader, r"ep-.*\.da", encoding="utf-8"
+)
+
+dutch: EuroparlCorpusReader = LazyCorpusLoader(
+    "europarl_raw/dutch", EuroparlCorpusReader, r"ep-.*\.nl", encoding="utf-8"
+)
+
+english: EuroparlCorpusReader = LazyCorpusLoader(
+    "europarl_raw/english", EuroparlCorpusReader, r"ep-.*\.en", encoding="utf-8"
+)
+
+finnish: EuroparlCorpusReader = LazyCorpusLoader(
+    "europarl_raw/finnish", EuroparlCorpusReader, r"ep-.*\.fi", encoding="utf-8"
+)
+
+french: EuroparlCorpusReader = LazyCorpusLoader(
+    "europarl_raw/french", EuroparlCorpusReader, r"ep-.*\.fr", encoding="utf-8"
+)
+
+german: EuroparlCorpusReader = LazyCorpusLoader(
+    "europarl_raw/german", EuroparlCorpusReader, r"ep-.*\.de", encoding="utf-8"
+)
+
+greek: EuroparlCorpusReader = LazyCorpusLoader(
+    "europarl_raw/greek", EuroparlCorpusReader, r"ep-.*\.el", encoding="utf-8"
+)
+
+italian: EuroparlCorpusReader = LazyCorpusLoader(
+    "europarl_raw/italian", EuroparlCorpusReader, r"ep-.*\.it", encoding="utf-8"
+)
+
+portuguese: EuroparlCorpusReader = LazyCorpusLoader(
+    "europarl_raw/portuguese", EuroparlCorpusReader, r"ep-.*\.pt", encoding="utf-8"
+)
+
+spanish: EuroparlCorpusReader = LazyCorpusLoader(
+    "europarl_raw/spanish", EuroparlCorpusReader, r"ep-.*\.es", encoding="utf-8"
+)
+
+swedish: EuroparlCorpusReader = LazyCorpusLoader(
+    "europarl_raw/swedish", EuroparlCorpusReader, r"ep-.*\.sv", encoding="utf-8"
+)
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/init.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/init.py
@@ -0,0 +1,186 @@
+# Natural Language Toolkit: Corpus Readers
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+#         Edward Loper <edloper@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+NLTK corpus readers.  The modules in this package provide functions
+that can be used to read corpus fileids in a variety of formats.  These
+functions can be used to read both the corpus fileids that are
+distributed in the NLTK corpus package, and corpus fileids that are part
+of external corpora.
+
+Corpus Reader Functions
+=======================
+Each corpus module defines one or more "corpus reader functions",
+which can be used to read documents from that corpus.  These functions
+take an argument, ``item``, which is used to indicate which document
+should be read from the corpus:
+
+- If ``item`` is one of the unique identifiers listed in the corpus
+  module's ``items`` variable, then the corresponding document will
+  be loaded from the NLTK corpus package.
+- If ``item`` is a fileid, then that file will be read.
+
+Additionally, corpus reader functions can be given lists of item
+names; in which case, they will return a concatenation of the
+corresponding documents.
+
+Corpus reader functions are named based on the type of information
+they return.  Some common examples, and their return types, are:
+
+- words(): list of str
+- sents(): list of (list of str)
+- paras(): list of (list of (list of str))
+- tagged_words(): list of (str,str) tuple
+- tagged_sents(): list of (list of (str,str))
+- tagged_paras(): list of (list of (list of (str,str)))
+- chunked_sents(): list of (Tree w/ (str,str) leaves)
+- parsed_sents(): list of (Tree with str leaves)
+- parsed_paras(): list of (list of (Tree with str leaves))
+- xml(): A single xml ElementTree
+- raw(): unprocessed corpus contents
+
+For example, to read a list of the words in the Brown Corpus, use
+``nltk.corpus.brown.words()``:
+
+    >>> from nltk.corpus import brown
+    >>> print(", ".join(brown.words()[:6])) # only first 6 words
+    The, Fulton, County, Grand, Jury, said
+
+isort:skip_file
+"""
+
+from nltk.corpus.reader.plaintext import *
+from nltk.corpus.reader.util import *
+from nltk.corpus.reader.api import *
+from nltk.corpus.reader.tagged import *
+from nltk.corpus.reader.cmudict import *
+from nltk.corpus.reader.conll import *
+from nltk.corpus.reader.chunked import *
+from nltk.corpus.reader.wordlist import *
+from nltk.corpus.reader.xmldocs import *
+from nltk.corpus.reader.ppattach import *
+from nltk.corpus.reader.senseval import *
+from nltk.corpus.reader.ieer import *
+from nltk.corpus.reader.sinica_treebank import *
+from nltk.corpus.reader.bracket_parse import *
+from nltk.corpus.reader.indian import *
+from nltk.corpus.reader.toolbox import *
+from nltk.corpus.reader.timit import *
+from nltk.corpus.reader.ycoe import *
+from nltk.corpus.reader.rte import *
+from nltk.corpus.reader.string_category import *
+from nltk.corpus.reader.propbank import *
+from nltk.corpus.reader.verbnet import *
+from nltk.corpus.reader.bnc import *
+from nltk.corpus.reader.nps_chat import *
+from nltk.corpus.reader.wordnet import *
+from nltk.corpus.reader.switchboard import *
+from nltk.corpus.reader.dependency import *
+from nltk.corpus.reader.nombank import *
+from nltk.corpus.reader.ipipan import *
+from nltk.corpus.reader.pl196x import *
+from nltk.corpus.reader.knbc import *
+from nltk.corpus.reader.chasen import *
+from nltk.corpus.reader.childes import *
+from nltk.corpus.reader.aligned import *
+from nltk.corpus.reader.lin import *
+from nltk.corpus.reader.semcor import *
+from nltk.corpus.reader.framenet import *
+from nltk.corpus.reader.udhr import *
+from nltk.corpus.reader.bnc import *
+from nltk.corpus.reader.sentiwordnet import *
+from nltk.corpus.reader.twitter import *
+from nltk.corpus.reader.nkjp import *
+from nltk.corpus.reader.crubadan import *
+from nltk.corpus.reader.mte import *
+from nltk.corpus.reader.reviews import *
+from nltk.corpus.reader.opinion_lexicon import *
+from nltk.corpus.reader.pros_cons import *
+from nltk.corpus.reader.categorized_sents import *
+from nltk.corpus.reader.comparative_sents import *
+from nltk.corpus.reader.panlex_lite import *
+from nltk.corpus.reader.panlex_swadesh import *
+from nltk.corpus.reader.bcp47 import *
+
+# Make sure that nltk.corpus.reader.bracket_parse gives the module, not
+# the function bracket_parse() defined in nltk.tree:
+from nltk.corpus.reader import bracket_parse
+
+__all__ = [
+    "CorpusReader",
+    "CategorizedCorpusReader",
+    "PlaintextCorpusReader",
+    "find_corpus_fileids",
+    "TaggedCorpusReader",
+    "CMUDictCorpusReader",
+    "ConllChunkCorpusReader",
+    "WordListCorpusReader",
+    "PPAttachmentCorpusReader",
+    "SensevalCorpusReader",
+    "IEERCorpusReader",
+    "ChunkedCorpusReader",
+    "SinicaTreebankCorpusReader",
+    "BracketParseCorpusReader",
+    "IndianCorpusReader",
+    "ToolboxCorpusReader",
+    "TimitCorpusReader",
+    "YCOECorpusReader",
+    "MacMorphoCorpusReader",
+    "SyntaxCorpusReader",
+    "AlpinoCorpusReader",
+    "RTECorpusReader",
+    "StringCategoryCorpusReader",
+    "EuroparlCorpusReader",
+    "CategorizedBracketParseCorpusReader",
+    "CategorizedTaggedCorpusReader",
+    "CategorizedPlaintextCorpusReader",
+    "PortugueseCategorizedPlaintextCorpusReader",
+    "tagged_treebank_para_block_reader",
+    "PropbankCorpusReader",
+    "VerbnetCorpusReader",
+    "BNCCorpusReader",
+    "ConllCorpusReader",
+    "XMLCorpusReader",
+    "NPSChatCorpusReader",
+    "SwadeshCorpusReader",
+    "WordNetCorpusReader",
+    "WordNetICCorpusReader",
+    "SwitchboardCorpusReader",
+    "DependencyCorpusReader",
+    "NombankCorpusReader",
+    "IPIPANCorpusReader",
+    "Pl196xCorpusReader",
+    "TEICorpusView",
+    "KNBCorpusReader",
+    "ChasenCorpusReader",
+    "CHILDESCorpusReader",
+    "AlignedCorpusReader",
+    "TimitTaggedCorpusReader",
+    "LinThesaurusCorpusReader",
+    "SemcorCorpusReader",
+    "FramenetCorpusReader",
+    "UdhrCorpusReader",
+    "BNCCorpusReader",
+    "SentiWordNetCorpusReader",
+    "SentiSynset",
+    "TwitterCorpusReader",
+    "NKJPCorpusReader",
+    "CrubadanCorpusReader",
+    "MTECorpusReader",
+    "ReviewsCorpusReader",
+    "OpinionLexiconCorpusReader",
+    "ProsConsCorpusReader",
+    "CategorizedSentencesCorpusReader",
+    "ComparativeSentencesCorpusReader",
+    "PanLexLiteCorpusReader",
+    "NonbreakingPrefixesCorpusReader",
+    "UnicharsCorpusReader",
+    "MWAPPDBCorpusReader",
+    "PanlexSwadeshCorpusReader",
+    "BCP47CorpusReader",
+]
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/aligned.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/aligned.py
@@ -0,0 +1,154 @@
+# Natural Language Toolkit: Aligned Corpus Reader
+#
+# Copyright (C) 2001-2025 NLTK Project
+# URL: <https://www.nltk.org/>
+# Author: Steven Bird <stevenbird1@gmail.com>
+# For license information, see LICENSE.TXT
+
+from nltk.corpus.reader.api import CorpusReader
+from nltk.corpus.reader.util import (
+    StreamBackedCorpusView,
+    concat,
+    read_alignedsent_block,
+)
+from nltk.tokenize import RegexpTokenizer, WhitespaceTokenizer
+from nltk.translate import AlignedSent, Alignment
+
+
+class AlignedCorpusReader(CorpusReader):
+    """
+    Reader for corpora of word-aligned sentences.  Tokens are assumed
+    to be separated by whitespace.  Sentences begin on separate lines.
+    """
+
+    def __init__(
+        self,
+        root,
+        fileids,
+        sep="/",
+        word_tokenizer=WhitespaceTokenizer(),
+        sent_tokenizer=RegexpTokenizer("\n", gaps=True),
+        alignedsent_block_reader=read_alignedsent_block,
+        encoding="latin1",
+    ):
+        """
+        Construct a new Aligned Corpus reader for a set of documents
+        located at the given root directory.  Example usage:
+
+            >>> root = '/...path to corpus.../'
+            >>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP
+
+        :param root: The root directory for this corpus.
+        :param fileids: A list or regexp specifying the fileids in this corpus.
+        """
+        CorpusReader.__init__(self, root, fileids, encoding)
+        self._sep = sep
+        self._word_tokenizer = word_tokenizer
+        self._sent_tokenizer = sent_tokenizer
+        self._alignedsent_block_reader = alignedsent_block_reader
+
+    def words(self, fileids=None):
+        """
+        :return: the given file(s) as a list of words
+            and punctuation symbols.
+        :rtype: list(str)
+        """
+        return concat(
+            [
+                AlignedSentCorpusView(
+                    fileid,
+                    enc,
+                    False,
+                    False,
+                    self._word_tokenizer,
+                    self._sent_tokenizer,
+                    self._alignedsent_block_reader,
+                )
+                for (fileid, enc) in self.abspaths(fileids, True)
+            ]
+        )
+
+    def sents(self, fileids=None):
+        """
+        :return: the given file(s) as a list of
+            sentences or utterances, each encoded as a list of word
+            strings.
+        :rtype: list(list(str))
+        """
+        return concat(
+            [
+                AlignedSentCorpusView(
+                    fileid,
+                    enc,
+                    False,
+                    True,
+                    self._word_tokenizer,
+                    self._sent_tokenizer,
+                    self._alignedsent_block_reader,
+                )
+                for (fileid, enc) in self.abspaths(fileids, True)
+            ]
+        )
+
+    def aligned_sents(self, fileids=None):
+        """
+        :return: the given file(s) as a list of AlignedSent objects.
+        :rtype: list(AlignedSent)
+        """
+        return concat(
+            [
+                AlignedSentCorpusView(
+                    fileid,
+                    enc,
+                    True,
+                    True,
+                    self._word_tokenizer,
+                    self._sent_tokenizer,
+                    self._alignedsent_block_reader,
+                )
+                for (fileid, enc) in self.abspaths(fileids, True)
+            ]
+        )
+
+
+class AlignedSentCorpusView(StreamBackedCorpusView):
+    """
+    A specialized corpus view for aligned sentences.
+    ``AlignedSentCorpusView`` objects are typically created by
+    ``AlignedCorpusReader`` (not directly by nltk users).
+    """
+
+    def __init__(
+        self,
+        corpus_file,
+        encoding,
+        aligned,
+        group_by_sent,
+        word_tokenizer,
+        sent_tokenizer,
+        alignedsent_block_reader,
+    ):
+        self._aligned = aligned
+        self._group_by_sent = group_by_sent
+        self._word_tokenizer = word_tokenizer
+        self._sent_tokenizer = sent_tokenizer
+        self._alignedsent_block_reader = alignedsent_block_reader
+        StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
+
+    def read_block(self, stream):
+        block = [
+            self._word_tokenizer.tokenize(sent_str)
+            for alignedsent_str in self._alignedsent_block_reader(stream)
+            for sent_str in self._sent_tokenizer.tokenize(alignedsent_str)
+        ]
+        if self._aligned:
+            block[2] = Alignment.fromstring(
+                " ".join(block[2])
+            )  # kludge; we shouldn't have tokenized the alignment string
+            block = [AlignedSent(*block)]
+        elif self._group_by_sent:
+            block = [block[0]]
+        else:
+            block = block[0]
+
+        return block
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/api.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/api.py
@@ -0,0 +1,517 @@
+# Natural Language Toolkit: API for Corpus Readers
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+#         Edward Loper <edloper@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+API for corpus readers.
+"""
+
+import os
+import re
+from collections import defaultdict
+from itertools import chain
+
+from nltk.corpus.reader.util import *
+from nltk.data import FileSystemPathPointer, PathPointer, ZipFilePathPointer
+
+
+class CorpusReader:
+    """
+    A base class for "corpus reader" classes, each of which can be
+    used to read a specific corpus format.  Each individual corpus
+    reader instance is used to read a specific corpus, consisting of
+    one or more files under a common root directory.  Each file is
+    identified by its ``file identifier``, which is the relative path
+    to the file from the root directory.
+
+    A separate subclass is defined for each corpus format.  These
+    subclasses define one or more methods that provide 'views' on the
+    corpus contents, such as ``words()`` (for a list of words) and
+    ``parsed_sents()`` (for a list of parsed sentences).  Called with
+    no arguments, these methods will return the contents of the entire
+    corpus.  For most corpora, these methods define one or more
+    selection arguments, such as ``fileids`` or ``categories``, which can
+    be used to select which portion of the corpus should be returned.
+    """
+
+    def __init__(self, root, fileids, encoding="utf8", tagset=None):
+        """
+        :type root: PathPointer or str
+        :param root: A path pointer identifying the root directory for
+            this corpus.  If a string is specified, then it will be
+            converted to a ``PathPointer`` automatically.
+        :param fileids: A list of the files that make up this corpus.
+            This list can either be specified explicitly, as a list of
+            strings; or implicitly, as a regular expression over file
+            paths.  The absolute path for each file will be constructed
+            by joining the reader's root to each file name.
+        :param encoding: The default unicode encoding for the files
+            that make up the corpus.  The value of ``encoding`` can be any
+            of the following:
+
+            - A string: ``encoding`` is the encoding name for all files.
+            - A dictionary: ``encoding[file_id]`` is the encoding
+              name for the file whose identifier is ``file_id``.  If
+              ``file_id`` is not in ``encoding``, then the file
+              contents will be processed using non-unicode byte strings.
+            - A list: ``encoding`` should be a list of ``(regexp, encoding)``
+              tuples.  The encoding for a file whose identifier is ``file_id``
+              will be the ``encoding`` value for the first tuple whose
+              ``regexp`` matches the ``file_id``.  If no tuple's ``regexp``
+              matches the ``file_id``, the file contents will be processed
+              using non-unicode byte strings.
+            - None: the file contents of all files will be
+              processed using non-unicode byte strings.
+        :param tagset: The name of the tagset used by this corpus, to be used
+              for normalizing or converting the POS tags returned by the
+              ``tagged_...()`` methods.
+        """
+        # Convert the root to a path pointer, if necessary.
+        if isinstance(root, str) and not isinstance(root, PathPointer):
+            m = re.match(r"(.*\.zip)/?(.*)$|", root)
+            zipfile, zipentry = m.groups()
+            if zipfile:
+                root = ZipFilePathPointer(zipfile, zipentry)
+            else:
+                root = FileSystemPathPointer(root)
+        elif not isinstance(root, PathPointer):
+            raise TypeError("CorpusReader: expected a string or a PathPointer")
+
+        # If `fileids` is a regexp, then expand it.
+        if isinstance(fileids, str):
+            fileids = find_corpus_fileids(root, fileids)
+
+        self._fileids = fileids
+        """A list of the relative paths for the fileids that make up
+        this corpus."""
+
+        self._root = root
+        """The root directory for this corpus."""
+
+        self._readme = "README"
+        self._license = "LICENSE"
+        self._citation = "citation.bib"
+
+        # If encoding was specified as a list of regexps, then convert
+        # it to a dictionary.
+        if isinstance(encoding, list):
+            encoding_dict = {}
+            for fileid in self._fileids:
+                for x in encoding:
+                    (regexp, enc) = x
+                    if re.match(regexp, fileid):
+                        encoding_dict[fileid] = enc
+                        break
+            encoding = encoding_dict
+
+        self._encoding = encoding
+        """The default unicode encoding for the fileids that make up
+           this corpus.  If ``encoding`` is None, then the file
+           contents are processed using byte strings."""
+        self._tagset = tagset
+
+    def __repr__(self):
+        if isinstance(self._root, ZipFilePathPointer):
+            path = f"{self._root.zipfile.filename}/{self._root.entry}"
+        else:
+            path = "%s" % self._root.path
+        return f"<{self.__class__.__name__} in {path!r}>"
+
+    def ensure_loaded(self):
+        """
+        Load this corpus (if it has not already been loaded).  This is
+        used by LazyCorpusLoader as a simple method that can be used to
+        make sure a corpus is loaded -- e.g., in case a user wants to
+        do help(some_corpus).
+        """
+        pass  # no need to actually do anything.
+
+    def readme(self):
+        """
+        Return the contents of the corpus README file, if it exists.
+        """
+        with self.open(self._readme) as f:
+            return f.read()
+
+    def license(self):
+        """
+        Return the contents of the corpus LICENSE file, if it exists.
+        """
+        with self.open(self._license) as f:
+            return f.read()
+
+    def citation(self):
+        """
+        Return the contents of the corpus citation.bib file, if it exists.
+        """
+        with self.open(self._citation) as f:
+            return f.read()
+
+    def fileids(self):
+        """
+        Return a list of file identifiers for the fileids that make up
+        this corpus.
+        """
+        return self._fileids
+
+    def abspath(self, fileid):
+        """
+        Return the absolute path for the given file.
+
+        :type fileid: str
+        :param fileid: The file identifier for the file whose path
+            should be returned.
+        :rtype: PathPointer
+        """
+        return self._root.join(fileid)
+
+    def abspaths(self, fileids=None, include_encoding=False, include_fileid=False):
+        """
+        Return a list of the absolute paths for all fileids in this corpus;
+        or for the given list of fileids, if specified.
+
+        :type fileids: None or str or list
+        :param fileids: Specifies the set of fileids for which paths should
+            be returned.  Can be None, for all fileids; a list of
+            file identifiers, for a specified set of fileids; or a single
+            file identifier, for a single file.  Note that the return
+            value is always a list of paths, even if ``fileids`` is a
+            single file identifier.
+
+        :param include_encoding: If true, then return a list of
+            ``(path_pointer, encoding)`` tuples.
+
+        :rtype: list(PathPointer)
+        """
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, str):
+            fileids = [fileids]
+
+        paths = [self._root.join(f) for f in fileids]
+
+        if include_encoding and include_fileid:
+            return list(zip(paths, [self.encoding(f) for f in fileids], fileids))
+        elif include_fileid:
+            return list(zip(paths, fileids))
+        elif include_encoding:
+            return list(zip(paths, [self.encoding(f) for f in fileids]))
+        else:
+            return paths
+
+    def raw(self, fileids=None):
+        """
+        :param fileids: A list specifying the fileids that should be used.
+        :return: the given file(s) as a single string.
+        :rtype: str
+        """
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, str):
+            fileids = [fileids]
+        contents = []
+        for f in fileids:
+            with self.open(f) as fp:
+                contents.append(fp.read())
+        return concat(contents)
+
+    def open(self, file):
+        """
+        Return an open stream that can be used to read the given file.
+        If the file's encoding is not None, then the stream will
+        automatically decode the file's contents into unicode.
+
+        :param file: The file identifier of the file to read.
+        """
+        encoding = self.encoding(file)
+        stream = self._root.join(file).open(encoding)
+        return stream
+
+    def encoding(self, file):
+        """
+        Return the unicode encoding for the given corpus file, if known.
+        If the encoding is unknown, or if the given file should be
+        processed using byte strings (str), then return None.
+        """
+        if isinstance(self._encoding, dict):
+            return self._encoding.get(file)
+        else:
+            return self._encoding
+
+    def _get_root(self):
+        return self._root
+
+    root = property(
+        _get_root,
+        doc="""
+        The directory where this corpus is stored.
+
+        :type: PathPointer""",
+    )
+
+
+######################################################################
+# { Corpora containing categorized items
+######################################################################
+
+
+class CategorizedCorpusReader:
+    """
+    A mixin class used to aid in the implementation of corpus readers
+    for categorized corpora.  This class defines the method
+    ``categories()``, which returns a list of the categories for the
+    corpus or for a specified set of fileids; and overrides ``fileids()``
+    to take a ``categories`` argument, restricting the set of fileids to
+    be returned.
+
+    Subclasses are expected to:
+
+      - Call ``__init__()`` to set up the mapping.
+
+      - Override all view methods to accept a ``categories`` parameter,
+        which can be used *instead* of the ``fileids`` parameter, to
+        select which fileids should be included in the returned view.
+    """
+
+    def __init__(self, kwargs):
+        """
+        Initialize this mapping based on keyword arguments, as
+        follows:
+
+          - cat_pattern: A regular expression pattern used to find the
+            category for each file identifier.  The pattern will be
+            applied to each file identifier, and the first matching
+            group will be used as the category label for that file.
+
+          - cat_map: A dictionary, mapping from file identifiers to
+            category labels.
+
+          - cat_file: The name of a file that contains the mapping
+            from file identifiers to categories.  The argument
+            ``cat_delimiter`` can be used to specify a delimiter.
+
+        The corresponding argument will be deleted from ``kwargs``.  If
+        more than one argument is specified, an exception will be
+        raised.
+        """
+        self._f2c = None  #: file-to-category mapping
+        self._c2f = None  #: category-to-file mapping
+
+        self._pattern = None  #: regexp specifying the mapping
+        self._map = None  #: dict specifying the mapping
+        self._file = None  #: fileid of file containing the mapping
+        self._delimiter = None  #: delimiter for ``self._file``
+
+        if "cat_pattern" in kwargs:
+            self._pattern = kwargs["cat_pattern"]
+            del kwargs["cat_pattern"]
+        elif "cat_map" in kwargs:
+            self._map = kwargs["cat_map"]
+            del kwargs["cat_map"]
+        elif "cat_file" in kwargs:
+            self._file = kwargs["cat_file"]
+            del kwargs["cat_file"]
+            if "cat_delimiter" in kwargs:
+                self._delimiter = kwargs["cat_delimiter"]
+                del kwargs["cat_delimiter"]
+        else:
+            raise ValueError(
+                "Expected keyword argument cat_pattern or " "cat_map or cat_file."
+            )
+
+        if "cat_pattern" in kwargs or "cat_map" in kwargs or "cat_file" in kwargs:
+            raise ValueError(
+                "Specify exactly one of: cat_pattern, " "cat_map, cat_file."
+            )
+
+    def _init(self):
+        self._f2c = defaultdict(set)
+        self._c2f = defaultdict(set)
+
+        if self._pattern is not None:
+            for file_id in self._fileids:
+                category = re.match(self._pattern, file_id).group(1)
+                self._add(file_id, category)
+
+        elif self._map is not None:
+            for file_id, categories in self._map.items():
+                for category in categories:
+                    self._add(file_id, category)
+
+        elif self._file is not None:
+            with self.open(self._file) as f:
+                for line in f.readlines():
+                    line = line.strip()
+                    file_id, categories = line.split(self._delimiter, 1)
+                    if file_id not in self.fileids():
+                        raise ValueError(
+                            "In category mapping file %s: %s "
+                            "not found" % (self._file, file_id)
+                        )
+                    for category in categories.split(self._delimiter):
+                        self._add(file_id, category)
+
+    def _add(self, file_id, category):
+        self._f2c[file_id].add(category)
+        self._c2f[category].add(file_id)
+
+    def categories(self, fileids=None):
+        """
+        Return a list of the categories that are defined for this corpus,
+        or for the file(s) if it is given.
+        """
+        if self._f2c is None:
+            self._init()
+        if fileids is None:
+            return sorted(self._c2f)
+        if isinstance(fileids, str):
+            fileids = [fileids]
+        return sorted(set.union(*(self._f2c[d] for d in fileids)))
+
+    def fileids(self, categories=None):
+        """
+        Return a list of file identifiers for the files that make up
+        this corpus, or that make up the given category(s) if specified.
+        """
+        if categories is None:
+            return super().fileids()
+        elif isinstance(categories, str):
+            if self._f2c is None:
+                self._init()
+            if categories in self._c2f:
+                return sorted(self._c2f[categories])
+            else:
+                raise ValueError("Category %s not found" % categories)
+        else:
+            if self._f2c is None:
+                self._init()
+            return sorted(set.union(*(self._c2f[c] for c in categories)))
+
+    def _resolve(self, fileids, categories):
+        if fileids is not None and categories is not None:
+            raise ValueError("Specify fileids or categories, not both")
+        if categories is not None:
+            return self.fileids(categories)
+        else:
+            return fileids
+
+    def raw(self, fileids=None, categories=None):
+        return super().raw(self._resolve(fileids, categories))
+
+    def words(self, fileids=None, categories=None):
+        return super().words(self._resolve(fileids, categories))
+
+    def sents(self, fileids=None, categories=None):
+        return super().sents(self._resolve(fileids, categories))
+
+    def paras(self, fileids=None, categories=None):
+        return super().paras(self._resolve(fileids, categories))
+
+
+######################################################################
+# { Treebank readers
+######################################################################
+
+
+# [xx] is it worth it to factor this out?
+class SyntaxCorpusReader(CorpusReader):
+    """
+    An abstract base class for reading corpora consisting of
+    syntactically parsed text.  Subclasses should define:
+
+      - ``__init__``, which specifies the location of the corpus
+        and a method for detecting the sentence blocks in corpus files.
+      - ``_read_block``, which reads a block from the input stream.
+      - ``_word``, which takes a block and returns a list of list of words.
+      - ``_tag``, which takes a block and returns a list of list of tagged
+        words.
+      - ``_parse``, which takes a block and returns a list of parsed
+        sentences.
+    """
+
+    def _parse(self, s):
+        raise NotImplementedError()
+
+    def _word(self, s):
+        raise NotImplementedError()
+
+    def _tag(self, s):
+        raise NotImplementedError()
+
+    def _read_block(self, stream):
+        raise NotImplementedError()
+
+    def parsed_sents(self, fileids=None):
+        reader = self._read_parsed_sent_block
+        return concat(
+            [
+                StreamBackedCorpusView(fileid, reader, encoding=enc)
+                for fileid, enc in self.abspaths(fileids, True)
+            ]
+        )
+
+    def tagged_sents(self, fileids=None, tagset=None):
+        def reader(stream):
+            return self._read_tagged_sent_block(stream, tagset)
+
+        return concat(
+            [
+                StreamBackedCorpusView(fileid, reader, encoding=enc)
+                for fileid, enc in self.abspaths(fileids, True)
+            ]
+        )
+
+    def sents(self, fileids=None):
+        reader = self._read_sent_block
+        return concat(
+            [
+                StreamBackedCorpusView(fileid, reader, encoding=enc)
+                for fileid, enc in self.abspaths(fileids, True)
+            ]
+        )
+
+    def tagged_words(self, fileids=None, tagset=None):
+        def reader(stream):
+            return self._read_tagged_word_block(stream, tagset)
+
+        return concat(
+            [
+                StreamBackedCorpusView(fileid, reader, encoding=enc)
+                for fileid, enc in self.abspaths(fileids, True)
+            ]
+        )
+
+    def words(self, fileids=None):
+        return concat(
+            [
+                StreamBackedCorpusView(fileid, self._read_word_block, encoding=enc)
+                for fileid, enc in self.abspaths(fileids, True)
+            ]
+        )
+
+    # ------------------------------------------------------------
+    # { Block Readers
+
+    def _read_word_block(self, stream):
+        return list(chain.from_iterable(self._read_sent_block(stream)))
+
+    def _read_tagged_word_block(self, stream, tagset=None):
+        return list(chain.from_iterable(self._read_tagged_sent_block(stream, tagset)))
+
+    def _read_sent_block(self, stream):
+        return list(filter(None, [self._word(t) for t in self._read_block(stream)]))
+
+    def _read_tagged_sent_block(self, stream, tagset=None):
+        return list(
+            filter(None, [self._tag(t, tagset) for t in self._read_block(stream)])
+        )
+
+    def _read_parsed_sent_block(self, stream):
+        return list(filter(None, [self._parse(t) for t in self._read_block(stream)]))
+
+    # } End of Block Readers
+    # ------------------------------------------------------------
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/bcp47.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/bcp47.py
@@ -0,0 +1,218 @@
+# Natural Language Toolkit: BCP-47 language tags
+#
+# Copyright (C) 2022-2023 NLTK Project
+# Author: Eric Kafe <kafe.eric@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+import re
+from warnings import warn
+from xml.etree import ElementTree as et
+
+from nltk.corpus.reader import CorpusReader
+
+
+class BCP47CorpusReader(CorpusReader):
+    """
+    Parse BCP-47 composite language tags
+
+    Supports all the main subtags, and the 'u-sd' extension:
+
+    >>> from nltk.corpus import bcp47
+    >>> bcp47.name('oc-gascon-u-sd-fr64')
+    'Occitan (post 1500): Gascon: Pyrénées-Atlantiques'
+
+    Can load a conversion table to Wikidata Q-codes:
+    >>> bcp47.load_wiki_q()
+    >>> bcp47.wiki_q['en-GI-spanglis']
+    'Q79388'
+
+    """
+
+    def __init__(self, root, fileids):
+        """Read the BCP-47 database"""
+        super().__init__(root, fileids)
+        self.langcode = {}
+        with self.open("iana/language-subtag-registry.txt") as fp:
+            self.db = self.data_dict(fp.read().split("%%\n"))
+        with self.open("cldr/common-subdivisions-en.xml") as fp:
+            self.subdiv = self.subdiv_dict(
+                et.parse(fp).iterfind("localeDisplayNames/subdivisions/subdivision")
+            )
+        self.morphology()
+
+    def load_wiki_q(self):
+        """Load conversion table to Wikidata Q-codes (only if needed)"""
+        with self.open("cldr/tools-cldr-rdf-external-entityToCode.tsv") as fp:
+            self.wiki_q = self.wiki_dict(fp.read().strip().split("\n")[1:])
+
+    def wiki_dict(self, lines):
+        """Convert Wikidata list of Q-codes to a BCP-47 dictionary"""
+        return {
+            pair[1]: pair[0].split("/")[-1]
+            for pair in [line.strip().split("\t") for line in lines]
+        }
+
+    def subdiv_dict(self, subdivs):
+        """Convert the CLDR subdivisions list to a dictionary"""
+        return {sub.attrib["type"]: sub.text for sub in subdivs}
+
+    def morphology(self):
+        self.casing = {
+            "language": str.lower,
+            "extlang": str.lower,
+            "script": str.title,
+            "region": str.upper,
+            "variant": str.lower,
+        }
+        dig = "[0-9]"
+        low = "[a-z]"
+        up = "[A-Z]"
+        alnum = "[a-zA-Z0-9]"
+        self.format = {
+            "language": re.compile(f"{low*3}?"),
+            "extlang": re.compile(f"{low*3}"),
+            "script": re.compile(f"{up}{low*3}"),
+            "region": re.compile(f"({up*2})|({dig*3})"),
+            "variant": re.compile(f"{alnum*4}{(alnum+'?')*4}"),
+            "singleton": re.compile(f"{low}"),
+        }
+
+    def data_dict(self, records):
+        """Convert the BCP-47 language subtag registry to a dictionary"""
+        self.version = records[0].replace("File-Date:", "").strip()
+        dic = {}
+        dic["deprecated"] = {}
+        for label in [
+            "language",
+            "extlang",
+            "script",
+            "region",
+            "variant",
+            "redundant",
+            "grandfathered",
+        ]:
+            dic["deprecated"][label] = {}
+        for record in records[1:]:
+            fields = [field.split(": ") for field in record.strip().split("\n")]
+            typ = fields[0][1]
+            tag = fields[1][1]
+            if typ not in dic:
+                dic[typ] = {}
+            subfields = {}
+            for field in fields[2:]:
+                if len(field) == 2:
+                    [key, val] = field
+                    if key not in subfields:
+                        subfields[key] = [val]
+                    else:  # multiple value
+                        subfields[key].append(val)
+                else:  # multiline field
+                    subfields[key][-1] += " " + field[0].strip()
+                if (
+                    "Deprecated" not in record
+                    and typ == "language"
+                    and key == "Description"
+                ):
+                    self.langcode[subfields[key][-1]] = tag
+            for key in subfields:
+                if len(subfields[key]) == 1:  # single value
+                    subfields[key] = subfields[key][0]
+            if "Deprecated" in record:
+                dic["deprecated"][typ][tag] = subfields
+            else:
+                dic[typ][tag] = subfields
+        return dic
+
+    def val2str(self, val):
+        """Return only first value"""
+        if type(val) == list:
+            #            val = "/".join(val) # Concatenate all values
+            val = val[0]
+        return val
+
+    def lang2str(self, lg_record):
+        """Concatenate subtag values"""
+        name = f"{lg_record['language']}"
+        for label in ["extlang", "script", "region", "variant", "extension"]:
+            if label in lg_record:
+                name += f": {lg_record[label]}"
+        return name
+
+    def parse_tag(self, tag):
+        """Convert a BCP-47 tag to a dictionary of labelled subtags"""
+        subtags = tag.split("-")
+        lang = {}
+        labels = ["language", "extlang", "script", "region", "variant", "variant"]
+        while subtags and labels:
+            subtag = subtags.pop(0)
+            found = False
+            while labels:
+                label = labels.pop(0)
+                subtag = self.casing[label](subtag)
+                if self.format[label].fullmatch(subtag):
+                    if subtag in self.db[label]:
+                        found = True
+                        valstr = self.val2str(self.db[label][subtag]["Description"])
+                        if label == "variant" and label in lang:
+                            lang[label] += ": " + valstr
+                        else:
+                            lang[label] = valstr
+                        break
+                    elif subtag in self.db["deprecated"][label]:
+                        found = True
+                        note = f"The {subtag!r} {label} code is deprecated"
+                        if "Preferred-Value" in self.db["deprecated"][label][subtag]:
+                            prefer = self.db["deprecated"][label][subtag][
+                                "Preferred-Value"
+                            ]
+                            note += f"', prefer '{self.val2str(prefer)}'"
+                        lang[label] = self.val2str(
+                            self.db["deprecated"][label][subtag]["Description"]
+                        )
+                        warn(note)
+                        break
+            if not found:
+                if subtag == "u" and subtags[0] == "sd":  # CLDR regional subdivisions
+                    sd = subtags[1]
+                    if sd in self.subdiv:
+                        ext = self.subdiv[sd]
+                    else:
+                        ext = f"<Unknown subdivision: {ext}>"
+                else:  # other extension subtags are not supported yet
+                    ext = f"{subtag}{''.join(['-'+ext for ext in subtags])}".lower()
+                    if not self.format["singleton"].fullmatch(subtag):
+                        ext = f"<Invalid extension: {ext}>"
+                        warn(ext)
+                lang["extension"] = ext
+                subtags = []
+        return lang
+
+    def name(self, tag):
+        """
+        Convert a BCP-47 tag to a colon-separated string of subtag names
+
+        >>> from nltk.corpus import bcp47
+        >>> bcp47.name('ca-Latn-ES-valencia')
+        'Catalan: Latin: Spain: Valencian'
+
+        """
+        for label in ["redundant", "grandfathered"]:
+            val = None
+            if tag in self.db[label]:
+                val = f"{self.db[label][tag]['Description']}"
+                note = f"The {tag!r} code is {label}"
+            elif tag in self.db["deprecated"][label]:
+                val = f"{self.db['deprecated'][label][tag]['Description']}"
+                note = f"The {tag!r} code is {label} and deprecated"
+                if "Preferred-Value" in self.db["deprecated"][label][tag]:
+                    prefer = self.db["deprecated"][label][tag]["Preferred-Value"]
+                    note += f", prefer {self.val2str(prefer)!r}"
+            if val:
+                warn(note)
+                return val
+        try:
+            return self.lang2str(self.parse_tag(tag))
+        except:
+            warn(f"Tag {tag!r} was not recognized")
+            return None
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/bnc.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/bnc.py
@@ -0,0 +1,265 @@
+# Natural Language Toolkit: Plaintext Corpus Reader
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""Corpus reader for the XML version of the British National Corpus."""
+
+from nltk.corpus.reader.util import concat
+from nltk.corpus.reader.xmldocs import ElementTree, XMLCorpusReader, XMLCorpusView
+
+
+class BNCCorpusReader(XMLCorpusReader):
+    r"""Corpus reader for the XML version of the British National Corpus.
+
+    For access to the complete XML data structure, use the ``xml()``
+    method.  For access to simple word lists and tagged word lists, use
+    ``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``.
+
+    You can obtain the full version of the BNC corpus at
+    https://www.ota.ox.ac.uk/desc/2554
+
+    If you extracted the archive to a directory called `BNC`, then you can
+    instantiate the reader as::
+
+        BNCCorpusReader(root='BNC/Texts/', fileids=r'[A-K]/\w*/\w*\.xml')
+
+    """
+
+    def __init__(self, root, fileids, lazy=True):
+        XMLCorpusReader.__init__(self, root, fileids)
+        self._lazy = lazy
+
+    def words(self, fileids=None, strip_space=True, stem=False):
+        """
+        :return: the given file(s) as a list of words
+            and punctuation symbols.
+        :rtype: list(str)
+
+        :param strip_space: If true, then strip trailing spaces from
+            word tokens.  Otherwise, leave the spaces on the tokens.
+        :param stem: If true, then use word stems instead of word strings.
+        """
+        return self._views(fileids, False, None, strip_space, stem)
+
+    def tagged_words(self, fileids=None, c5=False, strip_space=True, stem=False):
+        """
+        :return: the given file(s) as a list of tagged
+            words and punctuation symbols, encoded as tuples
+            ``(word,tag)``.
+        :rtype: list(tuple(str,str))
+
+        :param c5: If true, then the tags used will be the more detailed
+            c5 tags.  Otherwise, the simplified tags will be used.
+        :param strip_space: If true, then strip trailing spaces from
+            word tokens.  Otherwise, leave the spaces on the tokens.
+        :param stem: If true, then use word stems instead of word strings.
+        """
+        tag = "c5" if c5 else "pos"
+        return self._views(fileids, False, tag, strip_space, stem)
+
+    def sents(self, fileids=None, strip_space=True, stem=False):
+        """
+        :return: the given file(s) as a list of
+            sentences or utterances, each encoded as a list of word
+            strings.
+        :rtype: list(list(str))
+
+        :param strip_space: If true, then strip trailing spaces from
+            word tokens.  Otherwise, leave the spaces on the tokens.
+        :param stem: If true, then use word stems instead of word strings.
+        """
+        return self._views(fileids, True, None, strip_space, stem)
+
+    def tagged_sents(self, fileids=None, c5=False, strip_space=True, stem=False):
+        """
+        :return: the given file(s) as a list of
+            sentences, each encoded as a list of ``(word,tag)`` tuples.
+        :rtype: list(list(tuple(str,str)))
+
+        :param c5: If true, then the tags used will be the more detailed
+            c5 tags.  Otherwise, the simplified tags will be used.
+        :param strip_space: If true, then strip trailing spaces from
+            word tokens.  Otherwise, leave the spaces on the tokens.
+        :param stem: If true, then use word stems instead of word strings.
+        """
+        tag = "c5" if c5 else "pos"
+        return self._views(
+            fileids, sent=True, tag=tag, strip_space=strip_space, stem=stem
+        )
+
+    def _views(self, fileids=None, sent=False, tag=False, strip_space=True, stem=False):
+        """A helper function that instantiates BNCWordViews or the list of words/sentences."""
+        f = BNCWordView if self._lazy else self._words
+        return concat(
+            [
+                f(fileid, sent, tag, strip_space, stem)
+                for fileid in self.abspaths(fileids)
+            ]
+        )
+
+    def _words(self, fileid, bracket_sent, tag, strip_space, stem):
+        """
+        Helper used to implement the view methods -- returns a list of
+        words or a list of sentences, optionally tagged.
+
+        :param fileid: The name of the underlying file.
+        :param bracket_sent: If true, include sentence bracketing.
+        :param tag: The name of the tagset to use, or None for no tags.
+        :param strip_space: If true, strip spaces from word tokens.
+        :param stem: If true, then substitute stems for words.
+        """
+        result = []
+
+        xmldoc = ElementTree.parse(fileid).getroot()
+        for xmlsent in xmldoc.findall(".//s"):
+            sent = []
+            for xmlword in _all_xmlwords_in(xmlsent):
+                word = xmlword.text
+                if not word:
+                    word = ""  # fixes issue 337?
+                if strip_space or stem:
+                    word = word.strip()
+                if stem:
+                    word = xmlword.get("hw", word)
+                if tag == "c5":
+                    word = (word, xmlword.get("c5"))
+                elif tag == "pos":
+                    word = (word, xmlword.get("pos", xmlword.get("c5")))
+                sent.append(word)
+            if bracket_sent:
+                result.append(BNCSentence(xmlsent.attrib["n"], sent))
+            else:
+                result.extend(sent)
+
+        assert None not in result
+        return result
+
+
+def _all_xmlwords_in(elt, result=None):
+    if result is None:
+        result = []
+    for child in elt:
+        if child.tag in ("c", "w"):
+            result.append(child)
+        else:
+            _all_xmlwords_in(child, result)
+    return result
+
+
+class BNCSentence(list):
+    """
+    A list of words, augmented by an attribute ``num`` used to record
+    the sentence identifier (the ``n`` attribute from the XML).
+    """
+
+    def __init__(self, num, items):
+        self.num = num
+        list.__init__(self, items)
+
+
+class BNCWordView(XMLCorpusView):
+    """
+    A stream backed corpus view specialized for use with the BNC corpus.
+    """
+
+    tags_to_ignore = {
+        "pb",
+        "gap",
+        "vocal",
+        "event",
+        "unclear",
+        "shift",
+        "pause",
+        "align",
+    }
+    """These tags are ignored. For their description refer to the
+    technical documentation, for example,
+    http://www.natcorp.ox.ac.uk/docs/URG/ref-vocal.html
+
+    """
+
+    def __init__(self, fileid, sent, tag, strip_space, stem):
+        """
+        :param fileid: The name of the underlying file.
+        :param sent: If true, include sentence bracketing.
+        :param tag: The name of the tagset to use, or None for no tags.
+        :param strip_space: If true, strip spaces from word tokens.
+        :param stem: If true, then substitute stems for words.
+        """
+        if sent:
+            tagspec = ".*/s"
+        else:
+            tagspec = ".*/s/(.*/)?(c|w)"
+        self._sent = sent
+        self._tag = tag
+        self._strip_space = strip_space
+        self._stem = stem
+
+        self.title = None  #: Title of the document.
+        self.author = None  #: Author of the document.
+        self.editor = None  #: Editor
+        self.resps = None  #: Statement of responsibility
+
+        XMLCorpusView.__init__(self, fileid, tagspec)
+
+        # Read in a tasty header.
+        self._open()
+        self.read_block(self._stream, ".*/teiHeader$", self.handle_header)
+        self.close()
+
+        # Reset tag context.
+        self._tag_context = {0: ()}
+
+    def handle_header(self, elt, context):
+        # Set up some metadata!
+        titles = elt.findall("titleStmt/title")
+        if titles:
+            self.title = "\n".join(title.text.strip() for title in titles)
+
+        authors = elt.findall("titleStmt/author")
+        if authors:
+            self.author = "\n".join(author.text.strip() for author in authors)
+
+        editors = elt.findall("titleStmt/editor")
+        if editors:
+            self.editor = "\n".join(editor.text.strip() for editor in editors)
+
+        resps = elt.findall("titleStmt/respStmt")
+        if resps:
+            self.resps = "\n\n".join(
+                "\n".join(resp_elt.text.strip() for resp_elt in resp) for resp in resps
+            )
+
+    def handle_elt(self, elt, context):
+        if self._sent:
+            return self.handle_sent(elt)
+        else:
+            return self.handle_word(elt)
+
+    def handle_word(self, elt):
+        word = elt.text
+        if not word:
+            word = ""  # fixes issue 337?
+        if self._strip_space or self._stem:
+            word = word.strip()
+        if self._stem:
+            word = elt.get("hw", word)
+        if self._tag == "c5":
+            word = (word, elt.get("c5"))
+        elif self._tag == "pos":
+            word = (word, elt.get("pos", elt.get("c5")))
+        return word
+
+    def handle_sent(self, elt):
+        sent = []
+        for child in elt:
+            if child.tag in ("mw", "hi", "corr", "trunc"):
+                sent += [self.handle_word(w) for w in child]
+            elif child.tag in ("w", "c"):
+                sent.append(self.handle_word(child))
+            elif child.tag not in self.tags_to_ignore:
+                raise ValueError("Unexpected element %s" % child.tag)
+        return BNCSentence(elt.attrib["n"], sent)
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/bracket_parse.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/bracket_parse.py
@@ -0,0 +1,237 @@
+# Natural Language Toolkit: Penn Treebank Reader
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+#         Edward Loper <edloper@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+"""
+Corpus reader for corpora that consist of parenthesis-delineated parse trees.
+"""
+
+import sys
+
+from nltk.corpus.reader.api import *
+from nltk.corpus.reader.util import *
+from nltk.tag import map_tag
+from nltk.tree import Tree
+
+# we use [^\s()]+ instead of \S+? to avoid matching ()
+SORTTAGWRD = re.compile(r"\((\d+) ([^\s()]+) ([^\s()]+)\)")
+TAGWORD = re.compile(r"\(([^\s()]+) ([^\s()]+)\)")
+WORD = re.compile(r"\([^\s()]+ ([^\s()]+)\)")
+EMPTY_BRACKETS = re.compile(r"\s*\(\s*\(")
+
+
+class BracketParseCorpusReader(SyntaxCorpusReader):
+    """
+    Reader for corpora that consist of parenthesis-delineated parse trees,
+    like those found in the "combined" section of the Penn Treebank,
+    e.g. "(S (NP (DT the) (JJ little) (NN dog)) (VP (VBD barked)))".
+
+    """
+
+    def __init__(
+        self,
+        root,
+        fileids,
+        comment_char=None,
+        detect_blocks="unindented_paren",
+        encoding="utf8",
+        tagset=None,
+    ):
+        """
+        :param root: The root directory for this corpus.
+        :param fileids: A list or regexp specifying the fileids in this corpus.
+        :param comment_char: The character which can appear at the start of
+            a line to indicate that the rest of the line is a comment.
+        :param detect_blocks: The method that is used to find blocks
+            in the corpus; can be 'unindented_paren' (every unindented
+            parenthesis starts a new parse) or 'sexpr' (brackets are
+            matched).
+        :param tagset: The name of the tagset used by this corpus, to be used
+            for normalizing or converting the POS tags returned by the
+            ``tagged_...()`` methods.
+        """
+        SyntaxCorpusReader.__init__(self, root, fileids, encoding)
+        self._comment_char = comment_char
+        self._detect_blocks = detect_blocks
+        self._tagset = tagset
+
+    def _read_block(self, stream):
+        if self._detect_blocks == "sexpr":
+            return read_sexpr_block(stream, comment_char=self._comment_char)
+        elif self._detect_blocks == "blankline":
+            return read_blankline_block(stream)
+        elif self._detect_blocks == "unindented_paren":
+            # Tokens start with unindented left parens.
+            toks = read_regexp_block(stream, start_re=r"^\(")
+            # Strip any comments out of the tokens.
+            if self._comment_char:
+                toks = [
+                    re.sub("(?m)^%s.*" % re.escape(self._comment_char), "", tok)
+                    for tok in toks
+                ]
+            return toks
+        else:
+            assert 0, "bad block type"
+
+    def _normalize(self, t):
+        # Replace leaves of the form (!), (,), with (! !), (, ,)
+        t = re.sub(r"\((.)\)", r"(\1 \1)", t)
+        # Replace leaves of the form (tag word root) with (tag word)
+        t = re.sub(r"\(([^\s()]+) ([^\s()]+) [^\s()]+\)", r"(\1 \2)", t)
+        return t
+
+    def _parse(self, t):
+        try:
+            tree = Tree.fromstring(self._normalize(t))
+            # If there's an empty node at the top, strip it off
+            if tree.label() == "" and len(tree) == 1:
+                return tree[0]
+            else:
+                return tree
+
+        except ValueError as e:
+            sys.stderr.write("Bad tree detected; trying to recover...\n")
+            # Try to recover, if we can:
+            if e.args == ("mismatched parens",):
+                for n in range(1, 5):
+                    try:
+                        v = Tree(self._normalize(t + ")" * n))
+                        sys.stderr.write(
+                            "  Recovered by adding %d close " "paren(s)\n" % n
+                        )
+                        return v
+                    except ValueError:
+                        pass
+            # Try something else:
+            sys.stderr.write("  Recovered by returning a flat parse.\n")
+            # sys.stderr.write(' '.join(t.split())+'\n')
+            return Tree("S", self._tag(t))
+
+    def _tag(self, t, tagset=None):
+        tagged_sent = [(w, p) for (p, w) in TAGWORD.findall(self._normalize(t))]
+        if tagset and tagset != self._tagset:
+            tagged_sent = [
+                (w, map_tag(self._tagset, tagset, p)) for (w, p) in tagged_sent
+            ]
+        return tagged_sent
+
+    def _word(self, t):
+        return WORD.findall(self._normalize(t))
+
+
+class CategorizedBracketParseCorpusReader(
+    CategorizedCorpusReader, BracketParseCorpusReader
+):
+    """
+    A reader for parsed corpora whose documents are
+    divided into categories based on their file identifiers.
+    @author: Nathan Schneider <nschneid@cs.cmu.edu>
+    """
+
+    def __init__(self, *args, **kwargs):
+        """
+        Initialize the corpus reader.  Categorization arguments
+        (C{cat_pattern}, C{cat_map}, and C{cat_file}) are passed to
+        the L{CategorizedCorpusReader constructor
+        <CategorizedCorpusReader.__init__>}.  The remaining arguments
+        are passed to the L{BracketParseCorpusReader constructor
+        <BracketParseCorpusReader.__init__>}.
+        """
+        CategorizedCorpusReader.__init__(self, kwargs)
+        BracketParseCorpusReader.__init__(self, *args, **kwargs)
+
+    def tagged_words(self, fileids=None, categories=None, tagset=None):
+        return super().tagged_words(self._resolve(fileids, categories), tagset)
+
+    def tagged_sents(self, fileids=None, categories=None, tagset=None):
+        return super().tagged_sents(self._resolve(fileids, categories), tagset)
+
+    def tagged_paras(self, fileids=None, categories=None, tagset=None):
+        return super().tagged_paras(self._resolve(fileids, categories), tagset)
+
+    def parsed_words(self, fileids=None, categories=None):
+        return super().parsed_words(self._resolve(fileids, categories))
+
+    def parsed_sents(self, fileids=None, categories=None):
+        return super().parsed_sents(self._resolve(fileids, categories))
+
+    def parsed_paras(self, fileids=None, categories=None):
+        return super().parsed_paras(self._resolve(fileids, categories))
+
+
+class AlpinoCorpusReader(BracketParseCorpusReader):
+    """
+    Reader for the Alpino Dutch Treebank.
+    This corpus has a lexical breakdown structure embedded, as read by `_parse`
+    Unfortunately this puts punctuation and some other words out of the sentence
+    order in the xml element tree. This is no good for `tag_` and `word_`
+    `_tag` and `_word` will be overridden to use a non-default new parameter 'ordered'
+    to the overridden _normalize function. The _parse function can then remain
+    untouched.
+    """
+
+    def __init__(self, root, encoding="ISO-8859-1", tagset=None):
+        BracketParseCorpusReader.__init__(
+            self,
+            root,
+            r"alpino\.xml",
+            detect_blocks="blankline",
+            encoding=encoding,
+            tagset=tagset,
+        )
+
+    def _normalize(self, t, ordered=False):
+        """Normalize the xml sentence element in t.
+        The sentence elements <alpino_ds>, although embedded in a few overall
+        xml elements, are separated by blank lines. That's how the reader can
+        deliver them one at a time.
+        Each sentence has a few category subnodes that are of no use to us.
+        The remaining word nodes may or may not appear in the proper order.
+        Each word node has attributes, among which:
+        - begin : the position of the word in the sentence
+        - pos   : Part of Speech: the Tag
+        - word  : the actual word
+        The return value is a string with all xml elementes replaced by
+        clauses: either a cat clause with nested clauses, or a word clause.
+        The order of the bracket clauses closely follows the xml.
+        If ordered == True, the word clauses include an order sequence number.
+        If ordered == False, the word clauses only have pos and word parts.
+        """
+        if t[:10] != "<alpino_ds":
+            return ""
+        # convert XML to sexpr notation
+        t = re.sub(r'  <node .*? cat="(\w+)".*>', r"(\1", t)
+        if ordered:
+            t = re.sub(
+                r'  <node. *?begin="(\d+)".*? pos="(\w+)".*? word="([^"]+)".*?/>',
+                r"(\1 \2 \3)",
+                t,
+            )
+        else:
+            t = re.sub(r'  <node .*?pos="(\w+)".*? word="([^"]+)".*?/>', r"(\1 \2)", t)
+        t = re.sub(r"  </node>", r")", t)
+        t = re.sub(r"<sentence>.*</sentence>", r"", t)
+        t = re.sub(r"</?alpino_ds.*>", r"", t)
+        return t
+
+    def _tag(self, t, tagset=None):
+        tagged_sent = [
+            (int(o), w, p)
+            for (o, p, w) in SORTTAGWRD.findall(self._normalize(t, ordered=True))
+        ]
+        tagged_sent.sort()
+        if tagset and tagset != self._tagset:
+            tagged_sent = [
+                (w, map_tag(self._tagset, tagset, p)) for (o, w, p) in tagged_sent
+            ]
+        else:
+            tagged_sent = [(w, p) for (o, w, p) in tagged_sent]
+        return tagged_sent
+
+    def _word(self, t):
+        """Return a correctly ordered list if words"""
+        tagged_sent = self._tag(t)
+        return [w for (w, p) in tagged_sent]
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/categorized_sents.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/categorized_sents.py
@@ -0,0 +1,168 @@
+# Natural Language Toolkit: Categorized Sentences Corpus Reader
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+CorpusReader structured for corpora that contain one instance on each row.
+This CorpusReader is specifically used for the Subjectivity Dataset and the
+Sentence Polarity Dataset.
+
+- Subjectivity Dataset information -
+
+Authors: Bo Pang and Lillian Lee.
+Url: https://www.cs.cornell.edu/people/pabo/movie-review-data
+
+Distributed with permission.
+
+Related papers:
+
+- Bo Pang and Lillian Lee. "A Sentimental Education: Sentiment Analysis Using
+    Subjectivity Summarization Based on Minimum Cuts". Proceedings of the ACL,
+    2004.
+
+- Sentence Polarity Dataset information -
+
+Authors: Bo Pang and Lillian Lee.
+Url: https://www.cs.cornell.edu/people/pabo/movie-review-data
+
+Related papers:
+
+- Bo Pang and Lillian Lee. "Seeing stars: Exploiting class relationships for
+    sentiment categorization with respect to rating scales". Proceedings of the
+    ACL, 2005.
+"""
+
+from nltk.corpus.reader.api import *
+from nltk.tokenize import *
+
+
+class CategorizedSentencesCorpusReader(CategorizedCorpusReader, CorpusReader):
+    """
+    A reader for corpora in which each row represents a single instance, mainly
+    a sentence. Istances are divided into categories based on their file identifiers
+    (see CategorizedCorpusReader).
+    Since many corpora allow rows that contain more than one sentence, it is
+    possible to specify a sentence tokenizer to retrieve all sentences instead
+    than all rows.
+
+    Examples using the Subjectivity Dataset:
+
+    >>> from nltk.corpus import subjectivity
+    >>> subjectivity.sents()[23] # doctest: +NORMALIZE_WHITESPACE
+    ['television', 'made', 'him', 'famous', ',', 'but', 'his', 'biggest', 'hits',
+    'happened', 'off', 'screen', '.']
+    >>> subjectivity.categories()
+    ['obj', 'subj']
+    >>> subjectivity.words(categories='subj')
+    ['smart', 'and', 'alert', ',', 'thirteen', ...]
+
+    Examples using the Sentence Polarity Dataset:
+
+    >>> from nltk.corpus import sentence_polarity
+    >>> sentence_polarity.sents() # doctest: +NORMALIZE_WHITESPACE
+    [['simplistic', ',', 'silly', 'and', 'tedious', '.'], ["it's", 'so', 'laddish',
+    'and', 'juvenile', ',', 'only', 'teenage', 'boys', 'could', 'possibly', 'find',
+    'it', 'funny', '.'], ...]
+    >>> sentence_polarity.categories()
+    ['neg', 'pos']
+    """
+
+    CorpusView = StreamBackedCorpusView
+
+    def __init__(
+        self,
+        root,
+        fileids,
+        word_tokenizer=WhitespaceTokenizer(),
+        sent_tokenizer=None,
+        encoding="utf8",
+        **kwargs
+    ):
+        """
+        :param root: The root directory for the corpus.
+        :param fileids: a list or regexp specifying the fileids in the corpus.
+        :param word_tokenizer: a tokenizer for breaking sentences or paragraphs
+            into words. Default: `WhitespaceTokenizer`
+        :param sent_tokenizer: a tokenizer for breaking paragraphs into sentences.
+        :param encoding: the encoding that should be used to read the corpus.
+        :param kwargs: additional parameters passed to CategorizedCorpusReader.
+        """
+
+        CorpusReader.__init__(self, root, fileids, encoding)
+        CategorizedCorpusReader.__init__(self, kwargs)
+        self._word_tokenizer = word_tokenizer
+        self._sent_tokenizer = sent_tokenizer
+
+    def sents(self, fileids=None, categories=None):
+        """
+        Return all sentences in the corpus or in the specified file(s).
+
+        :param fileids: a list or regexp specifying the ids of the files whose
+            sentences have to be returned.
+        :param categories: a list specifying the categories whose sentences have
+            to be returned.
+        :return: the given file(s) as a list of sentences.
+            Each sentence is tokenized using the specified word_tokenizer.
+        :rtype: list(list(str))
+        """
+        fileids = self._resolve(fileids, categories)
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, str):
+            fileids = [fileids]
+        return concat(
+            [
+                self.CorpusView(path, self._read_sent_block, encoding=enc)
+                for (path, enc, fileid) in self.abspaths(fileids, True, True)
+            ]
+        )
+
+    def words(self, fileids=None, categories=None):
+        """
+        Return all words and punctuation symbols in the corpus or in the specified
+        file(s).
+
+        :param fileids: a list or regexp specifying the ids of the files whose
+            words have to be returned.
+        :param categories: a list specifying the categories whose words have to
+            be returned.
+        :return: the given file(s) as a list of words and punctuation symbols.
+        :rtype: list(str)
+        """
+        fileids = self._resolve(fileids, categories)
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, str):
+            fileids = [fileids]
+        return concat(
+            [
+                self.CorpusView(path, self._read_word_block, encoding=enc)
+                for (path, enc, fileid) in self.abspaths(fileids, True, True)
+            ]
+        )
+
+    def _read_sent_block(self, stream):
+        sents = []
+        for i in range(20):  # Read 20 lines at a time.
+            line = stream.readline()
+            if not line:
+                continue
+            if self._sent_tokenizer:
+                sents.extend(
+                    [
+                        self._word_tokenizer.tokenize(sent)
+                        for sent in self._sent_tokenizer.tokenize(line)
+                    ]
+                )
+            else:
+                sents.append(self._word_tokenizer.tokenize(line))
+        return sents
+
+    def _read_word_block(self, stream):
+        words = []
+        for sent in self._read_sent_block(stream):
+            words.extend(sent)
+        return words
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/chasen.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/chasen.py
@@ -0,0 +1,154 @@
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Masato Hagiwara <hagisan@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+import sys
+
+from nltk.corpus.reader import util
+from nltk.corpus.reader.api import *
+from nltk.corpus.reader.util import *
+
+
+class ChasenCorpusReader(CorpusReader):
+    def __init__(self, root, fileids, encoding="utf8", sent_splitter=None):
+        self._sent_splitter = sent_splitter
+        CorpusReader.__init__(self, root, fileids, encoding)
+
+    def words(self, fileids=None):
+        return concat(
+            [
+                ChasenCorpusView(fileid, enc, False, False, False, self._sent_splitter)
+                for (fileid, enc) in self.abspaths(fileids, True)
+            ]
+        )
+
+    def tagged_words(self, fileids=None):
+        return concat(
+            [
+                ChasenCorpusView(fileid, enc, True, False, False, self._sent_splitter)
+                for (fileid, enc) in self.abspaths(fileids, True)
+            ]
+        )
+
+    def sents(self, fileids=None):
+        return concat(
+            [
+                ChasenCorpusView(fileid, enc, False, True, False, self._sent_splitter)
+                for (fileid, enc) in self.abspaths(fileids, True)
+            ]
+        )
+
+    def tagged_sents(self, fileids=None):
+        return concat(
+            [
+                ChasenCorpusView(fileid, enc, True, True, False, self._sent_splitter)
+                for (fileid, enc) in self.abspaths(fileids, True)
+            ]
+        )
+
+    def paras(self, fileids=None):
+        return concat(
+            [
+                ChasenCorpusView(fileid, enc, False, True, True, self._sent_splitter)
+                for (fileid, enc) in self.abspaths(fileids, True)
+            ]
+        )
+
+    def tagged_paras(self, fileids=None):
+        return concat(
+            [
+                ChasenCorpusView(fileid, enc, True, True, True, self._sent_splitter)
+                for (fileid, enc) in self.abspaths(fileids, True)
+            ]
+        )
+
+
+class ChasenCorpusView(StreamBackedCorpusView):
+    """
+    A specialized corpus view for ChasenReader. Similar to ``TaggedCorpusView``,
+    but this'll use fixed sets of word and sentence tokenizer.
+    """
+
+    def __init__(
+        self,
+        corpus_file,
+        encoding,
+        tagged,
+        group_by_sent,
+        group_by_para,
+        sent_splitter=None,
+    ):
+        self._tagged = tagged
+        self._group_by_sent = group_by_sent
+        self._group_by_para = group_by_para
+        self._sent_splitter = sent_splitter
+        StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
+
+    def read_block(self, stream):
+        """Reads one paragraph at a time."""
+        block = []
+        for para_str in read_regexp_block(stream, r".", r"^EOS\n"):
+            para = []
+
+            sent = []
+            for line in para_str.splitlines():
+                _eos = line.strip() == "EOS"
+                _cells = line.split("\t")
+                w = (_cells[0], "\t".join(_cells[1:]))
+                if not _eos:
+                    sent.append(w)
+
+                if _eos or (self._sent_splitter and self._sent_splitter(w)):
+                    if not self._tagged:
+                        sent = [w for (w, t) in sent]
+                    if self._group_by_sent:
+                        para.append(sent)
+                    else:
+                        para.extend(sent)
+                    sent = []
+
+            if len(sent) > 0:
+                if not self._tagged:
+                    sent = [w for (w, t) in sent]
+
+                if self._group_by_sent:
+                    para.append(sent)
+                else:
+                    para.extend(sent)
+
+            if self._group_by_para:
+                block.append(para)
+            else:
+                block.extend(para)
+
+        return block
+
+
+def demo():
+    import nltk
+    from nltk.corpus.util import LazyCorpusLoader
+
+    jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*chasen", encoding="utf-8")
+    print("/".join(jeita.words()[22100:22140]))
+
+    print(
+        "\nEOS\n".join(
+            "\n".join("{}/{}".format(w[0], w[1].split("\t")[2]) for w in sent)
+            for sent in jeita.tagged_sents()[2170:2173]
+        )
+    )
+
+
+def test():
+    from nltk.corpus.util import LazyCorpusLoader
+
+    jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*chasen", encoding="utf-8")
+
+    assert isinstance(jeita.tagged_words()[0][1], str)
+
+
+if __name__ == "__main__":
+    demo()
+    test()
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/childes.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/childes.py
@@ -0,0 +1,630 @@
+# CHILDES XML Corpus Reader
+
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Tomonori Nagano <tnagano@gc.cuny.edu>
+#         Alexis Dimitriadis <A.Dimitriadis@uu.nl>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Corpus reader for the XML version of the CHILDES corpus.
+"""
+
+__docformat__ = "epytext en"
+
+import re
+from collections import defaultdict
+
+from nltk.corpus.reader.util import concat
+from nltk.corpus.reader.xmldocs import ElementTree, XMLCorpusReader
+from nltk.util import LazyConcatenation, LazyMap, flatten
+
+# to resolve the namespace issue
+NS = "http://www.talkbank.org/ns/talkbank"
+
+
+class CHILDESCorpusReader(XMLCorpusReader):
+    """
+    Corpus reader for the XML version of the CHILDES corpus.
+    The CHILDES corpus is available at ``https://childes.talkbank.org/``. The XML
+    version of CHILDES is located at ``https://childes.talkbank.org/data-xml/``.
+    Copy the needed parts of the CHILDES XML corpus into the NLTK data directory
+    (``nltk_data/corpora/CHILDES/``).
+
+    For access to the file text use the usual nltk functions,
+    ``words()``, ``sents()``, ``tagged_words()`` and ``tagged_sents()``.
+    """
+
+    def __init__(self, root, fileids, lazy=True):
+        XMLCorpusReader.__init__(self, root, fileids)
+        self._lazy = lazy
+
+    def words(
+        self,
+        fileids=None,
+        speaker="ALL",
+        stem=False,
+        relation=False,
+        strip_space=True,
+        replace=False,
+    ):
+        """
+        :return: the given file(s) as a list of words
+        :rtype: list(str)
+
+        :param speaker: If specified, select specific speaker(s) defined
+            in the corpus. Default is 'ALL' (all participants). Common choices
+            are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
+            researchers)
+        :param stem: If true, then use word stems instead of word strings.
+        :param relation: If true, then return tuples of (stem, index,
+            dependent_index)
+        :param strip_space: If true, then strip trailing spaces from word
+            tokens. Otherwise, leave the spaces on the tokens.
+        :param replace: If true, then use the replaced (intended) word instead
+            of the original word (e.g., 'wat' will be replaced with 'watch')
+        """
+        sent = None
+        pos = False
+        if not self._lazy:
+            return [
+                self._get_words(
+                    fileid, speaker, sent, stem, relation, pos, strip_space, replace
+                )
+                for fileid in self.abspaths(fileids)
+            ]
+
+        get_words = lambda fileid: self._get_words(
+            fileid, speaker, sent, stem, relation, pos, strip_space, replace
+        )
+        return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
+
+    def tagged_words(
+        self,
+        fileids=None,
+        speaker="ALL",
+        stem=False,
+        relation=False,
+        strip_space=True,
+        replace=False,
+    ):
+        """
+        :return: the given file(s) as a list of tagged
+            words and punctuation symbols, encoded as tuples
+            ``(word,tag)``.
+        :rtype: list(tuple(str,str))
+
+        :param speaker: If specified, select specific speaker(s) defined
+            in the corpus. Default is 'ALL' (all participants). Common choices
+            are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
+            researchers)
+        :param stem: If true, then use word stems instead of word strings.
+        :param relation: If true, then return tuples of (stem, index,
+            dependent_index)
+        :param strip_space: If true, then strip trailing spaces from word
+            tokens. Otherwise, leave the spaces on the tokens.
+        :param replace: If true, then use the replaced (intended) word instead
+            of the original word (e.g., 'wat' will be replaced with 'watch')
+        """
+        sent = None
+        pos = True
+        if not self._lazy:
+            return [
+                self._get_words(
+                    fileid, speaker, sent, stem, relation, pos, strip_space, replace
+                )
+                for fileid in self.abspaths(fileids)
+            ]
+
+        get_words = lambda fileid: self._get_words(
+            fileid, speaker, sent, stem, relation, pos, strip_space, replace
+        )
+        return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
+
+    def sents(
+        self,
+        fileids=None,
+        speaker="ALL",
+        stem=False,
+        relation=None,
+        strip_space=True,
+        replace=False,
+    ):
+        """
+        :return: the given file(s) as a list of sentences or utterances, each
+            encoded as a list of word strings.
+        :rtype: list(list(str))
+
+        :param speaker: If specified, select specific speaker(s) defined
+            in the corpus. Default is 'ALL' (all participants). Common choices
+            are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
+            researchers)
+        :param stem: If true, then use word stems instead of word strings.
+        :param relation: If true, then return tuples of ``(str,pos,relation_list)``.
+            If there is manually-annotated relation info, it will return
+            tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)``
+        :param strip_space: If true, then strip trailing spaces from word
+            tokens. Otherwise, leave the spaces on the tokens.
+        :param replace: If true, then use the replaced (intended) word instead
+            of the original word (e.g., 'wat' will be replaced with 'watch')
+        """
+        sent = True
+        pos = False
+        if not self._lazy:
+            return [
+                self._get_words(
+                    fileid, speaker, sent, stem, relation, pos, strip_space, replace
+                )
+                for fileid in self.abspaths(fileids)
+            ]
+
+        get_words = lambda fileid: self._get_words(
+            fileid, speaker, sent, stem, relation, pos, strip_space, replace
+        )
+        return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
+
+    def tagged_sents(
+        self,
+        fileids=None,
+        speaker="ALL",
+        stem=False,
+        relation=None,
+        strip_space=True,
+        replace=False,
+    ):
+        """
+        :return: the given file(s) as a list of
+            sentences, each encoded as a list of ``(word,tag)`` tuples.
+        :rtype: list(list(tuple(str,str)))
+
+        :param speaker: If specified, select specific speaker(s) defined
+            in the corpus. Default is 'ALL' (all participants). Common choices
+            are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
+            researchers)
+        :param stem: If true, then use word stems instead of word strings.
+        :param relation: If true, then return tuples of ``(str,pos,relation_list)``.
+            If there is manually-annotated relation info, it will return
+            tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)``
+        :param strip_space: If true, then strip trailing spaces from word
+            tokens. Otherwise, leave the spaces on the tokens.
+        :param replace: If true, then use the replaced (intended) word instead
+            of the original word (e.g., 'wat' will be replaced with 'watch')
+        """
+        sent = True
+        pos = True
+        if not self._lazy:
+            return [
+                self._get_words(
+                    fileid, speaker, sent, stem, relation, pos, strip_space, replace
+                )
+                for fileid in self.abspaths(fileids)
+            ]
+
+        get_words = lambda fileid: self._get_words(
+            fileid, speaker, sent, stem, relation, pos, strip_space, replace
+        )
+        return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
+
+    def corpus(self, fileids=None):
+        """
+        :return: the given file(s) as a dict of ``(corpus_property_key, value)``
+        :rtype: list(dict)
+        """
+        if not self._lazy:
+            return [self._get_corpus(fileid) for fileid in self.abspaths(fileids)]
+        return LazyMap(self._get_corpus, self.abspaths(fileids))
+
+    def _get_corpus(self, fileid):
+        results = dict()
+        xmldoc = ElementTree.parse(fileid).getroot()
+        for key, value in xmldoc.items():
+            results[key] = value
+        return results
+
+    def participants(self, fileids=None):
+        """
+        :return: the given file(s) as a dict of
+            ``(participant_property_key, value)``
+        :rtype: list(dict)
+        """
+        if not self._lazy:
+            return [self._get_participants(fileid) for fileid in self.abspaths(fileids)]
+        return LazyMap(self._get_participants, self.abspaths(fileids))
+
+    def _get_participants(self, fileid):
+        # multidimensional dicts
+        def dictOfDicts():
+            return defaultdict(dictOfDicts)
+
+        xmldoc = ElementTree.parse(fileid).getroot()
+        # getting participants' data
+        pat = dictOfDicts()
+        for participant in xmldoc.findall(
+            f".//{{{NS}}}Participants/{{{NS}}}participant"
+        ):
+            for key, value in participant.items():
+                pat[participant.get("id")][key] = value
+        return pat
+
+    def age(self, fileids=None, speaker="CHI", month=False):
+        """
+        :return: the given file(s) as string or int
+        :rtype: list or int
+
+        :param month: If true, return months instead of year-month-date
+        """
+        if not self._lazy:
+            return [
+                self._get_age(fileid, speaker, month)
+                for fileid in self.abspaths(fileids)
+            ]
+        get_age = lambda fileid: self._get_age(fileid, speaker, month)
+        return LazyMap(get_age, self.abspaths(fileids))
+
+    def _get_age(self, fileid, speaker, month):
+        xmldoc = ElementTree.parse(fileid).getroot()
+        for pat in xmldoc.findall(f".//{{{NS}}}Participants/{{{NS}}}participant"):
+            try:
+                if pat.get("id") == speaker:
+                    age = pat.get("age")
+                    if month:
+                        age = self.convert_age(age)
+                    return age
+            # some files don't have age data
+            except (TypeError, AttributeError) as e:
+                return None
+
+    def convert_age(self, age_year):
+        "Caclculate age in months from a string in CHILDES format"
+        m = re.match(r"P(\d+)Y(\d+)M?(\d?\d?)D?", age_year)
+        age_month = int(m.group(1)) * 12 + int(m.group(2))
+        try:
+            if int(m.group(3)) > 15:
+                age_month += 1
+        # some corpora don't have age information?
+        except ValueError as e:
+            pass
+        return age_month
+
+    def MLU(self, fileids=None, speaker="CHI"):
+        """
+        :return: the given file(s) as a floating number
+        :rtype: list(float)
+        """
+        if not self._lazy:
+            return [
+                self._getMLU(fileid, speaker=speaker)
+                for fileid in self.abspaths(fileids)
+            ]
+        get_MLU = lambda fileid: self._getMLU(fileid, speaker=speaker)
+        return LazyMap(get_MLU, self.abspaths(fileids))
+
+    def _getMLU(self, fileid, speaker):
+        sents = self._get_words(
+            fileid,
+            speaker=speaker,
+            sent=True,
+            stem=True,
+            relation=False,
+            pos=True,
+            strip_space=True,
+            replace=True,
+        )
+        results = []
+        lastSent = []
+        numFillers = 0
+        sentDiscount = 0
+        for sent in sents:
+            posList = [pos for (word, pos) in sent]
+            # if any part of the sentence is intelligible
+            if any(pos == "unk" for pos in posList):
+                continue
+            # if the sentence is null
+            elif sent == []:
+                continue
+            # if the sentence is the same as the last sent
+            elif sent == lastSent:
+                continue
+            else:
+                results.append([word for (word, pos) in sent])
+                # count number of fillers
+                if len({"co", None}.intersection(posList)) > 0:
+                    numFillers += posList.count("co")
+                    numFillers += posList.count(None)
+                    sentDiscount += 1
+            lastSent = sent
+        try:
+            thisWordList = flatten(results)
+            # count number of morphemes
+            # (e.g., 'read' = 1 morpheme but 'read-PAST' is 2 morphemes)
+            numWords = (
+                len(flatten([word.split("-") for word in thisWordList])) - numFillers
+            )
+            numSents = len(results) - sentDiscount
+            mlu = numWords / numSents
+        except ZeroDivisionError:
+            mlu = 0
+        # return {'mlu':mlu,'wordNum':numWords,'sentNum':numSents}
+        return mlu
+
+    def _get_words(
+        self, fileid, speaker, sent, stem, relation, pos, strip_space, replace
+    ):
+        if (
+            isinstance(speaker, str) and speaker != "ALL"
+        ):  # ensure we have a list of speakers
+            speaker = [speaker]
+        xmldoc = ElementTree.parse(fileid).getroot()
+        # processing each xml doc
+        results = []
+        for xmlsent in xmldoc.findall(".//{%s}u" % NS):
+            sents = []
+            # select speakers
+            if speaker == "ALL" or xmlsent.get("who") in speaker:
+                for xmlword in xmlsent.findall(".//{%s}w" % NS):
+                    infl = None
+                    suffixStem = None
+                    suffixTag = None
+                    # getting replaced words
+                    if replace and xmlsent.find(f".//{{{NS}}}w/{{{NS}}}replacement"):
+                        xmlword = xmlsent.find(
+                            f".//{{{NS}}}w/{{{NS}}}replacement/{{{NS}}}w"
+                        )
+                    elif replace and xmlsent.find(f".//{{{NS}}}w/{{{NS}}}wk"):
+                        xmlword = xmlsent.find(f".//{{{NS}}}w/{{{NS}}}wk")
+                    # get text
+                    if xmlword.text:
+                        word = xmlword.text
+                    else:
+                        word = ""
+                    # strip tailing space
+                    if strip_space:
+                        word = word.strip()
+                    # stem
+                    if relation or stem:
+                        try:
+                            xmlstem = xmlword.find(".//{%s}stem" % NS)
+                            word = xmlstem.text
+                        except AttributeError as e:
+                            pass
+                        # if there is an inflection
+                        try:
+                            xmlinfl = xmlword.find(
+                                f".//{{{NS}}}mor/{{{NS}}}mw/{{{NS}}}mk"
+                            )
+                            word += "-" + xmlinfl.text
+                        except:
+                            pass
+                        # if there is a suffix
+                        try:
+                            xmlsuffix = xmlword.find(
+                                ".//{%s}mor/{%s}mor-post/{%s}mw/{%s}stem"
+                                % (NS, NS, NS, NS)
+                            )
+                            suffixStem = xmlsuffix.text
+                        except AttributeError:
+                            suffixStem = ""
+                        if suffixStem:
+                            word += "~" + suffixStem
+                    # pos
+                    if relation or pos:
+                        try:
+                            xmlpos = xmlword.findall(".//{%s}c" % NS)
+                            xmlpos2 = xmlword.findall(".//{%s}s" % NS)
+                            if xmlpos2 != []:
+                                tag = xmlpos[0].text + ":" + xmlpos2[0].text
+                            else:
+                                tag = xmlpos[0].text
+                        except (AttributeError, IndexError) as e:
+                            tag = ""
+                        try:
+                            xmlsuffixpos = xmlword.findall(
+                                ".//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}c"
+                                % (NS, NS, NS, NS, NS)
+                            )
+                            xmlsuffixpos2 = xmlword.findall(
+                                ".//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}s"
+                                % (NS, NS, NS, NS, NS)
+                            )
+                            if xmlsuffixpos2:
+                                suffixTag = (
+                                    xmlsuffixpos[0].text + ":" + xmlsuffixpos2[0].text
+                                )
+                            else:
+                                suffixTag = xmlsuffixpos[0].text
+                        except:
+                            pass
+                        if suffixTag:
+                            tag += "~" + suffixTag
+                        word = (word, tag)
+                    # relational
+                    # the gold standard is stored in
+                    # <mor></mor><mor type="trn"><gra type="grt">
+                    if relation == True:
+                        for xmlstem_rel in xmlword.findall(
+                            f".//{{{NS}}}mor/{{{NS}}}gra"
+                        ):
+                            if not xmlstem_rel.get("type") == "grt":
+                                word = (
+                                    word[0],
+                                    word[1],
+                                    xmlstem_rel.get("index")
+                                    + "|"
+                                    + xmlstem_rel.get("head")
+                                    + "|"
+                                    + xmlstem_rel.get("relation"),
+                                )
+                            else:
+                                word = (
+                                    word[0],
+                                    word[1],
+                                    word[2],
+                                    word[0],
+                                    word[1],
+                                    xmlstem_rel.get("index")
+                                    + "|"
+                                    + xmlstem_rel.get("head")
+                                    + "|"
+                                    + xmlstem_rel.get("relation"),
+                                )
+                        try:
+                            for xmlpost_rel in xmlword.findall(
+                                f".//{{{NS}}}mor/{{{NS}}}mor-post/{{{NS}}}gra"
+                            ):
+                                if not xmlpost_rel.get("type") == "grt":
+                                    suffixStem = (
+                                        suffixStem[0],
+                                        suffixStem[1],
+                                        xmlpost_rel.get("index")
+                                        + "|"
+                                        + xmlpost_rel.get("head")
+                                        + "|"
+                                        + xmlpost_rel.get("relation"),
+                                    )
+                                else:
+                                    suffixStem = (
+                                        suffixStem[0],
+                                        suffixStem[1],
+                                        suffixStem[2],
+                                        suffixStem[0],
+                                        suffixStem[1],
+                                        xmlpost_rel.get("index")
+                                        + "|"
+                                        + xmlpost_rel.get("head")
+                                        + "|"
+                                        + xmlpost_rel.get("relation"),
+                                    )
+                        except:
+                            pass
+                    sents.append(word)
+                if sent or relation:
+                    results.append(sents)
+                else:
+                    results.extend(sents)
+        return LazyMap(lambda x: x, results)
+
+    # Ready-to-use browser opener
+
+    """
+    The base URL for viewing files on the childes website. This
+    shouldn't need to be changed, unless CHILDES changes the configuration
+    of their server or unless the user sets up their own corpus webserver.
+    """
+    childes_url_base = r"https://childes.talkbank.org/browser/index.php?url="
+
+    def webview_file(self, fileid, urlbase=None):
+        """Map a corpus file to its web version on the CHILDES website,
+        and open it in a web browser.
+
+        The complete URL to be used is:
+            childes.childes_url_base + urlbase + fileid.replace('.xml', '.cha')
+
+        If no urlbase is passed, we try to calculate it.  This
+        requires that the childes corpus was set up to mirror the
+        folder hierarchy under childes.psy.cmu.edu/data-xml/, e.g.:
+        nltk_data/corpora/childes/Eng-USA/Cornell/??? or
+        nltk_data/corpora/childes/Romance/Spanish/Aguirre/???
+
+        The function first looks (as a special case) if "Eng-USA" is
+        on the path consisting of <corpus root>+fileid; then if
+        "childes", possibly followed by "data-xml", appears. If neither
+        one is found, we use the unmodified fileid and hope for the best.
+        If this is not right, specify urlbase explicitly, e.g., if the
+        corpus root points to the Cornell folder, urlbase='Eng-USA/Cornell'.
+        """
+
+        import webbrowser
+
+        if urlbase:
+            path = urlbase + "/" + fileid
+        else:
+            full = self.root + "/" + fileid
+            full = re.sub(r"\\", "/", full)
+            if "/childes/" in full.lower():
+                # Discard /data-xml/ if present
+                path = re.findall(r"(?i)/childes(?:/data-xml)?/(.*)\.xml", full)[0]
+            elif "eng-usa" in full.lower():
+                path = "Eng-USA/" + re.findall(r"/(?i)Eng-USA/(.*)\.xml", full)[0]
+            else:
+                path = fileid
+
+        # Strip ".xml" and add ".cha", as necessary:
+        if path.endswith(".xml"):
+            path = path[:-4]
+
+        if not path.endswith(".cha"):
+            path = path + ".cha"
+
+        url = self.childes_url_base + path
+
+        webbrowser.open_new_tab(url)
+        print("Opening in browser:", url)
+        # Pausing is a good idea, but it's up to the user...
+        # raw_input("Hit Return to continue")
+
+
+def demo(corpus_root=None):
+    """
+    The CHILDES corpus should be manually downloaded and saved
+    to ``[NLTK_Data_Dir]/corpora/childes/``
+    """
+    if not corpus_root:
+        from nltk.data import find
+
+        corpus_root = find("corpora/childes/data-xml/Eng-USA/")
+
+    try:
+        childes = CHILDESCorpusReader(corpus_root, ".*.xml")
+        # describe all corpus
+        for file in childes.fileids()[:5]:
+            corpus = ""
+            corpus_id = ""
+            for key, value in childes.corpus(file)[0].items():
+                if key == "Corpus":
+                    corpus = value
+                if key == "Id":
+                    corpus_id = value
+            print("Reading", corpus, corpus_id, " .....")
+            print("words:", childes.words(file)[:7], "...")
+            print(
+                "words with replaced words:",
+                childes.words(file, replace=True)[:7],
+                " ...",
+            )
+            print("words with pos tags:", childes.tagged_words(file)[:7], " ...")
+            print("words (only MOT):", childes.words(file, speaker="MOT")[:7], "...")
+            print("words (only CHI):", childes.words(file, speaker="CHI")[:7], "...")
+            print("stemmed words:", childes.words(file, stem=True)[:7], " ...")
+            print(
+                "words with relations and pos-tag:",
+                childes.words(file, relation=True)[:5],
+                " ...",
+            )
+            print("sentence:", childes.sents(file)[:2], " ...")
+            for participant, values in childes.participants(file)[0].items():
+                for key, value in values.items():
+                    print("\tparticipant", participant, key, ":", value)
+            print("num of sent:", len(childes.sents(file)))
+            print("num of morphemes:", len(childes.words(file, stem=True)))
+            print("age:", childes.age(file))
+            print("age in month:", childes.age(file, month=True))
+            print("MLU:", childes.MLU(file))
+            print()
+
+    except LookupError as e:
+        print(
+            """The CHILDES corpus, or the parts you need, should be manually
+        downloaded from https://childes.talkbank.org/data-xml/ and saved at
+        [NLTK_Data_Dir]/corpora/childes/
+            Alternately, you can call the demo with the path to a portion of the CHILDES corpus, e.g.:
+        demo('/path/to/childes/data-xml/Eng-USA/")
+        """
+        )
+        # corpus_root_http = urllib2.urlopen('https://childes.talkbank.org/data-xml/Eng-USA/Bates.zip')
+        # corpus_root_http_bates = zipfile.ZipFile(cStringIO.StringIO(corpus_root_http.read()))
+        ##this fails
+        # childes = CHILDESCorpusReader(corpus_root_http_bates,corpus_root_http_bates.namelist())
+
+
+if __name__ == "__main__":
+    demo()
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/chunked.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/chunked.py
@@ -0,0 +1,273 @@
+# Natural Language Toolkit: Chunked Corpus Reader
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+#         Edward Loper <edloper@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+A reader for corpora that contain chunked (and optionally tagged)
+documents.
+"""
+
+import codecs
+import os.path
+
+import nltk
+from nltk.chunk import tagstr2tree
+from nltk.corpus.reader.api import *
+from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader
+from nltk.corpus.reader.util import *
+from nltk.tokenize import *
+from nltk.tree import Tree
+
+
+class ChunkedCorpusReader(CorpusReader):
+    """
+    Reader for chunked (and optionally tagged) corpora.  Paragraphs
+    are split using a block reader.  They are then tokenized into
+    sentences using a sentence tokenizer.  Finally, these sentences
+    are parsed into chunk trees using a string-to-chunktree conversion
+    function.  Each of these steps can be performed using a default
+    function or a custom function.  By default, paragraphs are split
+    on blank lines; sentences are listed one per line; and sentences
+    are parsed into chunk trees using ``nltk.chunk.tagstr2tree``.
+    """
+
+    def __init__(
+        self,
+        root,
+        fileids,
+        extension="",
+        str2chunktree=tagstr2tree,
+        sent_tokenizer=RegexpTokenizer("\n", gaps=True),
+        para_block_reader=read_blankline_block,
+        encoding="utf8",
+        tagset=None,
+    ):
+        """
+        :param root: The root directory for this corpus.
+        :param fileids: A list or regexp specifying the fileids in this corpus.
+        """
+        CorpusReader.__init__(self, root, fileids, encoding)
+        self._cv_args = (str2chunktree, sent_tokenizer, para_block_reader, tagset)
+        """Arguments for corpus views generated by this corpus: a tuple
+        (str2chunktree, sent_tokenizer, para_block_tokenizer)"""
+
+    def words(self, fileids=None):
+        """
+        :return: the given file(s) as a list of words
+            and punctuation symbols.
+        :rtype: list(str)
+        """
+        return concat(
+            [
+                ChunkedCorpusView(f, enc, 0, 0, 0, 0, *self._cv_args)
+                for (f, enc) in self.abspaths(fileids, True)
+            ]
+        )
+
+    def sents(self, fileids=None):
+        """
+        :return: the given file(s) as a list of
+            sentences or utterances, each encoded as a list of word
+            strings.
+        :rtype: list(list(str))
+        """
+        return concat(
+            [
+                ChunkedCorpusView(f, enc, 0, 1, 0, 0, *self._cv_args)
+                for (f, enc) in self.abspaths(fileids, True)
+            ]
+        )
+
+    def paras(self, fileids=None):
+        """
+        :return: the given file(s) as a list of
+            paragraphs, each encoded as a list of sentences, which are
+            in turn encoded as lists of word strings.
+        :rtype: list(list(list(str)))
+        """
+        return concat(
+            [
+                ChunkedCorpusView(f, enc, 0, 1, 1, 0, *self._cv_args)
+                for (f, enc) in self.abspaths(fileids, True)
+            ]
+        )
+
+    def tagged_words(self, fileids=None, tagset=None):
+        """
+        :return: the given file(s) as a list of tagged
+            words and punctuation symbols, encoded as tuples
+            ``(word,tag)``.
+        :rtype: list(tuple(str,str))
+        """
+        return concat(
+            [
+                ChunkedCorpusView(
+                    f, enc, 1, 0, 0, 0, *self._cv_args, target_tagset=tagset
+                )
+                for (f, enc) in self.abspaths(fileids, True)
+            ]
+        )
+
+    def tagged_sents(self, fileids=None, tagset=None):
+        """
+        :return: the given file(s) as a list of
+            sentences, each encoded as a list of ``(word,tag)`` tuples.
+
+        :rtype: list(list(tuple(str,str)))
+        """
+        return concat(
+            [
+                ChunkedCorpusView(
+                    f, enc, 1, 1, 0, 0, *self._cv_args, target_tagset=tagset
+                )
+                for (f, enc) in self.abspaths(fileids, True)
+            ]
+        )
+
+    def tagged_paras(self, fileids=None, tagset=None):
+        """
+        :return: the given file(s) as a list of
+            paragraphs, each encoded as a list of sentences, which are
+            in turn encoded as lists of ``(word,tag)`` tuples.
+        :rtype: list(list(list(tuple(str,str))))
+        """
+        return concat(
+            [
+                ChunkedCorpusView(
+                    f, enc, 1, 1, 1, 0, *self._cv_args, target_tagset=tagset
+                )
+                for (f, enc) in self.abspaths(fileids, True)
+            ]
+        )
+
+    def chunked_words(self, fileids=None, tagset=None):
+        """
+        :return: the given file(s) as a list of tagged
+            words and chunks.  Words are encoded as ``(word, tag)``
+            tuples (if the corpus has tags) or word strings (if the
+            corpus has no tags).  Chunks are encoded as depth-one
+            trees over ``(word,tag)`` tuples or word strings.
+        :rtype: list(tuple(str,str) and Tree)
+        """
+        return concat(
+            [
+                ChunkedCorpusView(
+                    f, enc, 1, 0, 0, 1, *self._cv_args, target_tagset=tagset
+                )
+                for (f, enc) in self.abspaths(fileids, True)
+            ]
+        )
+
+    def chunked_sents(self, fileids=None, tagset=None):
+        """
+        :return: the given file(s) as a list of
+            sentences, each encoded as a shallow Tree.  The leaves
+            of these trees are encoded as ``(word, tag)`` tuples (if
+            the corpus has tags) or word strings (if the corpus has no
+            tags).
+        :rtype: list(Tree)
+        """
+        return concat(
+            [
+                ChunkedCorpusView(
+                    f, enc, 1, 1, 0, 1, *self._cv_args, target_tagset=tagset
+                )
+                for (f, enc) in self.abspaths(fileids, True)
+            ]
+        )
+
+    def chunked_paras(self, fileids=None, tagset=None):
+        """
+        :return: the given file(s) as a list of
+            paragraphs, each encoded as a list of sentences, which are
+            in turn encoded as a shallow Tree.  The leaves of these
+            trees are encoded as ``(word, tag)`` tuples (if the corpus
+            has tags) or word strings (if the corpus has no tags).
+        :rtype: list(list(Tree))
+        """
+        return concat(
+            [
+                ChunkedCorpusView(
+                    f, enc, 1, 1, 1, 1, *self._cv_args, target_tagset=tagset
+                )
+                for (f, enc) in self.abspaths(fileids, True)
+            ]
+        )
+
+    def _read_block(self, stream):
+        return [tagstr2tree(t) for t in read_blankline_block(stream)]
+
+
+class ChunkedCorpusView(StreamBackedCorpusView):
+    def __init__(
+        self,
+        fileid,
+        encoding,
+        tagged,
+        group_by_sent,
+        group_by_para,
+        chunked,
+        str2chunktree,
+        sent_tokenizer,
+        para_block_reader,
+        source_tagset=None,
+        target_tagset=None,
+    ):
+        StreamBackedCorpusView.__init__(self, fileid, encoding=encoding)
+        self._tagged = tagged
+        self._group_by_sent = group_by_sent
+        self._group_by_para = group_by_para
+        self._chunked = chunked
+        self._str2chunktree = str2chunktree
+        self._sent_tokenizer = sent_tokenizer
+        self._para_block_reader = para_block_reader
+        self._source_tagset = source_tagset
+        self._target_tagset = target_tagset
+
+    def read_block(self, stream):
+        block = []
+        for para_str in self._para_block_reader(stream):
+            para = []
+            for sent_str in self._sent_tokenizer.tokenize(para_str):
+                sent = self._str2chunktree(
+                    sent_str,
+                    source_tagset=self._source_tagset,
+                    target_tagset=self._target_tagset,
+                )
+
+                # If requested, throw away the tags.
+                if not self._tagged:
+                    sent = self._untag(sent)
+
+                # If requested, throw away the chunks.
+                if not self._chunked:
+                    sent = sent.leaves()
+
+                # Add the sentence to `para`.
+                if self._group_by_sent:
+                    para.append(sent)
+                else:
+                    para.extend(sent)
+
+            # Add the paragraph to `block`.
+            if self._group_by_para:
+                block.append(para)
+            else:
+                block.extend(para)
+
+        # Return the block
+        return block
+
+    def _untag(self, tree):
+        for i, child in enumerate(tree):
+            if isinstance(child, Tree):
+                self._untag(child)
+            elif isinstance(child, tuple):
+                tree[i] = child[0]
+            else:
+                raise ValueError("expected child to be Tree or tuple")
+        return tree
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/cmudict.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/cmudict.py
@@ -0,0 +1,88 @@
+# Natural Language Toolkit: Carnegie Mellon Pronouncing Dictionary Corpus Reader
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+The Carnegie Mellon Pronouncing Dictionary [cmudict.0.6]
+ftp://ftp.cs.cmu.edu/project/speech/dict/
+Copyright 1998 Carnegie Mellon University
+
+File Format: Each line consists of an uppercased word, a counter
+(for alternative pronunciations), and a transcription.  Vowels are
+marked for stress (1=primary, 2=secondary, 0=no stress).  E.g.:
+NATURAL 1 N AE1 CH ER0 AH0 L
+
+The dictionary contains 127069 entries.  Of these, 119400 words are assigned
+a unique pronunciation, 6830 words have two pronunciations, and 839 words have
+three or more pronunciations.  Many of these are fast-speech variants.
+
+Phonemes: There are 39 phonemes, as shown below:
+
+Phoneme Example Translation    Phoneme Example Translation
+------- ------- -----------    ------- ------- -----------
+AA      odd     AA D           AE      at      AE T
+AH      hut     HH AH T        AO      ought   AO T
+AW      cow     K AW           AY      hide    HH AY D
+B       be      B IY           CH      cheese  CH IY Z
+D       dee     D IY           DH      thee    DH IY
+EH      Ed      EH D           ER      hurt    HH ER T
+EY      ate     EY T           F       fee     F IY
+G       green   G R IY N       HH      he      HH IY
+IH      it      IH T           IY      eat     IY T
+JH      gee     JH IY          K       key     K IY
+L       lee     L IY           M       me      M IY
+N       knee    N IY           NG      ping    P IH NG
+OW      oat     OW T           OY      toy     T OY
+P       pee     P IY           R       read    R IY D
+S       sea     S IY           SH      she     SH IY
+T       tea     T IY           TH      theta   TH EY T AH
+UH      hood    HH UH D        UW      two     T UW
+V       vee     V IY           W       we      W IY
+Y       yield   Y IY L D       Z       zee     Z IY
+ZH      seizure S IY ZH ER
+"""
+
+from nltk.corpus.reader.api import *
+from nltk.corpus.reader.util import *
+from nltk.util import Index
+
+
+class CMUDictCorpusReader(CorpusReader):
+    def entries(self):
+        """
+        :return: the cmudict lexicon as a list of entries
+            containing (word, transcriptions) tuples.
+        """
+        return concat(
+            [
+                StreamBackedCorpusView(fileid, read_cmudict_block, encoding=enc)
+                for fileid, enc in self.abspaths(None, True)
+            ]
+        )
+
+    def words(self):
+        """
+        :return: a list of all words defined in the cmudict lexicon.
+        """
+        return [word.lower() for (word, _) in self.entries()]
+
+    def dict(self):
+        """
+        :return: the cmudict lexicon as a dictionary, whose keys are
+            lowercase words and whose values are lists of pronunciations.
+        """
+        return dict(Index(self.entries()))
+
+
+def read_cmudict_block(stream):
+    entries = []
+    while len(entries) < 100:  # Read 100 at a time.
+        line = stream.readline()
+        if line == "":
+            return entries  # end of file.
+        pieces = line.split()
+        entries.append((pieces[0].lower(), pieces[2:]))
+    return entries
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/comparative_sents.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/comparative_sents.py
@@ -0,0 +1,309 @@
+# Natural Language Toolkit: Comparative Sentence Corpus Reader
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+CorpusReader for the Comparative Sentence Dataset.
+
+- Comparative Sentence Dataset information -
+
+Annotated by: Nitin Jindal and Bing Liu, 2006.
+              Department of Computer Sicence
+              University of Illinois at Chicago
+
+Contact: Nitin Jindal, njindal@cs.uic.edu
+         Bing Liu, liub@cs.uic.edu (https://www.cs.uic.edu/~liub)
+
+Distributed with permission.
+
+Related papers:
+
+- Nitin Jindal and Bing Liu. "Identifying Comparative Sentences in Text Documents".
+   Proceedings of the ACM SIGIR International Conference on Information Retrieval
+   (SIGIR-06), 2006.
+
+- Nitin Jindal and Bing Liu. "Mining Comprative Sentences and Relations".
+   Proceedings of Twenty First National Conference on Artificial Intelligence
+   (AAAI-2006), 2006.
+
+- Murthy Ganapathibhotla and Bing Liu. "Mining Opinions in Comparative Sentences".
+    Proceedings of the 22nd International Conference on Computational Linguistics
+    (Coling-2008), Manchester, 18-22 August, 2008.
+"""
+import re
+
+from nltk.corpus.reader.api import *
+from nltk.tokenize import *
+
+# Regular expressions for dataset components
+STARS = re.compile(r"^\*+$")
+COMPARISON = re.compile(r"<cs-[1234]>")
+CLOSE_COMPARISON = re.compile(r"</cs-[1234]>")
+GRAD_COMPARISON = re.compile(r"<cs-[123]>")
+NON_GRAD_COMPARISON = re.compile(r"<cs-4>")
+ENTITIES_FEATS = re.compile(r"(\d)_((?:[\.\w\s/-](?!\d_))+)")
+KEYWORD = re.compile(r"\(([^\(]*)\)$")
+
+
+class Comparison:
+    """
+    A Comparison represents a comparative sentence and its constituents.
+    """
+
+    def __init__(
+        self,
+        text=None,
+        comp_type=None,
+        entity_1=None,
+        entity_2=None,
+        feature=None,
+        keyword=None,
+    ):
+        """
+        :param text: a string (optionally tokenized) containing a comparison.
+        :param comp_type: an integer defining the type of comparison expressed.
+            Values can be: 1 (Non-equal gradable), 2 (Equative), 3 (Superlative),
+            4 (Non-gradable).
+        :param entity_1: the first entity considered in the comparison relation.
+        :param entity_2: the second entity considered in the comparison relation.
+        :param feature: the feature considered in the comparison relation.
+        :param keyword: the word or phrase which is used for that comparative relation.
+        """
+        self.text = text
+        self.comp_type = comp_type
+        self.entity_1 = entity_1
+        self.entity_2 = entity_2
+        self.feature = feature
+        self.keyword = keyword
+
+    def __repr__(self):
+        return (
+            'Comparison(text="{}", comp_type={}, entity_1="{}", entity_2="{}", '
+            'feature="{}", keyword="{}")'
+        ).format(
+            self.text,
+            self.comp_type,
+            self.entity_1,
+            self.entity_2,
+            self.feature,
+            self.keyword,
+        )
+
+
+class ComparativeSentencesCorpusReader(CorpusReader):
+    """
+    Reader for the Comparative Sentence Dataset by Jindal and Liu (2006).
+
+        >>> from nltk.corpus import comparative_sentences
+        >>> comparison = comparative_sentences.comparisons()[0]
+        >>> comparison.text # doctest: +NORMALIZE_WHITESPACE
+        ['its', 'fast-forward', 'and', 'rewind', 'work', 'much', 'more', 'smoothly',
+        'and', 'consistently', 'than', 'those', 'of', 'other', 'models', 'i', "'ve",
+        'had', '.']
+        >>> comparison.entity_2
+        'models'
+        >>> (comparison.feature, comparison.keyword)
+        ('rewind', 'more')
+        >>> len(comparative_sentences.comparisons())
+        853
+    """
+
+    CorpusView = StreamBackedCorpusView
+
+    def __init__(
+        self,
+        root,
+        fileids,
+        word_tokenizer=WhitespaceTokenizer(),
+        sent_tokenizer=None,
+        encoding="utf8",
+    ):
+        """
+        :param root: The root directory for this corpus.
+        :param fileids: a list or regexp specifying the fileids in this corpus.
+        :param word_tokenizer: tokenizer for breaking sentences or paragraphs
+            into words. Default: `WhitespaceTokenizer`
+        :param sent_tokenizer: tokenizer for breaking paragraphs into sentences.
+        :param encoding: the encoding that should be used to read the corpus.
+        """
+
+        CorpusReader.__init__(self, root, fileids, encoding)
+        self._word_tokenizer = word_tokenizer
+        self._sent_tokenizer = sent_tokenizer
+        self._readme = "README.txt"
+
+    def comparisons(self, fileids=None):
+        """
+        Return all comparisons in the corpus.
+
+        :param fileids: a list or regexp specifying the ids of the files whose
+            comparisons have to be returned.
+        :return: the given file(s) as a list of Comparison objects.
+        :rtype: list(Comparison)
+        """
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, str):
+            fileids = [fileids]
+        return concat(
+            [
+                self.CorpusView(path, self._read_comparison_block, encoding=enc)
+                for (path, enc, fileid) in self.abspaths(fileids, True, True)
+            ]
+        )
+
+    def keywords(self, fileids=None):
+        """
+        Return a set of all keywords used in the corpus.
+
+        :param fileids: a list or regexp specifying the ids of the files whose
+            keywords have to be returned.
+        :return: the set of keywords and comparative phrases used in the corpus.
+        :rtype: set(str)
+        """
+        all_keywords = concat(
+            [
+                self.CorpusView(path, self._read_keyword_block, encoding=enc)
+                for (path, enc, fileid) in self.abspaths(fileids, True, True)
+            ]
+        )
+
+        keywords_set = {keyword.lower() for keyword in all_keywords if keyword}
+        return keywords_set
+
+    def keywords_readme(self):
+        """
+        Return the list of words and constituents considered as clues of a
+        comparison (from listOfkeywords.txt).
+        """
+        keywords = []
+        with self.open("listOfkeywords.txt") as fp:
+            raw_text = fp.read()
+        for line in raw_text.split("\n"):
+            if not line or line.startswith("//"):
+                continue
+            keywords.append(line.strip())
+        return keywords
+
+    def sents(self, fileids=None):
+        """
+        Return all sentences in the corpus.
+
+        :param fileids: a list or regexp specifying the ids of the files whose
+            sentences have to be returned.
+        :return: all sentences of the corpus as lists of tokens (or as plain
+            strings, if no word tokenizer is specified).
+        :rtype: list(list(str)) or list(str)
+        """
+        return concat(
+            [
+                self.CorpusView(path, self._read_sent_block, encoding=enc)
+                for (path, enc, fileid) in self.abspaths(fileids, True, True)
+            ]
+        )
+
+    def words(self, fileids=None):
+        """
+        Return all words and punctuation symbols in the corpus.
+
+        :param fileids: a list or regexp specifying the ids of the files whose
+            words have to be returned.
+        :return: the given file(s) as a list of words and punctuation symbols.
+        :rtype: list(str)
+        """
+        return concat(
+            [
+                self.CorpusView(path, self._read_word_block, encoding=enc)
+                for (path, enc, fileid) in self.abspaths(fileids, True, True)
+            ]
+        )
+
+    def _read_comparison_block(self, stream):
+        while True:
+            line = stream.readline()
+            if not line:
+                return []  # end of file.
+            comparison_tags = re.findall(COMPARISON, line)
+            if comparison_tags:
+                grad_comparisons = re.findall(GRAD_COMPARISON, line)
+                non_grad_comparisons = re.findall(NON_GRAD_COMPARISON, line)
+                # Advance to the next line (it contains the comparative sentence)
+                comparison_text = stream.readline().strip()
+                if self._word_tokenizer:
+                    comparison_text = self._word_tokenizer.tokenize(comparison_text)
+                # Skip the next line (it contains closing comparison tags)
+                stream.readline()
+                # If gradable comparisons are found, create Comparison instances
+                # and populate their fields
+                comparison_bundle = []
+                if grad_comparisons:
+                    # Each comparison tag has its own relations on a separate line
+                    for comp in grad_comparisons:
+                        comp_type = int(re.match(r"<cs-(\d)>", comp).group(1))
+                        comparison = Comparison(
+                            text=comparison_text, comp_type=comp_type
+                        )
+                        line = stream.readline()
+                        entities_feats = ENTITIES_FEATS.findall(line)
+                        if entities_feats:
+                            for code, entity_feat in entities_feats:
+                                if code == "1":
+                                    comparison.entity_1 = entity_feat.strip()
+                                elif code == "2":
+                                    comparison.entity_2 = entity_feat.strip()
+                                elif code == "3":
+                                    comparison.feature = entity_feat.strip()
+                        keyword = KEYWORD.findall(line)
+                        if keyword:
+                            comparison.keyword = keyword[0]
+                        comparison_bundle.append(comparison)
+                # If non-gradable comparisons are found, create a simple Comparison
+                # instance for each one
+                if non_grad_comparisons:
+                    for comp in non_grad_comparisons:
+                        # comp_type in this case should always be 4.
+                        comp_type = int(re.match(r"<cs-(\d)>", comp).group(1))
+                        comparison = Comparison(
+                            text=comparison_text, comp_type=comp_type
+                        )
+                        comparison_bundle.append(comparison)
+                # Flatten the list of comparisons before returning them
+                # return concat([comparison_bundle])
+                return comparison_bundle
+
+    def _read_keyword_block(self, stream):
+        keywords = []
+        for comparison in self._read_comparison_block(stream):
+            keywords.append(comparison.keyword)
+        return keywords
+
+    def _read_sent_block(self, stream):
+        while True:
+            line = stream.readline()
+            if re.match(STARS, line):
+                while True:
+                    line = stream.readline()
+                    if re.match(STARS, line):
+                        break
+                continue
+            if (
+                not re.findall(COMPARISON, line)
+                and not ENTITIES_FEATS.findall(line)
+                and not re.findall(CLOSE_COMPARISON, line)
+            ):
+                if self._sent_tokenizer:
+                    return [
+                        self._word_tokenizer.tokenize(sent)
+                        for sent in self._sent_tokenizer.tokenize(line)
+                    ]
+                else:
+                    return [self._word_tokenizer.tokenize(line)]
+
+    def _read_word_block(self, stream):
+        words = []
+        for sent in self._read_sent_block(stream):
+            words.extend(sent)
+        return words
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/conll.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/conll.py
@@ -0,0 +1,579 @@
+# Natural Language Toolkit: CONLL Corpus Reader
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+#         Edward Loper <edloper@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Read CoNLL-style chunk fileids.
+"""
+
+import textwrap
+
+from nltk.corpus.reader.api import *
+from nltk.corpus.reader.util import *
+from nltk.tag import map_tag
+from nltk.tree import Tree
+from nltk.util import LazyConcatenation, LazyMap
+
+
+class ConllCorpusReader(CorpusReader):
+    """
+    A corpus reader for CoNLL-style files.  These files consist of a
+    series of sentences, separated by blank lines.  Each sentence is
+    encoded using a table (or "grid") of values, where each line
+    corresponds to a single word, and each column corresponds to an
+    annotation type.  The set of columns used by CoNLL-style files can
+    vary from corpus to corpus; the ``ConllCorpusReader`` constructor
+    therefore takes an argument, ``columntypes``, which is used to
+    specify the columns that are used by a given corpus. By default
+    columns are split by consecutive whitespaces, with the
+    ``separator`` argument you can set a string to split by (e.g.
+    ``\'\t\'``).
+
+
+    @todo: Add support for reading from corpora where different
+        parallel files contain different columns.
+    @todo: Possibly add caching of the grid corpus view?  This would
+        allow the same grid view to be used by different data access
+        methods (eg words() and parsed_sents() could both share the
+        same grid corpus view object).
+    @todo: Better support for -DOCSTART-.  Currently, we just ignore
+        it, but it could be used to define methods that retrieve a
+        document at a time (eg parsed_documents()).
+    """
+
+    # /////////////////////////////////////////////////////////////////
+    # Column Types
+    # /////////////////////////////////////////////////////////////////
+
+    WORDS = "words"  #: column type for words
+    POS = "pos"  #: column type for part-of-speech tags
+    TREE = "tree"  #: column type for parse trees
+    CHUNK = "chunk"  #: column type for chunk structures
+    NE = "ne"  #: column type for named entities
+    SRL = "srl"  #: column type for semantic role labels
+    IGNORE = "ignore"  #: column type for column that should be ignored
+
+    #: A list of all column types supported by the conll corpus reader.
+    COLUMN_TYPES = (WORDS, POS, TREE, CHUNK, NE, SRL, IGNORE)
+
+    # /////////////////////////////////////////////////////////////////
+    # Constructor
+    # /////////////////////////////////////////////////////////////////
+
+    def __init__(
+        self,
+        root,
+        fileids,
+        columntypes,
+        chunk_types=None,
+        root_label="S",
+        pos_in_tree=False,
+        srl_includes_roleset=True,
+        encoding="utf8",
+        tree_class=Tree,
+        tagset=None,
+        separator=None,
+    ):
+        for columntype in columntypes:
+            if columntype not in self.COLUMN_TYPES:
+                raise ValueError("Bad column type %r" % columntype)
+        if isinstance(chunk_types, str):
+            chunk_types = [chunk_types]
+        self._chunk_types = chunk_types
+        self._colmap = {c: i for (i, c) in enumerate(columntypes)}
+        self._pos_in_tree = pos_in_tree
+        self._root_label = root_label  # for chunks
+        self._srl_includes_roleset = srl_includes_roleset
+        self._tree_class = tree_class
+        CorpusReader.__init__(self, root, fileids, encoding)
+        self._tagset = tagset
+        self.sep = separator
+
+    # /////////////////////////////////////////////////////////////////
+    # Data Access Methods
+    # /////////////////////////////////////////////////////////////////
+
+    def words(self, fileids=None):
+        self._require(self.WORDS)
+        return LazyConcatenation(LazyMap(self._get_words, self._grids(fileids)))
+
+    def sents(self, fileids=None):
+        self._require(self.WORDS)
+        return LazyMap(self._get_words, self._grids(fileids))
+
+    def tagged_words(self, fileids=None, tagset=None):
+        self._require(self.WORDS, self.POS)
+
+        def get_tagged_words(grid):
+            return self._get_tagged_words(grid, tagset)
+
+        return LazyConcatenation(LazyMap(get_tagged_words, self._grids(fileids)))
+
+    def tagged_sents(self, fileids=None, tagset=None):
+        self._require(self.WORDS, self.POS)
+
+        def get_tagged_words(grid):
+            return self._get_tagged_words(grid, tagset)
+
+        return LazyMap(get_tagged_words, self._grids(fileids))
+
+    def chunked_words(self, fileids=None, chunk_types=None, tagset=None):
+        self._require(self.WORDS, self.POS, self.CHUNK)
+        if chunk_types is None:
+            chunk_types = self._chunk_types
+
+        def get_chunked_words(grid):  # capture chunk_types as local var
+            return self._get_chunked_words(grid, chunk_types, tagset)
+
+        return LazyConcatenation(LazyMap(get_chunked_words, self._grids(fileids)))
+
+    def chunked_sents(self, fileids=None, chunk_types=None, tagset=None):
+        self._require(self.WORDS, self.POS, self.CHUNK)
+        if chunk_types is None:
+            chunk_types = self._chunk_types
+
+        def get_chunked_words(grid):  # capture chunk_types as local var
+            return self._get_chunked_words(grid, chunk_types, tagset)
+
+        return LazyMap(get_chunked_words, self._grids(fileids))
+
+    def parsed_sents(self, fileids=None, pos_in_tree=None, tagset=None):
+        self._require(self.WORDS, self.POS, self.TREE)
+        if pos_in_tree is None:
+            pos_in_tree = self._pos_in_tree
+
+        def get_parsed_sent(grid):  # capture pos_in_tree as local var
+            return self._get_parsed_sent(grid, pos_in_tree, tagset)
+
+        return LazyMap(get_parsed_sent, self._grids(fileids))
+
+    def srl_spans(self, fileids=None):
+        self._require(self.SRL)
+        return LazyMap(self._get_srl_spans, self._grids(fileids))
+
+    def srl_instances(self, fileids=None, pos_in_tree=None, flatten=True):
+        self._require(self.WORDS, self.POS, self.TREE, self.SRL)
+        if pos_in_tree is None:
+            pos_in_tree = self._pos_in_tree
+
+        def get_srl_instances(grid):  # capture pos_in_tree as local var
+            return self._get_srl_instances(grid, pos_in_tree)
+
+        result = LazyMap(get_srl_instances, self._grids(fileids))
+        if flatten:
+            result = LazyConcatenation(result)
+        return result
+
+    def iob_words(self, fileids=None, tagset=None):
+        """
+        :return: a list of word/tag/IOB tuples
+        :rtype: list(tuple)
+        :param fileids: the list of fileids that make up this corpus
+        :type fileids: None or str or list
+        """
+        self._require(self.WORDS, self.POS, self.CHUNK)
+
+        def get_iob_words(grid):
+            return self._get_iob_words(grid, tagset)
+
+        return LazyConcatenation(LazyMap(get_iob_words, self._grids(fileids)))
+
+    def iob_sents(self, fileids=None, tagset=None):
+        """
+        :return: a list of lists of word/tag/IOB tuples
+        :rtype: list(list)
+        :param fileids: the list of fileids that make up this corpus
+        :type fileids: None or str or list
+        """
+        self._require(self.WORDS, self.POS, self.CHUNK)
+
+        def get_iob_words(grid):
+            return self._get_iob_words(grid, tagset)
+
+        return LazyMap(get_iob_words, self._grids(fileids))
+
+    # /////////////////////////////////////////////////////////////////
+    # Grid Reading
+    # /////////////////////////////////////////////////////////////////
+
+    def _grids(self, fileids=None):
+        # n.b.: we could cache the object returned here (keyed on
+        # fileids), which would let us reuse the same corpus view for
+        # different things (eg srl and parse trees).
+        return concat(
+            [
+                StreamBackedCorpusView(fileid, self._read_grid_block, encoding=enc)
+                for (fileid, enc) in self.abspaths(fileids, True)
+            ]
+        )
+
+    def _read_grid_block(self, stream):
+        grids = []
+        for block in read_blankline_block(stream):
+            block = block.strip()
+            if not block:
+                continue
+
+            grid = [line.split(self.sep) for line in block.split("\n")]
+
+            # If there's a docstart row, then discard. ([xx] eventually it
+            # would be good to actually use it)
+            if grid[0][self._colmap.get("words", 0)] == "-DOCSTART-":
+                del grid[0]
+
+            # Check that the grid is consistent.
+            for row in grid:
+                if len(row) != len(grid[0]):
+                    raise ValueError("Inconsistent number of columns:\n%s" % block)
+            grids.append(grid)
+        return grids
+
+    # /////////////////////////////////////////////////////////////////
+    # Transforms
+    # /////////////////////////////////////////////////////////////////
+    # given a grid, transform it into some representation (e.g.,
+    # a list of words or a parse tree).
+
+    def _get_words(self, grid):
+        return self._get_column(grid, self._colmap["words"])
+
+    def _get_tagged_words(self, grid, tagset=None):
+        pos_tags = self._get_column(grid, self._colmap["pos"])
+        if tagset and tagset != self._tagset:
+            pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
+        return list(zip(self._get_column(grid, self._colmap["words"]), pos_tags))
+
+    def _get_iob_words(self, grid, tagset=None):
+        pos_tags = self._get_column(grid, self._colmap["pos"])
+        if tagset and tagset != self._tagset:
+            pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
+        return list(
+            zip(
+                self._get_column(grid, self._colmap["words"]),
+                pos_tags,
+                self._get_column(grid, self._colmap["chunk"]),
+            )
+        )
+
+    def _get_chunked_words(self, grid, chunk_types, tagset=None):
+        # n.b.: this method is very similar to conllstr2tree.
+        words = self._get_column(grid, self._colmap["words"])
+        pos_tags = self._get_column(grid, self._colmap["pos"])
+        if tagset and tagset != self._tagset:
+            pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
+        chunk_tags = self._get_column(grid, self._colmap["chunk"])
+
+        stack = [Tree(self._root_label, [])]
+
+        for word, pos_tag, chunk_tag in zip(words, pos_tags, chunk_tags):
+            if chunk_tag == "O":
+                state, chunk_type = "O", ""
+            else:
+                (state, chunk_type) = chunk_tag.split("-")
+            # If it's a chunk we don't care about, treat it as O.
+            if chunk_types is not None and chunk_type not in chunk_types:
+                state = "O"
+            # Treat a mismatching I like a B.
+            if state == "I" and chunk_type != stack[-1].label():
+                state = "B"
+            # For B or I: close any open chunks
+            if state in "BO" and len(stack) == 2:
+                stack.pop()
+            # For B: start a new chunk.
+            if state == "B":
+                new_chunk = Tree(chunk_type, [])
+                stack[-1].append(new_chunk)
+                stack.append(new_chunk)
+            # Add the word token.
+            stack[-1].append((word, pos_tag))
+
+        return stack[0]
+
+    def _get_parsed_sent(self, grid, pos_in_tree, tagset=None):
+        words = self._get_column(grid, self._colmap["words"])
+        pos_tags = self._get_column(grid, self._colmap["pos"])
+        if tagset and tagset != self._tagset:
+            pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
+        parse_tags = self._get_column(grid, self._colmap["tree"])
+
+        treestr = ""
+        for word, pos_tag, parse_tag in zip(words, pos_tags, parse_tags):
+            if word == "(":
+                word = "-LRB-"
+            if word == ")":
+                word = "-RRB-"
+            if pos_tag == "(":
+                pos_tag = "-LRB-"
+            if pos_tag == ")":
+                pos_tag = "-RRB-"
+            (left, right) = parse_tag.split("*")
+            right = right.count(")") * ")"  # only keep ')'.
+            treestr += f"{left} ({pos_tag} {word}) {right}"
+        try:
+            tree = self._tree_class.fromstring(treestr)
+        except (ValueError, IndexError):
+            tree = self._tree_class.fromstring(f"({self._root_label} {treestr})")
+
+        if not pos_in_tree:
+            for subtree in tree.subtrees():
+                for i, child in enumerate(subtree):
+                    if (
+                        isinstance(child, Tree)
+                        and len(child) == 1
+                        and isinstance(child[0], str)
+                    ):
+                        subtree[i] = (child[0], child.label())
+
+        return tree
+
+    def _get_srl_spans(self, grid):
+        """
+        list of list of (start, end), tag) tuples
+        """
+        if self._srl_includes_roleset:
+            predicates = self._get_column(grid, self._colmap["srl"] + 1)
+            start_col = self._colmap["srl"] + 2
+        else:
+            predicates = self._get_column(grid, self._colmap["srl"])
+            start_col = self._colmap["srl"] + 1
+
+        # Count how many predicates there are.  This tells us how many
+        # columns to expect for SRL data.
+        num_preds = len([p for p in predicates if p != "-"])
+
+        spanlists = []
+        for i in range(num_preds):
+            col = self._get_column(grid, start_col + i)
+            spanlist = []
+            stack = []
+            for wordnum, srl_tag in enumerate(col):
+                (left, right) = srl_tag.split("*")
+                for tag in left.split("("):
+                    if tag:
+                        stack.append((tag, wordnum))
+                for i in range(right.count(")")):
+                    (tag, start) = stack.pop()
+                    spanlist.append(((start, wordnum + 1), tag))
+            spanlists.append(spanlist)
+
+        return spanlists
+
+    def _get_srl_instances(self, grid, pos_in_tree):
+        tree = self._get_parsed_sent(grid, pos_in_tree)
+        spanlists = self._get_srl_spans(grid)
+        if self._srl_includes_roleset:
+            predicates = self._get_column(grid, self._colmap["srl"] + 1)
+            rolesets = self._get_column(grid, self._colmap["srl"])
+        else:
+            predicates = self._get_column(grid, self._colmap["srl"])
+            rolesets = [None] * len(predicates)
+
+        instances = ConllSRLInstanceList(tree)
+        for wordnum, predicate in enumerate(predicates):
+            if predicate == "-":
+                continue
+            # Decide which spanlist to use.  Don't assume that they're
+            # sorted in the same order as the predicates (even though
+            # they usually are).
+            for spanlist in spanlists:
+                for (start, end), tag in spanlist:
+                    if wordnum in range(start, end) and tag in ("V", "C-V"):
+                        break
+                else:
+                    continue
+                break
+            else:
+                raise ValueError("No srl column found for %r" % predicate)
+            instances.append(
+                ConllSRLInstance(tree, wordnum, predicate, rolesets[wordnum], spanlist)
+            )
+
+        return instances
+
+    # /////////////////////////////////////////////////////////////////
+    # Helper Methods
+    # /////////////////////////////////////////////////////////////////
+
+    def _require(self, *columntypes):
+        for columntype in columntypes:
+            if columntype not in self._colmap:
+                raise ValueError(
+                    "This corpus does not contain a %s " "column." % columntype
+                )
+
+    @staticmethod
+    def _get_column(grid, column_index):
+        return [grid[i][column_index] for i in range(len(grid))]
+
+
+class ConllSRLInstance:
+    """
+    An SRL instance from a CoNLL corpus, which identifies and
+    providing labels for the arguments of a single verb.
+    """
+
+    # [xx] add inst.core_arguments, inst.argm_arguments?
+
+    def __init__(self, tree, verb_head, verb_stem, roleset, tagged_spans):
+        self.verb = []
+        """A list of the word indices of the words that compose the
+           verb whose arguments are identified by this instance.
+           This will contain multiple word indices when multi-word
+           verbs are used (e.g. 'turn on')."""
+
+        self.verb_head = verb_head
+        """The word index of the head word of the verb whose arguments
+           are identified by this instance.  E.g., for a sentence that
+           uses the verb 'turn on,' ``verb_head`` will be the word index
+           of the word 'turn'."""
+
+        self.verb_stem = verb_stem
+
+        self.roleset = roleset
+
+        self.arguments = []
+        """A list of ``(argspan, argid)`` tuples, specifying the location
+           and type for each of the arguments identified by this
+           instance.  ``argspan`` is a tuple ``start, end``, indicating
+           that the argument consists of the ``words[start:end]``."""
+
+        self.tagged_spans = tagged_spans
+        """A list of ``(span, id)`` tuples, specifying the location and
+           type for each of the arguments, as well as the verb pieces,
+           that make up this instance."""
+
+        self.tree = tree
+        """The parse tree for the sentence containing this instance."""
+
+        self.words = tree.leaves()
+        """A list of the words in the sentence containing this
+           instance."""
+
+        # Fill in the self.verb and self.arguments values.
+        for (start, end), tag in tagged_spans:
+            if tag in ("V", "C-V"):
+                self.verb += list(range(start, end))
+            else:
+                self.arguments.append(((start, end), tag))
+
+    def __repr__(self):
+        # Originally, its:
+        ##plural = 's' if len(self.arguments) != 1 else ''
+        plural = "s" if len(self.arguments) != 1 else ""
+        return "<ConllSRLInstance for %r with %d argument%s>" % (
+            (self.verb_stem, len(self.arguments), plural)
+        )
+
+    def pprint(self):
+        verbstr = " ".join(self.words[i][0] for i in self.verb)
+        hdr = f"SRL for {verbstr!r} (stem={self.verb_stem!r}):\n"
+        s = ""
+        for i, word in enumerate(self.words):
+            if isinstance(word, tuple):
+                word = word[0]
+            for (start, end), argid in self.arguments:
+                if i == start:
+                    s += "[%s " % argid
+                if i == end:
+                    s += "] "
+            if i in self.verb:
+                word = "<<%s>>" % word
+            s += word + " "
+        return hdr + textwrap.fill(
+            s.replace(" ]", "]"), initial_indent="    ", subsequent_indent="    "
+        )
+
+
+class ConllSRLInstanceList(list):
+    """
+    Set of instances for a single sentence
+    """
+
+    def __init__(self, tree, instances=()):
+        self.tree = tree
+        list.__init__(self, instances)
+
+    def __str__(self):
+        return self.pprint()
+
+    def pprint(self, include_tree=False):
+        # Sanity check: trees should be the same
+        for inst in self:
+            if inst.tree != self.tree:
+                raise ValueError("Tree mismatch!")
+
+        # If desired, add trees:
+        if include_tree:
+            words = self.tree.leaves()
+            pos = [None] * len(words)
+            synt = ["*"] * len(words)
+            self._tree2conll(self.tree, 0, words, pos, synt)
+
+        s = ""
+        for i in range(len(words)):
+            # optional tree columns
+            if include_tree:
+                s += "%-20s " % words[i]
+                s += "%-8s " % pos[i]
+                s += "%15s*%-8s " % tuple(synt[i].split("*"))
+
+            # verb head column
+            for inst in self:
+                if i == inst.verb_head:
+                    s += "%-20s " % inst.verb_stem
+                    break
+            else:
+                s += "%-20s " % "-"
+            # Remaining columns: self
+            for inst in self:
+                argstr = "*"
+                for (start, end), argid in inst.tagged_spans:
+                    if i == start:
+                        argstr = f"({argid}{argstr}"
+                    if i == (end - 1):
+                        argstr += ")"
+                s += "%-12s " % argstr
+            s += "\n"
+        return s
+
+    def _tree2conll(self, tree, wordnum, words, pos, synt):
+        assert isinstance(tree, Tree)
+        if len(tree) == 1 and isinstance(tree[0], str):
+            pos[wordnum] = tree.label()
+            assert words[wordnum] == tree[0]
+            return wordnum + 1
+        elif len(tree) == 1 and isinstance(tree[0], tuple):
+            assert len(tree[0]) == 2
+            pos[wordnum], pos[wordnum] = tree[0]
+            return wordnum + 1
+        else:
+            synt[wordnum] = f"({tree.label()}{synt[wordnum]}"
+            for child in tree:
+                wordnum = self._tree2conll(child, wordnum, words, pos, synt)
+            synt[wordnum - 1] += ")"
+            return wordnum
+
+
+class ConllChunkCorpusReader(ConllCorpusReader):
+    """
+    A ConllCorpusReader whose data file contains three columns: words,
+    pos, and chunk.
+    """
+
+    def __init__(
+        self, root, fileids, chunk_types, encoding="utf8", tagset=None, separator=None
+    ):
+        ConllCorpusReader.__init__(
+            self,
+            root,
+            fileids,
+            ("words", "pos", "chunk"),
+            chunk_types=chunk_types,
+            encoding=encoding,
+            tagset=tagset,
+            separator=separator,
+        )
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/crubadan.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/crubadan.py
@@ -0,0 +1,106 @@
+# Natural Language Toolkit: An Crubadan N-grams Reader
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Avital Pekker <avital.pekker@utoronto.ca>
+#
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+An NLTK interface for the n-gram statistics gathered from
+the corpora for each language using An Crubadan.
+
+There are multiple potential applications for the data but
+this reader was created with the goal of using it in the
+context of language identification.
+
+For details about An Crubadan, this data, and its potential uses, see:
+http://borel.slu.edu/crubadan/index.html
+"""
+
+import re
+from os import path
+
+from nltk.corpus.reader import CorpusReader
+from nltk.data import ZipFilePathPointer
+from nltk.probability import FreqDist
+
+
+class CrubadanCorpusReader(CorpusReader):
+    """
+    A corpus reader used to access language An Crubadan n-gram files.
+    """
+
+    _LANG_MAPPER_FILE = "table.txt"
+    _all_lang_freq = {}
+
+    def __init__(self, root, fileids, encoding="utf8", tagset=None):
+        super().__init__(root, fileids, encoding="utf8")
+        self._lang_mapping_data = []
+        self._load_lang_mapping_data()
+
+    def lang_freq(self, lang):
+        """Return n-gram FreqDist for a specific language
+        given ISO 639-3 language code"""
+
+        if lang not in self._all_lang_freq:
+            self._all_lang_freq[lang] = self._load_lang_ngrams(lang)
+
+        return self._all_lang_freq[lang]
+
+    def langs(self):
+        """Return a list of supported languages as ISO 639-3 codes"""
+        return [row[1] for row in self._lang_mapping_data]
+
+    def iso_to_crubadan(self, lang):
+        """Return internal Crubadan code based on ISO 639-3 code"""
+        for i in self._lang_mapping_data:
+            if i[1].lower() == lang.lower():
+                return i[0]
+
+    def crubadan_to_iso(self, lang):
+        """Return ISO 639-3 code given internal Crubadan code"""
+        for i in self._lang_mapping_data:
+            if i[0].lower() == lang.lower():
+                return i[1]
+
+    def _load_lang_mapping_data(self):
+        """Load language mappings between codes and description from table.txt"""
+        if isinstance(self.root, ZipFilePathPointer):
+            raise RuntimeError(
+                "Please install the 'crubadan' corpus first, use nltk.download()"
+            )
+
+        mapper_file = path.join(self.root, self._LANG_MAPPER_FILE)
+        if self._LANG_MAPPER_FILE not in self.fileids():
+            raise RuntimeError("Could not find language mapper file: " + mapper_file)
+
+        with open(mapper_file, encoding="utf-8") as raw:
+            strip_raw = raw.read().strip()
+
+            self._lang_mapping_data = [row.split("\t") for row in strip_raw.split("\n")]
+
+    def _load_lang_ngrams(self, lang):
+        """Load single n-gram language file given the ISO 639-3 language code
+        and return its FreqDist"""
+
+        if lang not in self.langs():
+            raise RuntimeError("Unsupported language.")
+
+        crubadan_code = self.iso_to_crubadan(lang)
+        ngram_file = path.join(self.root, crubadan_code + "-3grams.txt")
+
+        if not path.isfile(ngram_file):
+            raise RuntimeError("No N-gram file found for requested language.")
+
+        counts = FreqDist()
+        with open(ngram_file, encoding="utf-8") as f:
+            for line in f:
+                data = line.split(" ")
+
+                ngram = data[1].strip("\n")
+                freq = int(data[0])
+
+                counts[ngram] = freq
+
+        return counts
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/dependency.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/dependency.py
@@ -0,0 +1,115 @@
+# Natural Language Toolkit: Dependency Corpus Reader
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Kepa Sarasola <kepa.sarasola@ehu.es>
+#         Iker Manterola <returntothehangar@hotmail.com>
+#
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+from nltk.corpus.reader.api import *
+from nltk.corpus.reader.util import *
+from nltk.parse import DependencyGraph
+from nltk.tokenize import *
+
+
+class DependencyCorpusReader(SyntaxCorpusReader):
+    def __init__(
+        self,
+        root,
+        fileids,
+        encoding="utf8",
+        word_tokenizer=TabTokenizer(),
+        sent_tokenizer=RegexpTokenizer("\n", gaps=True),
+        para_block_reader=read_blankline_block,
+    ):
+        SyntaxCorpusReader.__init__(self, root, fileids, encoding)
+
+    #########################################################
+
+    def words(self, fileids=None):
+        return concat(
+            [
+                DependencyCorpusView(fileid, False, False, False, encoding=enc)
+                for fileid, enc in self.abspaths(fileids, include_encoding=True)
+            ]
+        )
+
+    def tagged_words(self, fileids=None):
+        return concat(
+            [
+                DependencyCorpusView(fileid, True, False, False, encoding=enc)
+                for fileid, enc in self.abspaths(fileids, include_encoding=True)
+            ]
+        )
+
+    def sents(self, fileids=None):
+        return concat(
+            [
+                DependencyCorpusView(fileid, False, True, False, encoding=enc)
+                for fileid, enc in self.abspaths(fileids, include_encoding=True)
+            ]
+        )
+
+    def tagged_sents(self, fileids=None):
+        return concat(
+            [
+                DependencyCorpusView(fileid, True, True, False, encoding=enc)
+                for fileid, enc in self.abspaths(fileids, include_encoding=True)
+            ]
+        )
+
+    def parsed_sents(self, fileids=None):
+        sents = concat(
+            [
+                DependencyCorpusView(fileid, False, True, True, encoding=enc)
+                for fileid, enc in self.abspaths(fileids, include_encoding=True)
+            ]
+        )
+        return [DependencyGraph(sent) for sent in sents]
+
+
+class DependencyCorpusView(StreamBackedCorpusView):
+    _DOCSTART = "-DOCSTART- -DOCSTART- O\n"  # dokumentu hasiera definitzen da
+
+    def __init__(
+        self,
+        corpus_file,
+        tagged,
+        group_by_sent,
+        dependencies,
+        chunk_types=None,
+        encoding="utf8",
+    ):
+        self._tagged = tagged
+        self._dependencies = dependencies
+        self._group_by_sent = group_by_sent
+        self._chunk_types = chunk_types
+        StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
+
+    def read_block(self, stream):
+        # Read the next sentence.
+        sent = read_blankline_block(stream)[0].strip()
+        # Strip off the docstart marker, if present.
+        if sent.startswith(self._DOCSTART):
+            sent = sent[len(self._DOCSTART) :].lstrip()
+
+        # extract word and tag from any of the formats
+        if not self._dependencies:
+            lines = [line.split("\t") for line in sent.split("\n")]
+            if len(lines[0]) == 3 or len(lines[0]) == 4:
+                sent = [(line[0], line[1]) for line in lines]
+            elif len(lines[0]) == 10:
+                sent = [(line[1], line[4]) for line in lines]
+            else:
+                raise ValueError("Unexpected number of fields in dependency tree file")
+
+            # discard tags if they weren't requested
+            if not self._tagged:
+                sent = [word for (word, tag) in sent]
+
+        # Return the result.
+        if self._group_by_sent:
+            return [sent]
+        else:
+            return list(sent)
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/framenet.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/framenet.py
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/ieer.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/ieer.py
@@ -0,0 +1,116 @@
+# Natural Language Toolkit: IEER Corpus Reader
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+#         Edward Loper <edloper@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Corpus reader for the Information Extraction and Entity Recognition Corpus.
+
+NIST 1999 Information Extraction: Entity Recognition Evaluation
+https://www.itl.nist.gov/iad/894.01/tests/ie-er/er_99/er_99.htm
+
+This corpus contains the NEWSWIRE development test data for the
+NIST 1999 IE-ER Evaluation.  The files were taken from the
+subdirectory: ``/ie_er_99/english/devtest/newswire/*.ref.nwt``
+and filenames were shortened.
+
+The corpus contains the following files: APW_19980314, APW_19980424,
+APW_19980429, NYT_19980315, NYT_19980403, and NYT_19980407.
+"""
+
+import nltk
+from nltk.corpus.reader.api import *
+
+#: A dictionary whose keys are the names of documents in this corpus;
+#: and whose values are descriptions of those documents' contents.
+titles = {
+    "APW_19980314": "Associated Press Weekly, 14 March 1998",
+    "APW_19980424": "Associated Press Weekly, 24 April 1998",
+    "APW_19980429": "Associated Press Weekly, 29 April 1998",
+    "NYT_19980315": "New York Times, 15 March 1998",
+    "NYT_19980403": "New York Times, 3 April 1998",
+    "NYT_19980407": "New York Times, 7 April 1998",
+}
+
+#: A list of all documents in this corpus.
+documents = sorted(titles)
+
+
+class IEERDocument:
+    def __init__(self, text, docno=None, doctype=None, date_time=None, headline=""):
+        self.text = text
+        self.docno = docno
+        self.doctype = doctype
+        self.date_time = date_time
+        self.headline = headline
+
+    def __repr__(self):
+        if self.headline:
+            headline = " ".join(self.headline.leaves())
+        else:
+            headline = (
+                " ".join([w for w in self.text.leaves() if w[:1] != "<"][:12]) + "..."
+            )
+        if self.docno is not None:
+            return f"<IEERDocument {self.docno}: {headline!r}>"
+        else:
+            return "<IEERDocument: %r>" % headline
+
+
+class IEERCorpusReader(CorpusReader):
+    """ """
+
+    def docs(self, fileids=None):
+        return concat(
+            [
+                StreamBackedCorpusView(fileid, self._read_block, encoding=enc)
+                for (fileid, enc) in self.abspaths(fileids, True)
+            ]
+        )
+
+    def parsed_docs(self, fileids=None):
+        return concat(
+            [
+                StreamBackedCorpusView(fileid, self._read_parsed_block, encoding=enc)
+                for (fileid, enc) in self.abspaths(fileids, True)
+            ]
+        )
+
+    def _read_parsed_block(self, stream):
+        # TODO: figure out while empty documents are being returned
+        return [
+            self._parse(doc)
+            for doc in self._read_block(stream)
+            if self._parse(doc).docno is not None
+        ]
+
+    def _parse(self, doc):
+        val = nltk.chunk.ieerstr2tree(doc, root_label="DOCUMENT")
+        if isinstance(val, dict):
+            return IEERDocument(**val)
+        else:
+            return IEERDocument(val)
+
+    def _read_block(self, stream):
+        out = []
+        # Skip any preamble.
+        while True:
+            line = stream.readline()
+            if not line:
+                break
+            if line.strip() == "<DOC>":
+                break
+        out.append(line)
+        # Read the document
+        while True:
+            line = stream.readline()
+            if not line:
+                break
+            out.append(line)
+            if line.strip() == "</DOC>":
+                break
+        # Return the document
+        return ["\n".join(out)]
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/indian.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/indian.py
@@ -0,0 +1,93 @@
+# Natural Language Toolkit: Indian Language POS-Tagged Corpus Reader
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+#         Edward Loper <edloper@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Indian Language POS-Tagged Corpus
+Collected by A Kumaran, Microsoft Research, India
+Distributed with permission
+
+Contents:
+  - Bangla: IIT Kharagpur
+  - Hindi: Microsoft Research India
+  - Marathi: IIT Bombay
+  - Telugu: IIIT Hyderabad
+"""
+
+from nltk.corpus.reader.api import *
+from nltk.corpus.reader.util import *
+from nltk.tag import map_tag, str2tuple
+
+
+class IndianCorpusReader(CorpusReader):
+    """
+    List of words, one per line.  Blank lines are ignored.
+    """
+
+    def words(self, fileids=None):
+        return concat(
+            [
+                IndianCorpusView(fileid, enc, False, False)
+                for (fileid, enc) in self.abspaths(fileids, True)
+            ]
+        )
+
+    def tagged_words(self, fileids=None, tagset=None):
+        if tagset and tagset != self._tagset:
+            tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
+        else:
+            tag_mapping_function = None
+        return concat(
+            [
+                IndianCorpusView(fileid, enc, True, False, tag_mapping_function)
+                for (fileid, enc) in self.abspaths(fileids, True)
+            ]
+        )
+
+    def sents(self, fileids=None):
+        return concat(
+            [
+                IndianCorpusView(fileid, enc, False, True)
+                for (fileid, enc) in self.abspaths(fileids, True)
+            ]
+        )
+
+    def tagged_sents(self, fileids=None, tagset=None):
+        if tagset and tagset != self._tagset:
+            tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
+        else:
+            tag_mapping_function = None
+        return concat(
+            [
+                IndianCorpusView(fileid, enc, True, True, tag_mapping_function)
+                for (fileid, enc) in self.abspaths(fileids, True)
+            ]
+        )
+
+
+class IndianCorpusView(StreamBackedCorpusView):
+    def __init__(
+        self, corpus_file, encoding, tagged, group_by_sent, tag_mapping_function=None
+    ):
+        self._tagged = tagged
+        self._group_by_sent = group_by_sent
+        self._tag_mapping_function = tag_mapping_function
+        StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
+
+    def read_block(self, stream):
+        line = stream.readline()
+        if line.startswith("<"):
+            return []
+        sent = [str2tuple(word, sep="_") for word in line.split()]
+        if self._tag_mapping_function:
+            sent = [(w, self._tag_mapping_function(t)) for (w, t) in sent]
+        if not self._tagged:
+            sent = [w for (w, t) in sent]
+        if self._group_by_sent:
+            return [sent]
+        else:
+            return sent
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/ipipan.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/ipipan.py
@@ -0,0 +1,354 @@
+# Natural Language Toolkit: IPI PAN Corpus Reader
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Konrad Goluchowski <kodie@mimuw.edu.pl>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+import functools
+
+from nltk.corpus.reader.api import CorpusReader
+from nltk.corpus.reader.util import StreamBackedCorpusView, concat
+
+
+def _parse_args(fun):
+    @functools.wraps(fun)
+    def decorator(self, fileids=None, **kwargs):
+        kwargs.pop("tags", None)
+        if not fileids:
+            fileids = self.fileids()
+        return fun(self, fileids, **kwargs)
+
+    return decorator
+
+
+class IPIPANCorpusReader(CorpusReader):
+    """
+    Corpus reader designed to work with corpus created by IPI PAN.
+    See http://korpus.pl/en/ for more details about IPI PAN corpus.
+
+    The corpus includes information about text domain, channel and categories.
+    You can access possible values using ``domains()``, ``channels()`` and
+    ``categories()``. You can use also this metadata to filter files, e.g.:
+    ``fileids(channel='prasa')``, ``fileids(categories='publicystyczny')``.
+
+    The reader supports methods: words, sents, paras and their tagged versions.
+    You can get part of speech instead of full tag by giving "simplify_tags=True"
+    parameter, e.g.: ``tagged_sents(simplify_tags=True)``.
+
+    Also you can get all tags disambiguated tags specifying parameter
+    "one_tag=False", e.g.: ``tagged_paras(one_tag=False)``.
+
+    You can get all tags that were assigned by a morphological analyzer specifying
+    parameter "disamb_only=False", e.g. ``tagged_words(disamb_only=False)``.
+
+    The IPIPAN Corpus contains tags indicating if there is a space between two
+    tokens. To add special "no space" markers, you should specify parameter
+    "append_no_space=True", e.g. ``tagged_words(append_no_space=True)``.
+    As a result in place where there should be no space between two tokens new
+    pair ('', 'no-space') will be inserted (for tagged data) and just '' for
+    methods without tags.
+
+    The corpus reader can also try to append spaces between words. To enable this
+    option, specify parameter "append_space=True", e.g. ``words(append_space=True)``.
+    As a result either ' ' or (' ', 'space') will be inserted between tokens.
+
+    By default, xml entities like &quot; and &amp; are replaced by corresponding
+    characters. You can turn off this feature, specifying parameter
+    "replace_xmlentities=False", e.g. ``words(replace_xmlentities=False)``.
+    """
+
+    def __init__(self, root, fileids):
+        CorpusReader.__init__(self, root, fileids, None, None)
+
+    def channels(self, fileids=None):
+        if not fileids:
+            fileids = self.fileids()
+        return self._parse_header(fileids, "channel")
+
+    def domains(self, fileids=None):
+        if not fileids:
+            fileids = self.fileids()
+        return self._parse_header(fileids, "domain")
+
+    def categories(self, fileids=None):
+        if not fileids:
+            fileids = self.fileids()
+        return [
+            self._map_category(cat) for cat in self._parse_header(fileids, "keyTerm")
+        ]
+
+    def fileids(self, channels=None, domains=None, categories=None):
+        if channels is not None and domains is not None and categories is not None:
+            raise ValueError(
+                "You can specify only one of channels, domains "
+                "and categories parameter at once"
+            )
+        if channels is None and domains is None and categories is None:
+            return CorpusReader.fileids(self)
+        if isinstance(channels, str):
+            channels = [channels]
+        if isinstance(domains, str):
+            domains = [domains]
+        if isinstance(categories, str):
+            categories = [categories]
+        if channels:
+            return self._list_morph_files_by("channel", channels)
+        elif domains:
+            return self._list_morph_files_by("domain", domains)
+        else:
+            return self._list_morph_files_by(
+                "keyTerm", categories, map=self._map_category
+            )
+
+    @_parse_args
+    def sents(self, fileids=None, **kwargs):
+        return concat(
+            [
+                self._view(
+                    fileid, mode=IPIPANCorpusView.SENTS_MODE, tags=False, **kwargs
+                )
+                for fileid in self._list_morph_files(fileids)
+            ]
+        )
+
+    @_parse_args
+    def paras(self, fileids=None, **kwargs):
+        return concat(
+            [
+                self._view(
+                    fileid, mode=IPIPANCorpusView.PARAS_MODE, tags=False, **kwargs
+                )
+                for fileid in self._list_morph_files(fileids)
+            ]
+        )
+
+    @_parse_args
+    def words(self, fileids=None, **kwargs):
+        return concat(
+            [
+                self._view(fileid, tags=False, **kwargs)
+                for fileid in self._list_morph_files(fileids)
+            ]
+        )
+
+    @_parse_args
+    def tagged_sents(self, fileids=None, **kwargs):
+        return concat(
+            [
+                self._view(fileid, mode=IPIPANCorpusView.SENTS_MODE, **kwargs)
+                for fileid in self._list_morph_files(fileids)
+            ]
+        )
+
+    @_parse_args
+    def tagged_paras(self, fileids=None, **kwargs):
+        return concat(
+            [
+                self._view(fileid, mode=IPIPANCorpusView.PARAS_MODE, **kwargs)
+                for fileid in self._list_morph_files(fileids)
+            ]
+        )
+
+    @_parse_args
+    def tagged_words(self, fileids=None, **kwargs):
+        return concat(
+            [self._view(fileid, **kwargs) for fileid in self._list_morph_files(fileids)]
+        )
+
+    def _list_morph_files(self, fileids):
+        return [f for f in self.abspaths(fileids)]
+
+    def _list_header_files(self, fileids):
+        return [
+            f.replace("morph.xml", "header.xml")
+            for f in self._list_morph_files(fileids)
+        ]
+
+    def _parse_header(self, fileids, tag):
+        values = set()
+        for f in self._list_header_files(fileids):
+            values_list = self._get_tag(f, tag)
+            for v in values_list:
+                values.add(v)
+        return list(values)
+
+    def _list_morph_files_by(self, tag, values, map=None):
+        fileids = self.fileids()
+        ret_fileids = set()
+        for f in fileids:
+            fp = self.abspath(f).replace("morph.xml", "header.xml")
+            values_list = self._get_tag(fp, tag)
+            for value in values_list:
+                if map is not None:
+                    value = map(value)
+                if value in values:
+                    ret_fileids.add(f)
+        return list(ret_fileids)
+
+    def _get_tag(self, f, tag):
+        tags = []
+        with open(f) as infile:
+            header = infile.read()
+        tag_end = 0
+        while True:
+            tag_pos = header.find("<" + tag, tag_end)
+            if tag_pos < 0:
+                return tags
+            tag_end = header.find("</" + tag + ">", tag_pos)
+            tags.append(header[tag_pos + len(tag) + 2 : tag_end])
+
+    def _map_category(self, cat):
+        pos = cat.find(">")
+        if pos == -1:
+            return cat
+        else:
+            return cat[pos + 1 :]
+
+    def _view(self, filename, **kwargs):
+        tags = kwargs.pop("tags", True)
+        mode = kwargs.pop("mode", 0)
+        simplify_tags = kwargs.pop("simplify_tags", False)
+        one_tag = kwargs.pop("one_tag", True)
+        disamb_only = kwargs.pop("disamb_only", True)
+        append_no_space = kwargs.pop("append_no_space", False)
+        append_space = kwargs.pop("append_space", False)
+        replace_xmlentities = kwargs.pop("replace_xmlentities", True)
+
+        if len(kwargs) > 0:
+            raise ValueError("Unexpected arguments: %s" % kwargs.keys())
+        if not one_tag and not disamb_only:
+            raise ValueError(
+                "You cannot specify both one_tag=False and " "disamb_only=False"
+            )
+        if not tags and (simplify_tags or not one_tag or not disamb_only):
+            raise ValueError(
+                "You cannot specify simplify_tags, one_tag or "
+                "disamb_only with functions other than tagged_*"
+            )
+
+        return IPIPANCorpusView(
+            filename,
+            tags=tags,
+            mode=mode,
+            simplify_tags=simplify_tags,
+            one_tag=one_tag,
+            disamb_only=disamb_only,
+            append_no_space=append_no_space,
+            append_space=append_space,
+            replace_xmlentities=replace_xmlentities,
+        )
+
+
+class IPIPANCorpusView(StreamBackedCorpusView):
+    WORDS_MODE = 0
+    SENTS_MODE = 1
+    PARAS_MODE = 2
+
+    def __init__(self, filename, startpos=0, **kwargs):
+        StreamBackedCorpusView.__init__(self, filename, None, startpos, None)
+        self.in_sentence = False
+        self.position = 0
+
+        self.show_tags = kwargs.pop("tags", True)
+        self.disamb_only = kwargs.pop("disamb_only", True)
+        self.mode = kwargs.pop("mode", IPIPANCorpusView.WORDS_MODE)
+        self.simplify_tags = kwargs.pop("simplify_tags", False)
+        self.one_tag = kwargs.pop("one_tag", True)
+        self.append_no_space = kwargs.pop("append_no_space", False)
+        self.append_space = kwargs.pop("append_space", False)
+        self.replace_xmlentities = kwargs.pop("replace_xmlentities", True)
+
+    def read_block(self, stream):
+        sentence = []
+        sentences = []
+        space = False
+        no_space = False
+
+        tags = set()
+
+        lines = self._read_data(stream)
+
+        while True:
+            # we may have only part of last line
+            if len(lines) <= 1:
+                self._seek(stream)
+                lines = self._read_data(stream)
+
+            if lines == [""]:
+                assert not sentences
+                return []
+
+            line = lines.pop()
+            self.position += len(line) + 1
+
+            if line.startswith('<chunk type="s"'):
+                self.in_sentence = True
+            elif line.startswith('<chunk type="p"'):
+                pass
+            elif line.startswith("<tok"):
+                if self.append_space and space and not no_space:
+                    self._append_space(sentence)
+                space = True
+                no_space = False
+                orth = ""
+                tags = set()
+            elif line.startswith("</chunk"):
+                if self.in_sentence:
+                    self.in_sentence = False
+                    self._seek(stream)
+                    if self.mode == self.SENTS_MODE:
+                        return [sentence]
+                    elif self.mode == self.WORDS_MODE:
+                        if self.append_space:
+                            self._append_space(sentence)
+                        return sentence
+                    else:
+                        sentences.append(sentence)
+                elif self.mode == self.PARAS_MODE:
+                    self._seek(stream)
+                    return [sentences]
+            elif line.startswith("<orth"):
+                orth = line[6:-7]
+                if self.replace_xmlentities:
+                    orth = orth.replace("&quot;", '"').replace("&amp;", "&")
+            elif line.startswith("<lex"):
+                if not self.disamb_only or line.find("disamb=") != -1:
+                    tag = line[line.index("<ctag") + 6 : line.index("</ctag")]
+                    tags.add(tag)
+            elif line.startswith("</tok"):
+                if self.show_tags:
+                    if self.simplify_tags:
+                        tags = [t.split(":")[0] for t in tags]
+                    if not self.one_tag or not self.disamb_only:
+                        sentence.append((orth, tuple(tags)))
+                    else:
+                        sentence.append((orth, tags.pop()))
+                else:
+                    sentence.append(orth)
+            elif line.startswith("<ns/>"):
+                if self.append_space:
+                    no_space = True
+                if self.append_no_space:
+                    if self.show_tags:
+                        sentence.append(("", "no-space"))
+                    else:
+                        sentence.append("")
+            elif line.startswith("</cesAna"):
+                pass
+
+    def _read_data(self, stream):
+        self.position = stream.tell()
+        buff = stream.read(4096)
+        lines = buff.split("\n")
+        lines.reverse()
+        return lines
+
+    def _seek(self, stream):
+        stream.seek(self.position)
+
+    def _append_space(self, sentence):
+        if self.show_tags:
+            sentence.append((" ", "space"))
+        else:
+            sentence.append(" ")
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/knbc.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/knbc.py
@@ -0,0 +1,186 @@
+#! /usr/bin/env python
+# KNB Corpus reader
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Masato Hagiwara <hagisan@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+# For more information, see http://lilyx.net/pages/nltkjapanesecorpus.html
+
+import re
+
+from nltk.corpus.reader.api import CorpusReader, SyntaxCorpusReader
+from nltk.corpus.reader.util import (
+    FileSystemPathPointer,
+    find_corpus_fileids,
+    read_blankline_block,
+)
+from nltk.parse import DependencyGraph
+
+# default function to convert morphlist to str for tree representation
+_morphs2str_default = lambda morphs: "/".join(m[0] for m in morphs if m[0] != "EOS")
+
+
+class KNBCorpusReader(SyntaxCorpusReader):
+    """
+    This class implements:
+      - ``__init__``, which specifies the location of the corpus
+        and a method for detecting the sentence blocks in corpus files.
+      - ``_read_block``, which reads a block from the input stream.
+      - ``_word``, which takes a block and returns a list of list of words.
+      - ``_tag``, which takes a block and returns a list of list of tagged
+        words.
+      - ``_parse``, which takes a block and returns a list of parsed
+        sentences.
+
+    The structure of tagged words:
+      tagged_word = (word(str), tags(tuple))
+      tags = (surface, reading, lemma, pos1, posid1, pos2, posid2, pos3, posid3, others ...)
+
+    Usage example
+
+    >>> from nltk.corpus.util import LazyCorpusLoader
+    >>> knbc = LazyCorpusLoader(
+    ...     'knbc/corpus1',
+    ...     KNBCorpusReader,
+    ...     r'.*/KN.*',
+    ...     encoding='euc-jp',
+    ... )
+
+    >>> len(knbc.sents()[0])
+    9
+
+    """
+
+    def __init__(self, root, fileids, encoding="utf8", morphs2str=_morphs2str_default):
+        """
+        Initialize KNBCorpusReader
+        morphs2str is a function to convert morphlist to str for tree representation
+        for _parse()
+        """
+        SyntaxCorpusReader.__init__(self, root, fileids, encoding)
+        self.morphs2str = morphs2str
+
+    def _read_block(self, stream):
+        # blocks are split by blankline (or EOF) - default
+        return read_blankline_block(stream)
+
+    def _word(self, t):
+        res = []
+        for line in t.splitlines():
+            # ignore the Bunsets headers
+            if not re.match(r"EOS|\*|\#|\+", line):
+                cells = line.strip().split(" ")
+                res.append(cells[0])
+
+        return res
+
+    # ignores tagset argument
+    def _tag(self, t, tagset=None):
+        res = []
+        for line in t.splitlines():
+            # ignore the Bunsets headers
+            if not re.match(r"EOS|\*|\#|\+", line):
+                cells = line.strip().split(" ")
+                # convert cells to morph tuples
+                res.append((cells[0], " ".join(cells[1:])))
+
+        return res
+
+    def _parse(self, t):
+        dg = DependencyGraph()
+        i = 0
+        for line in t.splitlines():
+            if line[0] in "*+":
+                # start of bunsetsu or tag
+
+                cells = line.strip().split(" ", 3)
+                m = re.match(r"([\-0-9]*)([ADIP])", cells[1])
+
+                assert m is not None
+
+                node = dg.nodes[i]
+                node.update({"address": i, "rel": m.group(2), "word": []})
+
+                dep_parent = int(m.group(1))
+
+                if dep_parent == -1:
+                    dg.root = node
+                else:
+                    dg.nodes[dep_parent]["deps"].append(i)
+
+                i += 1
+            elif line[0] != "#":
+                # normal morph
+                cells = line.strip().split(" ")
+                # convert cells to morph tuples
+                morph = cells[0], " ".join(cells[1:])
+                dg.nodes[i - 1]["word"].append(morph)
+
+        if self.morphs2str:
+            for node in dg.nodes.values():
+                node["word"] = self.morphs2str(node["word"])
+
+        return dg.tree()
+
+
+######################################################################
+# Demo
+######################################################################
+
+
+def demo():
+    import nltk
+    from nltk.corpus.util import LazyCorpusLoader
+
+    root = nltk.data.find("corpora/knbc/corpus1")
+    fileids = [
+        f
+        for f in find_corpus_fileids(FileSystemPathPointer(root), ".*")
+        if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)
+    ]
+
+    def _knbc_fileids_sort(x):
+        cells = x.split("-")
+        return (cells[0], int(cells[1]), int(cells[2]), int(cells[3]))
+
+    knbc = LazyCorpusLoader(
+        "knbc/corpus1",
+        KNBCorpusReader,
+        sorted(fileids, key=_knbc_fileids_sort),
+        encoding="euc-jp",
+    )
+
+    print(knbc.fileids()[:10])
+    print("".join(knbc.words()[:100]))
+
+    print("\n\n".join(str(tree) for tree in knbc.parsed_sents()[:2]))
+
+    knbc.morphs2str = lambda morphs: "/".join(
+        "{}({})".format(m[0], m[1].split(" ")[2]) for m in morphs if m[0] != "EOS"
+    ).encode("utf-8")
+
+    print("\n\n".join("%s" % tree for tree in knbc.parsed_sents()[:2]))
+
+    print(
+        "\n".join(
+            " ".join("{}/{}".format(w[0], w[1].split(" ")[2]) for w in sent)
+            for sent in knbc.tagged_sents()[0:2]
+        )
+    )
+
+
+def test():
+    from nltk.corpus.util import LazyCorpusLoader
+
+    knbc = LazyCorpusLoader(
+        "knbc/corpus1", KNBCorpusReader, r".*/KN.*", encoding="euc-jp"
+    )
+    assert isinstance(knbc.words()[0], str)
+    assert isinstance(knbc.sents()[0][0], str)
+    assert isinstance(knbc.tagged_words()[0], tuple)
+    assert isinstance(knbc.tagged_sents()[0][0], tuple)
+
+
+if __name__ == "__main__":
+    demo()
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/lin.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/lin.py
@@ -0,0 +1,183 @@
+# Natural Language Toolkit: Lin's Thesaurus
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Dan Blanchard <dblanchard@ets.org>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.txt
+
+import re
+from collections import defaultdict
+from functools import reduce
+
+from nltk.corpus.reader import CorpusReader
+
+
+class LinThesaurusCorpusReader(CorpusReader):
+    """Wrapper for the LISP-formatted thesauruses distributed by Dekang Lin."""
+
+    # Compiled regular expression for extracting the key from the first line of each
+    # thesaurus entry
+    _key_re = re.compile(r'\("?([^"]+)"? \(desc [0-9.]+\).+')
+
+    @staticmethod
+    def __defaultdict_factory():
+        """Factory for creating defaultdict of defaultdict(dict)s"""
+        return defaultdict(dict)
+
+    def __init__(self, root, badscore=0.0):
+        """
+        Initialize the thesaurus.
+
+        :param root: root directory containing thesaurus LISP files
+        :type root: C{string}
+        :param badscore: the score to give to words which do not appear in each other's sets of synonyms
+        :type badscore: C{float}
+        """
+
+        super().__init__(root, r"sim[A-Z]\.lsp")
+        self._thesaurus = defaultdict(LinThesaurusCorpusReader.__defaultdict_factory)
+        self._badscore = badscore
+        for path, encoding, fileid in self.abspaths(
+            include_encoding=True, include_fileid=True
+        ):
+            with open(path) as lin_file:
+                first = True
+                for line in lin_file:
+                    line = line.strip()
+                    # Start of entry
+                    if first:
+                        key = LinThesaurusCorpusReader._key_re.sub(r"\1", line)
+                        first = False
+                    # End of entry
+                    elif line == "))":
+                        first = True
+                    # Lines with pairs of ngrams and scores
+                    else:
+                        split_line = line.split("\t")
+                        if len(split_line) == 2:
+                            ngram, score = split_line
+                            self._thesaurus[fileid][key][ngram.strip('"')] = float(
+                                score
+                            )
+
+    def similarity(self, ngram1, ngram2, fileid=None):
+        """
+        Returns the similarity score for two ngrams.
+
+        :param ngram1: first ngram to compare
+        :type ngram1: C{string}
+        :param ngram2: second ngram to compare
+        :type ngram2: C{string}
+        :param fileid: thesaurus fileid to search in. If None, search all fileids.
+        :type fileid: C{string}
+        :return: If fileid is specified, just the score for the two ngrams; otherwise,
+                 list of tuples of fileids and scores.
+        """
+        # Entries don't contain themselves, so make sure similarity between item and itself is 1.0
+        if ngram1 == ngram2:
+            if fileid:
+                return 1.0
+            else:
+                return [(fid, 1.0) for fid in self._fileids]
+        else:
+            if fileid:
+                return (
+                    self._thesaurus[fileid][ngram1][ngram2]
+                    if ngram2 in self._thesaurus[fileid][ngram1]
+                    else self._badscore
+                )
+            else:
+                return [
+                    (
+                        fid,
+                        (
+                            self._thesaurus[fid][ngram1][ngram2]
+                            if ngram2 in self._thesaurus[fid][ngram1]
+                            else self._badscore
+                        ),
+                    )
+                    for fid in self._fileids
+                ]
+
+    def scored_synonyms(self, ngram, fileid=None):
+        """
+        Returns a list of scored synonyms (tuples of synonyms and scores) for the current ngram
+
+        :param ngram: ngram to lookup
+        :type ngram: C{string}
+        :param fileid: thesaurus fileid to search in. If None, search all fileids.
+        :type fileid: C{string}
+        :return: If fileid is specified, list of tuples of scores and synonyms; otherwise,
+                 list of tuples of fileids and lists, where inner lists consist of tuples of
+                 scores and synonyms.
+        """
+        if fileid:
+            return self._thesaurus[fileid][ngram].items()
+        else:
+            return [
+                (fileid, self._thesaurus[fileid][ngram].items())
+                for fileid in self._fileids
+            ]
+
+    def synonyms(self, ngram, fileid=None):
+        """
+        Returns a list of synonyms for the current ngram.
+
+        :param ngram: ngram to lookup
+        :type ngram: C{string}
+        :param fileid: thesaurus fileid to search in. If None, search all fileids.
+        :type fileid: C{string}
+        :return: If fileid is specified, list of synonyms; otherwise, list of tuples of fileids and
+                 lists, where inner lists contain synonyms.
+        """
+        if fileid:
+            return self._thesaurus[fileid][ngram].keys()
+        else:
+            return [
+                (fileid, self._thesaurus[fileid][ngram].keys())
+                for fileid in self._fileids
+            ]
+
+    def __contains__(self, ngram):
+        """
+        Determines whether or not the given ngram is in the thesaurus.
+
+        :param ngram: ngram to lookup
+        :type ngram: C{string}
+        :return: whether the given ngram is in the thesaurus.
+        """
+        return reduce(
+            lambda accum, fileid: accum or (ngram in self._thesaurus[fileid]),
+            self._fileids,
+            False,
+        )
+
+
+######################################################################
+# Demo
+######################################################################
+
+
+def demo():
+    from nltk.corpus import lin_thesaurus as thes
+
+    word1 = "business"
+    word2 = "enterprise"
+    print("Getting synonyms for " + word1)
+    print(thes.synonyms(word1))
+
+    print("Getting scored synonyms for " + word1)
+    print(thes.scored_synonyms(word1))
+
+    print("Getting synonyms from simN.lsp (noun subsection) for " + word1)
+    print(thes.synonyms(word1, fileid="simN.lsp"))
+
+    print("Getting synonyms from simN.lsp (noun subsection) for " + word1)
+    print(thes.synonyms(word1, fileid="simN.lsp"))
+
+    print(f"Similarity score for {word1} and {word2}:")
+    print(thes.similarity(word1, word2))
+
+
+if __name__ == "__main__":
+    demo()
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/markdown.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/markdown.py
@@ -0,0 +1,344 @@
+from collections import namedtuple
+from functools import partial, wraps
+
+from nltk.corpus.reader.api import CategorizedCorpusReader
+from nltk.corpus.reader.plaintext import PlaintextCorpusReader
+from nltk.corpus.reader.util import concat, read_blankline_block
+from nltk.tokenize import blankline_tokenize, sent_tokenize, word_tokenize
+
+
+def comma_separated_string_args(func):
+    """
+    A decorator that allows a function to be called with
+    a single string of comma-separated values which become
+    individual function arguments.
+    """
+
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        _args = list()
+        for arg in args:
+            if isinstance(arg, str):
+                _args.append({part.strip() for part in arg.split(",")})
+            elif isinstance(arg, list):
+                _args.append(set(arg))
+            else:
+                _args.append(arg)
+        for name, value in kwargs.items():
+            if isinstance(value, str):
+                kwargs[name] = {part.strip() for part in value.split(",")}
+        return func(*_args, **kwargs)
+
+    return wrapper
+
+
+def read_parse_blankline_block(stream, parser):
+    block = read_blankline_block(stream)
+    if block:
+        return [parser.render(block[0])]
+    return block
+
+
+class MarkdownBlock:
+    def __init__(self, content):
+        self.content = content
+        self.truncate_at = 16
+
+    def __repr__(self):
+        return f"{self.__class__.__name__}(content={repr(str(self))})"
+
+    def __str__(self):
+        return (
+            f"{self.content[:self.truncate_at]}"
+            f"{'...' if len(self.content) > self.truncate_at else ''}"
+        )
+
+    @property
+    def raw(self):
+        return self.content
+
+    @property
+    def words(self):
+        return word_tokenize(self.content)
+
+    @property
+    def sents(self):
+        return [word_tokenize(sent) for sent in sent_tokenize(self.content)]
+
+    @property
+    def paras(self):
+        return [
+            [word_tokenize(sent) for sent in sent_tokenize(para)]
+            for para in blankline_tokenize(self.content)
+        ]
+
+
+class CodeBlock(MarkdownBlock):
+    def __init__(self, language, *args):
+        self.language = language
+        super().__init__(*args)
+
+    @property
+    def sents(self):
+        return [word_tokenize(line) for line in self.content.splitlines()]
+
+    @property
+    def lines(self):
+        return self.content.splitlines()
+
+    @property
+    def paras(self):
+        return [
+            [word_tokenize(line) for line in para.splitlines()]
+            for para in blankline_tokenize(self.content)
+        ]
+
+
+class MarkdownSection(MarkdownBlock):
+    def __init__(self, heading, level, *args):
+        self.heading = heading
+        self.level = level
+        super().__init__(*args)
+
+
+Image = namedtuple("Image", "label, src, title")
+Link = namedtuple("Link", "label, href, title")
+List = namedtuple("List", "is_ordered, items")
+
+
+class MarkdownCorpusReader(PlaintextCorpusReader):
+    def __init__(self, *args, parser=None, **kwargs):
+        from markdown_it import MarkdownIt
+        from mdit_plain.renderer import RendererPlain
+        from mdit_py_plugins.front_matter import front_matter_plugin
+
+        self.parser = parser
+        if self.parser is None:
+            self.parser = MarkdownIt("commonmark", renderer_cls=RendererPlain)
+            self.parser.use(front_matter_plugin)
+
+        kwargs.setdefault(
+            "para_block_reader", partial(read_parse_blankline_block, parser=self.parser)
+        )
+        super().__init__(*args, **kwargs)
+
+    # This override takes care of removing markup.
+    def _read_word_block(self, stream):
+        words = list()
+        for para in self._para_block_reader(stream):
+            words.extend(self._word_tokenizer.tokenize(para))
+        return words
+
+
+class CategorizedMarkdownCorpusReader(CategorizedCorpusReader, MarkdownCorpusReader):
+    """
+    A reader for markdown corpora whose documents are divided into
+    categories based on their file identifiers.
+
+    Based on nltk.corpus.reader.plaintext.CategorizedPlaintextCorpusReader:
+    https://www.nltk.org/_modules/nltk/corpus/reader/api.html#CategorizedCorpusReader
+    """
+
+    def __init__(self, *args, cat_field="tags", **kwargs):
+        """
+        Initialize the corpus reader. Categorization arguments
+        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
+        the ``CategorizedCorpusReader`` constructor.  The remaining arguments
+        are passed to the ``MarkdownCorpusReader`` constructor.
+        """
+        cat_args = ["cat_pattern", "cat_map", "cat_file"]
+        if not any(arg in kwargs for arg in cat_args):
+            # Initialize with a blank map now,
+            # and try to build categories from document metadata later.
+            kwargs["cat_map"] = dict()
+        CategorizedCorpusReader.__init__(self, kwargs)
+        MarkdownCorpusReader.__init__(self, *args, **kwargs)
+
+        # Map file IDs to categories if self._map exists but is still empty:
+        if self._map is not None and not self._map:
+            for file_id in self._fileids:
+                metadata = self.metadata(file_id)
+                if metadata:
+                    self._map[file_id] = metadata[0].get(cat_field, [])
+
+    ### Begin CategorizedCorpusReader Overrides
+    @comma_separated_string_args
+    def categories(self, fileids=None):
+        return super().categories(fileids)
+
+    @comma_separated_string_args
+    def fileids(self, categories=None):
+        if categories is None:
+            return self._fileids
+        return super().fileids(categories)
+
+    ### End CategorizedCorpusReader Overrides
+
+    ### Begin MarkdownCorpusReader Overrides
+    @comma_separated_string_args
+    def raw(self, fileids=None, categories=None):
+        return super().raw(self._resolve(fileids, categories))
+
+    @comma_separated_string_args
+    def words(self, fileids=None, categories=None):
+        return super().words(self._resolve(fileids, categories))
+
+    @comma_separated_string_args
+    def sents(self, fileids=None, categories=None):
+        return super().sents(self._resolve(fileids, categories))
+
+    @comma_separated_string_args
+    def paras(self, fileids=None, categories=None):
+        return super().paras(self._resolve(fileids, categories))
+
+    ### End MarkdownCorpusReader Overrides
+
+    def concatenated_view(self, reader, fileids, categories):
+        return concat(
+            [
+                self.CorpusView(path, reader, encoding=enc)
+                for (path, enc) in self.abspaths(
+                    self._resolve(fileids, categories), include_encoding=True
+                )
+            ]
+        )
+
+    def metadata_reader(self, stream):
+        from yaml import safe_load
+
+        return [
+            safe_load(t.content)
+            for t in self.parser.parse(stream.read())
+            if t.type == "front_matter"
+        ]
+
+    @comma_separated_string_args
+    def metadata(self, fileids=None, categories=None):
+        return self.concatenated_view(self.metadata_reader, fileids, categories)
+
+    def blockquote_reader(self, stream):
+        tokens = self.parser.parse(stream.read())
+        opening_tokens = filter(
+            lambda t: t.level == 0 and t.type == "blockquote_open", tokens
+        )
+        closing_tokens = filter(
+            lambda t: t.level == 0 and t.type == "blockquote_close", tokens
+        )
+        blockquotes = list()
+        for o, c in zip(opening_tokens, closing_tokens):
+            opening_index = tokens.index(o)
+            closing_index = tokens.index(c, opening_index)
+            blockquotes.append(tokens[opening_index : closing_index + 1])
+        return [
+            MarkdownBlock(
+                self.parser.renderer.render(block, self.parser.options, env=None)
+            )
+            for block in blockquotes
+        ]
+
+    @comma_separated_string_args
+    def blockquotes(self, fileids=None, categories=None):
+        return self.concatenated_view(self.blockquote_reader, fileids, categories)
+
+    def code_block_reader(self, stream):
+        return [
+            CodeBlock(
+                t.info,
+                t.content,
+            )
+            for t in self.parser.parse(stream.read())
+            if t.level == 0 and t.type in ("fence", "code_block")
+        ]
+
+    @comma_separated_string_args
+    def code_blocks(self, fileids=None, categories=None):
+        return self.concatenated_view(self.code_block_reader, fileids, categories)
+
+    def image_reader(self, stream):
+        return [
+            Image(
+                child_token.content,
+                child_token.attrGet("src"),
+                child_token.attrGet("title"),
+            )
+            for inline_token in filter(
+                lambda t: t.type == "inline", self.parser.parse(stream.read())
+            )
+            for child_token in inline_token.children
+            if child_token.type == "image"
+        ]
+
+    @comma_separated_string_args
+    def images(self, fileids=None, categories=None):
+        return self.concatenated_view(self.image_reader, fileids, categories)
+
+    def link_reader(self, stream):
+        return [
+            Link(
+                inline_token.children[i + 1].content,
+                child_token.attrGet("href"),
+                child_token.attrGet("title"),
+            )
+            for inline_token in filter(
+                lambda t: t.type == "inline", self.parser.parse(stream.read())
+            )
+            for i, child_token in enumerate(inline_token.children)
+            if child_token.type == "link_open"
+        ]
+
+    @comma_separated_string_args
+    def links(self, fileids=None, categories=None):
+        return self.concatenated_view(self.link_reader, fileids, categories)
+
+    def list_reader(self, stream):
+        tokens = self.parser.parse(stream.read())
+        opening_types = ("bullet_list_open", "ordered_list_open")
+        opening_tokens = filter(
+            lambda t: t.level == 0 and t.type in opening_types, tokens
+        )
+        closing_types = ("bullet_list_close", "ordered_list_close")
+        closing_tokens = filter(
+            lambda t: t.level == 0 and t.type in closing_types, tokens
+        )
+        list_blocks = list()
+        for o, c in zip(opening_tokens, closing_tokens):
+            opening_index = tokens.index(o)
+            closing_index = tokens.index(c, opening_index)
+            list_blocks.append(tokens[opening_index : closing_index + 1])
+        return [
+            List(
+                tokens[0].type == "ordered_list_open",
+                [t.content for t in tokens if t.content],
+            )
+            for tokens in list_blocks
+        ]
+
+    @comma_separated_string_args
+    def lists(self, fileids=None, categories=None):
+        return self.concatenated_view(self.list_reader, fileids, categories)
+
+    def section_reader(self, stream):
+        section_blocks, block = list(), list()
+        for t in self.parser.parse(stream.read()):
+            if t.level == 0 and t.type == "heading_open":
+                if not block:
+                    block.append(t)
+                else:
+                    section_blocks.append(block)
+                    block = [t]
+            elif block:
+                block.append(t)
+        if block:
+            section_blocks.append(block)
+        return [
+            MarkdownSection(
+                block[1].content,
+                block[0].markup.count("#"),
+                self.parser.renderer.render(block, self.parser.options, env=None),
+            )
+            for block in section_blocks
+        ]
+
+    @comma_separated_string_args
+    def sections(self, fileids=None, categories=None):
+        return self.concatenated_view(self.section_reader, fileids, categories)
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/mte.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/mte.py
@@ -0,0 +1,398 @@
+"""
+A reader for corpora whose documents are in MTE format.
+"""
+
+import os
+import re
+from functools import reduce
+
+from nltk.corpus.reader import TaggedCorpusReader, concat
+from nltk.corpus.reader.xmldocs import XMLCorpusView
+
+
+def xpath(root, path, ns):
+    return root.findall(path, ns)
+
+
+class MTECorpusView(XMLCorpusView):
+    """
+    Class for lazy viewing the MTE Corpus.
+    """
+
+    def __init__(self, fileid, tagspec, elt_handler=None):
+        XMLCorpusView.__init__(self, fileid, tagspec, elt_handler)
+
+    def read_block(self, stream, tagspec=None, elt_handler=None):
+        return list(
+            filter(
+                lambda x: x is not None,
+                XMLCorpusView.read_block(self, stream, tagspec, elt_handler),
+            )
+        )
+
+
+class MTEFileReader:
+    """
+    Class for loading the content of the multext-east corpus. It
+    parses the xml files and does some tag-filtering depending on the
+    given method parameters.
+    """
+
+    ns = {
+        "tei": "https://www.tei-c.org/ns/1.0",
+        "xml": "https://www.w3.org/XML/1998/namespace",
+    }
+    tag_ns = "{https://www.tei-c.org/ns/1.0}"
+    xml_ns = "{https://www.w3.org/XML/1998/namespace}"
+    word_path = "TEI/text/body/div/div/p/s/(w|c)"
+    sent_path = "TEI/text/body/div/div/p/s"
+    para_path = "TEI/text/body/div/div/p"
+
+    def __init__(self, file_path):
+        self.__file_path = file_path
+
+    @classmethod
+    def _word_elt(cls, elt, context):
+        return elt.text
+
+    @classmethod
+    def _sent_elt(cls, elt, context):
+        return [cls._word_elt(w, None) for w in xpath(elt, "*", cls.ns)]
+
+    @classmethod
+    def _para_elt(cls, elt, context):
+        return [cls._sent_elt(s, None) for s in xpath(elt, "*", cls.ns)]
+
+    @classmethod
+    def _tagged_word_elt(cls, elt, context):
+        if "ana" not in elt.attrib:
+            return (elt.text, "")
+
+        if cls.__tags == "" and cls.__tagset == "msd":
+            return (elt.text, elt.attrib["ana"])
+        elif cls.__tags == "" and cls.__tagset == "universal":
+            return (elt.text, MTETagConverter.msd_to_universal(elt.attrib["ana"]))
+        else:
+            tags = re.compile("^" + re.sub("-", ".", cls.__tags) + ".*$")
+            if tags.match(elt.attrib["ana"]):
+                if cls.__tagset == "msd":
+                    return (elt.text, elt.attrib["ana"])
+                else:
+                    return (
+                        elt.text,
+                        MTETagConverter.msd_to_universal(elt.attrib["ana"]),
+                    )
+            else:
+                return None
+
+    @classmethod
+    def _tagged_sent_elt(cls, elt, context):
+        return list(
+            filter(
+                lambda x: x is not None,
+                [cls._tagged_word_elt(w, None) for w in xpath(elt, "*", cls.ns)],
+            )
+        )
+
+    @classmethod
+    def _tagged_para_elt(cls, elt, context):
+        return list(
+            filter(
+                lambda x: x is not None,
+                [cls._tagged_sent_elt(s, None) for s in xpath(elt, "*", cls.ns)],
+            )
+        )
+
+    @classmethod
+    def _lemma_word_elt(cls, elt, context):
+        if "lemma" not in elt.attrib:
+            return (elt.text, "")
+        else:
+            return (elt.text, elt.attrib["lemma"])
+
+    @classmethod
+    def _lemma_sent_elt(cls, elt, context):
+        return [cls._lemma_word_elt(w, None) for w in xpath(elt, "*", cls.ns)]
+
+    @classmethod
+    def _lemma_para_elt(cls, elt, context):
+        return [cls._lemma_sent_elt(s, None) for s in xpath(elt, "*", cls.ns)]
+
+    def words(self):
+        return MTECorpusView(
+            self.__file_path, MTEFileReader.word_path, MTEFileReader._word_elt
+        )
+
+    def sents(self):
+        return MTECorpusView(
+            self.__file_path, MTEFileReader.sent_path, MTEFileReader._sent_elt
+        )
+
+    def paras(self):
+        return MTECorpusView(
+            self.__file_path, MTEFileReader.para_path, MTEFileReader._para_elt
+        )
+
+    def lemma_words(self):
+        return MTECorpusView(
+            self.__file_path, MTEFileReader.word_path, MTEFileReader._lemma_word_elt
+        )
+
+    def tagged_words(self, tagset, tags):
+        MTEFileReader.__tagset = tagset
+        MTEFileReader.__tags = tags
+        return MTECorpusView(
+            self.__file_path, MTEFileReader.word_path, MTEFileReader._tagged_word_elt
+        )
+
+    def lemma_sents(self):
+        return MTECorpusView(
+            self.__file_path, MTEFileReader.sent_path, MTEFileReader._lemma_sent_elt
+        )
+
+    def tagged_sents(self, tagset, tags):
+        MTEFileReader.__tagset = tagset
+        MTEFileReader.__tags = tags
+        return MTECorpusView(
+            self.__file_path, MTEFileReader.sent_path, MTEFileReader._tagged_sent_elt
+        )
+
+    def lemma_paras(self):
+        return MTECorpusView(
+            self.__file_path, MTEFileReader.para_path, MTEFileReader._lemma_para_elt
+        )
+
+    def tagged_paras(self, tagset, tags):
+        MTEFileReader.__tagset = tagset
+        MTEFileReader.__tags = tags
+        return MTECorpusView(
+            self.__file_path, MTEFileReader.para_path, MTEFileReader._tagged_para_elt
+        )
+
+
+class MTETagConverter:
+    """
+    Class for converting msd tags to universal tags, more conversion
+    options are currently not implemented.
+    """
+
+    mapping_msd_universal = {
+        "A": "ADJ",
+        "S": "ADP",
+        "R": "ADV",
+        "C": "CONJ",
+        "D": "DET",
+        "N": "NOUN",
+        "M": "NUM",
+        "Q": "PRT",
+        "P": "PRON",
+        "V": "VERB",
+        ".": ".",
+        "-": "X",
+    }
+
+    @staticmethod
+    def msd_to_universal(tag):
+        """
+        This function converts the annotation from the Multex-East to the universal tagset
+        as described in Chapter 5 of the NLTK-Book
+
+        Unknown Tags will be mapped to X. Punctuation marks are not supported in MSD tags, so
+        """
+        indicator = tag[0] if not tag[0] == "#" else tag[1]
+
+        if not indicator in MTETagConverter.mapping_msd_universal:
+            indicator = "-"
+
+        return MTETagConverter.mapping_msd_universal[indicator]
+
+
+class MTECorpusReader(TaggedCorpusReader):
+    """
+    Reader for corpora following the TEI-p5 xml scheme, such as MULTEXT-East.
+    MULTEXT-East contains part-of-speech-tagged words with a quite precise tagging
+    scheme. These tags can be converted to the Universal tagset
+    """
+
+    def __init__(self, root=None, fileids=None, encoding="utf8"):
+        """
+        Construct a new MTECorpusreader for a set of documents
+        located at the given root directory.  Example usage:
+
+            >>> root = '/...path to corpus.../'
+            >>> reader = MTECorpusReader(root, 'oana-*.xml', 'utf8') # doctest: +SKIP
+
+        :param root: The root directory for this corpus. (default points to location in multext config file)
+        :param fileids: A list or regexp specifying the fileids in this corpus. (default is oana-en.xml)
+        :param encoding: The encoding of the given files (default is utf8)
+        """
+        TaggedCorpusReader.__init__(self, root, fileids, encoding)
+        self._readme = "00README.txt"
+
+    def __fileids(self, fileids):
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, str):
+            fileids = [fileids]
+        # filter wrong userinput
+        fileids = filter(lambda x: x in self._fileids, fileids)
+        # filter multext-east sourcefiles that are not compatible to the teip5 specification
+        fileids = filter(lambda x: x not in ["oana-bg.xml", "oana-mk.xml"], fileids)
+        if not fileids:
+            print("No valid multext-east file specified")
+        return fileids
+
+    def words(self, fileids=None):
+        """
+        :param fileids: A list specifying the fileids that should be used.
+        :return: the given file(s) as a list of words and punctuation symbols.
+        :rtype: list(str)
+        """
+        return concat(
+            [
+                MTEFileReader(os.path.join(self._root, f)).words()
+                for f in self.__fileids(fileids)
+            ]
+        )
+
+    def sents(self, fileids=None):
+        """
+        :param fileids: A list specifying the fileids that should be used.
+        :return: the given file(s) as a list of sentences or utterances,
+                 each encoded as a list of word strings
+        :rtype: list(list(str))
+        """
+        return concat(
+            [
+                MTEFileReader(os.path.join(self._root, f)).sents()
+                for f in self.__fileids(fileids)
+            ]
+        )
+
+    def paras(self, fileids=None):
+        """
+        :param fileids: A list specifying the fileids that should be used.
+        :return: the given file(s) as a list of paragraphs, each encoded as a list
+                 of sentences, which are in turn encoded as lists of word string
+        :rtype: list(list(list(str)))
+        """
+        return concat(
+            [
+                MTEFileReader(os.path.join(self._root, f)).paras()
+                for f in self.__fileids(fileids)
+            ]
+        )
+
+    def lemma_words(self, fileids=None):
+        """
+        :param fileids: A list specifying the fileids that should be used.
+        :return: the given file(s) as a list of words, the corresponding lemmas
+                 and punctuation symbols, encoded as tuples (word, lemma)
+        :rtype: list(tuple(str,str))
+        """
+        return concat(
+            [
+                MTEFileReader(os.path.join(self._root, f)).lemma_words()
+                for f in self.__fileids(fileids)
+            ]
+        )
+
+    def tagged_words(self, fileids=None, tagset="msd", tags=""):
+        """
+        :param fileids: A list specifying the fileids that should be used.
+        :param tagset: The tagset that should be used in the returned object,
+                       either "universal" or "msd", "msd" is the default
+        :param tags: An MSD Tag that is used to filter all parts of the used corpus
+                     that are not more precise or at least equal to the given tag
+        :return: the given file(s) as a list of tagged words and punctuation symbols
+                 encoded as tuples (word, tag)
+        :rtype: list(tuple(str, str))
+        """
+        if tagset == "universal" or tagset == "msd":
+            return concat(
+                [
+                    MTEFileReader(os.path.join(self._root, f)).tagged_words(
+                        tagset, tags
+                    )
+                    for f in self.__fileids(fileids)
+                ]
+            )
+        else:
+            print("Unknown tagset specified.")
+
+    def lemma_sents(self, fileids=None):
+        """
+        :param fileids: A list specifying the fileids that should be used.
+        :return: the given file(s) as a list of sentences or utterances, each
+                 encoded as a list of tuples of the word and the corresponding
+                 lemma (word, lemma)
+        :rtype: list(list(tuple(str, str)))
+        """
+        return concat(
+            [
+                MTEFileReader(os.path.join(self._root, f)).lemma_sents()
+                for f in self.__fileids(fileids)
+            ]
+        )
+
+    def tagged_sents(self, fileids=None, tagset="msd", tags=""):
+        """
+        :param fileids: A list specifying the fileids that should be used.
+        :param tagset: The tagset that should be used in the returned object,
+                       either "universal" or "msd", "msd" is the default
+        :param tags: An MSD Tag that is used to filter all parts of the used corpus
+                     that are not more precise or at least equal to the given tag
+        :return: the given file(s) as a list of sentences or utterances, each
+                 each encoded as a list of (word,tag) tuples
+        :rtype: list(list(tuple(str, str)))
+        """
+        if tagset == "universal" or tagset == "msd":
+            return concat(
+                [
+                    MTEFileReader(os.path.join(self._root, f)).tagged_sents(
+                        tagset, tags
+                    )
+                    for f in self.__fileids(fileids)
+                ]
+            )
+        else:
+            print("Unknown tagset specified.")
+
+    def lemma_paras(self, fileids=None):
+        """
+        :param fileids: A list specifying the fileids that should be used.
+        :return: the given file(s) as a list of paragraphs, each encoded as a
+                 list of sentences, which are in turn encoded as a list of
+                 tuples of the word and the corresponding lemma (word, lemma)
+        :rtype: list(List(List(tuple(str, str))))
+        """
+        return concat(
+            [
+                MTEFileReader(os.path.join(self._root, f)).lemma_paras()
+                for f in self.__fileids(fileids)
+            ]
+        )
+
+    def tagged_paras(self, fileids=None, tagset="msd", tags=""):
+        """
+        :param fileids: A list specifying the fileids that should be used.
+        :param tagset: The tagset that should be used in the returned object,
+                       either "universal" or "msd", "msd" is the default
+        :param tags: An MSD Tag that is used to filter all parts of the used corpus
+                     that are not more precise or at least equal to the given tag
+        :return: the given file(s) as a list of paragraphs, each encoded as a
+                 list of sentences, which are in turn encoded as a list
+                 of (word,tag) tuples
+        :rtype: list(list(list(tuple(str, str))))
+        """
+        if tagset == "universal" or tagset == "msd":
+            return concat(
+                [
+                    MTEFileReader(os.path.join(self._root, f)).tagged_paras(
+                        tagset, tags
+                    )
+                    for f in self.__fileids(fileids)
+                ]
+            )
+        else:
+            print("Unknown tagset specified.")
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/nkjp.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/nkjp.py
@@ -0,0 +1,486 @@
+# Natural Language Toolkit: NKJP Corpus Reader
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Gabriela Kaczka
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+import functools
+import os
+import re
+import tempfile
+
+from nltk.corpus.reader.util import concat
+from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView
+
+
+def _parse_args(fun):
+    """
+    Wraps function arguments:
+    if fileids not specified then function set NKJPCorpusReader paths.
+    """
+
+    @functools.wraps(fun)
+    def decorator(self, fileids=None, **kwargs):
+        if not fileids:
+            fileids = self._paths
+        return fun(self, fileids, **kwargs)
+
+    return decorator
+
+
+class NKJPCorpusReader(XMLCorpusReader):
+    WORDS_MODE = 0
+    SENTS_MODE = 1
+    HEADER_MODE = 2
+    RAW_MODE = 3
+
+    def __init__(self, root, fileids=".*"):
+        """
+        Corpus reader designed to work with National Corpus of Polish.
+        See http://nkjp.pl/ for more details about NKJP.
+        use example:
+        import nltk
+        import nkjp
+        from nkjp import NKJPCorpusReader
+        x = NKJPCorpusReader(root='/home/USER/nltk_data/corpora/nkjp/', fileids='') # obtain the whole corpus
+        x.header()
+        x.raw()
+        x.words()
+        x.tagged_words(tags=['subst', 'comp'])  #Link to find more tags: nkjp.pl/poliqarp/help/ense2.html
+        x.sents()
+        x = NKJPCorpusReader(root='/home/USER/nltk_data/corpora/nkjp/', fileids='Wilk*') # obtain particular file(s)
+        x.header(fileids=['WilkDom', '/home/USER/nltk_data/corpora/nkjp/WilkWilczy'])
+        x.tagged_words(fileids=['WilkDom', '/home/USER/nltk_data/corpora/nkjp/WilkWilczy'], tags=['subst', 'comp'])
+        """
+        if isinstance(fileids, str):
+            XMLCorpusReader.__init__(self, root, fileids + ".*/header.xml")
+        else:
+            XMLCorpusReader.__init__(
+                self, root, [fileid + "/header.xml" for fileid in fileids]
+            )
+        self._paths = self.get_paths()
+
+    def get_paths(self):
+        return [
+            os.path.join(str(self._root), f.split("header.xml")[0])
+            for f in self._fileids
+        ]
+
+    def fileids(self):
+        """
+        Returns a list of file identifiers for the fileids that make up
+        this corpus.
+        """
+        return [f.split("header.xml")[0] for f in self._fileids]
+
+    def _view(self, filename, tags=None, **kwargs):
+        """
+        Returns a view specialised for use with particular corpus file.
+        """
+        mode = kwargs.pop("mode", NKJPCorpusReader.WORDS_MODE)
+        if mode is NKJPCorpusReader.WORDS_MODE:
+            return NKJPCorpus_Morph_View(filename, tags=tags)
+        elif mode is NKJPCorpusReader.SENTS_MODE:
+            return NKJPCorpus_Segmentation_View(filename, tags=tags)
+        elif mode is NKJPCorpusReader.HEADER_MODE:
+            return NKJPCorpus_Header_View(filename, tags=tags)
+        elif mode is NKJPCorpusReader.RAW_MODE:
+            return NKJPCorpus_Text_View(
+                filename, tags=tags, mode=NKJPCorpus_Text_View.RAW_MODE
+            )
+
+        else:
+            raise NameError("No such mode!")
+
+    def add_root(self, fileid):
+        """
+        Add root if necessary to specified fileid.
+        """
+        if self.root in fileid:
+            return fileid
+        return self.root + fileid
+
+    @_parse_args
+    def header(self, fileids=None, **kwargs):
+        """
+        Returns header(s) of specified fileids.
+        """
+        return concat(
+            [
+                self._view(
+                    self.add_root(fileid), mode=NKJPCorpusReader.HEADER_MODE, **kwargs
+                ).handle_query()
+                for fileid in fileids
+            ]
+        )
+
+    @_parse_args
+    def sents(self, fileids=None, **kwargs):
+        """
+        Returns sentences in specified fileids.
+        """
+        return concat(
+            [
+                self._view(
+                    self.add_root(fileid), mode=NKJPCorpusReader.SENTS_MODE, **kwargs
+                ).handle_query()
+                for fileid in fileids
+            ]
+        )
+
+    @_parse_args
+    def words(self, fileids=None, **kwargs):
+        """
+        Returns words in specified fileids.
+        """
+
+        return concat(
+            [
+                self._view(
+                    self.add_root(fileid), mode=NKJPCorpusReader.WORDS_MODE, **kwargs
+                ).handle_query()
+                for fileid in fileids
+            ]
+        )
+
+    @_parse_args
+    def tagged_words(self, fileids=None, **kwargs):
+        """
+        Call with specified tags as a list, e.g. tags=['subst', 'comp'].
+        Returns tagged words in specified fileids.
+        """
+        tags = kwargs.pop("tags", [])
+        return concat(
+            [
+                self._view(
+                    self.add_root(fileid),
+                    mode=NKJPCorpusReader.WORDS_MODE,
+                    tags=tags,
+                    **kwargs
+                ).handle_query()
+                for fileid in fileids
+            ]
+        )
+
+    @_parse_args
+    def raw(self, fileids=None, **kwargs):
+        """
+        Returns words in specified fileids.
+        """
+        return concat(
+            [
+                self._view(
+                    self.add_root(fileid), mode=NKJPCorpusReader.RAW_MODE, **kwargs
+                ).handle_query()
+                for fileid in fileids
+            ]
+        )
+
+
+class NKJPCorpus_Header_View(XMLCorpusView):
+    def __init__(self, filename, **kwargs):
+        """
+        HEADER_MODE
+        A stream backed corpus view specialized for use with
+        header.xml files in NKJP corpus.
+        """
+        self.tagspec = ".*/sourceDesc$"
+        XMLCorpusView.__init__(self, filename + "header.xml", self.tagspec)
+
+    def handle_query(self):
+        self._open()
+        header = []
+        while True:
+            segm = XMLCorpusView.read_block(self, self._stream)
+            if len(segm) == 0:
+                break
+            header.extend(segm)
+        self.close()
+        return header
+
+    def handle_elt(self, elt, context):
+        titles = elt.findall("bibl/title")
+        title = []
+        if titles:
+            title = "\n".join(title.text.strip() for title in titles)
+
+        authors = elt.findall("bibl/author")
+        author = []
+        if authors:
+            author = "\n".join(author.text.strip() for author in authors)
+
+        dates = elt.findall("bibl/date")
+        date = []
+        if dates:
+            date = "\n".join(date.text.strip() for date in dates)
+
+        publishers = elt.findall("bibl/publisher")
+        publisher = []
+        if publishers:
+            publisher = "\n".join(publisher.text.strip() for publisher in publishers)
+
+        idnos = elt.findall("bibl/idno")
+        idno = []
+        if idnos:
+            idno = "\n".join(idno.text.strip() for idno in idnos)
+
+        notes = elt.findall("bibl/note")
+        note = []
+        if notes:
+            note = "\n".join(note.text.strip() for note in notes)
+
+        return {
+            "title": title,
+            "author": author,
+            "date": date,
+            "publisher": publisher,
+            "idno": idno,
+            "note": note,
+        }
+
+
+class XML_Tool:
+    """
+    Helper class creating xml file to one without references to nkjp: namespace.
+    That's needed because the XMLCorpusView assumes that one can find short substrings
+    of XML that are valid XML, which is not true if a namespace is declared at top level
+    """
+
+    def __init__(self, root, filename):
+        self.read_file = os.path.join(root, filename)
+        self.write_file = tempfile.NamedTemporaryFile(delete=False)
+
+    def build_preprocessed_file(self):
+        try:
+            fr = open(self.read_file)
+            fw = self.write_file
+            line = " "
+            while len(line):
+                line = fr.readline()
+                x = re.split(r"nkjp:[^ ]* ", line)  # in all files
+                ret = " ".join(x)
+                x = re.split("<nkjp:paren>", ret)  # in ann_segmentation.xml
+                ret = " ".join(x)
+                x = re.split("</nkjp:paren>", ret)  # in ann_segmentation.xml
+                ret = " ".join(x)
+                x = re.split("<choice>", ret)  # in ann_segmentation.xml
+                ret = " ".join(x)
+                x = re.split("</choice>", ret)  # in ann_segmentation.xml
+                ret = " ".join(x)
+                fw.write(ret)
+            fr.close()
+            fw.close()
+            return self.write_file.name
+        except Exception as e:
+            self.remove_preprocessed_file()
+            raise Exception from e
+
+    def remove_preprocessed_file(self):
+        os.remove(self.write_file.name)
+
+
+class NKJPCorpus_Segmentation_View(XMLCorpusView):
+    """
+    A stream backed corpus view specialized for use with
+    ann_segmentation.xml files in NKJP corpus.
+    """
+
+    def __init__(self, filename, **kwargs):
+        self.tagspec = ".*p/.*s"
+        # intersperse NKJPCorpus_Text_View
+        self.text_view = NKJPCorpus_Text_View(
+            filename, mode=NKJPCorpus_Text_View.SENTS_MODE
+        )
+        self.text_view.handle_query()
+        # xml preprocessing
+        self.xml_tool = XML_Tool(filename, "ann_segmentation.xml")
+        # base class init
+        XMLCorpusView.__init__(
+            self, self.xml_tool.build_preprocessed_file(), self.tagspec
+        )
+
+    def get_segm_id(self, example_word):
+        return example_word.split("(")[1].split(",")[0]
+
+    def get_sent_beg(self, beg_word):
+        # returns index of beginning letter in sentence
+        return int(beg_word.split(",")[1])
+
+    def get_sent_end(self, end_word):
+        # returns index of end letter in sentence
+        splitted = end_word.split(")")[0].split(",")
+        return int(splitted[1]) + int(splitted[2])
+
+    def get_sentences(self, sent_segm):
+        # returns one sentence
+        id = self.get_segm_id(sent_segm[0])
+        segm = self.text_view.segm_dict[id]  # text segment
+        beg = self.get_sent_beg(sent_segm[0])
+        end = self.get_sent_end(sent_segm[len(sent_segm) - 1])
+        return segm[beg:end]
+
+    def remove_choice(self, segm):
+        ret = []
+        prev_txt_end = -1
+        prev_txt_nr = -1
+        for word in segm:
+            txt_nr = self.get_segm_id(word)
+            # get increasing sequence of ids: in case of choice get first possibility
+            if self.get_sent_beg(word) > prev_txt_end - 1 or prev_txt_nr != txt_nr:
+                ret.append(word)
+                prev_txt_end = self.get_sent_end(word)
+            prev_txt_nr = txt_nr
+
+        return ret
+
+    def handle_query(self):
+        try:
+            self._open()
+            sentences = []
+            while True:
+                sent_segm = XMLCorpusView.read_block(self, self._stream)
+                if len(sent_segm) == 0:
+                    break
+                for segm in sent_segm:
+                    segm = self.remove_choice(segm)
+                    sentences.append(self.get_sentences(segm))
+            self.close()
+            self.xml_tool.remove_preprocessed_file()
+            return sentences
+        except Exception as e:
+            self.xml_tool.remove_preprocessed_file()
+            raise Exception from e
+
+    def handle_elt(self, elt, context):
+        ret = []
+        for seg in elt:
+            ret.append(seg.get("corresp"))
+        return ret
+
+
+class NKJPCorpus_Text_View(XMLCorpusView):
+    """
+    A stream backed corpus view specialized for use with
+    text.xml files in NKJP corpus.
+    """
+
+    SENTS_MODE = 0
+    RAW_MODE = 1
+
+    def __init__(self, filename, **kwargs):
+        self.mode = kwargs.pop("mode", 0)
+        self.tagspec = ".*/div/ab"
+        self.segm_dict = dict()
+        # xml preprocessing
+        self.xml_tool = XML_Tool(filename, "text.xml")
+        # base class init
+        XMLCorpusView.__init__(
+            self, self.xml_tool.build_preprocessed_file(), self.tagspec
+        )
+
+    def handle_query(self):
+        try:
+            self._open()
+            x = self.read_block(self._stream)
+            self.close()
+            self.xml_tool.remove_preprocessed_file()
+            return x
+        except Exception as e:
+            self.xml_tool.remove_preprocessed_file()
+            raise Exception from e
+
+    def read_block(self, stream, tagspec=None, elt_handler=None):
+        """
+        Returns text as a list of sentences.
+        """
+        txt = []
+        while True:
+            segm = XMLCorpusView.read_block(self, stream)
+            if len(segm) == 0:
+                break
+            for part in segm:
+                txt.append(part)
+
+        return [" ".join([segm for segm in txt])]
+
+    def get_segm_id(self, elt):
+        for attr in elt.attrib:
+            if attr.endswith("id"):
+                return elt.get(attr)
+
+    def handle_elt(self, elt, context):
+        # fill dictionary to use later in sents mode
+        if self.mode is NKJPCorpus_Text_View.SENTS_MODE:
+            self.segm_dict[self.get_segm_id(elt)] = elt.text
+        return elt.text
+
+
+class NKJPCorpus_Morph_View(XMLCorpusView):
+    """
+    A stream backed corpus view specialized for use with
+    ann_morphosyntax.xml files in NKJP corpus.
+    """
+
+    def __init__(self, filename, **kwargs):
+        self.tags = kwargs.pop("tags", None)
+        self.tagspec = ".*/seg/fs"
+        self.xml_tool = XML_Tool(filename, "ann_morphosyntax.xml")
+        XMLCorpusView.__init__(
+            self, self.xml_tool.build_preprocessed_file(), self.tagspec
+        )
+
+    def handle_query(self):
+        try:
+            self._open()
+            words = []
+            while True:
+                segm = XMLCorpusView.read_block(self, self._stream)
+                if len(segm) == 0:
+                    break
+                for part in segm:
+                    if part is not None:
+                        words.append(part)
+            self.close()
+            self.xml_tool.remove_preprocessed_file()
+            return words
+        except Exception as e:
+            self.xml_tool.remove_preprocessed_file()
+            raise Exception from e
+
+    def handle_elt(self, elt, context):
+        word = ""
+        flag = False
+        is_not_interp = True
+        # if tags not specified, then always return word
+        if self.tags is None:
+            flag = True
+
+        for child in elt:
+            # get word
+            if "name" in child.keys() and child.attrib["name"] == "orth":
+                for symbol in child:
+                    if symbol.tag == "string":
+                        word = symbol.text
+            elif "name" in child.keys() and child.attrib["name"] == "interps":
+                for symbol in child:
+                    if "type" in symbol.keys() and symbol.attrib["type"] == "lex":
+                        for symbol2 in symbol:
+                            if (
+                                "name" in symbol2.keys()
+                                and symbol2.attrib["name"] == "ctag"
+                            ):
+                                for symbol3 in symbol2:
+                                    if (
+                                        "value" in symbol3.keys()
+                                        and self.tags is not None
+                                        and symbol3.attrib["value"] in self.tags
+                                    ):
+                                        flag = True
+                                    elif (
+                                        "value" in symbol3.keys()
+                                        and symbol3.attrib["value"] == "interp"
+                                    ):
+                                        is_not_interp = False
+        if flag and is_not_interp:
+            return word
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/nombank.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/nombank.py
@@ -0,0 +1,465 @@
+# Natural Language Toolkit: NomBank Corpus Reader
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Authors: Paul Bedaride <paul.bedaride@gmail.com>
+#          Edward Loper <edloper@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+from functools import total_ordering
+from xml.etree import ElementTree
+
+from nltk.corpus.reader.api import *
+from nltk.corpus.reader.util import *
+from nltk.internals import raise_unorderable_types
+from nltk.tree import Tree
+
+
+class NombankCorpusReader(CorpusReader):
+    """
+    Corpus reader for the nombank corpus, which augments the Penn
+    Treebank with information about the predicate argument structure
+    of every noun instance.  The corpus consists of two parts: the
+    predicate-argument annotations themselves, and a set of "frameset
+    files" which define the argument labels used by the annotations,
+    on a per-noun basis.  Each "frameset file" contains one or more
+    predicates, such as ``'turn'`` or ``'turn_on'``, each of which is
+    divided into coarse-grained word senses called "rolesets".  For
+    each "roleset", the frameset file provides descriptions of the
+    argument roles, along with examples.
+    """
+
+    def __init__(
+        self,
+        root,
+        nomfile,
+        framefiles="",
+        nounsfile=None,
+        parse_fileid_xform=None,
+        parse_corpus=None,
+        encoding="utf8",
+    ):
+        """
+        :param root: The root directory for this corpus.
+        :param nomfile: The name of the file containing the predicate-
+            argument annotations (relative to ``root``).
+        :param framefiles: A list or regexp specifying the frameset
+            fileids for this corpus.
+        :param parse_fileid_xform: A transform that should be applied
+            to the fileids in this corpus.  This should be a function
+            of one argument (a fileid) that returns a string (the new
+            fileid).
+        :param parse_corpus: The corpus containing the parse trees
+            corresponding to this corpus.  These parse trees are
+            necessary to resolve the tree pointers used by nombank.
+        """
+
+        # If framefiles is specified as a regexp, expand it.
+        if isinstance(framefiles, str):
+            self._fileids = find_corpus_fileids(root, framefiles)
+        self._fileids = list(framefiles)
+        # Initialize the corpus reader.
+        CorpusReader.__init__(self, root, framefiles, encoding)
+
+        # Record our nom file & nouns file.
+        self._nomfile = nomfile
+        self._nounsfile = nounsfile
+        self._parse_fileid_xform = parse_fileid_xform
+        self._parse_corpus = parse_corpus
+
+    def instances(self, baseform=None):
+        """
+        :return: a corpus view that acts as a list of
+            ``NombankInstance`` objects, one for each noun in the corpus.
+        """
+        kwargs = {}
+        if baseform is not None:
+            kwargs["instance_filter"] = lambda inst: inst.baseform == baseform
+        return StreamBackedCorpusView(
+            self.abspath(self._nomfile),
+            lambda stream: self._read_instance_block(stream, **kwargs),
+            encoding=self.encoding(self._nomfile),
+        )
+
+    def lines(self):
+        """
+        :return: a corpus view that acts as a list of strings, one for
+            each line in the predicate-argument annotation file.
+        """
+        return StreamBackedCorpusView(
+            self.abspath(self._nomfile),
+            read_line_block,
+            encoding=self.encoding(self._nomfile),
+        )
+
+    def roleset(self, roleset_id):
+        """
+        :return: the xml description for the given roleset.
+        """
+        baseform = roleset_id.split(".")[0]
+        baseform = baseform.replace("perc-sign", "%")
+        baseform = baseform.replace("oneslashonezero", "1/10").replace(
+            "1/10", "1-slash-10"
+        )
+        framefile = "frames/%s.xml" % baseform
+        if framefile not in self.fileids():
+            raise ValueError("Frameset file for %s not found" % roleset_id)
+
+        # n.b.: The encoding for XML fileids is specified by the file
+        # itself; so we ignore self._encoding here.
+        with self.abspath(framefile).open() as fp:
+            etree = ElementTree.parse(fp).getroot()
+        for roleset in etree.findall("predicate/roleset"):
+            if roleset.attrib["id"] == roleset_id:
+                return roleset
+        raise ValueError(f"Roleset {roleset_id} not found in {framefile}")
+
+    def rolesets(self, baseform=None):
+        """
+        :return: list of xml descriptions for rolesets.
+        """
+        if baseform is not None:
+            framefile = "frames/%s.xml" % baseform
+            if framefile not in self.fileids():
+                raise ValueError("Frameset file for %s not found" % baseform)
+            framefiles = [framefile]
+        else:
+            framefiles = self.fileids()
+
+        rsets = []
+        for framefile in framefiles:
+            # n.b.: The encoding for XML fileids is specified by the file
+            # itself; so we ignore self._encoding here.
+            with self.abspath(framefile).open() as fp:
+                etree = ElementTree.parse(fp).getroot()
+            rsets.append(etree.findall("predicate/roleset"))
+        return LazyConcatenation(rsets)
+
+    def nouns(self):
+        """
+        :return: a corpus view that acts as a list of all noun lemmas
+            in this corpus (from the nombank.1.0.words file).
+        """
+        return StreamBackedCorpusView(
+            self.abspath(self._nounsfile),
+            read_line_block,
+            encoding=self.encoding(self._nounsfile),
+        )
+
+    def _read_instance_block(self, stream, instance_filter=lambda inst: True):
+        block = []
+
+        # Read 100 at a time.
+        for i in range(100):
+            line = stream.readline().strip()
+            if line:
+                inst = NombankInstance.parse(
+                    line, self._parse_fileid_xform, self._parse_corpus
+                )
+                if instance_filter(inst):
+                    block.append(inst)
+
+        return block
+
+
+######################################################################
+# { Nombank Instance & related datatypes
+######################################################################
+
+
+class NombankInstance:
+    def __init__(
+        self,
+        fileid,
+        sentnum,
+        wordnum,
+        baseform,
+        sensenumber,
+        predicate,
+        predid,
+        arguments,
+        parse_corpus=None,
+    ):
+        self.fileid = fileid
+        """The name of the file containing the parse tree for this
+        instance's sentence."""
+
+        self.sentnum = sentnum
+        """The sentence number of this sentence within ``fileid``.
+        Indexing starts from zero."""
+
+        self.wordnum = wordnum
+        """The word number of this instance's predicate within its
+        containing sentence.  Word numbers are indexed starting from
+        zero, and include traces and other empty parse elements."""
+
+        self.baseform = baseform
+        """The baseform of the predicate."""
+
+        self.sensenumber = sensenumber
+        """The sense number of the predicate."""
+
+        self.predicate = predicate
+        """A ``NombankTreePointer`` indicating the position of this
+        instance's predicate within its containing sentence."""
+
+        self.predid = predid
+        """Identifier of the predicate."""
+
+        self.arguments = tuple(arguments)
+        """A list of tuples (argloc, argid), specifying the location
+        and identifier for each of the predicate's argument in the
+        containing sentence.  Argument identifiers are strings such as
+        ``'ARG0'`` or ``'ARGM-TMP'``.  This list does *not* contain
+        the predicate."""
+
+        self.parse_corpus = parse_corpus
+        """A corpus reader for the parse trees corresponding to the
+        instances in this nombank corpus."""
+
+    @property
+    def roleset(self):
+        """The name of the roleset used by this instance's predicate.
+        Use ``nombank.roleset() <NombankCorpusReader.roleset>`` to
+        look up information about the roleset."""
+        r = self.baseform.replace("%", "perc-sign")
+        r = r.replace("1/10", "1-slash-10").replace("1-slash-10", "oneslashonezero")
+        return f"{r}.{self.sensenumber}"
+
+    def __repr__(self):
+        return "<NombankInstance: {}, sent {}, word {}>".format(
+            self.fileid,
+            self.sentnum,
+            self.wordnum,
+        )
+
+    def __str__(self):
+        s = "{} {} {} {} {}".format(
+            self.fileid,
+            self.sentnum,
+            self.wordnum,
+            self.baseform,
+            self.sensenumber,
+        )
+        items = self.arguments + ((self.predicate, "rel"),)
+        for argloc, argid in sorted(items):
+            s += f" {argloc}-{argid}"
+        return s
+
+    def _get_tree(self):
+        if self.parse_corpus is None:
+            return None
+        if self.fileid not in self.parse_corpus.fileids():
+            return None
+        return self.parse_corpus.parsed_sents(self.fileid)[self.sentnum]
+
+    tree = property(
+        _get_tree,
+        doc="""
+        The parse tree corresponding to this instance, or None if
+        the corresponding tree is not available.""",
+    )
+
+    @staticmethod
+    def parse(s, parse_fileid_xform=None, parse_corpus=None):
+        pieces = s.split()
+        if len(pieces) < 6:
+            raise ValueError("Badly formatted nombank line: %r" % s)
+
+        # Divide the line into its basic pieces.
+        (fileid, sentnum, wordnum, baseform, sensenumber) = pieces[:5]
+
+        args = pieces[5:]
+        rel = [args.pop(i) for i, p in enumerate(args) if "-rel" in p]
+        if len(rel) != 1:
+            raise ValueError("Badly formatted nombank line: %r" % s)
+
+        # Apply the fileid selector, if any.
+        if parse_fileid_xform is not None:
+            fileid = parse_fileid_xform(fileid)
+
+        # Convert sentence & word numbers to ints.
+        sentnum = int(sentnum)
+        wordnum = int(wordnum)
+
+        # Parse the predicate location.
+
+        predloc, predid = rel[0].split("-", 1)
+        predicate = NombankTreePointer.parse(predloc)
+
+        # Parse the arguments.
+        arguments = []
+        for arg in args:
+            argloc, argid = arg.split("-", 1)
+            arguments.append((NombankTreePointer.parse(argloc), argid))
+
+        # Put it all together.
+        return NombankInstance(
+            fileid,
+            sentnum,
+            wordnum,
+            baseform,
+            sensenumber,
+            predicate,
+            predid,
+            arguments,
+            parse_corpus,
+        )
+
+
+class NombankPointer:
+    """
+    A pointer used by nombank to identify one or more constituents in
+    a parse tree.  ``NombankPointer`` is an abstract base class with
+    three concrete subclasses:
+
+    - ``NombankTreePointer`` is used to point to single constituents.
+    - ``NombankSplitTreePointer`` is used to point to 'split'
+      constituents, which consist of a sequence of two or more
+      ``NombankTreePointer`` pointers.
+    - ``NombankChainTreePointer`` is used to point to entire trace
+      chains in a tree.  It consists of a sequence of pieces, which
+      can be ``NombankTreePointer`` or ``NombankSplitTreePointer`` pointers.
+    """
+
+    def __init__(self):
+        if self.__class__ == NombankPointer:
+            raise NotImplementedError()
+
+
+class NombankChainTreePointer(NombankPointer):
+    def __init__(self, pieces):
+        self.pieces = pieces
+        """A list of the pieces that make up this chain.  Elements may
+           be either ``NombankSplitTreePointer`` or
+           ``NombankTreePointer`` pointers."""
+
+    def __str__(self):
+        return "*".join("%s" % p for p in self.pieces)
+
+    def __repr__(self):
+        return "<NombankChainTreePointer: %s>" % self
+
+    def select(self, tree):
+        if tree is None:
+            raise ValueError("Parse tree not available")
+        return Tree("*CHAIN*", [p.select(tree) for p in self.pieces])
+
+
+class NombankSplitTreePointer(NombankPointer):
+    def __init__(self, pieces):
+        self.pieces = pieces
+        """A list of the pieces that make up this chain.  Elements are
+           all ``NombankTreePointer`` pointers."""
+
+    def __str__(self):
+        return ",".join("%s" % p for p in self.pieces)
+
+    def __repr__(self):
+        return "<NombankSplitTreePointer: %s>" % self
+
+    def select(self, tree):
+        if tree is None:
+            raise ValueError("Parse tree not available")
+        return Tree("*SPLIT*", [p.select(tree) for p in self.pieces])
+
+
+@total_ordering
+class NombankTreePointer(NombankPointer):
+    """
+    wordnum:height*wordnum:height*...
+    wordnum:height,
+
+    """
+
+    def __init__(self, wordnum, height):
+        self.wordnum = wordnum
+        self.height = height
+
+    @staticmethod
+    def parse(s):
+        # Deal with chains (xx*yy*zz)
+        pieces = s.split("*")
+        if len(pieces) > 1:
+            return NombankChainTreePointer(
+                [NombankTreePointer.parse(elt) for elt in pieces]
+            )
+
+        # Deal with split args (xx,yy,zz)
+        pieces = s.split(",")
+        if len(pieces) > 1:
+            return NombankSplitTreePointer(
+                [NombankTreePointer.parse(elt) for elt in pieces]
+            )
+
+        # Deal with normal pointers.
+        pieces = s.split(":")
+        if len(pieces) != 2:
+            raise ValueError("bad nombank pointer %r" % s)
+        return NombankTreePointer(int(pieces[0]), int(pieces[1]))
+
+    def __str__(self):
+        return f"{self.wordnum}:{self.height}"
+
+    def __repr__(self):
+        return "NombankTreePointer(%d, %d)" % (self.wordnum, self.height)
+
+    def __eq__(self, other):
+        while isinstance(other, (NombankChainTreePointer, NombankSplitTreePointer)):
+            other = other.pieces[0]
+
+        if not isinstance(other, NombankTreePointer):
+            return self is other
+
+        return self.wordnum == other.wordnum and self.height == other.height
+
+    def __ne__(self, other):
+        return not self == other
+
+    def __lt__(self, other):
+        while isinstance(other, (NombankChainTreePointer, NombankSplitTreePointer)):
+            other = other.pieces[0]
+
+        if not isinstance(other, NombankTreePointer):
+            return id(self) < id(other)
+
+        return (self.wordnum, -self.height) < (other.wordnum, -other.height)
+
+    def select(self, tree):
+        if tree is None:
+            raise ValueError("Parse tree not available")
+        return tree[self.treepos(tree)]
+
+    def treepos(self, tree):
+        """
+        Convert this pointer to a standard 'tree position' pointer,
+        given that it points to the given tree.
+        """
+        if tree is None:
+            raise ValueError("Parse tree not available")
+        stack = [tree]
+        treepos = []
+
+        wordnum = 0
+        while True:
+            # tree node:
+            if isinstance(stack[-1], Tree):
+                # Select the next child.
+                if len(treepos) < len(stack):
+                    treepos.append(0)
+                else:
+                    treepos[-1] += 1
+                # Update the stack.
+                if treepos[-1] < len(stack[-1]):
+                    stack.append(stack[-1][treepos[-1]])
+                else:
+                    # End of node's child list: pop up a level.
+                    stack.pop()
+                    treepos.pop()
+            # word node:
+            else:
+                if wordnum == self.wordnum:
+                    return tuple(treepos[: len(treepos) - self.height - 1])
+                else:
+                    wordnum += 1
+                    stack.pop()
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/nps_chat.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/nps_chat.py
@@ -0,0 +1,90 @@
+# Natural Language Toolkit: NPS Chat Corpus Reader
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+import re
+import textwrap
+
+from nltk.corpus.reader.api import *
+from nltk.corpus.reader.util import *
+from nltk.corpus.reader.xmldocs import *
+from nltk.internals import ElementWrapper
+from nltk.tag import map_tag
+from nltk.util import LazyConcatenation
+
+
+class NPSChatCorpusReader(XMLCorpusReader):
+    def __init__(self, root, fileids, wrap_etree=False, tagset=None):
+        XMLCorpusReader.__init__(self, root, fileids, wrap_etree)
+        self._tagset = tagset
+
+    def xml_posts(self, fileids=None):
+        if self._wrap_etree:
+            return concat(
+                [
+                    XMLCorpusView(fileid, "Session/Posts/Post", self._wrap_elt)
+                    for fileid in self.abspaths(fileids)
+                ]
+            )
+        else:
+            return concat(
+                [
+                    XMLCorpusView(fileid, "Session/Posts/Post")
+                    for fileid in self.abspaths(fileids)
+                ]
+            )
+
+    def posts(self, fileids=None):
+        return concat(
+            [
+                XMLCorpusView(
+                    fileid, "Session/Posts/Post/terminals", self._elt_to_words
+                )
+                for fileid in self.abspaths(fileids)
+            ]
+        )
+
+    def tagged_posts(self, fileids=None, tagset=None):
+        def reader(elt, handler):
+            return self._elt_to_tagged_words(elt, handler, tagset)
+
+        return concat(
+            [
+                XMLCorpusView(fileid, "Session/Posts/Post/terminals", reader)
+                for fileid in self.abspaths(fileids)
+            ]
+        )
+
+    def words(self, fileids=None):
+        return LazyConcatenation(self.posts(fileids))
+
+    def tagged_words(self, fileids=None, tagset=None):
+        return LazyConcatenation(self.tagged_posts(fileids, tagset))
+
+    def _wrap_elt(self, elt, handler):
+        return ElementWrapper(elt)
+
+    def _elt_to_words(self, elt, handler):
+        return [self._simplify_username(t.attrib["word"]) for t in elt.findall("t")]
+
+    def _elt_to_tagged_words(self, elt, handler, tagset=None):
+        tagged_post = [
+            (self._simplify_username(t.attrib["word"]), t.attrib["pos"])
+            for t in elt.findall("t")
+        ]
+        if tagset and tagset != self._tagset:
+            tagged_post = [
+                (w, map_tag(self._tagset, tagset, t)) for (w, t) in tagged_post
+            ]
+        return tagged_post
+
+    @staticmethod
+    def _simplify_username(word):
+        if "User" in word:
+            word = "U" + word.split("User", 1)[1]
+        elif isinstance(word, bytes):
+            word = word.decode("ascii")
+        return word
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/opinion_lexicon.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/opinion_lexicon.py
@@ -0,0 +1,125 @@
+# Natural Language Toolkit: Opinion Lexicon Corpus Reader
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+CorpusReader for the Opinion Lexicon.
+
+Opinion Lexicon information
+===========================
+
+Authors: Minqing Hu and Bing Liu, 2004.
+    Department of Computer Science
+    University of Illinois at Chicago
+
+Contact: Bing Liu, liub@cs.uic.edu
+        https://www.cs.uic.edu/~liub
+
+Distributed with permission.
+
+Related papers:
+
+- Minqing Hu and Bing Liu. "Mining and summarizing customer reviews".
+    Proceedings of the ACM SIGKDD International Conference on Knowledge Discovery
+    & Data Mining (KDD-04), Aug 22-25, 2004, Seattle, Washington, USA.
+
+- Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing and
+    Comparing Opinions on the Web". Proceedings of the 14th International World
+    Wide Web conference (WWW-2005), May 10-14, 2005, Chiba, Japan.
+"""
+
+from nltk.corpus.reader import WordListCorpusReader
+from nltk.corpus.reader.api import *
+
+
+class IgnoreReadmeCorpusView(StreamBackedCorpusView):
+    """
+    This CorpusView is used to skip the initial readme block of the corpus.
+    """
+
+    def __init__(self, *args, **kwargs):
+        StreamBackedCorpusView.__init__(self, *args, **kwargs)
+        # open self._stream
+        self._open()
+        # skip the readme block
+        read_blankline_block(self._stream)
+        # Set the initial position to the current stream position
+        self._filepos = [self._stream.tell()]
+
+
+class OpinionLexiconCorpusReader(WordListCorpusReader):
+    """
+    Reader for Liu and Hu opinion lexicon.  Blank lines and readme are ignored.
+
+        >>> from nltk.corpus import opinion_lexicon
+        >>> opinion_lexicon.words()
+        ['2-faced', '2-faces', 'abnormal', 'abolish', ...]
+
+    The OpinionLexiconCorpusReader provides shortcuts to retrieve positive/negative
+    words:
+
+        >>> opinion_lexicon.negative()
+        ['2-faced', '2-faces', 'abnormal', 'abolish', ...]
+
+    Note that words from `words()` method are sorted by file id, not alphabetically:
+
+        >>> opinion_lexicon.words()[0:10] # doctest: +NORMALIZE_WHITESPACE
+        ['2-faced', '2-faces', 'abnormal', 'abolish', 'abominable', 'abominably',
+        'abominate', 'abomination', 'abort', 'aborted']
+        >>> sorted(opinion_lexicon.words())[0:10] # doctest: +NORMALIZE_WHITESPACE
+        ['2-faced', '2-faces', 'a+', 'abnormal', 'abolish', 'abominable', 'abominably',
+        'abominate', 'abomination', 'abort']
+    """
+
+    CorpusView = IgnoreReadmeCorpusView
+
+    def words(self, fileids=None):
+        """
+        Return all words in the opinion lexicon. Note that these words are not
+        sorted in alphabetical order.
+
+        :param fileids: a list or regexp specifying the ids of the files whose
+            words have to be returned.
+        :return: the given file(s) as a list of words and punctuation symbols.
+        :rtype: list(str)
+        """
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, str):
+            fileids = [fileids]
+        return concat(
+            [
+                self.CorpusView(path, self._read_word_block, encoding=enc)
+                for (path, enc, fileid) in self.abspaths(fileids, True, True)
+            ]
+        )
+
+    def positive(self):
+        """
+        Return all positive words in alphabetical order.
+
+        :return: a list of positive words.
+        :rtype: list(str)
+        """
+        return self.words("positive-words.txt")
+
+    def negative(self):
+        """
+        Return all negative words in alphabetical order.
+
+        :return: a list of negative words.
+        :rtype: list(str)
+        """
+        return self.words("negative-words.txt")
+
+    def _read_word_block(self, stream):
+        words = []
+        for i in range(20):  # Read 20 lines at a time.
+            line = stream.readline()
+            if not line:
+                continue
+            words.append(line.strip())
+        return words
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/panlex_lite.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/panlex_lite.py
@@ -0,0 +1,174 @@
+# Natural Language Toolkit: PanLex Corpus Reader
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: David Kamholz <kamholz@panlex.org>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+CorpusReader for PanLex Lite, a stripped down version of PanLex distributed
+as an SQLite database. See the README.txt in the panlex_lite corpus directory
+for more information on PanLex Lite.
+"""
+
+import os
+import sqlite3
+
+from nltk.corpus.reader.api import CorpusReader
+
+
+class PanLexLiteCorpusReader(CorpusReader):
+    MEANING_Q = """
+        SELECT dnx2.mn, dnx2.uq, dnx2.ap, dnx2.ui, ex2.tt, ex2.lv
+        FROM dnx
+        JOIN ex ON (ex.ex = dnx.ex)
+        JOIN dnx dnx2 ON (dnx2.mn = dnx.mn)
+        JOIN ex ex2 ON (ex2.ex = dnx2.ex)
+        WHERE dnx.ex != dnx2.ex AND ex.tt = ? AND ex.lv = ?
+        ORDER BY dnx2.uq DESC
+    """
+
+    TRANSLATION_Q = """
+        SELECT s.tt, sum(s.uq) AS trq FROM (
+            SELECT ex2.tt, max(dnx.uq) AS uq
+            FROM dnx
+            JOIN ex ON (ex.ex = dnx.ex)
+            JOIN dnx dnx2 ON (dnx2.mn = dnx.mn)
+            JOIN ex ex2 ON (ex2.ex = dnx2.ex)
+            WHERE dnx.ex != dnx2.ex AND ex.lv = ? AND ex.tt = ? AND ex2.lv = ?
+            GROUP BY ex2.tt, dnx.ui
+        ) s
+        GROUP BY s.tt
+        ORDER BY trq DESC, s.tt
+    """
+
+    def __init__(self, root):
+        self._c = sqlite3.connect(os.path.join(root, "db.sqlite")).cursor()
+
+        self._uid_lv = {}
+        self._lv_uid = {}
+
+        for row in self._c.execute("SELECT uid, lv FROM lv"):
+            self._uid_lv[row[0]] = row[1]
+            self._lv_uid[row[1]] = row[0]
+
+    def language_varieties(self, lc=None):
+        """
+        Return a list of PanLex language varieties.
+
+        :param lc: ISO 639 alpha-3 code. If specified, filters returned varieties
+            by this code. If unspecified, all varieties are returned.
+        :return: the specified language varieties as a list of tuples. The first
+            element is the language variety's seven-character uniform identifier,
+            and the second element is its default name.
+        :rtype: list(tuple)
+        """
+
+        if lc is None:
+            return self._c.execute("SELECT uid, tt FROM lv ORDER BY uid").fetchall()
+        else:
+            return self._c.execute(
+                "SELECT uid, tt FROM lv WHERE lc = ? ORDER BY uid", (lc,)
+            ).fetchall()
+
+    def meanings(self, expr_uid, expr_tt):
+        """
+        Return a list of meanings for an expression.
+
+        :param expr_uid: the expression's language variety, as a seven-character
+            uniform identifier.
+        :param expr_tt: the expression's text.
+        :return: a list of Meaning objects.
+        :rtype: list(Meaning)
+        """
+
+        expr_lv = self._uid_lv[expr_uid]
+
+        mn_info = {}
+
+        for i in self._c.execute(self.MEANING_Q, (expr_tt, expr_lv)):
+            mn = i[0]
+            uid = self._lv_uid[i[5]]
+
+            if not mn in mn_info:
+                mn_info[mn] = {
+                    "uq": i[1],
+                    "ap": i[2],
+                    "ui": i[3],
+                    "ex": {expr_uid: [expr_tt]},
+                }
+
+            if not uid in mn_info[mn]["ex"]:
+                mn_info[mn]["ex"][uid] = []
+
+            mn_info[mn]["ex"][uid].append(i[4])
+
+        return [Meaning(mn, mn_info[mn]) for mn in mn_info]
+
+    def translations(self, from_uid, from_tt, to_uid):
+        """
+        Return a list of translations for an expression into a single language
+        variety.
+
+        :param from_uid: the source expression's language variety, as a
+            seven-character uniform identifier.
+        :param from_tt: the source expression's text.
+        :param to_uid: the target language variety, as a seven-character
+            uniform identifier.
+        :return: a list of translation tuples. The first element is the expression
+            text and the second element is the translation quality.
+        :rtype: list(tuple)
+        """
+
+        from_lv = self._uid_lv[from_uid]
+        to_lv = self._uid_lv[to_uid]
+
+        return self._c.execute(self.TRANSLATION_Q, (from_lv, from_tt, to_lv)).fetchall()
+
+
+class Meaning(dict):
+    """
+    Represents a single PanLex meaning. A meaning is a translation set derived
+    from a single source.
+    """
+
+    def __init__(self, mn, attr):
+        super().__init__(**attr)
+        self["mn"] = mn
+
+    def id(self):
+        """
+        :return: the meaning's id.
+        :rtype: int
+        """
+        return self["mn"]
+
+    def quality(self):
+        """
+        :return: the meaning's source's quality (0=worst, 9=best).
+        :rtype: int
+        """
+        return self["uq"]
+
+    def source(self):
+        """
+        :return: the meaning's source id.
+        :rtype: int
+        """
+        return self["ap"]
+
+    def source_group(self):
+        """
+        :return: the meaning's source group id.
+        :rtype: int
+        """
+        return self["ui"]
+
+    def expressions(self):
+        """
+        :return: the meaning's expressions as a dictionary whose keys are language
+            variety uniform identifiers and whose values are lists of expression
+            texts.
+        :rtype: dict
+        """
+        return self["ex"]
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/panlex_swadesh.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/panlex_swadesh.py
@@ -0,0 +1,95 @@
+# Natural Language Toolkit: Word List Corpus Reader
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+#         Edward Loper <edloper@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+
+import re
+from collections import defaultdict, namedtuple
+
+from nltk.corpus.reader.api import *
+from nltk.corpus.reader.util import *
+from nltk.corpus.reader.wordlist import WordListCorpusReader
+from nltk.tokenize import line_tokenize
+
+PanlexLanguage = namedtuple(
+    "PanlexLanguage",
+    [
+        "panlex_uid",  # (1) PanLex UID
+        "iso639",  # (2) ISO 639 language code
+        "iso639_type",  # (3) ISO 639 language type, see README
+        "script",  # (4) normal scripts of expressions
+        "name",  # (5) PanLex default name
+        "langvar_uid",  # (6) UID of the language variety in which the default name is an expression
+    ],
+)
+
+
+class PanlexSwadeshCorpusReader(WordListCorpusReader):
+    """
+    This is a class to read the PanLex Swadesh list from
+
+    David Kamholz, Jonathan Pool, and Susan M. Colowick (2014).
+    PanLex: Building a Resource for Panlingual Lexical Translation.
+    In LREC. http://www.lrec-conf.org/proceedings/lrec2014/pdf/1029_Paper.pdf
+
+    License: CC0 1.0 Universal
+    https://creativecommons.org/publicdomain/zero/1.0/legalcode
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # Find the swadesh size using the fileids' path.
+        self.swadesh_size = re.match(r"swadesh([0-9].*)\/", self.fileids()[0]).group(1)
+        self._languages = {lang.panlex_uid: lang for lang in self.get_languages()}
+        self._macro_langauges = self.get_macrolanguages()
+
+    def license(self):
+        return "CC0 1.0 Universal"
+
+    def language_codes(self):
+        return self._languages.keys()
+
+    def get_languages(self):
+        for line in self.raw(f"langs{self.swadesh_size}.txt").split("\n"):
+            if not line.strip():  # Skip empty lines.
+                continue
+            yield PanlexLanguage(*line.strip().split("\t"))
+
+    def get_macrolanguages(self):
+        macro_langauges = defaultdict(list)
+        for lang in self._languages.values():
+            macro_langauges[lang.iso639].append(lang.panlex_uid)
+        return macro_langauges
+
+    def words_by_lang(self, lang_code):
+        """
+        :return: a list of list(str)
+        """
+        fileid = f"swadesh{self.swadesh_size}/{lang_code}.txt"
+        return [concept.split("\t") for concept in self.words(fileid)]
+
+    def words_by_iso639(self, iso63_code):
+        """
+        :return: a list of list(str)
+        """
+        fileids = [
+            f"swadesh{self.swadesh_size}/{lang_code}.txt"
+            for lang_code in self._macro_langauges[iso63_code]
+        ]
+        return [
+            concept.split("\t") for fileid in fileids for concept in self.words(fileid)
+        ]
+
+    def entries(self, fileids=None):
+        """
+        :return: a tuple of words for the specified fileids.
+        """
+        if not fileids:
+            fileids = self.fileids()
+
+        wordlists = [self.words(f) for f in fileids]
+        return list(zip(*wordlists))
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/pl196x.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/pl196x.py
@@ -0,0 +1,373 @@
+# Natural Language Toolkit:
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Piotr Kasprzyk <p.j.kasprzyk@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+from nltk.corpus.reader.api import *
+from nltk.corpus.reader.xmldocs import XMLCorpusReader
+
+PARA = re.compile(r"<p(?: [^>]*){0,1}>(.*?)</p>")
+SENT = re.compile(r"<s(?: [^>]*){0,1}>(.*?)</s>")
+
+TAGGEDWORD = re.compile(r"<([wc](?: [^>]*){0,1}>)(.*?)</[wc]>")
+WORD = re.compile(r"<[wc](?: [^>]*){0,1}>(.*?)</[wc]>")
+
+TYPE = re.compile(r'type="(.*?)"')
+ANA = re.compile(r'ana="(.*?)"')
+
+TEXTID = re.compile(r'text id="(.*?)"')
+
+
+class TEICorpusView(StreamBackedCorpusView):
+    def __init__(
+        self,
+        corpus_file,
+        tagged,
+        group_by_sent,
+        group_by_para,
+        tagset=None,
+        head_len=0,
+        textids=None,
+    ):
+        self._tagged = tagged
+        self._textids = textids
+
+        self._group_by_sent = group_by_sent
+        self._group_by_para = group_by_para
+        # WARNING -- skip header
+        StreamBackedCorpusView.__init__(self, corpus_file, startpos=head_len)
+
+    _pagesize = 4096
+
+    def read_block(self, stream):
+        block = stream.readlines(self._pagesize)
+        block = concat(block)
+        while (block.count("<text id") > block.count("</text>")) or block.count(
+            "<text id"
+        ) == 0:
+            tmp = stream.readline()
+            if len(tmp) <= 0:
+                break
+            block += tmp
+
+        block = block.replace("\n", "")
+
+        textids = TEXTID.findall(block)
+        if self._textids:
+            for tid in textids:
+                if tid not in self._textids:
+                    beg = block.find(tid) - 1
+                    end = block[beg:].find("</text>") + len("</text>")
+                    block = block[:beg] + block[beg + end :]
+
+        output = []
+        for para_str in PARA.findall(block):
+            para = []
+            for sent_str in SENT.findall(para_str):
+                if not self._tagged:
+                    sent = WORD.findall(sent_str)
+                else:
+                    sent = list(map(self._parse_tag, TAGGEDWORD.findall(sent_str)))
+                if self._group_by_sent:
+                    para.append(sent)
+                else:
+                    para.extend(sent)
+            if self._group_by_para:
+                output.append(para)
+            else:
+                output.extend(para)
+        return output
+
+    def _parse_tag(self, tag_word_tuple):
+        (tag, word) = tag_word_tuple
+        if tag.startswith("w"):
+            tag = ANA.search(tag).group(1)
+        else:  # tag.startswith('c')
+            tag = TYPE.search(tag).group(1)
+        return word, tag
+
+
+class Pl196xCorpusReader(CategorizedCorpusReader, XMLCorpusReader):
+    head_len = 2770
+
+    def __init__(self, *args, **kwargs):
+        if "textid_file" in kwargs:
+            self._textids = kwargs["textid_file"]
+        else:
+            self._textids = None
+
+        XMLCorpusReader.__init__(self, *args)
+        CategorizedCorpusReader.__init__(self, kwargs)
+
+        self._init_textids()
+
+    def _init_textids(self):
+        self._f2t = defaultdict(list)
+        self._t2f = defaultdict(list)
+        if self._textids is not None:
+            with open(self._textids) as fp:
+                for line in fp:
+                    line = line.strip()
+                    file_id, text_ids = line.split(" ", 1)
+                    if file_id not in self.fileids():
+                        raise ValueError(
+                            "In text_id mapping file %s: %s not found"
+                            % (self._textids, file_id)
+                        )
+                    for text_id in text_ids.split(self._delimiter):
+                        self._add_textids(file_id, text_id)
+
+    def _add_textids(self, file_id, text_id):
+        self._f2t[file_id].append(text_id)
+        self._t2f[text_id].append(file_id)
+
+    def _resolve(self, fileids, categories, textids=None):
+        tmp = None
+        if (
+            len(
+                list(
+                    filter(
+                        lambda accessor: accessor is None,
+                        (fileids, categories, textids),
+                    )
+                )
+            )
+            != 1
+        ):
+            raise ValueError(
+                "Specify exactly one of: fileids, " "categories or textids"
+            )
+
+        if fileids is not None:
+            return fileids, None
+
+        if categories is not None:
+            return self.fileids(categories), None
+
+        if textids is not None:
+            if isinstance(textids, str):
+                textids = [textids]
+            files = sum((self._t2f[t] for t in textids), [])
+            tdict = dict()
+            for f in files:
+                tdict[f] = set(self._f2t[f]) & set(textids)
+            return files, tdict
+
+    def decode_tag(self, tag):
+        # to be implemented
+        return tag
+
+    def textids(self, fileids=None, categories=None):
+        """
+        In the pl196x corpus each category is stored in single
+        file and thus both methods provide identical functionality. In order
+        to accommodate finer granularity, a non-standard textids() method was
+        implemented. All the main functions can be supplied with a list
+        of required chunks---giving much more control to the user.
+        """
+        fileids, _ = self._resolve(fileids, categories)
+        if fileids is None:
+            return sorted(self._t2f)
+
+        if isinstance(fileids, str):
+            fileids = [fileids]
+        return sorted(sum((self._f2t[d] for d in fileids), []))
+
+    def words(self, fileids=None, categories=None, textids=None):
+        fileids, textids = self._resolve(fileids, categories, textids)
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, str):
+            fileids = [fileids]
+
+        if textids:
+            return concat(
+                [
+                    TEICorpusView(
+                        self.abspath(fileid),
+                        False,
+                        False,
+                        False,
+                        head_len=self.head_len,
+                        textids=textids[fileid],
+                    )
+                    for fileid in fileids
+                ]
+            )
+        else:
+            return concat(
+                [
+                    TEICorpusView(
+                        self.abspath(fileid),
+                        False,
+                        False,
+                        False,
+                        head_len=self.head_len,
+                    )
+                    for fileid in fileids
+                ]
+            )
+
+    def sents(self, fileids=None, categories=None, textids=None):
+        fileids, textids = self._resolve(fileids, categories, textids)
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, str):
+            fileids = [fileids]
+
+        if textids:
+            return concat(
+                [
+                    TEICorpusView(
+                        self.abspath(fileid),
+                        False,
+                        True,
+                        False,
+                        head_len=self.head_len,
+                        textids=textids[fileid],
+                    )
+                    for fileid in fileids
+                ]
+            )
+        else:
+            return concat(
+                [
+                    TEICorpusView(
+                        self.abspath(fileid), False, True, False, head_len=self.head_len
+                    )
+                    for fileid in fileids
+                ]
+            )
+
+    def paras(self, fileids=None, categories=None, textids=None):
+        fileids, textids = self._resolve(fileids, categories, textids)
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, str):
+            fileids = [fileids]
+
+        if textids:
+            return concat(
+                [
+                    TEICorpusView(
+                        self.abspath(fileid),
+                        False,
+                        True,
+                        True,
+                        head_len=self.head_len,
+                        textids=textids[fileid],
+                    )
+                    for fileid in fileids
+                ]
+            )
+        else:
+            return concat(
+                [
+                    TEICorpusView(
+                        self.abspath(fileid), False, True, True, head_len=self.head_len
+                    )
+                    for fileid in fileids
+                ]
+            )
+
+    def tagged_words(self, fileids=None, categories=None, textids=None):
+        fileids, textids = self._resolve(fileids, categories, textids)
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, str):
+            fileids = [fileids]
+
+        if textids:
+            return concat(
+                [
+                    TEICorpusView(
+                        self.abspath(fileid),
+                        True,
+                        False,
+                        False,
+                        head_len=self.head_len,
+                        textids=textids[fileid],
+                    )
+                    for fileid in fileids
+                ]
+            )
+        else:
+            return concat(
+                [
+                    TEICorpusView(
+                        self.abspath(fileid), True, False, False, head_len=self.head_len
+                    )
+                    for fileid in fileids
+                ]
+            )
+
+    def tagged_sents(self, fileids=None, categories=None, textids=None):
+        fileids, textids = self._resolve(fileids, categories, textids)
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, str):
+            fileids = [fileids]
+
+        if textids:
+            return concat(
+                [
+                    TEICorpusView(
+                        self.abspath(fileid),
+                        True,
+                        True,
+                        False,
+                        head_len=self.head_len,
+                        textids=textids[fileid],
+                    )
+                    for fileid in fileids
+                ]
+            )
+        else:
+            return concat(
+                [
+                    TEICorpusView(
+                        self.abspath(fileid), True, True, False, head_len=self.head_len
+                    )
+                    for fileid in fileids
+                ]
+            )
+
+    def tagged_paras(self, fileids=None, categories=None, textids=None):
+        fileids, textids = self._resolve(fileids, categories, textids)
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, str):
+            fileids = [fileids]
+
+        if textids:
+            return concat(
+                [
+                    TEICorpusView(
+                        self.abspath(fileid),
+                        True,
+                        True,
+                        True,
+                        head_len=self.head_len,
+                        textids=textids[fileid],
+                    )
+                    for fileid in fileids
+                ]
+            )
+        else:
+            return concat(
+                [
+                    TEICorpusView(
+                        self.abspath(fileid), True, True, True, head_len=self.head_len
+                    )
+                    for fileid in fileids
+                ]
+            )
+
+    def xml(self, fileids=None, categories=None):
+        fileids, _ = self._resolve(fileids, categories)
+        if len(fileids) == 1:
+            return XMLCorpusReader.xml(self, fileids[0])
+        else:
+            raise TypeError("Expected a single file")
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/plaintext.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/plaintext.py
@@ -0,0 +1,237 @@
+# Natural Language Toolkit: Plaintext Corpus Reader
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+#         Edward Loper <edloper@gmail.com>
+#         Nitin Madnani <nmadnani@umiacs.umd.edu>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+A reader for corpora that consist of plaintext documents.
+"""
+
+import nltk.data
+from nltk.corpus.reader.api import *
+from nltk.corpus.reader.util import *
+from nltk.tokenize import *
+
+
+class PlaintextCorpusReader(CorpusReader):
+    """
+    Reader for corpora that consist of plaintext documents.  Paragraphs
+    are assumed to be split using blank lines.  Sentences and words can
+    be tokenized using the default tokenizers, or by custom tokenizers
+    specified as parameters to the constructor.
+
+    This corpus reader can be customized (e.g., to skip preface
+    sections of specific document formats) by creating a subclass and
+    overriding the ``CorpusView`` class variable.
+    """
+
+    CorpusView = StreamBackedCorpusView
+    """The corpus view class used by this reader.  Subclasses of
+       ``PlaintextCorpusReader`` may specify alternative corpus view
+       classes (e.g., to skip the preface sections of documents.)"""
+
+    def __init__(
+        self,
+        root,
+        fileids,
+        word_tokenizer=WordPunctTokenizer(),
+        sent_tokenizer=None,
+        para_block_reader=read_blankline_block,
+        encoding="utf8",
+    ):
+        r"""
+        Construct a new plaintext corpus reader for a set of documents
+        located at the given root directory.  Example usage:
+
+            >>> root = '/usr/local/share/nltk_data/corpora/webtext/'
+            >>> reader = PlaintextCorpusReader(root, '.*\.txt') # doctest: +SKIP
+
+        :param root: The root directory for this corpus.
+        :param fileids: A list or regexp specifying the fileids in this corpus.
+        :param word_tokenizer: Tokenizer for breaking sentences or
+            paragraphs into words.
+        :param sent_tokenizer: Tokenizer for breaking paragraphs
+            into words.
+        :param para_block_reader: The block reader used to divide the
+            corpus into paragraph blocks.
+        """
+        CorpusReader.__init__(self, root, fileids, encoding)
+        self._word_tokenizer = word_tokenizer
+        self._sent_tokenizer = sent_tokenizer
+        self._para_block_reader = para_block_reader
+
+    def words(self, fileids=None):
+        """
+        :return: the given file(s) as a list of words
+            and punctuation symbols.
+        :rtype: list(str)
+        """
+        return concat(
+            [
+                self.CorpusView(path, self._read_word_block, encoding=enc)
+                for (path, enc, fileid) in self.abspaths(fileids, True, True)
+            ]
+        )
+
+    def sents(self, fileids=None):
+        """
+        :return: the given file(s) as a list of
+            sentences or utterances, each encoded as a list of word
+            strings.
+        :rtype: list(list(str))
+        """
+        if self._sent_tokenizer is None:
+            try:
+                self._sent_tokenizer = PunktTokenizer()
+            except:
+                raise ValueError("No sentence tokenizer for this corpus")
+
+        return concat(
+            [
+                self.CorpusView(path, self._read_sent_block, encoding=enc)
+                for (path, enc, fileid) in self.abspaths(fileids, True, True)
+            ]
+        )
+
+    def paras(self, fileids=None):
+        """
+        :return: the given file(s) as a list of
+            paragraphs, each encoded as a list of sentences, which are
+            in turn encoded as lists of word strings.
+        :rtype: list(list(list(str)))
+        """
+        if self._sent_tokenizer is None:
+            try:
+                self._sent_tokenizer = PunktTokenizer()
+            except:
+                raise ValueError("No sentence tokenizer for this corpus")
+
+        return concat(
+            [
+                self.CorpusView(path, self._read_para_block, encoding=enc)
+                for (path, enc, fileid) in self.abspaths(fileids, True, True)
+            ]
+        )
+
+    def _read_word_block(self, stream):
+        words = []
+        for i in range(20):  # Read 20 lines at a time.
+            words.extend(self._word_tokenizer.tokenize(stream.readline()))
+        return words
+
+    def _read_sent_block(self, stream):
+        sents = []
+        for para in self._para_block_reader(stream):
+            sents.extend(
+                [
+                    self._word_tokenizer.tokenize(sent)
+                    for sent in self._sent_tokenizer.tokenize(para)
+                ]
+            )
+        return sents
+
+    def _read_para_block(self, stream):
+        paras = []
+        for para in self._para_block_reader(stream):
+            paras.append(
+                [
+                    self._word_tokenizer.tokenize(sent)
+                    for sent in self._sent_tokenizer.tokenize(para)
+                ]
+            )
+        return paras
+
+
+class CategorizedPlaintextCorpusReader(CategorizedCorpusReader, PlaintextCorpusReader):
+    """
+    A reader for plaintext corpora whose documents are divided into
+    categories based on their file identifiers.
+    """
+
+    def __init__(self, *args, **kwargs):
+        """
+        Initialize the corpus reader.  Categorization arguments
+        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
+        the ``CategorizedCorpusReader`` constructor.  The remaining arguments
+        are passed to the ``PlaintextCorpusReader`` constructor.
+        """
+        CategorizedCorpusReader.__init__(self, kwargs)
+        PlaintextCorpusReader.__init__(self, *args, **kwargs)
+
+
+class PortugueseCategorizedPlaintextCorpusReader(CategorizedPlaintextCorpusReader):
+    """
+    This class is identical with CategorizedPlaintextCorpusReader,
+    except that it initializes a Portuguese PunktTokenizer:
+
+    >>> from nltk.corpus import machado
+    >>> print(machado._sent_tokenizer._lang)
+    portuguese
+
+    """
+
+    def __init__(self, *args, **kwargs):
+        CategorizedPlaintextCorpusReader.__init__(self, *args, **kwargs)
+        # Fixed (@ekaf 2025), new way to invoke Punkt:
+        self._sent_tokenizer = PunktTokenizer("portuguese")
+
+
+class EuroparlCorpusReader(PlaintextCorpusReader):
+    """
+    Reader for Europarl corpora that consist of plaintext documents.
+    Documents are divided into chapters instead of paragraphs as
+    for regular plaintext documents. Chapters are separated using blank
+    lines. Everything is inherited from ``PlaintextCorpusReader`` except
+    that:
+
+    - Since the corpus is pre-processed and pre-tokenized, the
+      word tokenizer should just split the line at whitespaces.
+    - For the same reason, the sentence tokenizer should just
+      split the paragraph at line breaks.
+    - There is a new 'chapters()' method that returns chapters instead
+      instead of paragraphs.
+    - The 'paras()' method inherited from PlaintextCorpusReader is
+      made non-functional to remove any confusion between chapters
+      and paragraphs for Europarl.
+    """
+
+    def _read_word_block(self, stream):
+        words = []
+        for i in range(20):  # Read 20 lines at a time.
+            words.extend(stream.readline().split())
+        return words
+
+    def _read_sent_block(self, stream):
+        sents = []
+        for para in self._para_block_reader(stream):
+            sents.extend([sent.split() for sent in para.splitlines()])
+        return sents
+
+    def _read_para_block(self, stream):
+        paras = []
+        for para in self._para_block_reader(stream):
+            paras.append([sent.split() for sent in para.splitlines()])
+        return paras
+
+    def chapters(self, fileids=None):
+        """
+        :return: the given file(s) as a list of
+            chapters, each encoded as a list of sentences, which are
+            in turn encoded as lists of word strings.
+        :rtype: list(list(list(str)))
+        """
+        return concat(
+            [
+                self.CorpusView(fileid, self._read_para_block, encoding=enc)
+                for (fileid, enc) in self.abspaths(fileids, True)
+            ]
+        )
+
+    def paras(self, fileids=None):
+        raise NotImplementedError(
+            "The Europarl corpus reader does not support paragraphs. Please use chapters() instead."
+        )
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/ppattach.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/ppattach.py
@@ -0,0 +1,95 @@
+# Natural Language Toolkit: PP Attachment Corpus Reader
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+#         Edward Loper <edloper@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Read lines from the Prepositional Phrase Attachment Corpus.
+
+The PP Attachment Corpus contains several files having the format:
+
+sentence_id verb noun1 preposition noun2 attachment
+
+For example:
+
+42960 gives authority to administration V
+46742 gives inventors of microchip N
+
+The PP attachment is to the verb phrase (V) or noun phrase (N), i.e.:
+
+(VP gives (NP authority) (PP to administration))
+(VP gives (NP inventors (PP of microchip)))
+
+The corpus contains the following files:
+
+training:   training set
+devset:     development test set, used for algorithm development.
+test:       test set, used to report results
+bitstrings: word classes derived from Mutual Information Clustering for the Wall Street Journal.
+
+Ratnaparkhi, Adwait (1994). A Maximum Entropy Model for Prepositional
+Phrase Attachment.  Proceedings of the ARPA Human Language Technology
+Conference.  [http://www.cis.upenn.edu/~adwait/papers/hlt94.ps]
+
+The PP Attachment Corpus is distributed with NLTK with the permission
+of the author.
+"""
+
+from nltk.corpus.reader.api import *
+from nltk.corpus.reader.util import *
+
+
+class PPAttachment:
+    def __init__(self, sent, verb, noun1, prep, noun2, attachment):
+        self.sent = sent
+        self.verb = verb
+        self.noun1 = noun1
+        self.prep = prep
+        self.noun2 = noun2
+        self.attachment = attachment
+
+    def __repr__(self):
+        return (
+            "PPAttachment(sent=%r, verb=%r, noun1=%r, prep=%r, "
+            "noun2=%r, attachment=%r)"
+            % (self.sent, self.verb, self.noun1, self.prep, self.noun2, self.attachment)
+        )
+
+
+class PPAttachmentCorpusReader(CorpusReader):
+    """
+    sentence_id verb noun1 preposition noun2 attachment
+    """
+
+    def attachments(self, fileids):
+        return concat(
+            [
+                StreamBackedCorpusView(fileid, self._read_obj_block, encoding=enc)
+                for (fileid, enc) in self.abspaths(fileids, True)
+            ]
+        )
+
+    def tuples(self, fileids):
+        return concat(
+            [
+                StreamBackedCorpusView(fileid, self._read_tuple_block, encoding=enc)
+                for (fileid, enc) in self.abspaths(fileids, True)
+            ]
+        )
+
+    def _read_tuple_block(self, stream):
+        line = stream.readline()
+        if line:
+            return [tuple(line.split())]
+        else:
+            return []
+
+    def _read_obj_block(self, stream):
+        line = stream.readline()
+        if line:
+            return [PPAttachment(*line.split())]
+        else:
+            return []
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/propbank.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/propbank.py
@@ -0,0 +1,519 @@
+# Natural Language Toolkit: PropBank Corpus Reader
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+import re
+from functools import total_ordering
+from xml.etree import ElementTree
+
+from nltk.corpus.reader.api import *
+from nltk.corpus.reader.util import *
+from nltk.internals import raise_unorderable_types
+from nltk.tree import Tree
+
+
+class PropbankCorpusReader(CorpusReader):
+    """
+    Corpus reader for the propbank corpus, which augments the Penn
+    Treebank with information about the predicate argument structure
+    of every verb instance.  The corpus consists of two parts: the
+    predicate-argument annotations themselves, and a set of "frameset
+    files" which define the argument labels used by the annotations,
+    on a per-verb basis.  Each "frameset file" contains one or more
+    predicates, such as ``'turn'`` or ``'turn_on'``, each of which is
+    divided into coarse-grained word senses called "rolesets".  For
+    each "roleset", the frameset file provides descriptions of the
+    argument roles, along with examples.
+    """
+
+    def __init__(
+        self,
+        root,
+        propfile,
+        framefiles="",
+        verbsfile=None,
+        parse_fileid_xform=None,
+        parse_corpus=None,
+        encoding="utf8",
+    ):
+        """
+        :param root: The root directory for this corpus.
+        :param propfile: The name of the file containing the predicate-
+            argument annotations (relative to ``root``).
+        :param framefiles: A list or regexp specifying the frameset
+            fileids for this corpus.
+        :param parse_fileid_xform: A transform that should be applied
+            to the fileids in this corpus.  This should be a function
+            of one argument (a fileid) that returns a string (the new
+            fileid).
+        :param parse_corpus: The corpus containing the parse trees
+            corresponding to this corpus.  These parse trees are
+            necessary to resolve the tree pointers used by propbank.
+        """
+        # If framefiles is specified as a regexp, expand it.
+        if isinstance(framefiles, str):
+            framefiles = find_corpus_fileids(root, framefiles)
+        framefiles = list(framefiles)
+        # Initialize the corpus reader.
+        CorpusReader.__init__(self, root, [propfile, verbsfile] + framefiles, encoding)
+
+        # Record our frame fileids & prop file.
+        self._propfile = propfile
+        self._framefiles = framefiles
+        self._verbsfile = verbsfile
+        self._parse_fileid_xform = parse_fileid_xform
+        self._parse_corpus = parse_corpus
+
+    def instances(self, baseform=None):
+        """
+        :return: a corpus view that acts as a list of
+            ``PropBankInstance`` objects, one for each noun in the corpus.
+        """
+        kwargs = {}
+        if baseform is not None:
+            kwargs["instance_filter"] = lambda inst: inst.baseform == baseform
+        return StreamBackedCorpusView(
+            self.abspath(self._propfile),
+            lambda stream: self._read_instance_block(stream, **kwargs),
+            encoding=self.encoding(self._propfile),
+        )
+
+    def lines(self):
+        """
+        :return: a corpus view that acts as a list of strings, one for
+            each line in the predicate-argument annotation file.
+        """
+        return StreamBackedCorpusView(
+            self.abspath(self._propfile),
+            read_line_block,
+            encoding=self.encoding(self._propfile),
+        )
+
+    def roleset(self, roleset_id):
+        """
+        :return: the xml description for the given roleset.
+        """
+        baseform = roleset_id.split(".")[0]
+        framefile = "frames/%s.xml" % baseform
+        if framefile not in self._framefiles:
+            raise ValueError("Frameset file for %s not found" % roleset_id)
+
+        # n.b.: The encoding for XML fileids is specified by the file
+        # itself; so we ignore self._encoding here.
+        with self.abspath(framefile).open() as fp:
+            etree = ElementTree.parse(fp).getroot()
+        for roleset in etree.findall("predicate/roleset"):
+            if roleset.attrib["id"] == roleset_id:
+                return roleset
+        raise ValueError(f"Roleset {roleset_id} not found in {framefile}")
+
+    def rolesets(self, baseform=None):
+        """
+        :return: list of xml descriptions for rolesets.
+        """
+        if baseform is not None:
+            framefile = "frames/%s.xml" % baseform
+            if framefile not in self._framefiles:
+                raise ValueError("Frameset file for %s not found" % baseform)
+            framefiles = [framefile]
+        else:
+            framefiles = self._framefiles
+
+        rsets = []
+        for framefile in framefiles:
+            # n.b.: The encoding for XML fileids is specified by the file
+            # itself; so we ignore self._encoding here.
+            with self.abspath(framefile).open() as fp:
+                etree = ElementTree.parse(fp).getroot()
+            rsets.append(etree.findall("predicate/roleset"))
+        return LazyConcatenation(rsets)
+
+    def verbs(self):
+        """
+        :return: a corpus view that acts as a list of all verb lemmas
+            in this corpus (from the verbs.txt file).
+        """
+        return StreamBackedCorpusView(
+            self.abspath(self._verbsfile),
+            read_line_block,
+            encoding=self.encoding(self._verbsfile),
+        )
+
+    def _read_instance_block(self, stream, instance_filter=lambda inst: True):
+        block = []
+
+        # Read 100 at a time.
+        for i in range(100):
+            line = stream.readline().strip()
+            if line:
+                inst = PropbankInstance.parse(
+                    line, self._parse_fileid_xform, self._parse_corpus
+                )
+                if instance_filter(inst):
+                    block.append(inst)
+
+        return block
+
+
+######################################################################
+# { Propbank Instance & related datatypes
+######################################################################
+
+
+class PropbankInstance:
+    def __init__(
+        self,
+        fileid,
+        sentnum,
+        wordnum,
+        tagger,
+        roleset,
+        inflection,
+        predicate,
+        arguments,
+        parse_corpus=None,
+    ):
+        self.fileid = fileid
+        """The name of the file containing the parse tree for this
+        instance's sentence."""
+
+        self.sentnum = sentnum
+        """The sentence number of this sentence within ``fileid``.
+        Indexing starts from zero."""
+
+        self.wordnum = wordnum
+        """The word number of this instance's predicate within its
+        containing sentence.  Word numbers are indexed starting from
+        zero, and include traces and other empty parse elements."""
+
+        self.tagger = tagger
+        """An identifier for the tagger who tagged this instance; or
+        ``'gold'`` if this is an adjuticated instance."""
+
+        self.roleset = roleset
+        """The name of the roleset used by this instance's predicate.
+        Use ``propbank.roleset() <PropbankCorpusReader.roleset>`` to
+        look up information about the roleset."""
+
+        self.inflection = inflection
+        """A ``PropbankInflection`` object describing the inflection of
+        this instance's predicate."""
+
+        self.predicate = predicate
+        """A ``PropbankTreePointer`` indicating the position of this
+        instance's predicate within its containing sentence."""
+
+        self.arguments = tuple(arguments)
+        """A list of tuples (argloc, argid), specifying the location
+        and identifier for each of the predicate's argument in the
+        containing sentence.  Argument identifiers are strings such as
+        ``'ARG0'`` or ``'ARGM-TMP'``.  This list does *not* contain
+        the predicate."""
+
+        self.parse_corpus = parse_corpus
+        """A corpus reader for the parse trees corresponding to the
+        instances in this propbank corpus."""
+
+    @property
+    def baseform(self):
+        """The baseform of the predicate."""
+        return self.roleset.split(".")[0]
+
+    @property
+    def sensenumber(self):
+        """The sense number of the predicate."""
+        return self.roleset.split(".")[1]
+
+    @property
+    def predid(self):
+        """Identifier of the predicate."""
+        return "rel"
+
+    def __repr__(self):
+        return "<PropbankInstance: {}, sent {}, word {}>".format(
+            self.fileid,
+            self.sentnum,
+            self.wordnum,
+        )
+
+    def __str__(self):
+        s = "{} {} {} {} {} {}".format(
+            self.fileid,
+            self.sentnum,
+            self.wordnum,
+            self.tagger,
+            self.roleset,
+            self.inflection,
+        )
+        items = self.arguments + ((self.predicate, "rel"),)
+        for argloc, argid in sorted(items):
+            s += f" {argloc}-{argid}"
+        return s
+
+    def _get_tree(self):
+        if self.parse_corpus is None:
+            return None
+        if self.fileid not in self.parse_corpus.fileids():
+            return None
+        return self.parse_corpus.parsed_sents(self.fileid)[self.sentnum]
+
+    tree = property(
+        _get_tree,
+        doc="""
+        The parse tree corresponding to this instance, or None if
+        the corresponding tree is not available.""",
+    )
+
+    @staticmethod
+    def parse(s, parse_fileid_xform=None, parse_corpus=None):
+        pieces = s.split()
+        if len(pieces) < 7:
+            raise ValueError("Badly formatted propbank line: %r" % s)
+
+        # Divide the line into its basic pieces.
+        (fileid, sentnum, wordnum, tagger, roleset, inflection) = pieces[:6]
+        rel = [p for p in pieces[6:] if p.endswith("-rel")]
+        args = [p for p in pieces[6:] if not p.endswith("-rel")]
+        if len(rel) != 1:
+            raise ValueError("Badly formatted propbank line: %r" % s)
+
+        # Apply the fileid selector, if any.
+        if parse_fileid_xform is not None:
+            fileid = parse_fileid_xform(fileid)
+
+        # Convert sentence & word numbers to ints.
+        sentnum = int(sentnum)
+        wordnum = int(wordnum)
+
+        # Parse the inflection
+        inflection = PropbankInflection.parse(inflection)
+
+        # Parse the predicate location.
+        predicate = PropbankTreePointer.parse(rel[0][:-4])
+
+        # Parse the arguments.
+        arguments = []
+        for arg in args:
+            argloc, argid = arg.split("-", 1)
+            arguments.append((PropbankTreePointer.parse(argloc), argid))
+
+        # Put it all together.
+        return PropbankInstance(
+            fileid,
+            sentnum,
+            wordnum,
+            tagger,
+            roleset,
+            inflection,
+            predicate,
+            arguments,
+            parse_corpus,
+        )
+
+
+class PropbankPointer:
+    """
+    A pointer used by propbank to identify one or more constituents in
+    a parse tree.  ``PropbankPointer`` is an abstract base class with
+    three concrete subclasses:
+
+      - ``PropbankTreePointer`` is used to point to single constituents.
+      - ``PropbankSplitTreePointer`` is used to point to 'split'
+        constituents, which consist of a sequence of two or more
+        ``PropbankTreePointer`` pointers.
+      - ``PropbankChainTreePointer`` is used to point to entire trace
+        chains in a tree.  It consists of a sequence of pieces, which
+        can be ``PropbankTreePointer`` or ``PropbankSplitTreePointer`` pointers.
+    """
+
+    def __init__(self):
+        if self.__class__ == PropbankPointer:
+            raise NotImplementedError()
+
+
+class PropbankChainTreePointer(PropbankPointer):
+    def __init__(self, pieces):
+        self.pieces = pieces
+        """A list of the pieces that make up this chain.  Elements may
+           be either ``PropbankSplitTreePointer`` or
+           ``PropbankTreePointer`` pointers."""
+
+    def __str__(self):
+        return "*".join("%s" % p for p in self.pieces)
+
+    def __repr__(self):
+        return "<PropbankChainTreePointer: %s>" % self
+
+    def select(self, tree):
+        if tree is None:
+            raise ValueError("Parse tree not available")
+        return Tree("*CHAIN*", [p.select(tree) for p in self.pieces])
+
+
+class PropbankSplitTreePointer(PropbankPointer):
+    def __init__(self, pieces):
+        self.pieces = pieces
+        """A list of the pieces that make up this chain.  Elements are
+           all ``PropbankTreePointer`` pointers."""
+
+    def __str__(self):
+        return ",".join("%s" % p for p in self.pieces)
+
+    def __repr__(self):
+        return "<PropbankSplitTreePointer: %s>" % self
+
+    def select(self, tree):
+        if tree is None:
+            raise ValueError("Parse tree not available")
+        return Tree("*SPLIT*", [p.select(tree) for p in self.pieces])
+
+
+@total_ordering
+class PropbankTreePointer(PropbankPointer):
+    """
+    wordnum:height*wordnum:height*...
+    wordnum:height,
+
+    """
+
+    def __init__(self, wordnum, height):
+        self.wordnum = wordnum
+        self.height = height
+
+    @staticmethod
+    def parse(s):
+        # Deal with chains (xx*yy*zz)
+        pieces = s.split("*")
+        if len(pieces) > 1:
+            return PropbankChainTreePointer(
+                [PropbankTreePointer.parse(elt) for elt in pieces]
+            )
+
+        # Deal with split args (xx,yy,zz)
+        pieces = s.split(",")
+        if len(pieces) > 1:
+            return PropbankSplitTreePointer(
+                [PropbankTreePointer.parse(elt) for elt in pieces]
+            )
+
+        # Deal with normal pointers.
+        pieces = s.split(":")
+        if len(pieces) != 2:
+            raise ValueError("bad propbank pointer %r" % s)
+        return PropbankTreePointer(int(pieces[0]), int(pieces[1]))
+
+    def __str__(self):
+        return f"{self.wordnum}:{self.height}"
+
+    def __repr__(self):
+        return "PropbankTreePointer(%d, %d)" % (self.wordnum, self.height)
+
+    def __eq__(self, other):
+        while isinstance(other, (PropbankChainTreePointer, PropbankSplitTreePointer)):
+            other = other.pieces[0]
+
+        if not isinstance(other, PropbankTreePointer):
+            return self is other
+
+        return self.wordnum == other.wordnum and self.height == other.height
+
+    def __ne__(self, other):
+        return not self == other
+
+    def __lt__(self, other):
+        while isinstance(other, (PropbankChainTreePointer, PropbankSplitTreePointer)):
+            other = other.pieces[0]
+
+        if not isinstance(other, PropbankTreePointer):
+            return id(self) < id(other)
+
+        return (self.wordnum, -self.height) < (other.wordnum, -other.height)
+
+    def select(self, tree):
+        if tree is None:
+            raise ValueError("Parse tree not available")
+        return tree[self.treepos(tree)]
+
+    def treepos(self, tree):
+        """
+        Convert this pointer to a standard 'tree position' pointer,
+        given that it points to the given tree.
+        """
+        if tree is None:
+            raise ValueError("Parse tree not available")
+        stack = [tree]
+        treepos = []
+
+        wordnum = 0
+        while True:
+            # tree node:
+            if isinstance(stack[-1], Tree):
+                # Select the next child.
+                if len(treepos) < len(stack):
+                    treepos.append(0)
+                else:
+                    treepos[-1] += 1
+                # Update the stack.
+                if treepos[-1] < len(stack[-1]):
+                    stack.append(stack[-1][treepos[-1]])
+                else:
+                    # End of node's child list: pop up a level.
+                    stack.pop()
+                    treepos.pop()
+            # word node:
+            else:
+                if wordnum == self.wordnum:
+                    return tuple(treepos[: len(treepos) - self.height - 1])
+                else:
+                    wordnum += 1
+                    stack.pop()
+
+
+class PropbankInflection:
+    # { Inflection Form
+    INFINITIVE = "i"
+    GERUND = "g"
+    PARTICIPLE = "p"
+    FINITE = "v"
+    # { Inflection Tense
+    FUTURE = "f"
+    PAST = "p"
+    PRESENT = "n"
+    # { Inflection Aspect
+    PERFECT = "p"
+    PROGRESSIVE = "o"
+    PERFECT_AND_PROGRESSIVE = "b"
+    # { Inflection Person
+    THIRD_PERSON = "3"
+    # { Inflection Voice
+    ACTIVE = "a"
+    PASSIVE = "p"
+    # { Inflection
+    NONE = "-"
+    # }
+
+    def __init__(self, form="-", tense="-", aspect="-", person="-", voice="-"):
+        self.form = form
+        self.tense = tense
+        self.aspect = aspect
+        self.person = person
+        self.voice = voice
+
+    def __str__(self):
+        return self.form + self.tense + self.aspect + self.person + self.voice
+
+    def __repr__(self):
+        return "<PropbankInflection: %s>" % self
+
+    _VALIDATE = re.compile(r"[igpv\-][fpn\-][pob\-][3\-][ap\-]$")
+
+    @staticmethod
+    def parse(s):
+        if not isinstance(s, str):
+            raise TypeError("expected a string")
+        if len(s) != 5 or not PropbankInflection._VALIDATE.match(s):
+            raise ValueError("Bad propbank inflection string %r" % s)
+        return PropbankInflection(*s)
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/pros_cons.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/pros_cons.py
@@ -0,0 +1,133 @@
+# Natural Language Toolkit: Pros and Cons Corpus Reader
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+CorpusReader for the Pros and Cons dataset.
+
+- Pros and Cons dataset information -
+
+Contact: Bing Liu, liub@cs.uic.edu
+        https://www.cs.uic.edu/~liub
+
+Distributed with permission.
+
+Related papers:
+
+- Murthy Ganapathibhotla and Bing Liu. "Mining Opinions in Comparative Sentences".
+    Proceedings of the 22nd International Conference on Computational Linguistics
+    (Coling-2008), Manchester, 18-22 August, 2008.
+
+- Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing and Comparing
+    Opinions on the Web". Proceedings of the 14th international World Wide Web
+    conference (WWW-2005), May 10-14, 2005, in Chiba, Japan.
+"""
+import re
+
+from nltk.corpus.reader.api import *
+from nltk.tokenize import *
+
+
+class ProsConsCorpusReader(CategorizedCorpusReader, CorpusReader):
+    """
+    Reader for the Pros and Cons sentence dataset.
+
+        >>> from nltk.corpus import pros_cons
+        >>> pros_cons.sents(categories='Cons') # doctest: +NORMALIZE_WHITESPACE
+        [['East', 'batteries', '!', 'On', '-', 'off', 'switch', 'too', 'easy',
+        'to', 'maneuver', '.'], ['Eats', '...', 'no', ',', 'GULPS', 'batteries'],
+        ...]
+        >>> pros_cons.words('IntegratedPros.txt')
+        ['Easy', 'to', 'use', ',', 'economical', '!', ...]
+    """
+
+    CorpusView = StreamBackedCorpusView
+
+    def __init__(
+        self,
+        root,
+        fileids,
+        word_tokenizer=WordPunctTokenizer(),
+        encoding="utf8",
+        **kwargs
+    ):
+        """
+        :param root: The root directory for the corpus.
+        :param fileids: a list or regexp specifying the fileids in the corpus.
+        :param word_tokenizer: a tokenizer for breaking sentences or paragraphs
+            into words. Default: `WhitespaceTokenizer`
+        :param encoding: the encoding that should be used to read the corpus.
+        :param kwargs: additional parameters passed to CategorizedCorpusReader.
+        """
+
+        CorpusReader.__init__(self, root, fileids, encoding)
+        CategorizedCorpusReader.__init__(self, kwargs)
+        self._word_tokenizer = word_tokenizer
+
+    def sents(self, fileids=None, categories=None):
+        """
+        Return all sentences in the corpus or in the specified files/categories.
+
+        :param fileids: a list or regexp specifying the ids of the files whose
+            sentences have to be returned.
+        :param categories: a list specifying the categories whose sentences
+            have to be returned.
+        :return: the given file(s) as a list of sentences. Each sentence is
+            tokenized using the specified word_tokenizer.
+        :rtype: list(list(str))
+        """
+        fileids = self._resolve(fileids, categories)
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, str):
+            fileids = [fileids]
+        return concat(
+            [
+                self.CorpusView(path, self._read_sent_block, encoding=enc)
+                for (path, enc, fileid) in self.abspaths(fileids, True, True)
+            ]
+        )
+
+    def words(self, fileids=None, categories=None):
+        """
+        Return all words and punctuation symbols in the corpus or in the specified
+        files/categories.
+
+        :param fileids: a list or regexp specifying the ids of the files whose
+            words have to be returned.
+        :param categories: a list specifying the categories whose words have
+            to be returned.
+        :return: the given file(s) as a list of words and punctuation symbols.
+        :rtype: list(str)
+        """
+        fileids = self._resolve(fileids, categories)
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, str):
+            fileids = [fileids]
+        return concat(
+            [
+                self.CorpusView(path, self._read_word_block, encoding=enc)
+                for (path, enc, fileid) in self.abspaths(fileids, True, True)
+            ]
+        )
+
+    def _read_sent_block(self, stream):
+        sents = []
+        for i in range(20):  # Read 20 lines at a time.
+            line = stream.readline()
+            if not line:
+                continue
+            sent = re.match(r"^(?!\n)\s*<(Pros|Cons)>(.*)</(?:Pros|Cons)>", line)
+            if sent:
+                sents.append(self._word_tokenizer.tokenize(sent.group(2).strip()))
+        return sents
+
+    def _read_word_block(self, stream):
+        words = []
+        for sent in self._read_sent_block(stream):
+            words.extend(sent)
+        return words
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/reviews.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/reviews.py
@@ -0,0 +1,331 @@
+# Natural Language Toolkit: Product Reviews Corpus Reader
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+CorpusReader for reviews corpora (syntax based on Customer Review Corpus).
+
+Customer Review Corpus information
+==================================
+
+Annotated by: Minqing Hu and Bing Liu, 2004.
+    Department of Computer Science
+    University of Illinois at Chicago
+
+Contact: Bing Liu, liub@cs.uic.edu
+        https://www.cs.uic.edu/~liub
+
+Distributed with permission.
+
+The "product_reviews_1" and "product_reviews_2" datasets respectively contain
+annotated customer reviews of 5 and 9 products from amazon.com.
+
+Related papers:
+
+- Minqing Hu and Bing Liu. "Mining and summarizing customer reviews".
+    Proceedings of the ACM SIGKDD International Conference on Knowledge
+    Discovery & Data Mining (KDD-04), 2004.
+
+- Minqing Hu and Bing Liu. "Mining Opinion Features in Customer Reviews".
+    Proceedings of Nineteeth National Conference on Artificial Intelligence
+    (AAAI-2004), 2004.
+
+- Xiaowen Ding, Bing Liu and Philip S. Yu. "A Holistic Lexicon-Based Appraoch to
+    Opinion Mining." Proceedings of First ACM International Conference on Web
+    Search and Data Mining (WSDM-2008), Feb 11-12, 2008, Stanford University,
+    Stanford, California, USA.
+
+Symbols used in the annotated reviews:
+
+    :[t]: the title of the review: Each [t] tag starts a review.
+    :xxxx[+|-n]: xxxx is a product feature.
+    :[+n]: Positive opinion, n is the opinion strength: 3 strongest, and 1 weakest.
+           Note that the strength is quite subjective.
+           You may want ignore it, but only considering + and -
+    :[-n]: Negative opinion
+    :##:   start of each sentence. Each line is a sentence.
+    :[u]:  feature not appeared in the sentence.
+    :[p]:  feature not appeared in the sentence. Pronoun resolution is needed.
+    :[s]:  suggestion or recommendation.
+    :[cc]: comparison with a competing product from a different brand.
+    :[cs]: comparison with a competing product from the same brand.
+
+Note: Some of the files (e.g. "ipod.txt", "Canon PowerShot SD500.txt") do not
+    provide separation between different reviews. This is due to the fact that
+    the dataset was specifically designed for aspect/feature-based sentiment
+    analysis, for which sentence-level annotation is sufficient. For document-
+    level classification and analysis, this peculiarity should be taken into
+    consideration.
+"""
+
+import re
+
+from nltk.corpus.reader.api import *
+from nltk.tokenize import *
+
+TITLE = re.compile(r"^\[t\](.*)$")  # [t] Title
+FEATURES = re.compile(
+    r"((?:(?:\w+\s)+)?\w+)\[((?:\+|\-)\d)\]"
+)  # find 'feature' in feature[+3]
+NOTES = re.compile(r"\[(?!t)(p|u|s|cc|cs)\]")  # find 'p' in camera[+2][p]
+SENT = re.compile(r"##(.*)$")  # find tokenized sentence
+
+
+class Review:
+    """
+    A Review is the main block of a ReviewsCorpusReader.
+    """
+
+    def __init__(self, title=None, review_lines=None):
+        """
+        :param title: the title of the review.
+        :param review_lines: the list of the ReviewLines that belong to the Review.
+        """
+        self.title = title
+        if review_lines is None:
+            self.review_lines = []
+        else:
+            self.review_lines = review_lines
+
+    def add_line(self, review_line):
+        """
+        Add a line (ReviewLine) to the review.
+
+        :param review_line: a ReviewLine instance that belongs to the Review.
+        """
+        assert isinstance(review_line, ReviewLine)
+        self.review_lines.append(review_line)
+
+    def features(self):
+        """
+        Return a list of features in the review. Each feature is a tuple made of
+        the specific item feature and the opinion strength about that feature.
+
+        :return: all features of the review as a list of tuples (feat, score).
+        :rtype: list(tuple)
+        """
+        features = []
+        for review_line in self.review_lines:
+            features.extend(review_line.features)
+        return features
+
+    def sents(self):
+        """
+        Return all tokenized sentences in the review.
+
+        :return: all sentences of the review as lists of tokens.
+        :rtype: list(list(str))
+        """
+        return [review_line.sent for review_line in self.review_lines]
+
+    def __repr__(self):
+        return 'Review(title="{}", review_lines={})'.format(
+            self.title, self.review_lines
+        )
+
+
+class ReviewLine:
+    """
+    A ReviewLine represents a sentence of the review, together with (optional)
+    annotations of its features and notes about the reviewed item.
+    """
+
+    def __init__(self, sent, features=None, notes=None):
+        self.sent = sent
+        if features is None:
+            self.features = []
+        else:
+            self.features = features
+
+        if notes is None:
+            self.notes = []
+        else:
+            self.notes = notes
+
+    def __repr__(self):
+        return "ReviewLine(features={}, notes={}, sent={})".format(
+            self.features, self.notes, self.sent
+        )
+
+
+class ReviewsCorpusReader(CorpusReader):
+    """
+    Reader for the Customer Review Data dataset by Hu, Liu (2004).
+    Note: we are not applying any sentence tokenization at the moment, just word
+    tokenization.
+
+        >>> from nltk.corpus import product_reviews_1
+        >>> camera_reviews = product_reviews_1.reviews('Canon_G3.txt')
+        >>> review = camera_reviews[0]
+        >>> review.sents()[0] # doctest: +NORMALIZE_WHITESPACE
+        ['i', 'recently', 'purchased', 'the', 'canon', 'powershot', 'g3', 'and', 'am',
+        'extremely', 'satisfied', 'with', 'the', 'purchase', '.']
+        >>> review.features() # doctest: +NORMALIZE_WHITESPACE
+        [('canon powershot g3', '+3'), ('use', '+2'), ('picture', '+2'),
+        ('picture quality', '+1'), ('picture quality', '+1'), ('camera', '+2'),
+        ('use', '+2'), ('feature', '+1'), ('picture quality', '+3'), ('use', '+1'),
+        ('option', '+1')]
+
+    We can also reach the same information directly from the stream:
+
+        >>> product_reviews_1.features('Canon_G3.txt')
+        [('canon powershot g3', '+3'), ('use', '+2'), ...]
+
+    We can compute stats for specific product features:
+
+        >>> n_reviews = len([(feat,score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture'])
+        >>> tot = sum([int(score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture'])
+        >>> mean = tot / n_reviews
+        >>> print(n_reviews, tot, mean)
+        15 24 1.6
+    """
+
+    CorpusView = StreamBackedCorpusView
+
+    def __init__(
+        self, root, fileids, word_tokenizer=WordPunctTokenizer(), encoding="utf8"
+    ):
+        """
+        :param root: The root directory for the corpus.
+        :param fileids: a list or regexp specifying the fileids in the corpus.
+        :param word_tokenizer: a tokenizer for breaking sentences or paragraphs
+            into words. Default: `WordPunctTokenizer`
+        :param encoding: the encoding that should be used to read the corpus.
+        """
+
+        CorpusReader.__init__(self, root, fileids, encoding)
+        self._word_tokenizer = word_tokenizer
+        self._readme = "README.txt"
+
+    def features(self, fileids=None):
+        """
+        Return a list of features. Each feature is a tuple made of the specific
+        item feature and the opinion strength about that feature.
+
+        :param fileids: a list or regexp specifying the ids of the files whose
+            features have to be returned.
+        :return: all features for the item(s) in the given file(s).
+        :rtype: list(tuple)
+        """
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, str):
+            fileids = [fileids]
+        return concat(
+            [
+                self.CorpusView(fileid, self._read_features, encoding=enc)
+                for (fileid, enc) in self.abspaths(fileids, True)
+            ]
+        )
+
+    def reviews(self, fileids=None):
+        """
+        Return all the reviews as a list of Review objects. If `fileids` is
+        specified, return all the reviews from each of the specified files.
+
+        :param fileids: a list or regexp specifying the ids of the files whose
+            reviews have to be returned.
+        :return: the given file(s) as a list of reviews.
+        """
+        if fileids is None:
+            fileids = self._fileids
+        return concat(
+            [
+                self.CorpusView(fileid, self._read_review_block, encoding=enc)
+                for (fileid, enc) in self.abspaths(fileids, True)
+            ]
+        )
+
+    def sents(self, fileids=None):
+        """
+        Return all sentences in the corpus or in the specified files.
+
+        :param fileids: a list or regexp specifying the ids of the files whose
+            sentences have to be returned.
+        :return: the given file(s) as a list of sentences, each encoded as a
+            list of word strings.
+        :rtype: list(list(str))
+        """
+        return concat(
+            [
+                self.CorpusView(path, self._read_sent_block, encoding=enc)
+                for (path, enc, fileid) in self.abspaths(fileids, True, True)
+            ]
+        )
+
+    def words(self, fileids=None):
+        """
+        Return all words and punctuation symbols in the corpus or in the specified
+        files.
+
+        :param fileids: a list or regexp specifying the ids of the files whose
+            words have to be returned.
+        :return: the given file(s) as a list of words and punctuation symbols.
+        :rtype: list(str)
+        """
+        return concat(
+            [
+                self.CorpusView(path, self._read_word_block, encoding=enc)
+                for (path, enc, fileid) in self.abspaths(fileids, True, True)
+            ]
+        )
+
+    def _read_features(self, stream):
+        features = []
+        for i in range(20):
+            line = stream.readline()
+            if not line:
+                return features
+            features.extend(re.findall(FEATURES, line))
+        return features
+
+    def _read_review_block(self, stream):
+        while True:
+            line = stream.readline()
+            if not line:
+                return []  # end of file.
+            title_match = re.match(TITLE, line)
+            if title_match:
+                review = Review(
+                    title=title_match.group(1).strip()
+                )  # We create a new review
+                break
+
+        # Scan until we find another line matching the regexp, or EOF.
+        while True:
+            oldpos = stream.tell()
+            line = stream.readline()
+            # End of file:
+            if not line:
+                return [review]
+            # Start of a new review: backup to just before it starts, and
+            # return the review we've already collected.
+            if re.match(TITLE, line):
+                stream.seek(oldpos)
+                return [review]
+            # Anything else is part of the review line.
+            feats = re.findall(FEATURES, line)
+            notes = re.findall(NOTES, line)
+            sent = re.findall(SENT, line)
+            if sent:
+                sent = self._word_tokenizer.tokenize(sent[0])
+            review_line = ReviewLine(sent=sent, features=feats, notes=notes)
+            review.add_line(review_line)
+
+    def _read_sent_block(self, stream):
+        sents = []
+        for review in self._read_review_block(stream):
+            sents.extend([sent for sent in review.sents()])
+        return sents
+
+    def _read_word_block(self, stream):
+        words = []
+        for i in range(20):  # Read 20 lines at a time.
+            line = stream.readline()
+            sent = re.findall(SENT, line)
+            if sent:
+                words.extend(self._word_tokenizer.tokenize(sent[0]))
+        return words
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/rte.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/rte.py
@@ -0,0 +1,146 @@
+# Natural Language Toolkit: RTE Corpus Reader
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author:  Ewan Klein <ewan@inf.ed.ac.uk>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Corpus reader for the Recognizing Textual Entailment (RTE) Challenge Corpora.
+
+The files were taken from the RTE1, RTE2 and RTE3 datasets and the files
+were regularized.
+
+Filenames are of the form rte*_dev.xml and rte*_test.xml. The latter are the
+gold standard annotated files.
+
+Each entailment corpus is a list of 'text'/'hypothesis' pairs. The following
+example is taken from RTE3::
+
+ <pair id="1" entailment="YES" task="IE" length="short" >
+
+    <t>The sale was made to pay Yukos' US$ 27.5 billion tax bill,
+    Yuganskneftegaz was originally sold for US$ 9.4 billion to a little known
+    company Baikalfinansgroup which was later bought by the Russian
+    state-owned oil company Rosneft .</t>
+
+   <h>Baikalfinansgroup was sold to Rosneft.</h>
+ </pair>
+
+In order to provide globally unique IDs for each pair, a new attribute
+``challenge`` has been added to the root element ``entailment-corpus`` of each
+file, taking values 1, 2 or 3. The GID is formatted 'm-n', where 'm' is the
+challenge number and 'n' is the pair ID.
+"""
+from nltk.corpus.reader.api import *
+from nltk.corpus.reader.util import *
+from nltk.corpus.reader.xmldocs import *
+
+
+def norm(value_string):
+    """
+    Normalize the string value in an RTE pair's ``value`` or ``entailment``
+    attribute as an integer (1, 0).
+
+    :param value_string: the label used to classify a text/hypothesis pair
+    :type value_string: str
+    :rtype: int
+    """
+
+    valdict = {"TRUE": 1, "FALSE": 0, "YES": 1, "NO": 0}
+    return valdict[value_string.upper()]
+
+
+class RTEPair:
+    """
+    Container for RTE text-hypothesis pairs.
+
+    The entailment relation is signalled by the ``value`` attribute in RTE1, and by
+    ``entailment`` in RTE2 and RTE3. These both get mapped on to the ``entailment``
+    attribute of this class.
+    """
+
+    def __init__(
+        self,
+        pair,
+        challenge=None,
+        id=None,
+        text=None,
+        hyp=None,
+        value=None,
+        task=None,
+        length=None,
+    ):
+        """
+        :param challenge: version of the RTE challenge (i.e., RTE1, RTE2 or RTE3)
+        :param id: identifier for the pair
+        :param text: the text component of the pair
+        :param hyp: the hypothesis component of the pair
+        :param value: classification label for the pair
+        :param task: attribute for the particular NLP task that the data was drawn from
+        :param length: attribute for the length of the text of the pair
+        """
+        self.challenge = challenge
+        self.id = pair.attrib["id"]
+        self.gid = f"{self.challenge}-{self.id}"
+        self.text = pair[0].text
+        self.hyp = pair[1].text
+
+        if "value" in pair.attrib:
+            self.value = norm(pair.attrib["value"])
+        elif "entailment" in pair.attrib:
+            self.value = norm(pair.attrib["entailment"])
+        else:
+            self.value = value
+        if "task" in pair.attrib:
+            self.task = pair.attrib["task"]
+        else:
+            self.task = task
+        if "length" in pair.attrib:
+            self.length = pair.attrib["length"]
+        else:
+            self.length = length
+
+    def __repr__(self):
+        if self.challenge:
+            return f"<RTEPair: gid={self.challenge}-{self.id}>"
+        else:
+            return "<RTEPair: id=%s>" % self.id
+
+
+class RTECorpusReader(XMLCorpusReader):
+    """
+    Corpus reader for corpora in RTE challenges.
+
+    This is just a wrapper around the XMLCorpusReader. See module docstring above for the expected
+    structure of input documents.
+    """
+
+    def _read_etree(self, doc):
+        """
+        Map the XML input into an RTEPair.
+
+        This uses the ``getiterator()`` method from the ElementTree package to
+        find all the ``<pair>`` elements.
+
+        :param doc: a parsed XML document
+        :rtype: list(RTEPair)
+        """
+        try:
+            challenge = doc.attrib["challenge"]
+        except KeyError:
+            challenge = None
+        pairiter = doc.iter("pair")
+        return [RTEPair(pair, challenge=challenge) for pair in pairiter]
+
+    def pairs(self, fileids):
+        """
+        Build a list of RTEPairs from a RTE corpus.
+
+        :param fileids: a list of RTE corpus fileids
+        :type: list
+        :rtype: list(RTEPair)
+        """
+        if isinstance(fileids, str):
+            fileids = [fileids]
+        return concat([self._read_etree(self.xml(fileid)) for fileid in fileids])
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/semcor.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/semcor.py
@@ -0,0 +1,296 @@
+# Natural Language Toolkit: SemCor Corpus Reader
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Nathan Schneider <nschneid@cs.cmu.edu>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Corpus reader for the SemCor Corpus.
+"""
+
+__docformat__ = "epytext en"
+
+from nltk.corpus.reader.api import *
+from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView
+from nltk.tree import Tree
+
+
+class SemcorCorpusReader(XMLCorpusReader):
+    """
+    Corpus reader for the SemCor Corpus.
+    For access to the complete XML data structure, use the ``xml()``
+    method.  For access to simple word lists and tagged word lists, use
+    ``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``.
+    """
+
+    def __init__(self, root, fileids, wordnet, lazy=True):
+        XMLCorpusReader.__init__(self, root, fileids)
+        self._lazy = lazy
+        self._wordnet = wordnet
+
+    def words(self, fileids=None):
+        """
+        :return: the given file(s) as a list of words and punctuation symbols.
+        :rtype: list(str)
+        """
+        return self._items(fileids, "word", False, False, False)
+
+    def chunks(self, fileids=None):
+        """
+        :return: the given file(s) as a list of chunks,
+            each of which is a list of words and punctuation symbols
+            that form a unit.
+        :rtype: list(list(str))
+        """
+        return self._items(fileids, "chunk", False, False, False)
+
+    def tagged_chunks(self, fileids=None, tag=("pos" or "sem" or "both")):
+        """
+        :return: the given file(s) as a list of tagged chunks, represented
+            in tree form.
+        :rtype: list(Tree)
+
+        :param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'`
+            to indicate the kind of tags to include.  Semantic tags consist of
+            WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity
+            without a specific entry in WordNet.  (Named entities of type 'other'
+            have no lemma.  Other chunks not in WordNet have no semantic tag.
+            Punctuation tokens have `None` for their part of speech tag.)
+        """
+        return self._items(fileids, "chunk", False, tag != "sem", tag != "pos")
+
+    def sents(self, fileids=None):
+        """
+        :return: the given file(s) as a list of sentences, each encoded
+            as a list of word strings.
+        :rtype: list(list(str))
+        """
+        return self._items(fileids, "word", True, False, False)
+
+    def chunk_sents(self, fileids=None):
+        """
+        :return: the given file(s) as a list of sentences, each encoded
+            as a list of chunks.
+        :rtype: list(list(list(str)))
+        """
+        return self._items(fileids, "chunk", True, False, False)
+
+    def tagged_sents(self, fileids=None, tag=("pos" or "sem" or "both")):
+        """
+        :return: the given file(s) as a list of sentences. Each sentence
+            is represented as a list of tagged chunks (in tree form).
+        :rtype: list(list(Tree))
+
+        :param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'`
+            to indicate the kind of tags to include.  Semantic tags consist of
+            WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity
+            without a specific entry in WordNet.  (Named entities of type 'other'
+            have no lemma.  Other chunks not in WordNet have no semantic tag.
+            Punctuation tokens have `None` for their part of speech tag.)
+        """
+        return self._items(fileids, "chunk", True, tag != "sem", tag != "pos")
+
+    def _items(self, fileids, unit, bracket_sent, pos_tag, sem_tag):
+        if unit == "word" and not bracket_sent:
+            # the result of the SemcorWordView may be a multiword unit, so the
+            # LazyConcatenation will make sure the sentence is flattened
+            _ = lambda *args: LazyConcatenation(
+                (SemcorWordView if self._lazy else self._words)(*args)
+            )
+        else:
+            _ = SemcorWordView if self._lazy else self._words
+        return concat(
+            [
+                _(fileid, unit, bracket_sent, pos_tag, sem_tag, self._wordnet)
+                for fileid in self.abspaths(fileids)
+            ]
+        )
+
+    def _words(self, fileid, unit, bracket_sent, pos_tag, sem_tag):
+        """
+        Helper used to implement the view methods -- returns a list of
+        tokens, (segmented) words, chunks, or sentences. The tokens
+        and chunks may optionally be tagged (with POS and sense
+        information).
+
+        :param fileid: The name of the underlying file.
+        :param unit: One of `'token'`, `'word'`, or `'chunk'`.
+        :param bracket_sent: If true, include sentence bracketing.
+        :param pos_tag: Whether to include part-of-speech tags.
+        :param sem_tag: Whether to include semantic tags, namely WordNet lemma
+            and OOV named entity status.
+        """
+        assert unit in ("token", "word", "chunk")
+        result = []
+
+        xmldoc = ElementTree.parse(fileid).getroot()
+        for xmlsent in xmldoc.findall(".//s"):
+            sent = []
+            for xmlword in _all_xmlwords_in(xmlsent):
+                itm = SemcorCorpusReader._word(
+                    xmlword, unit, pos_tag, sem_tag, self._wordnet
+                )
+                if unit == "word":
+                    sent.extend(itm)
+                else:
+                    sent.append(itm)
+
+            if bracket_sent:
+                result.append(SemcorSentence(xmlsent.attrib["snum"], sent))
+            else:
+                result.extend(sent)
+
+        assert None not in result
+        return result
+
+    @staticmethod
+    def _word(xmlword, unit, pos_tag, sem_tag, wordnet):
+        tkn = xmlword.text
+        if not tkn:
+            tkn = ""  # fixes issue 337?
+
+        lemma = xmlword.get("lemma", tkn)  # lemma or NE class
+        lexsn = xmlword.get("lexsn")  # lex_sense (locator for the lemma's sense)
+        if lexsn is not None:
+            sense_key = lemma + "%" + lexsn
+            wnpos = ("n", "v", "a", "r", "s")[
+                int(lexsn.split(":")[0]) - 1
+            ]  # see http://wordnet.princeton.edu/man/senseidx.5WN.html
+        else:
+            sense_key = wnpos = None
+        redef = xmlword.get(
+            "rdf", tkn
+        )  # redefinition--this indicates the lookup string
+        # does not exactly match the enclosed string, e.g. due to typographical adjustments
+        # or discontinuity of a multiword expression. If a redefinition has occurred,
+        # the "rdf" attribute holds its inflected form and "lemma" holds its lemma.
+        # For NEs, "rdf", "lemma", and "pn" all hold the same value (the NE class).
+        sensenum = xmlword.get("wnsn")  # WordNet sense number
+        isOOVEntity = "pn" in xmlword.keys()  # a "personal name" (NE) not in WordNet
+        pos = xmlword.get(
+            "pos"
+        )  # part of speech for the whole chunk (None for punctuation)
+
+        if unit == "token":
+            if not pos_tag and not sem_tag:
+                itm = tkn
+            else:
+                itm = (
+                    (tkn,)
+                    + ((pos,) if pos_tag else ())
+                    + ((lemma, wnpos, sensenum, isOOVEntity) if sem_tag else ())
+                )
+            return itm
+        else:
+            ww = tkn.split("_")  # TODO: case where punctuation intervenes in MWE
+            if unit == "word":
+                return ww
+            else:
+                if sensenum is not None:
+                    try:
+                        sense = wordnet.lemma_from_key(sense_key)  # Lemma object
+                    except Exception:
+                        # cannot retrieve the wordnet.Lemma object. possible reasons:
+                        #  (a) the wordnet corpus is not downloaded;
+                        #  (b) a nonexistent sense is annotated: e.g., such.s.00 triggers:
+                        #  nltk.corpus.reader.wordnet.WordNetError: No synset found for key u'such%5:00:01:specified:00'
+                        # solution: just use the lemma name as a string
+                        try:
+                            sense = "%s.%s.%02d" % (
+                                lemma,
+                                wnpos,
+                                int(sensenum),
+                            )  # e.g.: reach.v.02
+                        except ValueError:
+                            sense = (
+                                lemma + "." + wnpos + "." + sensenum
+                            )  # e.g. the sense number may be "2;1"
+
+                bottom = [Tree(pos, ww)] if pos_tag else ww
+
+                if sem_tag and isOOVEntity:
+                    if sensenum is not None:
+                        return Tree(sense, [Tree("NE", bottom)])
+                    else:  # 'other' NE
+                        return Tree("NE", bottom)
+                elif sem_tag and sensenum is not None:
+                    return Tree(sense, bottom)
+                elif pos_tag:
+                    return bottom[0]
+                else:
+                    return bottom  # chunk as a list
+
+
+def _all_xmlwords_in(elt, result=None):
+    if result is None:
+        result = []
+    for child in elt:
+        if child.tag in ("wf", "punc"):
+            result.append(child)
+        else:
+            _all_xmlwords_in(child, result)
+    return result
+
+
+class SemcorSentence(list):
+    """
+    A list of words, augmented by an attribute ``num`` used to record
+    the sentence identifier (the ``n`` attribute from the XML).
+    """
+
+    def __init__(self, num, items):
+        self.num = num
+        list.__init__(self, items)
+
+
+class SemcorWordView(XMLCorpusView):
+    """
+    A stream backed corpus view specialized for use with the BNC corpus.
+    """
+
+    def __init__(self, fileid, unit, bracket_sent, pos_tag, sem_tag, wordnet):
+        """
+        :param fileid: The name of the underlying file.
+        :param unit: One of `'token'`, `'word'`, or `'chunk'`.
+        :param bracket_sent: If true, include sentence bracketing.
+        :param pos_tag: Whether to include part-of-speech tags.
+        :param sem_tag: Whether to include semantic tags, namely WordNet lemma
+            and OOV named entity status.
+        """
+        if bracket_sent:
+            tagspec = ".*/s"
+        else:
+            tagspec = ".*/s/(punc|wf)"
+
+        self._unit = unit
+        self._sent = bracket_sent
+        self._pos_tag = pos_tag
+        self._sem_tag = sem_tag
+        self._wordnet = wordnet
+
+        XMLCorpusView.__init__(self, fileid, tagspec)
+
+    def handle_elt(self, elt, context):
+        if self._sent:
+            return self.handle_sent(elt)
+        else:
+            return self.handle_word(elt)
+
+    def handle_word(self, elt):
+        return SemcorCorpusReader._word(
+            elt, self._unit, self._pos_tag, self._sem_tag, self._wordnet
+        )
+
+    def handle_sent(self, elt):
+        sent = []
+        for child in elt:
+            if child.tag in ("wf", "punc"):
+                itm = self.handle_word(child)
+                if self._unit == "word":
+                    sent.extend(itm)
+                else:
+                    sent.append(itm)
+            else:
+                raise ValueError("Unexpected element %s" % child.tag)
+        return SemcorSentence(elt.attrib["snum"], sent)
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/senseval.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/senseval.py
@@ -0,0 +1,196 @@
+# Natural Language Toolkit: Senseval 2 Corpus Reader
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
+#         Steven Bird <stevenbird1@gmail.com> (modifications)
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Read from the Senseval 2 Corpus.
+
+SENSEVAL [http://www.senseval.org/]
+Evaluation exercises for Word Sense Disambiguation.
+Organized by ACL-SIGLEX [https://www.siglex.org/]
+
+Prepared by Ted Pedersen <tpederse@umn.edu>, University of Minnesota,
+https://www.d.umn.edu/~tpederse/data.html
+Distributed with permission.
+
+The NLTK version of the Senseval 2 files uses well-formed XML.
+Each instance of the ambiguous words "hard", "interest", "line", and "serve"
+is tagged with a sense identifier, and supplied with context.
+"""
+
+import re
+from xml.etree import ElementTree
+
+from nltk.corpus.reader.api import *
+from nltk.corpus.reader.util import *
+from nltk.tokenize import *
+
+
+class SensevalInstance:
+    def __init__(self, word, position, context, senses):
+        self.word = word
+        self.senses = tuple(senses)
+        self.position = position
+        self.context = context
+
+    def __repr__(self):
+        return "SensevalInstance(word=%r, position=%r, " "context=%r, senses=%r)" % (
+            self.word,
+            self.position,
+            self.context,
+            self.senses,
+        )
+
+
+class SensevalCorpusReader(CorpusReader):
+    def instances(self, fileids=None):
+        return concat(
+            [
+                SensevalCorpusView(fileid, enc)
+                for (fileid, enc) in self.abspaths(fileids, True)
+            ]
+        )
+
+    def _entry(self, tree):
+        elts = []
+        for lexelt in tree.findall("lexelt"):
+            for inst in lexelt.findall("instance"):
+                sense = inst[0].attrib["senseid"]
+                context = [(w.text, w.attrib["pos"]) for w in inst[1]]
+                elts.append((sense, context))
+        return elts
+
+
+class SensevalCorpusView(StreamBackedCorpusView):
+    def __init__(self, fileid, encoding):
+        StreamBackedCorpusView.__init__(self, fileid, encoding=encoding)
+
+        self._word_tokenizer = WhitespaceTokenizer()
+        self._lexelt_starts = [0]  # list of streampos
+        self._lexelts = [None]  # list of lexelt names
+
+    def read_block(self, stream):
+        # Decide which lexical element we're in.
+        lexelt_num = bisect.bisect_right(self._lexelt_starts, stream.tell()) - 1
+        lexelt = self._lexelts[lexelt_num]
+
+        instance_lines = []
+        in_instance = False
+        while True:
+            line = stream.readline()
+            if line == "":
+                assert instance_lines == []
+                return []
+
+            # Start of a lexical element?
+            if line.lstrip().startswith("<lexelt"):
+                lexelt_num += 1
+                m = re.search("item=(\"[^\"]+\"|'[^']+')", line)
+                assert m is not None  # <lexelt> has no 'item=...'
+                lexelt = m.group(1)[1:-1]
+                if lexelt_num < len(self._lexelts):
+                    assert lexelt == self._lexelts[lexelt_num]
+                else:
+                    self._lexelts.append(lexelt)
+                    self._lexelt_starts.append(stream.tell())
+
+            # Start of an instance?
+            if line.lstrip().startswith("<instance"):
+                assert instance_lines == []
+                in_instance = True
+
+            # Body of an instance?
+            if in_instance:
+                instance_lines.append(line)
+
+            # End of an instance?
+            if line.lstrip().startswith("</instance"):
+                xml_block = "\n".join(instance_lines)
+                xml_block = _fixXML(xml_block)
+                inst = ElementTree.fromstring(xml_block)
+                return [self._parse_instance(inst, lexelt)]
+
+    def _parse_instance(self, instance, lexelt):
+        senses = []
+        context = []
+        position = None
+        for child in instance:
+            if child.tag == "answer":
+                senses.append(child.attrib["senseid"])
+            elif child.tag == "context":
+                context += self._word_tokenizer.tokenize(child.text)
+                for cword in child:
+                    if cword.tag == "compound":
+                        cword = cword[0]  # is this ok to do?
+
+                    if cword.tag == "head":
+                        # Some santiy checks:
+                        assert position is None, "head specified twice"
+                        assert cword.text.strip() or len(cword) == 1
+                        assert not (cword.text.strip() and len(cword) == 1)
+                        # Record the position of the head:
+                        position = len(context)
+                        # Add on the head word itself:
+                        if cword.text.strip():
+                            context.append(cword.text.strip())
+                        elif cword[0].tag == "wf":
+                            context.append((cword[0].text, cword[0].attrib["pos"]))
+                            if cword[0].tail:
+                                context += self._word_tokenizer.tokenize(cword[0].tail)
+                        else:
+                            assert False, "expected CDATA or wf in <head>"
+                    elif cword.tag == "wf":
+                        context.append((cword.text, cword.attrib["pos"]))
+                    elif cword.tag == "s":
+                        pass  # Sentence boundary marker.
+
+                    else:
+                        print("ACK", cword.tag)
+                        assert False, "expected CDATA or <wf> or <head>"
+                    if cword.tail:
+                        context += self._word_tokenizer.tokenize(cword.tail)
+            else:
+                assert False, "unexpected tag %s" % child.tag
+        return SensevalInstance(lexelt, position, context, senses)
+
+
+def _fixXML(text):
+    """
+    Fix the various issues with Senseval pseudo-XML.
+    """
+    # <~> or <^> => ~ or ^
+    text = re.sub(r"<([~\^])>", r"\1", text)
+    # fix lone &
+    text = re.sub(r"(\s+)\&(\s+)", r"\1&amp;\2", text)
+    # fix """
+    text = re.sub(r'"""', "'\"'", text)
+    # fix <s snum=dd> => <s snum="dd"/>
+    text = re.sub(r'(<[^<]*snum=)([^">]+)>', r'\1"\2"/>', text)
+    # fix foreign word tag
+    text = re.sub(r"<\&frasl>\s*<p[^>]*>", "FRASL", text)
+    # remove <&I .>
+    text = re.sub(r"<\&I[^>]*>", "", text)
+    # fix <{word}>
+    text = re.sub(r"<{([^}]+)}>", r"\1", text)
+    # remove <@>, <p>, </p>
+    text = re.sub(r"<(@|/?p)>", r"", text)
+    # remove <&M .> and <&T .> and <&Ms .>
+    text = re.sub(r"<&\w+ \.>", r"", text)
+    # remove <!DOCTYPE... > lines
+    text = re.sub(r"<!DOCTYPE[^>]*>", r"", text)
+    # remove <[hi]> and <[/p]> etc
+    text = re.sub(r"<\[\/?[^>]+\]*>", r"", text)
+    # take the thing out of the brackets: <&hellip;>
+    text = re.sub(r"<(\&\w+;)>", r"\1", text)
+    # and remove the & for those patterns that aren't regular XML
+    text = re.sub(r"&(?!amp|gt|lt|apos|quot)", r"", text)
+    # fix 'abc <p="foo"/>' style tags - now <wf pos="foo">abc</wf>
+    text = re.sub(
+        r'[ \t]*([^<>\s]+?)[ \t]*<p="([^"]*"?)"/>', r' <wf pos="\2">\1</wf>', text
+    )
+    text = re.sub(r'\s*"\s*<p=\'"\'/>', " <wf pos='\"'>\"</wf>", text)
+    return text
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/sentiwordnet.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/sentiwordnet.py
@@ -0,0 +1,136 @@
+# Natural Language Toolkit: SentiWordNet
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Christopher Potts <cgpotts@stanford.edu>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+An NLTK interface for SentiWordNet
+
+SentiWordNet is a lexical resource for opinion mining.
+SentiWordNet assigns to each synset of WordNet three
+sentiment scores: positivity, negativity, and objectivity.
+
+For details about SentiWordNet see:
+http://sentiwordnet.isti.cnr.it/
+
+    >>> from nltk.corpus import sentiwordnet as swn
+    >>> print(swn.senti_synset('breakdown.n.03'))
+    <breakdown.n.03: PosScore=0.0 NegScore=0.25>
+    >>> list(swn.senti_synsets('slow'))
+    [SentiSynset('decelerate.v.01'), SentiSynset('slow.v.02'),\
+ SentiSynset('slow.v.03'), SentiSynset('slow.a.01'),\
+ SentiSynset('slow.a.02'), SentiSynset('dense.s.04'),\
+ SentiSynset('slow.a.04'), SentiSynset('boring.s.01'),\
+ SentiSynset('dull.s.05'), SentiSynset('slowly.r.01'),\
+ SentiSynset('behind.r.03')]
+    >>> happy = swn.senti_synsets('happy', 'a')
+    >>> happy0 = list(happy)[0]
+    >>> happy0.pos_score()
+    0.875
+    >>> happy0.neg_score()
+    0.0
+    >>> happy0.obj_score()
+    0.125
+"""
+
+import re
+
+from nltk.corpus.reader import CorpusReader
+
+
+class SentiWordNetCorpusReader(CorpusReader):
+    def __init__(self, root, fileids, encoding="utf-8"):
+        """
+        Construct a new SentiWordNet Corpus Reader, using data from
+        the specified file.
+        """
+        super().__init__(root, fileids, encoding=encoding)
+        if len(self._fileids) != 1:
+            raise ValueError("Exactly one file must be specified")
+        self._db = {}
+        self._parse_src_file()
+
+    def _parse_src_file(self):
+        lines = self.open(self._fileids[0]).read().splitlines()
+        lines = filter((lambda x: not re.search(r"^\s*#", x)), lines)
+        for i, line in enumerate(lines):
+            fields = [field.strip() for field in re.split(r"\t+", line)]
+            try:
+                pos, offset, pos_score, neg_score, synset_terms, gloss = fields
+            except BaseException as e:
+                raise ValueError(f"Line {i} formatted incorrectly: {line}\n") from e
+            if pos and offset:
+                offset = int(offset)
+                self._db[(pos, offset)] = (float(pos_score), float(neg_score))
+
+    def senti_synset(self, *vals):
+        from nltk.corpus import wordnet as wn
+
+        if tuple(vals) in self._db:
+            pos_score, neg_score = self._db[tuple(vals)]
+            pos, offset = vals
+            if pos == "s":
+                pos = "a"
+            synset = wn.synset_from_pos_and_offset(pos, offset)
+            return SentiSynset(pos_score, neg_score, synset)
+        else:
+            synset = wn.synset(vals[0])
+            pos = synset.pos()
+            if pos == "s":
+                pos = "a"
+            offset = synset.offset()
+            if (pos, offset) in self._db:
+                pos_score, neg_score = self._db[(pos, offset)]
+                return SentiSynset(pos_score, neg_score, synset)
+            else:
+                return None
+
+    def senti_synsets(self, string, pos=None):
+        from nltk.corpus import wordnet as wn
+
+        sentis = []
+        synset_list = wn.synsets(string, pos)
+        for synset in synset_list:
+            sentis.append(self.senti_synset(synset.name()))
+        sentis = filter(lambda x: x, sentis)
+        return sentis
+
+    def all_senti_synsets(self):
+        from nltk.corpus import wordnet as wn
+
+        for key, fields in self._db.items():
+            pos, offset = key
+            pos_score, neg_score = fields
+            synset = wn.synset_from_pos_and_offset(pos, offset)
+            yield SentiSynset(pos_score, neg_score, synset)
+
+
+class SentiSynset:
+    def __init__(self, pos_score, neg_score, synset):
+        self._pos_score = pos_score
+        self._neg_score = neg_score
+        self._obj_score = 1.0 - (self._pos_score + self._neg_score)
+        self.synset = synset
+
+    def pos_score(self):
+        return self._pos_score
+
+    def neg_score(self):
+        return self._neg_score
+
+    def obj_score(self):
+        return self._obj_score
+
+    def __str__(self):
+        """Prints just the Pos/Neg scores for now."""
+        s = "<"
+        s += self.synset.name() + ": "
+        s += "PosScore=%s " % self._pos_score
+        s += "NegScore=%s" % self._neg_score
+        s += ">"
+        return s
+
+    def __repr__(self):
+        return "Senti" + repr(self.synset)
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/sinica_treebank.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/sinica_treebank.py
@@ -0,0 +1,75 @@
+# Natural Language Toolkit: Sinica Treebank Reader
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Sinica Treebank Corpus Sample
+
+http://rocling.iis.sinica.edu.tw/CKIP/engversion/treebank.htm
+
+10,000 parsed sentences, drawn from the Academia Sinica Balanced
+Corpus of Modern Chinese.  Parse tree notation is based on
+Information-based Case Grammar.  Tagset documentation is available
+at https://www.sinica.edu.tw/SinicaCorpus/modern_e_wordtype.html
+
+Language and Knowledge Processing Group, Institute of Information
+Science, Academia Sinica
+
+The data is distributed with the Natural Language Toolkit under the terms of
+the Creative Commons Attribution-NonCommercial-ShareAlike License
+[https://creativecommons.org/licenses/by-nc-sa/2.5/].
+
+References:
+
+Feng-Yi Chen, Pi-Fang Tsai, Keh-Jiann Chen, and Chu-Ren Huang (1999)
+The Construction of Sinica Treebank. Computational Linguistics and
+Chinese Language Processing, 4, pp 87-104.
+
+Huang Chu-Ren, Keh-Jiann Chen, Feng-Yi Chen, Keh-Jiann Chen, Zhao-Ming
+Gao, and Kuang-Yu Chen. 2000. Sinica Treebank: Design Criteria,
+Annotation Guidelines, and On-line Interface. Proceedings of 2nd
+Chinese Language Processing Workshop, Association for Computational
+Linguistics.
+
+Chen Keh-Jiann and Yu-Ming Hsieh (2004) Chinese Treebanks and Grammar
+Extraction, Proceedings of IJCNLP-04, pp560-565.
+"""
+
+from nltk.corpus.reader.api import *
+from nltk.corpus.reader.util import *
+from nltk.tag import map_tag
+from nltk.tree import sinica_parse
+
+IDENTIFIER = re.compile(r"^#\S+\s")
+APPENDIX = re.compile(r"(?<=\))#.*$")
+TAGWORD = re.compile(r":([^:()|]+):([^:()|]+)")
+WORD = re.compile(r":[^:()|]+:([^:()|]+)")
+
+
+class SinicaTreebankCorpusReader(SyntaxCorpusReader):
+    """
+    Reader for the sinica treebank.
+    """
+
+    def _read_block(self, stream):
+        sent = stream.readline()
+        sent = IDENTIFIER.sub("", sent)
+        sent = APPENDIX.sub("", sent)
+        return [sent]
+
+    def _parse(self, sent):
+        return sinica_parse(sent)
+
+    def _tag(self, sent, tagset=None):
+        tagged_sent = [(w, t) for (t, w) in TAGWORD.findall(sent)]
+        if tagset and tagset != self._tagset:
+            tagged_sent = [
+                (w, map_tag(self._tagset, tagset, t)) for (w, t) in tagged_sent
+            ]
+        return tagged_sent
+
+    def _word(self, sent):
+        return WORD.findall(sent)
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/string_category.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/string_category.py
@@ -0,0 +1,56 @@
+# Natural Language Toolkit: String Category Corpus Reader
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+#         Edward Loper <edloper@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Read tuples from a corpus consisting of categorized strings.
+For example, from the question classification corpus:
+
+NUM:dist How far is it from Denver to Aspen ?
+LOC:city What county is Modesto , California in ?
+HUM:desc Who was Galileo ?
+DESC:def What is an atom ?
+NUM:date When did Hawaii become a state ?
+"""
+
+from nltk.corpus.reader.api import *
+
+# based on PPAttachmentCorpusReader
+from nltk.corpus.reader.util import *
+
+
+# [xx] Should the order of the tuple be reversed -- in most other places
+# in nltk, we use the form (data, tag) -- e.g., tagged words and
+# labeled texts for classifiers.
+class StringCategoryCorpusReader(CorpusReader):
+    def __init__(self, root, fileids, delimiter=" ", encoding="utf8"):
+        """
+        :param root: The root directory for this corpus.
+        :param fileids: A list or regexp specifying the fileids in this corpus.
+        :param delimiter: Field delimiter
+        """
+        CorpusReader.__init__(self, root, fileids, encoding)
+        self._delimiter = delimiter
+
+    def tuples(self, fileids=None):
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, str):
+            fileids = [fileids]
+        return concat(
+            [
+                StreamBackedCorpusView(fileid, self._read_tuple_block, encoding=enc)
+                for (fileid, enc) in self.abspaths(fileids, True)
+            ]
+        )
+
+    def _read_tuple_block(self, stream):
+        line = stream.readline().strip()
+        if line:
+            return [tuple(line.split(self._delimiter, 1))]
+        else:
+            return []
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/switchboard.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/switchboard.py
@@ -0,0 +1,125 @@
+# Natural Language Toolkit: Switchboard Corpus Reader
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+import re
+
+from nltk.corpus.reader.api import *
+from nltk.corpus.reader.util import *
+from nltk.tag import map_tag, str2tuple
+
+
+class SwitchboardTurn(list):
+    """
+    A specialized list object used to encode switchboard utterances.
+    The elements of the list are the words in the utterance; and two
+    attributes, ``speaker`` and ``id``, are provided to retrieve the
+    spearker identifier and utterance id.  Note that utterance ids
+    are only unique within a given discourse.
+    """
+
+    def __init__(self, words, speaker, id):
+        list.__init__(self, words)
+        self.speaker = speaker
+        self.id = int(id)
+
+    def __repr__(self):
+        if len(self) == 0:
+            text = ""
+        elif isinstance(self[0], tuple):
+            text = " ".join("%s/%s" % w for w in self)
+        else:
+            text = " ".join(self)
+        return f"<{self.speaker}.{self.id}: {text!r}>"
+
+
+class SwitchboardCorpusReader(CorpusReader):
+    _FILES = ["tagged"]
+    # Use the "tagged" file even for non-tagged data methods, since
+    # it's tokenized.
+
+    def __init__(self, root, tagset=None):
+        CorpusReader.__init__(self, root, self._FILES)
+        self._tagset = tagset
+
+    def words(self):
+        return StreamBackedCorpusView(self.abspath("tagged"), self._words_block_reader)
+
+    def tagged_words(self, tagset=None):
+        def tagged_words_block_reader(stream):
+            return self._tagged_words_block_reader(stream, tagset)
+
+        return StreamBackedCorpusView(self.abspath("tagged"), tagged_words_block_reader)
+
+    def turns(self):
+        return StreamBackedCorpusView(self.abspath("tagged"), self._turns_block_reader)
+
+    def tagged_turns(self, tagset=None):
+        def tagged_turns_block_reader(stream):
+            return self._tagged_turns_block_reader(stream, tagset)
+
+        return StreamBackedCorpusView(self.abspath("tagged"), tagged_turns_block_reader)
+
+    def discourses(self):
+        return StreamBackedCorpusView(
+            self.abspath("tagged"), self._discourses_block_reader
+        )
+
+    def tagged_discourses(self, tagset=False):
+        def tagged_discourses_block_reader(stream):
+            return self._tagged_discourses_block_reader(stream, tagset)
+
+        return StreamBackedCorpusView(
+            self.abspath("tagged"), tagged_discourses_block_reader
+        )
+
+    def _discourses_block_reader(self, stream):
+        # returns at most 1 discourse.  (The other methods depend on this.)
+        return [
+            [
+                self._parse_utterance(u, include_tag=False)
+                for b in read_blankline_block(stream)
+                for u in b.split("\n")
+                if u.strip()
+            ]
+        ]
+
+    def _tagged_discourses_block_reader(self, stream, tagset=None):
+        # returns at most 1 discourse.  (The other methods depend on this.)
+        return [
+            [
+                self._parse_utterance(u, include_tag=True, tagset=tagset)
+                for b in read_blankline_block(stream)
+                for u in b.split("\n")
+                if u.strip()
+            ]
+        ]
+
+    def _turns_block_reader(self, stream):
+        return self._discourses_block_reader(stream)[0]
+
+    def _tagged_turns_block_reader(self, stream, tagset=None):
+        return self._tagged_discourses_block_reader(stream, tagset)[0]
+
+    def _words_block_reader(self, stream):
+        return sum(self._discourses_block_reader(stream)[0], [])
+
+    def _tagged_words_block_reader(self, stream, tagset=None):
+        return sum(self._tagged_discourses_block_reader(stream, tagset)[0], [])
+
+    _UTTERANCE_RE = re.compile(r"(\w+)\.(\d+)\:\s*(.*)")
+    _SEP = "/"
+
+    def _parse_utterance(self, utterance, include_tag, tagset=None):
+        m = self._UTTERANCE_RE.match(utterance)
+        if m is None:
+            raise ValueError("Bad utterance %r" % utterance)
+        speaker, id, text = m.groups()
+        words = [str2tuple(s, self._SEP) for s in text.split()]
+        if not include_tag:
+            words = [w for (w, t) in words]
+        elif tagset and tagset != self._tagset:
+            words = [(w, map_tag(self._tagset, tagset, t)) for (w, t) in words]
+        return SwitchboardTurn(words, speaker, id)
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/tagged.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/tagged.py
@@ -0,0 +1,354 @@
+# Natural Language Toolkit: Tagged Corpus Reader
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com>
+#         Jacob Perkins <japerk@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+A reader for corpora whose documents contain part-of-speech-tagged words.
+"""
+
+import os
+
+from nltk.corpus.reader.api import *
+from nltk.corpus.reader.timit import read_timit_block
+from nltk.corpus.reader.util import *
+from nltk.tag import map_tag, str2tuple
+from nltk.tokenize import *
+
+
+class TaggedCorpusReader(CorpusReader):
+    """
+    Reader for simple part-of-speech tagged corpora.  Paragraphs are
+    assumed to be split using blank lines.  Sentences and words can be
+    tokenized using the default tokenizers, or by custom tokenizers
+    specified as parameters to the constructor.  Words are parsed
+    using ``nltk.tag.str2tuple``.  By default, ``'/'`` is used as the
+    separator.  I.e., words should have the form::
+
+       word1/tag1 word2/tag2 word3/tag3 ...
+
+    But custom separators may be specified as parameters to the
+    constructor.  Part of speech tags are case-normalized to upper
+    case.
+    """
+
+    def __init__(
+        self,
+        root,
+        fileids,
+        sep="/",
+        word_tokenizer=WhitespaceTokenizer(),
+        sent_tokenizer=RegexpTokenizer("\n", gaps=True),
+        para_block_reader=read_blankline_block,
+        encoding="utf8",
+        tagset=None,
+    ):
+        """
+        Construct a new Tagged Corpus reader for a set of documents
+        located at the given root directory.  Example usage:
+
+            >>> root = '/...path to corpus.../'
+            >>> reader = TaggedCorpusReader(root, '.*', '.txt') # doctest: +SKIP
+
+        :param root: The root directory for this corpus.
+        :param fileids: A list or regexp specifying the fileids in this corpus.
+        """
+        CorpusReader.__init__(self, root, fileids, encoding)
+        self._sep = sep
+        self._word_tokenizer = word_tokenizer
+        self._sent_tokenizer = sent_tokenizer
+        self._para_block_reader = para_block_reader
+        self._tagset = tagset
+
+    def words(self, fileids=None):
+        """
+        :return: the given file(s) as a list of words
+            and punctuation symbols.
+        :rtype: list(str)
+        """
+        return concat(
+            [
+                TaggedCorpusView(
+                    fileid,
+                    enc,
+                    False,
+                    False,
+                    False,
+                    self._sep,
+                    self._word_tokenizer,
+                    self._sent_tokenizer,
+                    self._para_block_reader,
+                    None,
+                )
+                for (fileid, enc) in self.abspaths(fileids, True)
+            ]
+        )
+
+    def sents(self, fileids=None):
+        """
+        :return: the given file(s) as a list of
+            sentences or utterances, each encoded as a list of word
+            strings.
+        :rtype: list(list(str))
+        """
+        return concat(
+            [
+                TaggedCorpusView(
+                    fileid,
+                    enc,
+                    False,
+                    True,
+                    False,
+                    self._sep,
+                    self._word_tokenizer,
+                    self._sent_tokenizer,
+                    self._para_block_reader,
+                    None,
+                )
+                for (fileid, enc) in self.abspaths(fileids, True)
+            ]
+        )
+
+    def paras(self, fileids=None):
+        """
+        :return: the given file(s) as a list of
+            paragraphs, each encoded as a list of sentences, which are
+            in turn encoded as lists of word strings.
+        :rtype: list(list(list(str)))
+        """
+        return concat(
+            [
+                TaggedCorpusView(
+                    fileid,
+                    enc,
+                    False,
+                    True,
+                    True,
+                    self._sep,
+                    self._word_tokenizer,
+                    self._sent_tokenizer,
+                    self._para_block_reader,
+                    None,
+                )
+                for (fileid, enc) in self.abspaths(fileids, True)
+            ]
+        )
+
+    def tagged_words(self, fileids=None, tagset=None):
+        """
+        :return: the given file(s) as a list of tagged
+            words and punctuation symbols, encoded as tuples
+            ``(word,tag)``.
+        :rtype: list(tuple(str,str))
+        """
+        if tagset and tagset != self._tagset:
+            tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
+        else:
+            tag_mapping_function = None
+        return concat(
+            [
+                TaggedCorpusView(
+                    fileid,
+                    enc,
+                    True,
+                    False,
+                    False,
+                    self._sep,
+                    self._word_tokenizer,
+                    self._sent_tokenizer,
+                    self._para_block_reader,
+                    tag_mapping_function,
+                )
+                for (fileid, enc) in self.abspaths(fileids, True)
+            ]
+        )
+
+    def tagged_sents(self, fileids=None, tagset=None):
+        """
+        :return: the given file(s) as a list of
+            sentences, each encoded as a list of ``(word,tag)`` tuples.
+
+        :rtype: list(list(tuple(str,str)))
+        """
+        if tagset and tagset != self._tagset:
+            tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
+        else:
+            tag_mapping_function = None
+        return concat(
+            [
+                TaggedCorpusView(
+                    fileid,
+                    enc,
+                    True,
+                    True,
+                    False,
+                    self._sep,
+                    self._word_tokenizer,
+                    self._sent_tokenizer,
+                    self._para_block_reader,
+                    tag_mapping_function,
+                )
+                for (fileid, enc) in self.abspaths(fileids, True)
+            ]
+        )
+
+    def tagged_paras(self, fileids=None, tagset=None):
+        """
+        :return: the given file(s) as a list of
+            paragraphs, each encoded as a list of sentences, which are
+            in turn encoded as lists of ``(word,tag)`` tuples.
+        :rtype: list(list(list(tuple(str,str))))
+        """
+        if tagset and tagset != self._tagset:
+            tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
+        else:
+            tag_mapping_function = None
+        return concat(
+            [
+                TaggedCorpusView(
+                    fileid,
+                    enc,
+                    True,
+                    True,
+                    True,
+                    self._sep,
+                    self._word_tokenizer,
+                    self._sent_tokenizer,
+                    self._para_block_reader,
+                    tag_mapping_function,
+                )
+                for (fileid, enc) in self.abspaths(fileids, True)
+            ]
+        )
+
+
+class CategorizedTaggedCorpusReader(CategorizedCorpusReader, TaggedCorpusReader):
+    """
+    A reader for part-of-speech tagged corpora whose documents are
+    divided into categories based on their file identifiers.
+    """
+
+    def __init__(self, *args, **kwargs):
+        """
+        Initialize the corpus reader.  Categorization arguments
+        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
+        the ``CategorizedCorpusReader`` constructor.  The remaining arguments
+        are passed to the ``TaggedCorpusReader``.
+        """
+        CategorizedCorpusReader.__init__(self, kwargs)
+        TaggedCorpusReader.__init__(self, *args, **kwargs)
+
+    def tagged_words(self, fileids=None, categories=None, tagset=None):
+        return super().tagged_words(self._resolve(fileids, categories), tagset)
+
+    def tagged_sents(self, fileids=None, categories=None, tagset=None):
+        return super().tagged_sents(self._resolve(fileids, categories), tagset)
+
+    def tagged_paras(self, fileids=None, categories=None, tagset=None):
+        return super().tagged_paras(self._resolve(fileids, categories), tagset)
+
+
+class TaggedCorpusView(StreamBackedCorpusView):
+    """
+    A specialized corpus view for tagged documents.  It can be
+    customized via flags to divide the tagged corpus documents up by
+    sentence or paragraph, and to include or omit part of speech tags.
+    ``TaggedCorpusView`` objects are typically created by
+    ``TaggedCorpusReader`` (not directly by nltk users).
+    """
+
+    def __init__(
+        self,
+        corpus_file,
+        encoding,
+        tagged,
+        group_by_sent,
+        group_by_para,
+        sep,
+        word_tokenizer,
+        sent_tokenizer,
+        para_block_reader,
+        tag_mapping_function=None,
+    ):
+        self._tagged = tagged
+        self._group_by_sent = group_by_sent
+        self._group_by_para = group_by_para
+        self._sep = sep
+        self._word_tokenizer = word_tokenizer
+        self._sent_tokenizer = sent_tokenizer
+        self._para_block_reader = para_block_reader
+        self._tag_mapping_function = tag_mapping_function
+        StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
+
+    def read_block(self, stream):
+        """Reads one paragraph at a time."""
+        block = []
+        for para_str in self._para_block_reader(stream):
+            para = []
+            for sent_str in self._sent_tokenizer.tokenize(para_str):
+                sent = [
+                    str2tuple(s, self._sep)
+                    for s in self._word_tokenizer.tokenize(sent_str)
+                ]
+                if self._tag_mapping_function:
+                    sent = [(w, self._tag_mapping_function(t)) for (w, t) in sent]
+                if not self._tagged:
+                    sent = [w for (w, t) in sent]
+                if self._group_by_sent:
+                    para.append(sent)
+                else:
+                    para.extend(sent)
+            if self._group_by_para:
+                block.append(para)
+            else:
+                block.extend(para)
+        return block
+
+
+# needs to implement simplified tags
+class MacMorphoCorpusReader(TaggedCorpusReader):
+    """
+    A corpus reader for the MAC_MORPHO corpus.  Each line contains a
+    single tagged word, using '_' as a separator.  Sentence boundaries
+    are based on the end-sentence tag ('_.').  Paragraph information
+    is not included in the corpus, so each paragraph returned by
+    ``self.paras()`` and ``self.tagged_paras()`` contains a single
+    sentence.
+    """
+
+    def __init__(self, root, fileids, encoding="utf8", tagset=None):
+        TaggedCorpusReader.__init__(
+            self,
+            root,
+            fileids,
+            sep="_",
+            word_tokenizer=LineTokenizer(),
+            sent_tokenizer=RegexpTokenizer(".*\n"),
+            para_block_reader=self._read_block,
+            encoding=encoding,
+            tagset=tagset,
+        )
+
+    def _read_block(self, stream):
+        return read_regexp_block(stream, r".*", r".*_\.")
+
+
+class TimitTaggedCorpusReader(TaggedCorpusReader):
+    """
+    A corpus reader for tagged sentences that are included in the TIMIT corpus.
+    """
+
+    def __init__(self, *args, **kwargs):
+        TaggedCorpusReader.__init__(
+            self, para_block_reader=read_timit_block, *args, **kwargs
+        )
+
+    def paras(self):
+        raise NotImplementedError("use sents() instead")
+
+    def tagged_paras(self):
+        raise NotImplementedError("use tagged_sents() instead")
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/timit.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/timit.py
@@ -0,0 +1,510 @@
+# Natural Language Toolkit: TIMIT Corpus Reader
+#
+# Copyright (C) 2001-2007 NLTK Project
+# Author: Haejoong Lee <haejoong@ldc.upenn.edu>
+#         Steven Bird <stevenbird1@gmail.com>
+#         Jacob Perkins <japerk@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+# [xx] this docstring is out-of-date:
+"""
+Read tokens, phonemes and audio data from the NLTK TIMIT Corpus.
+
+This corpus contains selected portion of the TIMIT corpus.
+
+ - 16 speakers from 8 dialect regions
+ - 1 male and 1 female from each dialect region
+ - total 130 sentences (10 sentences per speaker.  Note that some
+   sentences are shared among other speakers, especially sa1 and sa2
+   are spoken by all speakers.)
+ - total 160 recording of sentences (10 recordings per speaker)
+ - audio format: NIST Sphere, single channel, 16kHz sampling,
+   16 bit sample, PCM encoding
+
+
+Module contents
+===============
+
+The timit corpus reader provides 4 functions and 4 data items.
+
+ - utterances
+
+   List of utterances in the corpus.  There are total 160 utterances,
+   each of which corresponds to a unique utterance of a speaker.
+   Here's an example of an utterance identifier in the list::
+
+       dr1-fvmh0/sx206
+         - _----  _---
+         | |  |   | |
+         | |  |   | |
+         | |  |   | `--- sentence number
+         | |  |   `----- sentence type (a:all, i:shared, x:exclusive)
+         | |  `--------- speaker ID
+         | `------------ sex (m:male, f:female)
+         `-------------- dialect region (1..8)
+
+ - speakers
+
+   List of speaker IDs.  An example of speaker ID::
+
+       dr1-fvmh0
+
+   Note that if you split an item ID with colon and take the first element of
+   the result, you will get a speaker ID.
+
+       >>> itemid = 'dr1-fvmh0/sx206'
+       >>> spkrid , sentid = itemid.split('/')
+       >>> spkrid
+       'dr1-fvmh0'
+
+   The second element of the result is a sentence ID.
+
+ - dictionary()
+
+   Phonetic dictionary of words contained in this corpus.  This is a Python
+   dictionary from words to phoneme lists.
+
+ - spkrinfo()
+
+   Speaker information table.  It's a Python dictionary from speaker IDs to
+   records of 10 fields.  Speaker IDs the same as the ones in timie.speakers.
+   Each record is a dictionary from field names to values, and the fields are
+   as follows::
+
+     id         speaker ID as defined in the original TIMIT speaker info table
+     sex        speaker gender (M:male, F:female)
+     dr         speaker dialect region (1:new england, 2:northern,
+                3:north midland, 4:south midland, 5:southern, 6:new york city,
+                7:western, 8:army brat (moved around))
+     use        corpus type (TRN:training, TST:test)
+                in this sample corpus only TRN is available
+     recdate    recording date
+     birthdate  speaker birth date
+     ht         speaker height
+     race       speaker race (WHT:white, BLK:black, AMR:american indian,
+                SPN:spanish-american, ORN:oriental,???:unknown)
+     edu        speaker education level (HS:high school, AS:associate degree,
+                BS:bachelor's degree (BS or BA), MS:master's degree (MS or MA),
+                PHD:doctorate degree (PhD,JD,MD), ??:unknown)
+     comments   comments by the recorder
+
+The 4 functions are as follows.
+
+ - tokenized(sentences=items, offset=False)
+
+   Given a list of items, returns an iterator of a list of word lists,
+   each of which corresponds to an item (sentence).  If offset is set to True,
+   each element of the word list is a tuple of word(string), start offset and
+   end offset, where offset is represented as a number of 16kHz samples.
+
+ - phonetic(sentences=items, offset=False)
+
+   Given a list of items, returns an iterator of a list of phoneme lists,
+   each of which corresponds to an item (sentence).  If offset is set to True,
+   each element of the phoneme list is a tuple of word(string), start offset
+   and end offset, where offset is represented as a number of 16kHz samples.
+
+ - audiodata(item, start=0, end=None)
+
+   Given an item, returns a chunk of audio samples formatted into a string.
+   When the function is called, if start and end are omitted, the entire
+   samples of the recording will be returned.  If only end is omitted,
+   samples from the start offset to the end of the recording will be returned.
+
+ - play(data)
+
+   Play the given audio samples. The audio samples can be obtained from the
+   timit.audiodata function.
+
+"""
+import sys
+import time
+
+from nltk.corpus.reader.api import *
+from nltk.internals import import_from_stdlib
+from nltk.tree import Tree
+
+
+class TimitCorpusReader(CorpusReader):
+    """
+    Reader for the TIMIT corpus (or any other corpus with the same
+    file layout and use of file formats).  The corpus root directory
+    should contain the following files:
+
+      - timitdic.txt: dictionary of standard transcriptions
+      - spkrinfo.txt: table of speaker information
+
+    In addition, the root directory should contain one subdirectory
+    for each speaker, containing three files for each utterance:
+
+      - <utterance-id>.txt: text content of utterances
+      - <utterance-id>.wrd: tokenized text content of utterances
+      - <utterance-id>.phn: phonetic transcription of utterances
+      - <utterance-id>.wav: utterance sound file
+    """
+
+    _FILE_RE = r"(\w+-\w+/\w+\.(phn|txt|wav|wrd))|" + r"timitdic\.txt|spkrinfo\.txt"
+    """A regexp matching fileids that are used by this corpus reader."""
+    _UTTERANCE_RE = r"\w+-\w+/\w+\.txt"
+
+    def __init__(self, root, encoding="utf8"):
+        """
+        Construct a new TIMIT corpus reader in the given directory.
+        :param root: The root directory for this corpus.
+        """
+        # Ensure that wave files don't get treated as unicode data:
+        if isinstance(encoding, str):
+            encoding = [(r".*\.wav", None), (".*", encoding)]
+
+        CorpusReader.__init__(
+            self, root, find_corpus_fileids(root, self._FILE_RE), encoding=encoding
+        )
+
+        self._utterances = [
+            name[:-4] for name in find_corpus_fileids(root, self._UTTERANCE_RE)
+        ]
+        """A list of the utterance identifiers for all utterances in
+        this corpus."""
+
+        self._speakerinfo = None
+        self._root = root
+        self.speakers = sorted({u.split("/")[0] for u in self._utterances})
+
+    def fileids(self, filetype=None):
+        """
+        Return a list of file identifiers for the files that make up
+        this corpus.
+
+        :param filetype: If specified, then ``filetype`` indicates that
+            only the files that have the given type should be
+            returned.  Accepted values are: ``txt``, ``wrd``, ``phn``,
+            ``wav``, or ``metadata``,
+        """
+        if filetype is None:
+            return CorpusReader.fileids(self)
+        elif filetype in ("txt", "wrd", "phn", "wav"):
+            return [f"{u}.{filetype}" for u in self._utterances]
+        elif filetype == "metadata":
+            return ["timitdic.txt", "spkrinfo.txt"]
+        else:
+            raise ValueError("Bad value for filetype: %r" % filetype)
+
+    def utteranceids(
+        self, dialect=None, sex=None, spkrid=None, sent_type=None, sentid=None
+    ):
+        """
+        :return: A list of the utterance identifiers for all
+            utterances in this corpus, or for the given speaker, dialect
+            region, gender, sentence type, or sentence number, if
+            specified.
+        """
+        if isinstance(dialect, str):
+            dialect = [dialect]
+        if isinstance(sex, str):
+            sex = [sex]
+        if isinstance(spkrid, str):
+            spkrid = [spkrid]
+        if isinstance(sent_type, str):
+            sent_type = [sent_type]
+        if isinstance(sentid, str):
+            sentid = [sentid]
+
+        utterances = self._utterances[:]
+        if dialect is not None:
+            utterances = [u for u in utterances if u[2] in dialect]
+        if sex is not None:
+            utterances = [u for u in utterances if u[4] in sex]
+        if spkrid is not None:
+            utterances = [u for u in utterances if u[:9] in spkrid]
+        if sent_type is not None:
+            utterances = [u for u in utterances if u[11] in sent_type]
+        if sentid is not None:
+            utterances = [u for u in utterances if u[10:] in spkrid]
+        return utterances
+
+    def transcription_dict(self):
+        """
+        :return: A dictionary giving the 'standard' transcription for
+            each word.
+        """
+        _transcriptions = {}
+        with self.open("timitdic.txt") as fp:
+            for line in fp:
+                if not line.strip() or line[0] == ";":
+                    continue
+                m = re.match(r"\s*(\S+)\s+/(.*)/\s*$", line)
+                if not m:
+                    raise ValueError("Bad line: %r" % line)
+                _transcriptions[m.group(1)] = m.group(2).split()
+        return _transcriptions
+
+    def spkrid(self, utterance):
+        return utterance.split("/")[0]
+
+    def sentid(self, utterance):
+        return utterance.split("/")[1]
+
+    def utterance(self, spkrid, sentid):
+        return f"{spkrid}/{sentid}"
+
+    def spkrutteranceids(self, speaker):
+        """
+        :return: A list of all utterances associated with a given
+            speaker.
+        """
+        return [
+            utterance
+            for utterance in self._utterances
+            if utterance.startswith(speaker + "/")
+        ]
+
+    def spkrinfo(self, speaker):
+        """
+        :return: A dictionary mapping .. something.
+        """
+        if speaker in self._utterances:
+            speaker = self.spkrid(speaker)
+
+        if self._speakerinfo is None:
+            self._speakerinfo = {}
+            with self.open("spkrinfo.txt") as fp:
+                for line in fp:
+                    if not line.strip() or line[0] == ";":
+                        continue
+                    rec = line.strip().split(None, 9)
+                    key = f"dr{rec[2]}-{rec[1].lower()}{rec[0].lower()}"
+                    self._speakerinfo[key] = SpeakerInfo(*rec)
+
+        return self._speakerinfo[speaker]
+
+    def phones(self, utterances=None):
+        results = []
+        for fileid in self._utterance_fileids(utterances, ".phn"):
+            with self.open(fileid) as fp:
+                for line in fp:
+                    if line.strip():
+                        results.append(line.split()[-1])
+        return results
+
+    def phone_times(self, utterances=None):
+        """
+        offset is represented as a number of 16kHz samples!
+        """
+        results = []
+        for fileid in self._utterance_fileids(utterances, ".phn"):
+            with self.open(fileid) as fp:
+                for line in fp:
+                    if line.strip():
+                        results.append(
+                            (
+                                line.split()[2],
+                                int(line.split()[0]),
+                                int(line.split()[1]),
+                            )
+                        )
+        return results
+
+    def words(self, utterances=None):
+        results = []
+        for fileid in self._utterance_fileids(utterances, ".wrd"):
+            with self.open(fileid) as fp:
+                for line in fp:
+                    if line.strip():
+                        results.append(line.split()[-1])
+        return results
+
+    def word_times(self, utterances=None):
+        results = []
+        for fileid in self._utterance_fileids(utterances, ".wrd"):
+            with self.open(fileid) as fp:
+                for line in fp:
+                    if line.strip():
+                        results.append(
+                            (
+                                line.split()[2],
+                                int(line.split()[0]),
+                                int(line.split()[1]),
+                            )
+                        )
+        return results
+
+    def sents(self, utterances=None):
+        results = []
+        for fileid in self._utterance_fileids(utterances, ".wrd"):
+            with self.open(fileid) as fp:
+                results.append([line.split()[-1] for line in fp if line.strip()])
+        return results
+
+    def sent_times(self, utterances=None):
+        # TODO: Check this
+        return [
+            (
+                line.split(None, 2)[-1].strip(),
+                int(line.split()[0]),
+                int(line.split()[1]),
+            )
+            for fileid in self._utterance_fileids(utterances, ".txt")
+            for line in self.open(fileid)
+            if line.strip()
+        ]
+
+    def phone_trees(self, utterances=None):
+        if utterances is None:
+            utterances = self._utterances
+        if isinstance(utterances, str):
+            utterances = [utterances]
+
+        trees = []
+        for utterance in utterances:
+            word_times = self.word_times(utterance)
+            phone_times = self.phone_times(utterance)
+            sent_times = self.sent_times(utterance)
+
+            while sent_times:
+                (sent, sent_start, sent_end) = sent_times.pop(0)
+                trees.append(Tree("S", []))
+                while (
+                    word_times and phone_times and phone_times[0][2] <= word_times[0][1]
+                ):
+                    trees[-1].append(phone_times.pop(0)[0])
+                while word_times and word_times[0][2] <= sent_end:
+                    (word, word_start, word_end) = word_times.pop(0)
+                    trees[-1].append(Tree(word, []))
+                    while phone_times and phone_times[0][2] <= word_end:
+                        trees[-1][-1].append(phone_times.pop(0)[0])
+                while phone_times and phone_times[0][2] <= sent_end:
+                    trees[-1].append(phone_times.pop(0)[0])
+        return trees
+
+    # [xx] NOTE: This is currently broken -- we're assuming that the
+    # fileids are WAV fileids (aka RIFF), but they're actually NIST SPHERE
+    # fileids.
+    def wav(self, utterance, start=0, end=None):
+        # nltk.chunk conflicts with the stdlib module 'chunk'
+        wave = import_from_stdlib("wave")
+
+        w = wave.open(self.open(utterance + ".wav"), "rb")
+
+        if end is None:
+            end = w.getnframes()
+
+        # Skip past frames before start, then read the frames we want
+        w.readframes(start)
+        frames = w.readframes(end - start)
+
+        # Open a new temporary file -- the wave module requires
+        # an actual file, and won't work w/ stringio. :(
+        tf = tempfile.TemporaryFile()
+        out = wave.open(tf, "w")
+
+        # Write the parameters & data to the new file.
+        out.setparams(w.getparams())
+        out.writeframes(frames)
+        out.close()
+
+        # Read the data back from the file, and return it.  The
+        # file will automatically be deleted when we return.
+        tf.seek(0)
+        return tf.read()
+
+    def audiodata(self, utterance, start=0, end=None):
+        assert end is None or end > start
+        headersize = 44
+        with self.open(utterance + ".wav") as fp:
+            if end is None:
+                data = fp.read()
+            else:
+                data = fp.read(headersize + end * 2)
+        return data[headersize + start * 2 :]
+
+    def _utterance_fileids(self, utterances, extension):
+        if utterances is None:
+            utterances = self._utterances
+        if isinstance(utterances, str):
+            utterances = [utterances]
+        return [f"{u}{extension}" for u in utterances]
+
+    def play(self, utterance, start=0, end=None):
+        """
+        Play the given audio sample.
+
+        :param utterance: The utterance id of the sample to play
+        """
+        # Method 1: os audio dev.
+        try:
+            import ossaudiodev
+
+            try:
+                dsp = ossaudiodev.open("w")
+                dsp.setfmt(ossaudiodev.AFMT_S16_LE)
+                dsp.channels(1)
+                dsp.speed(16000)
+                dsp.write(self.audiodata(utterance, start, end))
+                dsp.close()
+            except OSError as e:
+                print(
+                    (
+                        "can't acquire the audio device; please "
+                        "activate your audio device."
+                    ),
+                    file=sys.stderr,
+                )
+                print("system error message:", str(e), file=sys.stderr)
+            return
+        except ImportError:
+            pass
+
+        # Method 2: pygame
+        try:
+            # FIXME: this won't work under python 3
+            import pygame.mixer
+            import StringIO
+
+            pygame.mixer.init(16000)
+            f = StringIO.StringIO(self.wav(utterance, start, end))
+            pygame.mixer.Sound(f).play()
+            while pygame.mixer.get_busy():
+                time.sleep(0.01)
+            return
+        except ImportError:
+            pass
+
+        # Method 3: complain. :)
+        print(
+            ("you must install pygame or ossaudiodev " "for audio playback."),
+            file=sys.stderr,
+        )
+
+
+class SpeakerInfo:
+    def __init__(
+        self, id, sex, dr, use, recdate, birthdate, ht, race, edu, comments=None
+    ):
+        self.id = id
+        self.sex = sex
+        self.dr = dr
+        self.use = use
+        self.recdate = recdate
+        self.birthdate = birthdate
+        self.ht = ht
+        self.race = race
+        self.edu = edu
+        self.comments = comments
+
+    def __repr__(self):
+        attribs = "id sex dr use recdate birthdate ht race edu comments"
+        args = [f"{attr}={getattr(self, attr)!r}" for attr in attribs.split()]
+        return "SpeakerInfo(%s)" % (", ".join(args))
+
+
+def read_timit_block(stream):
+    """
+    Block reader for timit tagged sentences, which are preceded by a sentence
+    number that will be ignored.
+    """
+    line = stream.readline()
+    if not line:
+        return []
+    n, sent = line.split(" ", 1)
+    return [sent]
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/toolbox.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/toolbox.py
@@ -0,0 +1,76 @@
+# Natural Language Toolkit: Toolbox Reader
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Greg Aumann <greg_aumann@sil.org>
+#         Stuart Robinson <Stuart.Robinson@mpi.nl>
+#         Steven Bird <stevenbird1@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Module for reading, writing and manipulating
+Toolbox databases and settings fileids.
+"""
+
+from nltk.corpus.reader.api import *
+from nltk.corpus.reader.util import *
+from nltk.toolbox import ToolboxData
+
+
+class ToolboxCorpusReader(CorpusReader):
+    def xml(self, fileids, key=None):
+        return concat(
+            [
+                ToolboxData(path, enc).parse(key=key)
+                for (path, enc) in self.abspaths(fileids, True)
+            ]
+        )
+
+    def fields(
+        self,
+        fileids,
+        strip=True,
+        unwrap=True,
+        encoding="utf8",
+        errors="strict",
+        unicode_fields=None,
+    ):
+        return concat(
+            [
+                list(
+                    ToolboxData(fileid, enc).fields(
+                        strip, unwrap, encoding, errors, unicode_fields
+                    )
+                )
+                for (fileid, enc) in self.abspaths(fileids, include_encoding=True)
+            ]
+        )
+
+    # should probably be done lazily:
+    def entries(self, fileids, **kwargs):
+        if "key" in kwargs:
+            key = kwargs["key"]
+            del kwargs["key"]
+        else:
+            key = "lx"  # the default key in MDF
+        entries = []
+        for marker, contents in self.fields(fileids, **kwargs):
+            if marker == key:
+                entries.append((contents, []))
+            else:
+                try:
+                    entries[-1][-1].append((marker, contents))
+                except IndexError:
+                    pass
+        return entries
+
+    def words(self, fileids, key="lx"):
+        return [contents for marker, contents in self.fields(fileids) if marker == key]
+
+
+def demo():
+    pass
+
+
+if __name__ == "__main__":
+    demo()
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/twitter.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/twitter.py
@@ -0,0 +1,136 @@
+# Natural Language Toolkit: Twitter Corpus Reader
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Ewan Klein <ewan@inf.ed.ac.uk>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+A reader for corpora that consist of Tweets. It is assumed that the Tweets
+have been serialised into line-delimited JSON.
+"""
+
+import json
+import os
+
+from nltk.corpus.reader.api import CorpusReader
+from nltk.corpus.reader.util import StreamBackedCorpusView, ZipFilePathPointer, concat
+from nltk.tokenize import TweetTokenizer
+
+
+class TwitterCorpusReader(CorpusReader):
+    r"""
+    Reader for corpora that consist of Tweets represented as a list of line-delimited JSON.
+
+    Individual Tweets can be tokenized using the default tokenizer, or by a
+    custom tokenizer specified as a parameter to the constructor.
+
+    Construct a new Tweet corpus reader for a set of documents
+    located at the given root directory.
+
+    If you made your own tweet collection in a directory called
+    `twitter-files`, then you can initialise the reader as::
+
+        from nltk.corpus import TwitterCorpusReader
+        reader = TwitterCorpusReader(root='/path/to/twitter-files', '.*\.json')
+
+    However, the recommended approach is to set the relevant directory as the
+    value of the environmental variable `TWITTER`, and then invoke the reader
+    as follows::
+
+       root = os.environ['TWITTER']
+       reader = TwitterCorpusReader(root, '.*\.json')
+
+    If you want to work directly with the raw Tweets, the `json` library can
+    be used::
+
+       import json
+       for tweet in reader.docs():
+           print(json.dumps(tweet, indent=1, sort_keys=True))
+
+    """
+
+    CorpusView = StreamBackedCorpusView
+    """
+    The corpus view class used by this reader.
+    """
+
+    def __init__(
+        self, root, fileids=None, word_tokenizer=TweetTokenizer(), encoding="utf8"
+    ):
+        """
+        :param root: The root directory for this corpus.
+        :param fileids: A list or regexp specifying the fileids in this corpus.
+        :param word_tokenizer: Tokenizer for breaking the text of Tweets into
+            smaller units, including but not limited to words.
+        """
+        CorpusReader.__init__(self, root, fileids, encoding)
+
+        for path in self.abspaths(self._fileids):
+            if isinstance(path, ZipFilePathPointer):
+                pass
+            elif os.path.getsize(path) == 0:
+                raise ValueError(f"File {path} is empty")
+        """Check that all user-created corpus files are non-empty."""
+
+        self._word_tokenizer = word_tokenizer
+
+    def docs(self, fileids=None):
+        """
+        Returns the full Tweet objects, as specified by `Twitter
+        documentation on Tweets
+        <https://dev.twitter.com/docs/platform-objects/tweets>`_
+
+        :return: the given file(s) as a list of dictionaries deserialised
+            from JSON.
+        :rtype: list(dict)
+        """
+        return concat(
+            [
+                self.CorpusView(path, self._read_tweets, encoding=enc)
+                for (path, enc, fileid) in self.abspaths(fileids, True, True)
+            ]
+        )
+
+    def strings(self, fileids=None):
+        """
+        Returns only the text content of Tweets in the file(s)
+
+        :return: the given file(s) as a list of Tweets.
+        :rtype: list(str)
+        """
+        fulltweets = self.docs(fileids)
+        tweets = []
+        for jsono in fulltweets:
+            try:
+                text = jsono["text"]
+                if isinstance(text, bytes):
+                    text = text.decode(self.encoding)
+                tweets.append(text)
+            except KeyError:
+                pass
+        return tweets
+
+    def tokenized(self, fileids=None):
+        """
+        :return: the given file(s) as a list of the text content of Tweets as
+            as a list of words, screenanames, hashtags, URLs and punctuation symbols.
+
+        :rtype: list(list(str))
+        """
+        tweets = self.strings(fileids)
+        tokenizer = self._word_tokenizer
+        return [tokenizer.tokenize(t) for t in tweets]
+
+    def _read_tweets(self, stream):
+        """
+        Assumes that each line in ``stream`` is a JSON-serialised object.
+        """
+        tweets = []
+        for i in range(10):
+            line = stream.readline()
+            if not line:
+                return tweets
+            tweet = json.loads(line)
+            tweets.append(tweet)
+        return tweets
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/udhr.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/udhr.py
@@ -0,0 +1,74 @@
+"""
+UDHR corpus reader. It mostly deals with encodings.
+"""
+
+from nltk.corpus.reader.plaintext import PlaintextCorpusReader
+from nltk.corpus.reader.util import find_corpus_fileids
+
+
+class UdhrCorpusReader(PlaintextCorpusReader):
+    ENCODINGS = [
+        (".*-Latin1$", "latin-1"),
+        (".*-Hebrew$", "hebrew"),
+        (".*-Arabic$", "cp1256"),
+        ("Czech_Cesky-UTF8", "cp1250"),  # yeah
+        ("Polish-Latin2", "cp1250"),
+        ("Polish_Polski-Latin2", "cp1250"),
+        (".*-Cyrillic$", "cyrillic"),
+        (".*-SJIS$", "SJIS"),
+        (".*-GB2312$", "GB2312"),
+        (".*-Latin2$", "ISO-8859-2"),
+        (".*-Greek$", "greek"),
+        (".*-UTF8$", "utf-8"),
+        ("Hungarian_Magyar-Unicode", "utf-16-le"),
+        ("Amahuaca", "latin1"),
+        ("Turkish_Turkce-Turkish", "latin5"),
+        ("Lithuanian_Lietuviskai-Baltic", "latin4"),
+        ("Japanese_Nihongo-EUC", "EUC-JP"),
+        ("Japanese_Nihongo-JIS", "iso2022_jp"),
+        ("Chinese_Mandarin-HZ", "hz"),
+        (r"Abkhaz\-Cyrillic\+Abkh", "cp1251"),
+    ]
+
+    SKIP = {
+        # The following files are not fully decodable because they
+        # were truncated at wrong bytes:
+        "Burmese_Myanmar-UTF8",
+        "Japanese_Nihongo-JIS",
+        "Chinese_Mandarin-HZ",
+        "Chinese_Mandarin-UTF8",
+        "Gujarati-UTF8",
+        "Hungarian_Magyar-Unicode",
+        "Lao-UTF8",
+        "Magahi-UTF8",
+        "Marathi-UTF8",
+        "Tamil-UTF8",
+        # Unfortunately, encodings required for reading
+        # the following files are not supported by Python:
+        "Vietnamese-VPS",
+        "Vietnamese-VIQR",
+        "Vietnamese-TCVN",
+        "Magahi-Agra",
+        "Bhojpuri-Agra",
+        "Esperanto-T61",  # latin3 raises an exception
+        # The following files are encoded for specific fonts:
+        "Burmese_Myanmar-WinResearcher",
+        "Armenian-DallakHelv",
+        "Tigrinya_Tigrigna-VG2Main",
+        "Amharic-Afenegus6..60375",  # ?
+        "Navaho_Dine-Navajo-Navaho-font",
+        # What are these?
+        "Azeri_Azerbaijani_Cyrillic-Az.Times.Cyr.Normal0117",
+        "Azeri_Azerbaijani_Latin-Az.Times.Lat0117",
+        # The following files are unintended:
+        "Czech-Latin2-err",
+        "Russian_Russky-UTF8~",
+    }
+
+    def __init__(self, root="udhr"):
+        fileids = find_corpus_fileids(root, r"(?!README|\.).*")
+        super().__init__(
+            root,
+            [fileid for fileid in fileids if fileid not in self.SKIP],
+            encoding=self.ENCODINGS,
+        )
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/util.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/util.py
@@ -0,0 +1,780 @@
+# Natural Language Toolkit: Corpus Reader Utilities
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+#         Edward Loper <edloper@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+import bisect
+import os
+import pickle
+import re
+import tempfile
+from functools import reduce
+from xml.etree import ElementTree
+
+from nltk.data import (
+    FileSystemPathPointer,
+    PathPointer,
+    SeekableUnicodeStreamReader,
+    ZipFilePathPointer,
+)
+from nltk.internals import slice_bounds
+from nltk.tokenize import wordpunct_tokenize
+from nltk.util import AbstractLazySequence, LazyConcatenation, LazySubsequence
+
+######################################################################
+# { Corpus View
+######################################################################
+
+
+class StreamBackedCorpusView(AbstractLazySequence):
+    """
+    A 'view' of a corpus file, which acts like a sequence of tokens:
+    it can be accessed by index, iterated over, etc.  However, the
+    tokens are only constructed as-needed -- the entire corpus is
+    never stored in memory at once.
+
+    The constructor to ``StreamBackedCorpusView`` takes two arguments:
+    a corpus fileid (specified as a string or as a ``PathPointer``);
+    and a block reader.  A "block reader" is a function that reads
+    zero or more tokens from a stream, and returns them as a list.  A
+    very simple example of a block reader is:
+
+        >>> def simple_block_reader(stream):
+        ...     return stream.readline().split()
+
+    This simple block reader reads a single line at a time, and
+    returns a single token (consisting of a string) for each
+    whitespace-separated substring on the line.
+
+    When deciding how to define the block reader for a given
+    corpus, careful consideration should be given to the size of
+    blocks handled by the block reader.  Smaller block sizes will
+    increase the memory requirements of the corpus view's internal
+    data structures (by 2 integers per block).  On the other hand,
+    larger block sizes may decrease performance for random access to
+    the corpus.  (But note that larger block sizes will *not*
+    decrease performance for iteration.)
+
+    Internally, ``CorpusView`` maintains a partial mapping from token
+    index to file position, with one entry per block.  When a token
+    with a given index *i* is requested, the ``CorpusView`` constructs
+    it as follows:
+
+      1. First, it searches the toknum/filepos mapping for the token
+         index closest to (but less than or equal to) *i*.
+
+      2. Then, starting at the file position corresponding to that
+         index, it reads one block at a time using the block reader
+         until it reaches the requested token.
+
+    The toknum/filepos mapping is created lazily: it is initially
+    empty, but every time a new block is read, the block's
+    initial token is added to the mapping.  (Thus, the toknum/filepos
+    map has one entry per block.)
+
+    In order to increase efficiency for random access patterns that
+    have high degrees of locality, the corpus view may cache one or
+    more blocks.
+
+    :note: Each ``CorpusView`` object internally maintains an open file
+        object for its underlying corpus file.  This file should be
+        automatically closed when the ``CorpusView`` is garbage collected,
+        but if you wish to close it manually, use the ``close()``
+        method.  If you access a ``CorpusView``'s items after it has been
+        closed, the file object will be automatically re-opened.
+
+    :warning: If the contents of the file are modified during the
+        lifetime of the ``CorpusView``, then the ``CorpusView``'s behavior
+        is undefined.
+
+    :warning: If a unicode encoding is specified when constructing a
+        ``CorpusView``, then the block reader may only call
+        ``stream.seek()`` with offsets that have been returned by
+        ``stream.tell()``; in particular, calling ``stream.seek()`` with
+        relative offsets, or with offsets based on string lengths, may
+        lead to incorrect behavior.
+
+    :ivar _block_reader: The function used to read
+        a single block from the underlying file stream.
+    :ivar _toknum: A list containing the token index of each block
+        that has been processed.  In particular, ``_toknum[i]`` is the
+        token index of the first token in block ``i``.  Together
+        with ``_filepos``, this forms a partial mapping between token
+        indices and file positions.
+    :ivar _filepos: A list containing the file position of each block
+        that has been processed.  In particular, ``_toknum[i]`` is the
+        file position of the first character in block ``i``.  Together
+        with ``_toknum``, this forms a partial mapping between token
+        indices and file positions.
+    :ivar _stream: The stream used to access the underlying corpus file.
+    :ivar _len: The total number of tokens in the corpus, if known;
+        or None, if the number of tokens is not yet known.
+    :ivar _eofpos: The character position of the last character in the
+        file.  This is calculated when the corpus view is initialized,
+        and is used to decide when the end of file has been reached.
+    :ivar _cache: A cache of the most recently read block.  It
+       is encoded as a tuple (start_toknum, end_toknum, tokens), where
+       start_toknum is the token index of the first token in the block;
+       end_toknum is the token index of the first token not in the
+       block; and tokens is a list of the tokens in the block.
+    """
+
+    def __init__(self, fileid, block_reader=None, startpos=0, encoding="utf8"):
+        """
+        Create a new corpus view, based on the file ``fileid``, and
+        read with ``block_reader``.  See the class documentation
+        for more information.
+
+        :param fileid: The path to the file that is read by this
+            corpus view.  ``fileid`` can either be a string or a
+            ``PathPointer``.
+
+        :param startpos: The file position at which the view will
+            start reading.  This can be used to skip over preface
+            sections.
+
+        :param encoding: The unicode encoding that should be used to
+            read the file's contents.  If no encoding is specified,
+            then the file's contents will be read as a non-unicode
+            string (i.e., a str).
+        """
+        if block_reader:
+            self.read_block = block_reader
+        # Initialize our toknum/filepos mapping.
+        self._toknum = [0]
+        self._filepos = [startpos]
+        self._encoding = encoding
+        # We don't know our length (number of tokens) yet.
+        self._len = None
+
+        self._fileid = fileid
+        self._stream = None
+
+        self._current_toknum = None
+        """This variable is set to the index of the next token that
+           will be read, immediately before ``self.read_block()`` is
+           called.  This is provided for the benefit of the block
+           reader, which under rare circumstances may need to know
+           the current token number."""
+
+        self._current_blocknum = None
+        """This variable is set to the index of the next block that
+           will be read, immediately before ``self.read_block()`` is
+           called.  This is provided for the benefit of the block
+           reader, which under rare circumstances may need to know
+           the current block number."""
+
+        # Find the length of the file.
+        try:
+            if isinstance(self._fileid, PathPointer):
+                self._eofpos = self._fileid.file_size()
+            else:
+                self._eofpos = os.stat(self._fileid).st_size
+        except Exception as exc:
+            raise ValueError(f"Unable to open or access {fileid!r} -- {exc}") from exc
+
+        # Maintain a cache of the most recently read block, to
+        # increase efficiency of random access.
+        self._cache = (-1, -1, None)
+
+    fileid = property(
+        lambda self: self._fileid,
+        doc="""
+        The fileid of the file that is accessed by this view.
+
+        :type: str or PathPointer""",
+    )
+
+    def read_block(self, stream):
+        """
+        Read a block from the input stream.
+
+        :return: a block of tokens from the input stream
+        :rtype: list(any)
+        :param stream: an input stream
+        :type stream: stream
+        """
+        raise NotImplementedError("Abstract Method")
+
+    def _open(self):
+        """
+        Open the file stream associated with this corpus view.  This
+        will be called performed if any value is read from the view
+        while its file stream is closed.
+        """
+        if isinstance(self._fileid, PathPointer):
+            self._stream = self._fileid.open(self._encoding)
+        elif self._encoding:
+            self._stream = SeekableUnicodeStreamReader(
+                open(self._fileid, "rb"), self._encoding
+            )
+        else:
+            self._stream = open(self._fileid, "rb")
+
+    def close(self):
+        """
+        Close the file stream associated with this corpus view.  This
+        can be useful if you are worried about running out of file
+        handles (although the stream should automatically be closed
+        upon garbage collection of the corpus view).  If the corpus
+        view is accessed after it is closed, it will be automatically
+        re-opened.
+        """
+        if self._stream is not None:
+            self._stream.close()
+        self._stream = None
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, type, value, traceback):
+        self.close()
+
+    def __len__(self):
+        if self._len is None:
+            # iterate_from() sets self._len when it reaches the end
+            # of the file:
+            for tok in self.iterate_from(self._toknum[-1]):
+                pass
+        return self._len
+
+    def __getitem__(self, i):
+        if isinstance(i, slice):
+            start, stop = slice_bounds(self, i)
+            # Check if it's in the cache.
+            offset = self._cache[0]
+            if offset <= start and stop <= self._cache[1]:
+                return self._cache[2][start - offset : stop - offset]
+            # Construct & return the result.
+            return LazySubsequence(self, start, stop)
+        else:
+            # Handle negative indices
+            if i < 0:
+                i += len(self)
+            if i < 0:
+                raise IndexError("index out of range")
+            # Check if it's in the cache.
+            offset = self._cache[0]
+            if offset <= i < self._cache[1]:
+                return self._cache[2][i - offset]
+            # Use iterate_from to extract it.
+            try:
+                return next(self.iterate_from(i))
+            except StopIteration as e:
+                raise IndexError("index out of range") from e
+
+    # If we wanted to be thread-safe, then this method would need to
+    # do some locking.
+    def iterate_from(self, start_tok):
+        # Start by feeding from the cache, if possible.
+        if self._cache[0] <= start_tok < self._cache[1]:
+            for tok in self._cache[2][start_tok - self._cache[0] :]:
+                yield tok
+                start_tok += 1
+
+        # Decide where in the file we should start.  If `start` is in
+        # our mapping, then we can jump straight to the correct block;
+        # otherwise, start at the last block we've processed.
+        if start_tok < self._toknum[-1]:
+            block_index = bisect.bisect_right(self._toknum, start_tok) - 1
+            toknum = self._toknum[block_index]
+            filepos = self._filepos[block_index]
+        else:
+            block_index = len(self._toknum) - 1
+            toknum = self._toknum[-1]
+            filepos = self._filepos[-1]
+
+        # Open the stream, if it's not open already.
+        if self._stream is None:
+            self._open()
+
+        # If the file is empty, the while loop will never run.
+        # This *seems* to be all the state we need to set:
+        if self._eofpos == 0:
+            self._len = 0
+
+        # Each iteration through this loop, we read a single block
+        # from the stream.
+        while filepos < self._eofpos:
+            # Read the next block.
+            self._stream.seek(filepos)
+            self._current_toknum = toknum
+            self._current_blocknum = block_index
+            tokens = self.read_block(self._stream)
+            assert isinstance(tokens, (tuple, list, AbstractLazySequence)), (
+                "block reader %s() should return list or tuple."
+                % self.read_block.__name__
+            )
+            num_toks = len(tokens)
+            new_filepos = self._stream.tell()
+            assert (
+                new_filepos > filepos
+            ), "block reader %s() should consume at least 1 byte (filepos=%d)" % (
+                self.read_block.__name__,
+                filepos,
+            )
+
+            # Update our cache.
+            self._cache = (toknum, toknum + num_toks, list(tokens))
+
+            # Update our mapping.
+            assert toknum <= self._toknum[-1]
+            if num_toks > 0:
+                block_index += 1
+                if toknum == self._toknum[-1]:
+                    assert new_filepos > self._filepos[-1]  # monotonic!
+                    self._filepos.append(new_filepos)
+                    self._toknum.append(toknum + num_toks)
+                else:
+                    # Check for consistency:
+                    assert (
+                        new_filepos == self._filepos[block_index]
+                    ), "inconsistent block reader (num chars read)"
+                    assert (
+                        toknum + num_toks == self._toknum[block_index]
+                    ), "inconsistent block reader (num tokens returned)"
+
+            # If we reached the end of the file, then update self._len
+            if new_filepos == self._eofpos:
+                self._len = toknum + num_toks
+            # Generate the tokens in this block (but skip any tokens
+            # before start_tok).  Note that between yields, our state
+            # may be modified.
+            for tok in tokens[max(0, start_tok - toknum) :]:
+                yield tok
+            # If we're at the end of the file, then we're done.
+            assert new_filepos <= self._eofpos
+            if new_filepos == self._eofpos:
+                break
+            # Update our indices
+            toknum += num_toks
+            filepos = new_filepos
+
+        # If we reach this point, then we should know our length.
+        assert self._len is not None
+        # Enforce closing of stream once we reached end of file
+        # We should have reached EOF once we're out of the while loop.
+        self.close()
+
+    # Use concat for these, so we can use a ConcatenatedCorpusView
+    # when possible.
+    def __add__(self, other):
+        return concat([self, other])
+
+    def __radd__(self, other):
+        return concat([other, self])
+
+    def __mul__(self, count):
+        return concat([self] * count)
+
+    def __rmul__(self, count):
+        return concat([self] * count)
+
+
+class ConcatenatedCorpusView(AbstractLazySequence):
+    """
+    A 'view' of a corpus file that joins together one or more
+    ``StreamBackedCorpusViews<StreamBackedCorpusView>``.  At most
+    one file handle is left open at any time.
+    """
+
+    def __init__(self, corpus_views):
+        self._pieces = corpus_views
+        """A list of the corpus subviews that make up this
+        concatenation."""
+
+        self._offsets = [0]
+        """A list of offsets, indicating the index at which each
+        subview begins.  In particular::
+            offsets[i] = sum([len(p) for p in pieces[:i]])"""
+
+        self._open_piece = None
+        """The most recently accessed corpus subview (or None).
+        Before a new subview is accessed, this subview will be closed."""
+
+    def __len__(self):
+        if len(self._offsets) <= len(self._pieces):
+            # Iterate to the end of the corpus.
+            for tok in self.iterate_from(self._offsets[-1]):
+                pass
+
+        return self._offsets[-1]
+
+    def close(self):
+        for piece in self._pieces:
+            piece.close()
+
+    def iterate_from(self, start_tok):
+        piecenum = bisect.bisect_right(self._offsets, start_tok) - 1
+
+        while piecenum < len(self._pieces):
+            offset = self._offsets[piecenum]
+            piece = self._pieces[piecenum]
+
+            # If we've got another piece open, close it first.
+            if self._open_piece is not piece:
+                if self._open_piece is not None:
+                    self._open_piece.close()
+                self._open_piece = piece
+
+            # Get everything we can from this piece.
+            yield from piece.iterate_from(max(0, start_tok - offset))
+
+            # Update the offset table.
+            if piecenum + 1 == len(self._offsets):
+                self._offsets.append(self._offsets[-1] + len(piece))
+
+            # Move on to the next piece.
+            piecenum += 1
+
+
+def concat(docs):
+    """
+    Concatenate together the contents of multiple documents from a
+    single corpus, using an appropriate concatenation function.  This
+    utility function is used by corpus readers when the user requests
+    more than one document at a time.
+    """
+    if len(docs) == 1:
+        return docs[0]
+    if len(docs) == 0:
+        raise ValueError("concat() expects at least one object!")
+
+    types = {d.__class__ for d in docs}
+
+    # If they're all strings, use string concatenation.
+    if all(isinstance(doc, str) for doc in docs):
+        return "".join(docs)
+
+    # If they're all corpus views, then use ConcatenatedCorpusView.
+    for typ in types:
+        if not issubclass(typ, (StreamBackedCorpusView, ConcatenatedCorpusView)):
+            break
+    else:
+        return ConcatenatedCorpusView(docs)
+
+    # If they're all lazy sequences, use a lazy concatenation
+    for typ in types:
+        if not issubclass(typ, AbstractLazySequence):
+            break
+    else:
+        return LazyConcatenation(docs)
+
+    # Otherwise, see what we can do:
+    if len(types) == 1:
+        typ = list(types)[0]
+
+        if issubclass(typ, list):
+            return reduce((lambda a, b: a + b), docs, [])
+
+        if issubclass(typ, tuple):
+            return reduce((lambda a, b: a + b), docs, ())
+
+        if ElementTree.iselement(typ):
+            xmltree = ElementTree.Element("documents")
+            for doc in docs:
+                xmltree.append(doc)
+            return xmltree
+
+    # No method found!
+    raise ValueError("Don't know how to concatenate types: %r" % types)
+
+
+######################################################################
+# { Block Readers
+######################################################################
+
+
+def read_whitespace_block(stream):
+    toks = []
+    for i in range(20):  # Read 20 lines at a time.
+        toks.extend(stream.readline().split())
+    return toks
+
+
+def read_wordpunct_block(stream):
+    toks = []
+    for i in range(20):  # Read 20 lines at a time.
+        toks.extend(wordpunct_tokenize(stream.readline()))
+    return toks
+
+
+def read_line_block(stream):
+    toks = []
+    for i in range(20):
+        line = stream.readline()
+        if not line:
+            return toks
+        toks.append(line.rstrip("\n"))
+    return toks
+
+
+def read_blankline_block(stream):
+    s = ""
+    while True:
+        line = stream.readline()
+        # End of file:
+        if not line:
+            if s:
+                return [s]
+            else:
+                return []
+        # Blank line:
+        elif line and not line.strip():
+            if s:
+                return [s]
+        # Other line:
+        else:
+            s += line
+
+
+def read_alignedsent_block(stream):
+    s = ""
+    while True:
+        line = stream.readline()
+        if line[0] == "=" or line[0] == "\n" or line[:2] == "\r\n":
+            continue
+        # End of file:
+        if not line:
+            if s:
+                return [s]
+            else:
+                return []
+        # Other line:
+        else:
+            s += line
+            if re.match(r"^\d+-\d+", line) is not None:
+                return [s]
+
+
+def read_regexp_block(stream, start_re, end_re=None):
+    """
+    Read a sequence of tokens from a stream, where tokens begin with
+    lines that match ``start_re``.  If ``end_re`` is specified, then
+    tokens end with lines that match ``end_re``; otherwise, tokens end
+    whenever the next line matching ``start_re`` or EOF is found.
+    """
+    # Scan until we find a line matching the start regexp.
+    while True:
+        line = stream.readline()
+        if not line:
+            return []  # end of file.
+        if re.match(start_re, line):
+            break
+
+    # Scan until we find another line matching the regexp, or EOF.
+    lines = [line]
+    while True:
+        oldpos = stream.tell()
+        line = stream.readline()
+        # End of file:
+        if not line:
+            return ["".join(lines)]
+        # End of token:
+        if end_re is not None and re.match(end_re, line):
+            return ["".join(lines)]
+        # Start of new token: backup to just before it starts, and
+        # return the token we've already collected.
+        if end_re is None and re.match(start_re, line):
+            stream.seek(oldpos)
+            return ["".join(lines)]
+        # Anything else is part of the token.
+        lines.append(line)
+
+
+def read_sexpr_block(stream, block_size=16384, comment_char=None):
+    """
+    Read a sequence of s-expressions from the stream, and leave the
+    stream's file position at the end the last complete s-expression
+    read.  This function will always return at least one s-expression,
+    unless there are no more s-expressions in the file.
+
+    If the file ends in in the middle of an s-expression, then that
+    incomplete s-expression is returned when the end of the file is
+    reached.
+
+    :param block_size: The default block size for reading.  If an
+        s-expression is longer than one block, then more than one
+        block will be read.
+    :param comment_char: A character that marks comments.  Any lines
+        that begin with this character will be stripped out.
+        (If spaces or tabs precede the comment character, then the
+        line will not be stripped.)
+    """
+    start = stream.tell()
+    block = stream.read(block_size)
+    encoding = getattr(stream, "encoding", None)
+    assert encoding is not None or isinstance(block, str)
+    if encoding not in (None, "utf-8"):
+        import warnings
+
+        warnings.warn(
+            "Parsing may fail, depending on the properties "
+            "of the %s encoding!" % encoding
+        )
+        # (e.g., the utf-16 encoding does not work because it insists
+        # on adding BOMs to the beginning of encoded strings.)
+
+    if comment_char:
+        COMMENT = re.compile("(?m)^%s.*$" % re.escape(comment_char))
+    while True:
+        try:
+            # If we're stripping comments, then make sure our block ends
+            # on a line boundary; and then replace any comments with
+            # space characters.  (We can't just strip them out -- that
+            # would make our offset wrong.)
+            if comment_char:
+                block += stream.readline()
+                block = re.sub(COMMENT, _sub_space, block)
+            # Read the block.
+            tokens, offset = _parse_sexpr_block(block)
+            # Skip whitespace
+            offset = re.compile(r"\s*").search(block, offset).end()
+
+            # Move to the end position.
+            if encoding is None:
+                stream.seek(start + offset)
+            else:
+                stream.seek(start + len(block[:offset].encode(encoding)))
+
+            # Return the list of tokens we processed
+            return tokens
+        except ValueError as e:
+            if e.args[0] == "Block too small":
+                next_block = stream.read(block_size)
+                if next_block:
+                    block += next_block
+                    continue
+                else:
+                    # The file ended mid-sexpr -- return what we got.
+                    return [block.strip()]
+            else:
+                raise
+
+
+def _sub_space(m):
+    """Helper function: given a regexp match, return a string of
+    spaces that's the same length as the matched string."""
+    return " " * (m.end() - m.start())
+
+
+def _parse_sexpr_block(block):
+    tokens = []
+    start = end = 0
+
+    while end < len(block):
+        m = re.compile(r"\S").search(block, end)
+        if not m:
+            return tokens, end
+
+        start = m.start()
+
+        # Case 1: sexpr is not parenthesized.
+        if m.group() != "(":
+            m2 = re.compile(r"[\s(]").search(block, start)
+            if m2:
+                end = m2.start()
+            else:
+                if tokens:
+                    return tokens, end
+                raise ValueError("Block too small")
+
+        # Case 2: parenthesized sexpr.
+        else:
+            nesting = 0
+            for m in re.compile(r"[()]").finditer(block, start):
+                if m.group() == "(":
+                    nesting += 1
+                else:
+                    nesting -= 1
+                if nesting == 0:
+                    end = m.end()
+                    break
+            else:
+                if tokens:
+                    return tokens, end
+                raise ValueError("Block too small")
+
+        tokens.append(block[start:end])
+
+    return tokens, end
+
+
+######################################################################
+# { Finding Corpus Items
+######################################################################
+
+
+def find_corpus_fileids(root, regexp):
+    if not isinstance(root, PathPointer):
+        raise TypeError("find_corpus_fileids: expected a PathPointer")
+    regexp += "$"
+
+    # Find fileids in a zipfile: scan the zipfile's namelist.  Filter
+    # out entries that end in '/' -- they're directories.
+    if isinstance(root, ZipFilePathPointer):
+        fileids = [
+            name[len(root.entry) :]
+            for name in root.zipfile.namelist()
+            if not name.endswith("/")
+        ]
+        items = [name for name in fileids if re.match(regexp, name)]
+        return sorted(items)
+
+    # Find fileids in a directory: use os.walk to search all (proper
+    # or symlinked) subdirectories, and match paths against the regexp.
+    elif isinstance(root, FileSystemPathPointer):
+        items = []
+        for dirname, subdirs, fileids in os.walk(root.path):
+            prefix = "".join("%s/" % p for p in _path_from(root.path, dirname))
+            items += [
+                prefix + fileid
+                for fileid in fileids
+                if re.match(regexp, prefix + fileid)
+            ]
+            # Don't visit svn directories:
+            if ".svn" in subdirs:
+                subdirs.remove(".svn")
+        return sorted(items)
+
+    else:
+        raise AssertionError("Don't know how to handle %r" % root)
+
+
+def _path_from(parent, child):
+    if os.path.split(parent)[1] == "":
+        parent = os.path.split(parent)[0]
+    path = []
+    while parent != child:
+        child, dirname = os.path.split(child)
+        path.insert(0, dirname)
+        assert os.path.split(child)[0] != child
+    return path
+
+
+######################################################################
+# { Paragraph structure in Treebank files
+######################################################################
+
+
+def tagged_treebank_para_block_reader(stream):
+    # Read the next paragraph.
+    para = ""
+    while True:
+        line = stream.readline()
+        # End of paragraph:
+        if re.match(r"======+\s*$", line):
+            if para.strip():
+                return [para]
+        # End of file:
+        elif line == "":
+            if para.strip():
+                return [para]
+            else:
+                return []
+        # Content line:
+        else:
+            para += line
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/verbnet.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/verbnet.py
@@ -0,0 +1,629 @@
+# Natural Language Toolkit: Verbnet Corpus Reader
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+An NLTK interface to the VerbNet verb lexicon
+
+For details about VerbNet see:
+https://verbs.colorado.edu/~mpalmer/projects/verbnet.html
+"""
+
+import re
+import textwrap
+from collections import defaultdict
+
+from nltk.corpus.reader.xmldocs import XMLCorpusReader
+
+
+class VerbnetCorpusReader(XMLCorpusReader):
+    """
+    An NLTK interface to the VerbNet verb lexicon.
+
+    From the VerbNet site: "VerbNet (VN) (Kipper-Schuler 2006) is the largest
+    on-line verb lexicon currently available for English. It is a hierarchical
+    domain-independent, broad-coverage verb lexicon with mappings to other
+    lexical resources such as WordNet (Miller, 1990; Fellbaum, 1998), XTAG
+    (XTAG Research Group, 2001), and FrameNet (Baker et al., 1998)."
+
+    For details about VerbNet see:
+    https://verbs.colorado.edu/~mpalmer/projects/verbnet.html
+    """
+
+    # No unicode encoding param, since the data files are all XML.
+    def __init__(self, root, fileids, wrap_etree=False):
+        XMLCorpusReader.__init__(self, root, fileids, wrap_etree)
+
+        self._lemma_to_class = defaultdict(list)
+        """A dictionary mapping from verb lemma strings to lists of
+        VerbNet class identifiers."""
+
+        self._wordnet_to_class = defaultdict(list)
+        """A dictionary mapping from wordnet identifier strings to
+        lists of VerbNet class identifiers."""
+
+        self._class_to_fileid = {}
+        """A dictionary mapping from class identifiers to
+        corresponding file identifiers.  The keys of this dictionary
+        provide a complete list of all classes and subclasses."""
+
+        self._shortid_to_longid = {}
+
+        # Initialize the dictionaries.  Use the quick (regexp-based)
+        # method instead of the slow (xml-based) method, because it
+        # runs 2-30 times faster.
+        self._quick_index()
+
+    _LONGID_RE = re.compile(r"([^\-\.]*)-([\d+.\-]+)$")
+    """Regular expression that matches (and decomposes) longids"""
+
+    _SHORTID_RE = re.compile(r"[\d+.\-]+$")
+    """Regular expression that matches shortids"""
+
+    _INDEX_RE = re.compile(
+        r'<MEMBER name="\??([^"]+)" wn="([^"]*)"[^>]+>|' r'<VNSUBCLASS ID="([^"]+)"/?>'
+    )
+    """Regular expression used by ``_index()`` to quickly scan the corpus
+       for basic information."""
+
+    def lemmas(self, vnclass=None):
+        """
+        Return a list of all verb lemmas that appear in any class, or
+        in the ``classid`` if specified.
+        """
+        if vnclass is None:
+            return sorted(self._lemma_to_class.keys())
+        else:
+            # [xx] should this include subclass members?
+            if isinstance(vnclass, str):
+                vnclass = self.vnclass(vnclass)
+            return [member.get("name") for member in vnclass.findall("MEMBERS/MEMBER")]
+
+    def wordnetids(self, vnclass=None):
+        """
+        Return a list of all wordnet identifiers that appear in any
+        class, or in ``classid`` if specified.
+        """
+        if vnclass is None:
+            return sorted(self._wordnet_to_class.keys())
+        else:
+            # [xx] should this include subclass members?
+            if isinstance(vnclass, str):
+                vnclass = self.vnclass(vnclass)
+            return sum(
+                (
+                    member.get("wn", "").split()
+                    for member in vnclass.findall("MEMBERS/MEMBER")
+                ),
+                [],
+            )
+
+    def classids(self, lemma=None, wordnetid=None, fileid=None, classid=None):
+        """
+        Return a list of the VerbNet class identifiers.  If a file
+        identifier is specified, then return only the VerbNet class
+        identifiers for classes (and subclasses) defined by that file.
+        If a lemma is specified, then return only VerbNet class
+        identifiers for classes that contain that lemma as a member.
+        If a wordnetid is specified, then return only identifiers for
+        classes that contain that wordnetid as a member.  If a classid
+        is specified, then return only identifiers for subclasses of
+        the specified VerbNet class.
+        If nothing is specified, return all classids within VerbNet
+        """
+        if fileid is not None:
+            return [c for (c, f) in self._class_to_fileid.items() if f == fileid]
+        elif lemma is not None:
+            return self._lemma_to_class[lemma]
+        elif wordnetid is not None:
+            return self._wordnet_to_class[wordnetid]
+        elif classid is not None:
+            xmltree = self.vnclass(classid)
+            return [
+                subclass.get("ID")
+                for subclass in xmltree.findall("SUBCLASSES/VNSUBCLASS")
+            ]
+        else:
+            return sorted(self._class_to_fileid.keys())
+
+    def vnclass(self, fileid_or_classid):
+        """Returns VerbNet class ElementTree
+
+        Return an ElementTree containing the xml for the specified
+        VerbNet class.
+
+        :param fileid_or_classid: An identifier specifying which class
+            should be returned.  Can be a file identifier (such as
+            ``'put-9.1.xml'``), or a VerbNet class identifier (such as
+            ``'put-9.1'``) or a short VerbNet class identifier (such as
+            ``'9.1'``).
+        """
+        # File identifier: just return the xml.
+        if fileid_or_classid in self._fileids:
+            return self.xml(fileid_or_classid)
+
+        # Class identifier: get the xml, and find the right elt.
+        classid = self.longid(fileid_or_classid)
+        if classid in self._class_to_fileid:
+            fileid = self._class_to_fileid[self.longid(classid)]
+            tree = self.xml(fileid)
+            if classid == tree.get("ID"):
+                return tree
+            else:
+                for subclass in tree.findall(".//VNSUBCLASS"):
+                    if classid == subclass.get("ID"):
+                        return subclass
+                else:
+                    assert False  # we saw it during _index()!
+
+        else:
+            raise ValueError(f"Unknown identifier {fileid_or_classid}")
+
+    def fileids(self, vnclass_ids=None):
+        """
+        Return a list of fileids that make up this corpus.  If
+        ``vnclass_ids`` is specified, then return the fileids that make
+        up the specified VerbNet class(es).
+        """
+        if vnclass_ids is None:
+            return self._fileids
+        elif isinstance(vnclass_ids, str):
+            return [self._class_to_fileid[self.longid(vnclass_ids)]]
+        else:
+            return [
+                self._class_to_fileid[self.longid(vnclass_id)]
+                for vnclass_id in vnclass_ids
+            ]
+
+    def frames(self, vnclass):
+        """Given a VerbNet class, this method returns VerbNet frames
+
+        The members returned are:
+        1) Example
+        2) Description
+        3) Syntax
+        4) Semantics
+
+        :param vnclass: A VerbNet class identifier; or an ElementTree
+            containing the xml contents of a VerbNet class.
+        :return: frames - a list of frame dictionaries
+        """
+        if isinstance(vnclass, str):
+            vnclass = self.vnclass(vnclass)
+        frames = []
+        vnframes = vnclass.findall("FRAMES/FRAME")
+        for vnframe in vnframes:
+            frames.append(
+                {
+                    "example": self._get_example_within_frame(vnframe),
+                    "description": self._get_description_within_frame(vnframe),
+                    "syntax": self._get_syntactic_list_within_frame(vnframe),
+                    "semantics": self._get_semantics_within_frame(vnframe),
+                }
+            )
+        return frames
+
+    def subclasses(self, vnclass):
+        """Returns subclass ids, if any exist
+
+        Given a VerbNet class, this method returns subclass ids (if they exist)
+        in a list of strings.
+
+        :param vnclass: A VerbNet class identifier; or an ElementTree
+            containing the xml contents of a VerbNet class.
+        :return: list of subclasses
+        """
+        if isinstance(vnclass, str):
+            vnclass = self.vnclass(vnclass)
+
+        subclasses = [
+            subclass.get("ID") for subclass in vnclass.findall("SUBCLASSES/VNSUBCLASS")
+        ]
+        return subclasses
+
+    def themroles(self, vnclass):
+        """Returns thematic roles participating in a VerbNet class
+
+        Members returned as part of roles are-
+        1) Type
+        2) Modifiers
+
+        :param vnclass: A VerbNet class identifier; or an ElementTree
+            containing the xml contents of a VerbNet class.
+        :return: themroles: A list of thematic roles in the VerbNet class
+        """
+        if isinstance(vnclass, str):
+            vnclass = self.vnclass(vnclass)
+
+        themroles = []
+        for trole in vnclass.findall("THEMROLES/THEMROLE"):
+            themroles.append(
+                {
+                    "type": trole.get("type"),
+                    "modifiers": [
+                        {"value": restr.get("Value"), "type": restr.get("type")}
+                        for restr in trole.findall("SELRESTRS/SELRESTR")
+                    ],
+                }
+            )
+        return themroles
+
+    ######################################################################
+    # { Index Initialization
+    ######################################################################
+
+    def _index(self):
+        """
+        Initialize the indexes ``_lemma_to_class``,
+        ``_wordnet_to_class``, and ``_class_to_fileid`` by scanning
+        through the corpus fileids.  This is fast if ElementTree
+        uses the C implementation (<0.1 secs), but quite slow (>10 secs)
+        if only the python implementation is available.
+        """
+        for fileid in self._fileids:
+            self._index_helper(self.xml(fileid), fileid)
+
+    def _index_helper(self, xmltree, fileid):
+        """Helper for ``_index()``"""
+        vnclass = xmltree.get("ID")
+        self._class_to_fileid[vnclass] = fileid
+        self._shortid_to_longid[self.shortid(vnclass)] = vnclass
+        for member in xmltree.findall("MEMBERS/MEMBER"):
+            self._lemma_to_class[member.get("name")].append(vnclass)
+            for wn in member.get("wn", "").split():
+                self._wordnet_to_class[wn].append(vnclass)
+        for subclass in xmltree.findall("SUBCLASSES/VNSUBCLASS"):
+            self._index_helper(subclass, fileid)
+
+    def _quick_index(self):
+        """
+        Initialize the indexes ``_lemma_to_class``,
+        ``_wordnet_to_class``, and ``_class_to_fileid`` by scanning
+        through the corpus fileids.  This doesn't do proper xml parsing,
+        but is good enough to find everything in the standard VerbNet
+        corpus -- and it runs about 30 times faster than xml parsing
+        (with the python ElementTree; only 2-3 times faster
+        if ElementTree uses the C implementation).
+        """
+        # nb: if we got rid of wordnet_to_class, this would run 2-3
+        # times faster.
+        for fileid in self._fileids:
+            vnclass = fileid[:-4]  # strip the '.xml'
+            self._class_to_fileid[vnclass] = fileid
+            self._shortid_to_longid[self.shortid(vnclass)] = vnclass
+            with self.open(fileid) as fp:
+                for m in self._INDEX_RE.finditer(fp.read()):
+                    groups = m.groups()
+                    if groups[0] is not None:
+                        self._lemma_to_class[groups[0]].append(vnclass)
+                        for wn in groups[1].split():
+                            self._wordnet_to_class[wn].append(vnclass)
+                    elif groups[2] is not None:
+                        self._class_to_fileid[groups[2]] = fileid
+                        vnclass = groups[2]  # for <MEMBER> elts.
+                        self._shortid_to_longid[self.shortid(vnclass)] = vnclass
+                    else:
+                        assert False, "unexpected match condition"
+
+    ######################################################################
+    # { Identifier conversion
+    ######################################################################
+
+    def longid(self, shortid):
+        """Returns longid of a VerbNet class
+
+        Given a short VerbNet class identifier (eg '37.10'), map it
+        to a long id (eg 'confess-37.10').  If ``shortid`` is already a
+        long id, then return it as-is"""
+        if self._LONGID_RE.match(shortid):
+            return shortid  # it's already a longid.
+        elif not self._SHORTID_RE.match(shortid):
+            raise ValueError("vnclass identifier %r not found" % shortid)
+        try:
+            return self._shortid_to_longid[shortid]
+        except KeyError as e:
+            raise ValueError("vnclass identifier %r not found" % shortid) from e
+
+    def shortid(self, longid):
+        """Returns shortid of a VerbNet class
+
+        Given a long VerbNet class identifier (eg 'confess-37.10'),
+        map it to a short id (eg '37.10').  If ``longid`` is already a
+        short id, then return it as-is."""
+        if self._SHORTID_RE.match(longid):
+            return longid  # it's already a shortid.
+        m = self._LONGID_RE.match(longid)
+        if m:
+            return m.group(2)
+        else:
+            raise ValueError("vnclass identifier %r not found" % longid)
+
+    ######################################################################
+    # { Frame access utility functions
+    ######################################################################
+
+    def _get_semantics_within_frame(self, vnframe):
+        """Returns semantics within a single frame
+
+        A utility function to retrieve semantics within a frame in VerbNet
+        Members of the semantics dictionary:
+        1) Predicate value
+        2) Arguments
+
+        :param vnframe: An ElementTree containing the xml contents of
+            a VerbNet frame.
+        :return: semantics: semantics dictionary
+        """
+        semantics_within_single_frame = []
+        for pred in vnframe.findall("SEMANTICS/PRED"):
+            arguments = [
+                {"type": arg.get("type"), "value": arg.get("value")}
+                for arg in pred.findall("ARGS/ARG")
+            ]
+            semantics_within_single_frame.append(
+                {
+                    "predicate_value": pred.get("value"),
+                    "arguments": arguments,
+                    "negated": pred.get("bool") == "!",
+                }
+            )
+        return semantics_within_single_frame
+
+    def _get_example_within_frame(self, vnframe):
+        """Returns example within a frame
+
+        A utility function to retrieve an example within a frame in VerbNet.
+
+        :param vnframe: An ElementTree containing the xml contents of
+            a VerbNet frame.
+        :return: example_text: The example sentence for this particular frame
+        """
+        example_element = vnframe.find("EXAMPLES/EXAMPLE")
+        if example_element is not None:
+            example_text = example_element.text
+        else:
+            example_text = ""
+        return example_text
+
+    def _get_description_within_frame(self, vnframe):
+        """Returns member description within frame
+
+        A utility function to retrieve a description of participating members
+        within a frame in VerbNet.
+
+        :param vnframe: An ElementTree containing the xml contents of
+            a VerbNet frame.
+        :return: description: a description dictionary with members - primary and secondary
+        """
+        description_element = vnframe.find("DESCRIPTION")
+        return {
+            "primary": description_element.attrib["primary"],
+            "secondary": description_element.get("secondary", ""),
+        }
+
+    def _get_syntactic_list_within_frame(self, vnframe):
+        """Returns semantics within a frame
+
+        A utility function to retrieve semantics within a frame in VerbNet.
+        Members of the syntactic dictionary:
+        1) POS Tag
+        2) Modifiers
+
+        :param vnframe: An ElementTree containing the xml contents of
+            a VerbNet frame.
+        :return: syntax_within_single_frame
+        """
+        syntax_within_single_frame = []
+        for elt in vnframe.find("SYNTAX"):
+            pos_tag = elt.tag
+            modifiers = dict()
+            modifiers["value"] = elt.get("value") if "value" in elt.attrib else ""
+            modifiers["selrestrs"] = [
+                {"value": restr.get("Value"), "type": restr.get("type")}
+                for restr in elt.findall("SELRESTRS/SELRESTR")
+            ]
+            modifiers["synrestrs"] = [
+                {"value": restr.get("Value"), "type": restr.get("type")}
+                for restr in elt.findall("SYNRESTRS/SYNRESTR")
+            ]
+            syntax_within_single_frame.append(
+                {"pos_tag": pos_tag, "modifiers": modifiers}
+            )
+        return syntax_within_single_frame
+
+    ######################################################################
+    # { Pretty Printing
+    ######################################################################
+
+    def pprint(self, vnclass):
+        """Returns pretty printed version of a VerbNet class
+
+        Return a string containing a pretty-printed representation of
+        the given VerbNet class.
+
+        :param vnclass: A VerbNet class identifier; or an ElementTree
+            containing the xml contents of a VerbNet class.
+        """
+        if isinstance(vnclass, str):
+            vnclass = self.vnclass(vnclass)
+
+        s = vnclass.get("ID") + "\n"
+        s += self.pprint_subclasses(vnclass, indent="  ") + "\n"
+        s += self.pprint_members(vnclass, indent="  ") + "\n"
+        s += "  Thematic roles:\n"
+        s += self.pprint_themroles(vnclass, indent="    ") + "\n"
+        s += "  Frames:\n"
+        s += self.pprint_frames(vnclass, indent="    ")
+        return s
+
+    def pprint_subclasses(self, vnclass, indent=""):
+        """Returns pretty printed version of subclasses of VerbNet class
+
+        Return a string containing a pretty-printed representation of
+        the given VerbNet class's subclasses.
+
+        :param vnclass: A VerbNet class identifier; or an ElementTree
+            containing the xml contents of a VerbNet class.
+        """
+        if isinstance(vnclass, str):
+            vnclass = self.vnclass(vnclass)
+
+        subclasses = self.subclasses(vnclass)
+        if not subclasses:
+            subclasses = ["(none)"]
+        s = "Subclasses: " + " ".join(subclasses)
+        return textwrap.fill(
+            s, 70, initial_indent=indent, subsequent_indent=indent + "  "
+        )
+
+    def pprint_members(self, vnclass, indent=""):
+        """Returns pretty printed version of members in a VerbNet class
+
+        Return a string containing a pretty-printed representation of
+        the given VerbNet class's member verbs.
+
+        :param vnclass: A VerbNet class identifier; or an ElementTree
+            containing the xml contents of a VerbNet class.
+        """
+        if isinstance(vnclass, str):
+            vnclass = self.vnclass(vnclass)
+
+        members = self.lemmas(vnclass)
+        if not members:
+            members = ["(none)"]
+        s = "Members: " + " ".join(members)
+        return textwrap.fill(
+            s, 70, initial_indent=indent, subsequent_indent=indent + "  "
+        )
+
+    def pprint_themroles(self, vnclass, indent=""):
+        """Returns pretty printed version of thematic roles in a VerbNet class
+
+        Return a string containing a pretty-printed representation of
+        the given VerbNet class's thematic roles.
+
+        :param vnclass: A VerbNet class identifier; or an ElementTree
+            containing the xml contents of a VerbNet class.
+        """
+        if isinstance(vnclass, str):
+            vnclass = self.vnclass(vnclass)
+
+        pieces = []
+        for themrole in self.themroles(vnclass):
+            piece = indent + "* " + themrole.get("type")
+            modifiers = [
+                modifier["value"] + modifier["type"]
+                for modifier in themrole["modifiers"]
+            ]
+            if modifiers:
+                piece += "[{}]".format(" ".join(modifiers))
+            pieces.append(piece)
+        return "\n".join(pieces)
+
+    def pprint_frames(self, vnclass, indent=""):
+        """Returns pretty version of all frames in a VerbNet class
+
+        Return a string containing a pretty-printed representation of
+        the list of frames within the VerbNet class.
+
+        :param vnclass: A VerbNet class identifier; or an ElementTree
+            containing the xml contents of a VerbNet class.
+        """
+        if isinstance(vnclass, str):
+            vnclass = self.vnclass(vnclass)
+        pieces = []
+        for vnframe in self.frames(vnclass):
+            pieces.append(self._pprint_single_frame(vnframe, indent))
+        return "\n".join(pieces)
+
+    def _pprint_single_frame(self, vnframe, indent=""):
+        """Returns pretty printed version of a single frame in a VerbNet class
+
+        Returns a string containing a pretty-printed representation of
+        the given frame.
+
+        :param vnframe: An ElementTree containing the xml contents of
+            a VerbNet frame.
+        """
+        frame_string = self._pprint_description_within_frame(vnframe, indent) + "\n"
+        frame_string += self._pprint_example_within_frame(vnframe, indent + " ") + "\n"
+        frame_string += (
+            self._pprint_syntax_within_frame(vnframe, indent + "  Syntax: ") + "\n"
+        )
+        frame_string += indent + "  Semantics:\n"
+        frame_string += self._pprint_semantics_within_frame(vnframe, indent + "    ")
+        return frame_string
+
+    def _pprint_example_within_frame(self, vnframe, indent=""):
+        """Returns pretty printed version of example within frame in a VerbNet class
+
+        Return a string containing a pretty-printed representation of
+        the given VerbNet frame example.
+
+        :param vnframe: An ElementTree containing the xml contents of
+            a Verbnet frame.
+        """
+        if vnframe["example"]:
+            return indent + " Example: " + vnframe["example"]
+
+    def _pprint_description_within_frame(self, vnframe, indent=""):
+        """Returns pretty printed version of a VerbNet frame description
+
+        Return a string containing a pretty-printed representation of
+        the given VerbNet frame description.
+
+        :param vnframe: An ElementTree containing the xml contents of
+            a VerbNet frame.
+        """
+        description = indent + vnframe["description"]["primary"]
+        if vnframe["description"]["secondary"]:
+            description += " ({})".format(vnframe["description"]["secondary"])
+        return description
+
+    def _pprint_syntax_within_frame(self, vnframe, indent=""):
+        """Returns pretty printed version of syntax within a frame in a VerbNet class
+
+        Return a string containing a pretty-printed representation of
+        the given VerbNet frame syntax.
+
+        :param vnframe: An ElementTree containing the xml contents of
+            a VerbNet frame.
+        """
+        pieces = []
+        for element in vnframe["syntax"]:
+            piece = element["pos_tag"]
+            modifier_list = []
+            if "value" in element["modifiers"] and element["modifiers"]["value"]:
+                modifier_list.append(element["modifiers"]["value"])
+            modifier_list += [
+                "{}{}".format(restr["value"], restr["type"])
+                for restr in (
+                    element["modifiers"]["selrestrs"]
+                    + element["modifiers"]["synrestrs"]
+                )
+            ]
+            if modifier_list:
+                piece += "[{}]".format(" ".join(modifier_list))
+            pieces.append(piece)
+
+        return indent + " ".join(pieces)
+
+    def _pprint_semantics_within_frame(self, vnframe, indent=""):
+        """Returns a pretty printed version of semantics within frame in a VerbNet class
+
+        Return a string containing a pretty-printed representation of
+        the given VerbNet frame semantics.
+
+        :param vnframe: An ElementTree containing the xml contents of
+            a VerbNet frame.
+        """
+        pieces = []
+        for predicate in vnframe["semantics"]:
+            arguments = [argument["value"] for argument in predicate["arguments"]]
+            pieces.append(
+                f"{'¬' if predicate['negated'] else ''}{predicate['predicate_value']}({', '.join(arguments)})"
+            )
+        return "\n".join(f"{indent}* {piece}" for piece in pieces)
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/wordlist.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/wordlist.py
@@ -0,0 +1,166 @@
+# Natural Language Toolkit: Word List Corpus Reader
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+#         Edward Loper <edloper@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+from nltk.corpus.reader.api import *
+from nltk.corpus.reader.util import *
+from nltk.tokenize import line_tokenize
+
+
+class WordListCorpusReader(CorpusReader):
+    """
+    List of words, one per line.  Blank lines are ignored.
+    """
+
+    def words(self, fileids=None, ignore_lines_startswith="\n"):
+        return [
+            line
+            for line in line_tokenize(self.raw(fileids))
+            if not line.startswith(ignore_lines_startswith)
+        ]
+
+
+class SwadeshCorpusReader(WordListCorpusReader):
+    def entries(self, fileids=None):
+        """
+        :return: a tuple of words for the specified fileids.
+        """
+        if not fileids:
+            fileids = self.fileids()
+
+        wordlists = [self.words(f) for f in fileids]
+        return list(zip(*wordlists))
+
+
+class NonbreakingPrefixesCorpusReader(WordListCorpusReader):
+    """
+    This is a class to read the nonbreaking prefixes textfiles from the
+    Moses Machine Translation toolkit. These lists are used in the Python port
+    of the Moses' word tokenizer.
+    """
+
+    available_langs = {
+        "catalan": "ca",
+        "czech": "cs",
+        "german": "de",
+        "greek": "el",
+        "english": "en",
+        "spanish": "es",
+        "finnish": "fi",
+        "french": "fr",
+        "hungarian": "hu",
+        "icelandic": "is",
+        "italian": "it",
+        "latvian": "lv",
+        "dutch": "nl",
+        "polish": "pl",
+        "portuguese": "pt",
+        "romanian": "ro",
+        "russian": "ru",
+        "slovak": "sk",
+        "slovenian": "sl",
+        "swedish": "sv",
+        "tamil": "ta",
+    }
+    # Also, add the lang IDs as the keys.
+    available_langs.update({v: v for v in available_langs.values()})
+
+    def words(self, lang=None, fileids=None, ignore_lines_startswith="#"):
+        """
+        This module returns a list of nonbreaking prefixes for the specified
+        language(s).
+
+        >>> from nltk.corpus import nonbreaking_prefixes as nbp
+        >>> nbp.words('en')[:10] == [u'A', u'B', u'C', u'D', u'E', u'F', u'G', u'H', u'I', u'J']
+        True
+        >>> nbp.words('ta')[:5] == [u'\u0b85', u'\u0b86', u'\u0b87', u'\u0b88', u'\u0b89']
+        True
+
+        :return: a list words for the specified language(s).
+        """
+        # If *lang* in list of languages available, allocate apt fileid.
+        # Otherwise, the function returns non-breaking prefixes for
+        # all languages when fileids==None.
+        if lang in self.available_langs:
+            lang = self.available_langs[lang]
+            fileids = ["nonbreaking_prefix." + lang]
+        return [
+            line
+            for line in line_tokenize(self.raw(fileids))
+            if not line.startswith(ignore_lines_startswith)
+        ]
+
+
+class UnicharsCorpusReader(WordListCorpusReader):
+    """
+    This class is used to read lists of characters from the Perl Unicode
+    Properties (see https://perldoc.perl.org/perluniprops.html).
+    The files in the perluniprop.zip are extracted using the Unicode::Tussle
+    module from https://search.cpan.org/~bdfoy/Unicode-Tussle-1.11/lib/Unicode/Tussle.pm
+    """
+
+    # These are categories similar to the Perl Unicode Properties
+    available_categories = [
+        "Close_Punctuation",
+        "Currency_Symbol",
+        "IsAlnum",
+        "IsAlpha",
+        "IsLower",
+        "IsN",
+        "IsSc",
+        "IsSo",
+        "IsUpper",
+        "Line_Separator",
+        "Number",
+        "Open_Punctuation",
+        "Punctuation",
+        "Separator",
+        "Symbol",
+    ]
+
+    def chars(self, category=None, fileids=None):
+        """
+        This module returns a list of characters from  the Perl Unicode Properties.
+        They are very useful when porting Perl tokenizers to Python.
+
+        >>> from nltk.corpus import perluniprops as pup
+        >>> pup.chars('Open_Punctuation')[:5] == [u'(', u'[', u'{', u'\u0f3a', u'\u0f3c']
+        True
+        >>> pup.chars('Currency_Symbol')[:5] == [u'$', u'\xa2', u'\xa3', u'\xa4', u'\xa5']
+        True
+        >>> pup.available_categories
+        ['Close_Punctuation', 'Currency_Symbol', 'IsAlnum', 'IsAlpha', 'IsLower', 'IsN', 'IsSc', 'IsSo', 'IsUpper', 'Line_Separator', 'Number', 'Open_Punctuation', 'Punctuation', 'Separator', 'Symbol']
+
+        :return: a list of characters given the specific unicode character category
+        """
+        if category in self.available_categories:
+            fileids = [category + ".txt"]
+        return list(self.raw(fileids).strip())
+
+
+class MWAPPDBCorpusReader(WordListCorpusReader):
+    """
+    This class is used to read the list of word pairs from the subset of lexical
+    pairs of The Paraphrase Database (PPDB) XXXL used in the Monolingual Word
+    Alignment (MWA) algorithm described in Sultan et al. (2014a, 2014b, 2015):
+
+     - http://acl2014.org/acl2014/Q14/pdf/Q14-1017
+     - https://www.aclweb.org/anthology/S14-2039
+     - https://www.aclweb.org/anthology/S15-2027
+
+    The original source of the full PPDB corpus can be found on
+    https://www.cis.upenn.edu/~ccb/ppdb/
+
+    :return: a list of tuples of similar lexical terms.
+    """
+
+    mwa_ppdb_xxxl_file = "ppdb-1.0-xxxl-lexical.extended.synonyms.uniquepairs"
+
+    def entries(self, fileids=mwa_ppdb_xxxl_file):
+        """
+        :return: a tuple of synonym word pairs.
+        """
+        return [tuple(line.split("\t")) for line in line_tokenize(self.raw(fileids))]
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/wordnet.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/wordnet.py
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/xmldocs.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/xmldocs.py
@@ -0,0 +1,397 @@
+# Natural Language Toolkit: XML Corpus Reader
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Corpus reader for corpora whose documents are xml files.
+
+(note -- not named 'xml' to avoid conflicting w/ standard xml package)
+"""
+
+import codecs
+from xml.etree import ElementTree
+
+from nltk.corpus.reader.api import CorpusReader
+from nltk.corpus.reader.util import *
+from nltk.data import SeekableUnicodeStreamReader
+from nltk.internals import ElementWrapper
+from nltk.tokenize import WordPunctTokenizer
+
+
+class XMLCorpusReader(CorpusReader):
+    """
+    Corpus reader for corpora whose documents are xml files.
+
+    Note that the ``XMLCorpusReader`` constructor does not take an
+    ``encoding`` argument, because the unicode encoding is specified by
+    the XML files themselves.  See the XML specs for more info.
+    """
+
+    def __init__(self, root, fileids, wrap_etree=False):
+        self._wrap_etree = wrap_etree
+        CorpusReader.__init__(self, root, fileids)
+
+    def xml(self, fileid=None):
+        # Make sure we have exactly one file -- no concatenating XML.
+        if fileid is None and len(self._fileids) == 1:
+            fileid = self._fileids[0]
+        if not isinstance(fileid, str):
+            raise TypeError("Expected a single file identifier string")
+        # Read the XML in using ElementTree.
+        with self.abspath(fileid).open() as fp:
+            elt = ElementTree.parse(fp).getroot()
+        # If requested, wrap it.
+        if self._wrap_etree:
+            elt = ElementWrapper(elt)
+        # Return the ElementTree element.
+        return elt
+
+    def words(self, fileid=None):
+        """
+        Returns all of the words and punctuation symbols in the specified file
+        that were in text nodes -- ie, tags are ignored. Like the xml() method,
+        fileid can only specify one file.
+
+        :return: the given file's text nodes as a list of words and punctuation symbols
+        :rtype: list(str)
+        """
+
+        elt = self.xml(fileid)
+        encoding = self.encoding(fileid)
+        word_tokenizer = WordPunctTokenizer()
+        try:
+            iterator = elt.getiterator()
+        except:
+            iterator = elt.iter()
+        out = []
+
+        for node in iterator:
+            text = node.text
+            if text is not None:
+                if isinstance(text, bytes):
+                    text = text.decode(encoding)
+                toks = word_tokenizer.tokenize(text)
+                out.extend(toks)
+        return out
+
+
+class XMLCorpusView(StreamBackedCorpusView):
+    """
+    A corpus view that selects out specified elements from an XML
+    file, and provides a flat list-like interface for accessing them.
+    (Note: ``XMLCorpusView`` is not used by ``XMLCorpusReader`` itself,
+    but may be used by subclasses of ``XMLCorpusReader``.)
+
+    Every XML corpus view has a "tag specification", indicating what
+    XML elements should be included in the view; and each (non-nested)
+    element that matches this specification corresponds to one item in
+    the view.  Tag specifications are regular expressions over tag
+    paths, where a tag path is a list of element tag names, separated
+    by '/', indicating the ancestry of the element.  Some examples:
+
+      - ``'foo'``: A top-level element whose tag is ``foo``.
+      - ``'foo/bar'``: An element whose tag is ``bar`` and whose parent
+        is a top-level element whose tag is ``foo``.
+      - ``'.*/foo'``: An element whose tag is ``foo``, appearing anywhere
+        in the xml tree.
+      - ``'.*/(foo|bar)'``: An wlement whose tag is ``foo`` or ``bar``,
+        appearing anywhere in the xml tree.
+
+    The view items are generated from the selected XML elements via
+    the method ``handle_elt()``.  By default, this method returns the
+    element as-is (i.e., as an ElementTree object); but it can be
+    overridden, either via subclassing or via the ``elt_handler``
+    constructor parameter.
+    """
+
+    #: If true, then display debugging output to stdout when reading
+    #: blocks.
+    _DEBUG = False
+
+    #: The number of characters read at a time by this corpus reader.
+    _BLOCK_SIZE = 1024
+
+    def __init__(self, fileid, tagspec, elt_handler=None):
+        """
+        Create a new corpus view based on a specified XML file.
+
+        Note that the ``XMLCorpusView`` constructor does not take an
+        ``encoding`` argument, because the unicode encoding is
+        specified by the XML files themselves.
+
+        :type tagspec: str
+        :param tagspec: A tag specification, indicating what XML
+            elements should be included in the view.  Each non-nested
+            element that matches this specification corresponds to one
+            item in the view.
+
+        :param elt_handler: A function used to transform each element
+            to a value for the view.  If no handler is specified, then
+            ``self.handle_elt()`` is called, which returns the element
+            as an ElementTree object.  The signature of elt_handler is::
+
+                elt_handler(elt, tagspec) -> value
+        """
+        if elt_handler:
+            self.handle_elt = elt_handler
+
+        self._tagspec = re.compile(tagspec + r"\Z")
+        """The tag specification for this corpus view."""
+
+        self._tag_context = {0: ()}
+        """A dictionary mapping from file positions (as returned by
+           ``stream.seek()`` to XML contexts.  An XML context is a
+           tuple of XML tag names, indicating which tags have not yet
+           been closed."""
+
+        encoding = self._detect_encoding(fileid)
+        StreamBackedCorpusView.__init__(self, fileid, encoding=encoding)
+
+    def _detect_encoding(self, fileid):
+        if isinstance(fileid, PathPointer):
+            try:
+                infile = fileid.open()
+                s = infile.readline()
+            finally:
+                infile.close()
+        else:
+            with open(fileid, "rb") as infile:
+                s = infile.readline()
+        if s.startswith(codecs.BOM_UTF16_BE):
+            return "utf-16-be"
+        if s.startswith(codecs.BOM_UTF16_LE):
+            return "utf-16-le"
+        if s.startswith(codecs.BOM_UTF32_BE):
+            return "utf-32-be"
+        if s.startswith(codecs.BOM_UTF32_LE):
+            return "utf-32-le"
+        if s.startswith(codecs.BOM_UTF8):
+            return "utf-8"
+        m = re.match(rb'\s*<\?xml\b.*\bencoding="([^"]+)"', s)
+        if m:
+            return m.group(1).decode()
+        m = re.match(rb"\s*<\?xml\b.*\bencoding='([^']+)'", s)
+        if m:
+            return m.group(1).decode()
+        # No encoding found -- what should the default be?
+        return "utf-8"
+
+    def handle_elt(self, elt, context):
+        """
+        Convert an element into an appropriate value for inclusion in
+        the view.  Unless overridden by a subclass or by the
+        ``elt_handler`` constructor argument, this method simply
+        returns ``elt``.
+
+        :return: The view value corresponding to ``elt``.
+
+        :type elt: ElementTree
+        :param elt: The element that should be converted.
+
+        :type context: str
+        :param context: A string composed of element tags separated by
+            forward slashes, indicating the XML context of the given
+            element.  For example, the string ``'foo/bar/baz'``
+            indicates that the element is a ``baz`` element whose
+            parent is a ``bar`` element and whose grandparent is a
+            top-level ``foo`` element.
+        """
+        return elt
+
+    #: A regular expression that matches XML fragments that do not
+    #: contain any un-closed tags.
+    _VALID_XML_RE = re.compile(
+        r"""
+        [^<]*
+        (
+          ((<!--.*?-->)                         |  # comment
+           (<![CDATA[.*?]])                     |  # raw character data
+           (<!DOCTYPE\s+[^\[]*(\[[^\]]*])?\s*>) |  # doctype decl
+           (<[^!>][^>]*>))                         # tag or PI
+          [^<]*)*
+        \Z""",
+        re.DOTALL | re.VERBOSE,
+    )
+
+    #: A regular expression used to extract the tag name from a start tag,
+    #: end tag, or empty-elt tag string.
+    _XML_TAG_NAME = re.compile(r"<\s*(?:/\s*)?([^\s>]+)")
+
+    #: A regular expression used to find all start-tags, end-tags, and
+    #: empty-elt tags in an XML file.  This regexp is more lenient than
+    #: the XML spec -- e.g., it allows spaces in some places where the
+    #: spec does not.
+    _XML_PIECE = re.compile(
+        r"""
+        # Include these so we can skip them:
+        (?P<COMMENT>        <!--.*?-->                          )|
+        (?P<CDATA>          <![CDATA[.*?]]>                     )|
+        (?P<PI>             <\?.*?\?>                           )|
+        (?P<DOCTYPE>        <!DOCTYPE\s+[^\[^>]*(\[[^\]]*])?\s*>)|
+        # These are the ones we actually care about:
+        (?P<EMPTY_ELT_TAG>  <\s*[^>/\?!\s][^>]*/\s*>            )|
+        (?P<START_TAG>      <\s*[^>/\?!\s][^>]*>                )|
+        (?P<END_TAG>        <\s*/[^>/\?!\s][^>]*>               )""",
+        re.DOTALL | re.VERBOSE,
+    )
+
+    def _read_xml_fragment(self, stream):
+        """
+        Read a string from the given stream that does not contain any
+        un-closed tags.  In particular, this function first reads a
+        block from the stream of size ``self._BLOCK_SIZE``.  It then
+        checks if that block contains an un-closed tag.  If it does,
+        then this function either backtracks to the last '<', or reads
+        another block.
+        """
+        fragment = ""
+
+        if isinstance(stream, SeekableUnicodeStreamReader):
+            startpos = stream.tell()
+        while True:
+            # Read a block and add it to the fragment.
+            xml_block = stream.read(self._BLOCK_SIZE)
+            fragment += xml_block
+
+            # Do we have a well-formed xml fragment?
+            if self._VALID_XML_RE.match(fragment):
+                return fragment
+
+            # Do we have a fragment that will never be well-formed?
+            if re.search("[<>]", fragment).group(0) == ">":
+                pos = stream.tell() - (
+                    len(fragment) - re.search("[<>]", fragment).end()
+                )
+                raise ValueError('Unexpected ">" near char %s' % pos)
+
+            # End of file?
+            if not xml_block:
+                raise ValueError("Unexpected end of file: tag not closed")
+
+            # If not, then we must be in the middle of a <..tag..>.
+            # If appropriate, backtrack to the most recent '<'
+            # character.
+            last_open_bracket = fragment.rfind("<")
+            if last_open_bracket > 0:
+                if self._VALID_XML_RE.match(fragment[:last_open_bracket]):
+                    if isinstance(stream, SeekableUnicodeStreamReader):
+                        stream.seek(startpos)
+                        stream.char_seek_forward(last_open_bracket)
+                    else:
+                        stream.seek(-(len(fragment) - last_open_bracket), 1)
+                    return fragment[:last_open_bracket]
+
+            # Otherwise, read another block. (i.e., return to the
+            # top of the loop.)
+
+    def read_block(self, stream, tagspec=None, elt_handler=None):
+        """
+        Read from ``stream`` until we find at least one element that
+        matches ``tagspec``, and return the result of applying
+        ``elt_handler`` to each element found.
+        """
+        if tagspec is None:
+            tagspec = self._tagspec
+        if elt_handler is None:
+            elt_handler = self.handle_elt
+
+        # Use a stack of strings to keep track of our context:
+        context = list(self._tag_context.get(stream.tell()))
+        assert context is not None  # check this -- could it ever happen?
+
+        elts = []
+
+        elt_start = None  # where does the elt start
+        elt_depth = None  # what context depth
+        elt_text = ""
+
+        while elts == [] or elt_start is not None:
+            if isinstance(stream, SeekableUnicodeStreamReader):
+                startpos = stream.tell()
+            xml_fragment = self._read_xml_fragment(stream)
+
+            # End of file.
+            if not xml_fragment:
+                if elt_start is None:
+                    break
+                else:
+                    raise ValueError("Unexpected end of file")
+
+            # Process each <tag> in the xml fragment.
+            for piece in self._XML_PIECE.finditer(xml_fragment):
+                if self._DEBUG:
+                    print("{:>25} {}".format("/".join(context)[-20:], piece.group()))
+
+                if piece.group("START_TAG"):
+                    name = self._XML_TAG_NAME.match(piece.group()).group(1)
+                    # Keep context up-to-date.
+                    context.append(name)
+                    # Is this one of the elts we're looking for?
+                    if elt_start is None:
+                        if re.match(tagspec, "/".join(context)):
+                            elt_start = piece.start()
+                            elt_depth = len(context)
+
+                elif piece.group("END_TAG"):
+                    name = self._XML_TAG_NAME.match(piece.group()).group(1)
+                    # sanity checks:
+                    if not context:
+                        raise ValueError("Unmatched tag </%s>" % name)
+                    if name != context[-1]:
+                        raise ValueError(f"Unmatched tag <{context[-1]}>...</{name}>")
+                    # Is this the end of an element?
+                    if elt_start is not None and elt_depth == len(context):
+                        elt_text += xml_fragment[elt_start : piece.end()]
+                        elts.append((elt_text, "/".join(context)))
+                        elt_start = elt_depth = None
+                        elt_text = ""
+                    # Keep context up-to-date
+                    context.pop()
+
+                elif piece.group("EMPTY_ELT_TAG"):
+                    name = self._XML_TAG_NAME.match(piece.group()).group(1)
+                    if elt_start is None:
+                        if re.match(tagspec, "/".join(context) + "/" + name):
+                            elts.append((piece.group(), "/".join(context) + "/" + name))
+
+            if elt_start is not None:
+                # If we haven't found any elements yet, then keep
+                # looping until we do.
+                if elts == []:
+                    elt_text += xml_fragment[elt_start:]
+                    elt_start = 0
+
+                # If we've found at least one element, then try
+                # backtracking to the start of the element that we're
+                # inside of.
+                else:
+                    # take back the last start-tag, and return what
+                    # we've gotten so far (elts is non-empty).
+                    if self._DEBUG:
+                        print(" " * 36 + "(backtrack)")
+                    if isinstance(stream, SeekableUnicodeStreamReader):
+                        stream.seek(startpos)
+                        stream.char_seek_forward(elt_start)
+                    else:
+                        stream.seek(-(len(xml_fragment) - elt_start), 1)
+                    context = context[: elt_depth - 1]
+                    elt_start = elt_depth = None
+                    elt_text = ""
+
+        # Update the _tag_context dict.
+        pos = stream.tell()
+        if pos in self._tag_context:
+            assert tuple(context) == self._tag_context[pos]
+        else:
+            self._tag_context[pos] = tuple(context)
+
+        return [
+            elt_handler(
+                ElementTree.fromstring(elt.encode("ascii", "xmlcharrefreplace")),
+                context,
+            )
+            for (elt, context) in elts
+        ]
--- a/backend/venv/Lib/site-packages/nltk/corpus/reader/ycoe.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/reader/ycoe.py
@@ -0,0 +1,256 @@
+# Natural Language Toolkit: York-Toronto-Helsinki Parsed Corpus of Old English Prose (YCOE)
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: Selina Dennis <selina@tranzfusion.net>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Corpus reader for the York-Toronto-Helsinki Parsed Corpus of Old
+English Prose (YCOE), a 1.5 million word syntactically-annotated
+corpus of Old English prose texts. The corpus is distributed by the
+Oxford Text Archive: http://www.ota.ahds.ac.uk/ It is not included
+with NLTK.
+
+The YCOE corpus is divided into 100 files, each representing
+an Old English prose text. Tags used within each text complies
+to the YCOE standard: https://www-users.york.ac.uk/~lang22/YCOE/YcoeHome.htm
+"""
+
+import os
+import re
+
+from nltk.corpus.reader.api import *
+from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader
+from nltk.corpus.reader.tagged import TaggedCorpusReader
+from nltk.corpus.reader.util import *
+from nltk.tokenize import RegexpTokenizer
+
+
+class YCOECorpusReader(CorpusReader):
+    """
+    Corpus reader for the York-Toronto-Helsinki Parsed Corpus of Old
+    English Prose (YCOE), a 1.5 million word syntactically-annotated
+    corpus of Old English prose texts.
+    """
+
+    def __init__(self, root, encoding="utf8"):
+        CorpusReader.__init__(self, root, [], encoding)
+
+        self._psd_reader = YCOEParseCorpusReader(
+            self.root.join("psd"), ".*", ".psd", encoding=encoding
+        )
+        self._pos_reader = YCOETaggedCorpusReader(self.root.join("pos"), ".*", ".pos")
+
+        # Make sure we have a consistent set of items:
+        documents = {f[:-4] for f in self._psd_reader.fileids()}
+        if {f[:-4] for f in self._pos_reader.fileids()} != documents:
+            raise ValueError('Items in "psd" and "pos" ' "subdirectories do not match.")
+
+        fileids = sorted(
+            ["%s.psd" % doc for doc in documents]
+            + ["%s.pos" % doc for doc in documents]
+        )
+        CorpusReader.__init__(self, root, fileids, encoding)
+        self._documents = sorted(documents)
+
+    def documents(self, fileids=None):
+        """
+        Return a list of document identifiers for all documents in
+        this corpus, or for the documents with the given file(s) if
+        specified.
+        """
+        if fileids is None:
+            return self._documents
+        if isinstance(fileids, str):
+            fileids = [fileids]
+        for f in fileids:
+            if f not in self._fileids:
+                raise KeyError("File id %s not found" % fileids)
+        # Strip off the '.pos' and '.psd' extensions.
+        return sorted({f[:-4] for f in fileids})
+
+    def fileids(self, documents=None):
+        """
+        Return a list of file identifiers for the files that make up
+        this corpus, or that store the given document(s) if specified.
+        """
+        if documents is None:
+            return self._fileids
+        elif isinstance(documents, str):
+            documents = [documents]
+        return sorted(
+            set(
+                ["%s.pos" % doc for doc in documents]
+                + ["%s.psd" % doc for doc in documents]
+            )
+        )
+
+    def _getfileids(self, documents, subcorpus):
+        """
+        Helper that selects the appropriate fileids for a given set of
+        documents from a given subcorpus (pos or psd).
+        """
+        if documents is None:
+            documents = self._documents
+        else:
+            if isinstance(documents, str):
+                documents = [documents]
+            for document in documents:
+                if document not in self._documents:
+                    if document[-4:] in (".pos", ".psd"):
+                        raise ValueError(
+                            "Expected a document identifier, not a file "
+                            "identifier.  (Use corpus.documents() to get "
+                            "a list of document identifiers."
+                        )
+                    else:
+                        raise ValueError("Document identifier %s not found" % document)
+        return [f"{d}.{subcorpus}" for d in documents]
+
+    # Delegate to one of our two sub-readers:
+    def words(self, documents=None):
+        return self._pos_reader.words(self._getfileids(documents, "pos"))
+
+    def sents(self, documents=None):
+        return self._pos_reader.sents(self._getfileids(documents, "pos"))
+
+    def paras(self, documents=None):
+        return self._pos_reader.paras(self._getfileids(documents, "pos"))
+
+    def tagged_words(self, documents=None):
+        return self._pos_reader.tagged_words(self._getfileids(documents, "pos"))
+
+    def tagged_sents(self, documents=None):
+        return self._pos_reader.tagged_sents(self._getfileids(documents, "pos"))
+
+    def tagged_paras(self, documents=None):
+        return self._pos_reader.tagged_paras(self._getfileids(documents, "pos"))
+
+    def parsed_sents(self, documents=None):
+        return self._psd_reader.parsed_sents(self._getfileids(documents, "psd"))
+
+
+class YCOEParseCorpusReader(BracketParseCorpusReader):
+    """Specialized version of the standard bracket parse corpus reader
+    that strips out (CODE ...) and (ID ...) nodes."""
+
+    def _parse(self, t):
+        t = re.sub(r"(?u)\((CODE|ID)[^\)]*\)", "", t)
+        if re.match(r"\s*\(\s*\)\s*$", t):
+            return None
+        return BracketParseCorpusReader._parse(self, t)
+
+
+class YCOETaggedCorpusReader(TaggedCorpusReader):
+    def __init__(self, root, items, encoding="utf8"):
+        gaps_re = r"(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*"
+        sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True)
+        TaggedCorpusReader.__init__(
+            self, root, items, sep="_", sent_tokenizer=sent_tokenizer
+        )
+
+
+#: A list of all documents and their titles in ycoe.
+documents = {
+    "coadrian.o34": "Adrian and Ritheus",
+    "coaelhom.o3": "Ælfric, Supplemental Homilies",
+    "coaelive.o3": "Ælfric's Lives of Saints",
+    "coalcuin": "Alcuin De virtutibus et vitiis",
+    "coalex.o23": "Alexander's Letter to Aristotle",
+    "coapollo.o3": "Apollonius of Tyre",
+    "coaugust": "Augustine",
+    "cobede.o2": "Bede's History of the English Church",
+    "cobenrul.o3": "Benedictine Rule",
+    "coblick.o23": "Blickling Homilies",
+    "coboeth.o2": "Boethius' Consolation of Philosophy",
+    "cobyrhtf.o3": "Byrhtferth's Manual",
+    "cocanedgD": "Canons of Edgar (D)",
+    "cocanedgX": "Canons of Edgar (X)",
+    "cocathom1.o3": "Ælfric's Catholic Homilies I",
+    "cocathom2.o3": "Ælfric's Catholic Homilies II",
+    "cochad.o24": "Saint Chad",
+    "cochdrul": "Chrodegang of Metz, Rule",
+    "cochristoph": "Saint Christopher",
+    "cochronA.o23": "Anglo-Saxon Chronicle A",
+    "cochronC": "Anglo-Saxon Chronicle C",
+    "cochronD": "Anglo-Saxon Chronicle D",
+    "cochronE.o34": "Anglo-Saxon Chronicle E",
+    "cocura.o2": "Cura Pastoralis",
+    "cocuraC": "Cura Pastoralis (Cotton)",
+    "codicts.o34": "Dicts of Cato",
+    "codocu1.o1": "Documents 1 (O1)",
+    "codocu2.o12": "Documents 2 (O1/O2)",
+    "codocu2.o2": "Documents 2 (O2)",
+    "codocu3.o23": "Documents 3 (O2/O3)",
+    "codocu3.o3": "Documents 3 (O3)",
+    "codocu4.o24": "Documents 4 (O2/O4)",
+    "coeluc1": "Honorius of Autun, Elucidarium 1",
+    "coeluc2": "Honorius of Autun, Elucidarium 1",
+    "coepigen.o3": "Ælfric's Epilogue to Genesis",
+    "coeuphr": "Saint Euphrosyne",
+    "coeust": "Saint Eustace and his companions",
+    "coexodusP": "Exodus (P)",
+    "cogenesiC": "Genesis (C)",
+    "cogregdC.o24": "Gregory's Dialogues (C)",
+    "cogregdH.o23": "Gregory's Dialogues (H)",
+    "coherbar": "Pseudo-Apuleius, Herbarium",
+    "coinspolD.o34": "Wulfstan's Institute of Polity (D)",
+    "coinspolX": "Wulfstan's Institute of Polity (X)",
+    "cojames": "Saint James",
+    "colacnu.o23": "Lacnunga",
+    "colaece.o2": "Leechdoms",
+    "colaw1cn.o3": "Laws, Cnut I",
+    "colaw2cn.o3": "Laws, Cnut II",
+    "colaw5atr.o3": "Laws, Æthelred V",
+    "colaw6atr.o3": "Laws, Æthelred VI",
+    "colawaf.o2": "Laws, Alfred",
+    "colawafint.o2": "Alfred's Introduction to Laws",
+    "colawger.o34": "Laws, Gerefa",
+    "colawine.ox2": "Laws, Ine",
+    "colawnorthu.o3": "Northumbra Preosta Lagu",
+    "colawwllad.o4": "Laws, William I, Lad",
+    "coleofri.o4": "Leofric",
+    "colsigef.o3": "Ælfric's Letter to Sigefyrth",
+    "colsigewB": "Ælfric's Letter to Sigeweard (B)",
+    "colsigewZ.o34": "Ælfric's Letter to Sigeweard (Z)",
+    "colwgeat": "Ælfric's Letter to Wulfgeat",
+    "colwsigeT": "Ælfric's Letter to Wulfsige (T)",
+    "colwsigeXa.o34": "Ælfric's Letter to Wulfsige (Xa)",
+    "colwstan1.o3": "Ælfric's Letter to Wulfstan I",
+    "colwstan2.o3": "Ælfric's Letter to Wulfstan II",
+    "comargaC.o34": "Saint Margaret (C)",
+    "comargaT": "Saint Margaret (T)",
+    "comart1": "Martyrology, I",
+    "comart2": "Martyrology, II",
+    "comart3.o23": "Martyrology, III",
+    "comarvel.o23": "Marvels of the East",
+    "comary": "Mary of Egypt",
+    "coneot": "Saint Neot",
+    "conicodA": "Gospel of Nicodemus (A)",
+    "conicodC": "Gospel of Nicodemus (C)",
+    "conicodD": "Gospel of Nicodemus (D)",
+    "conicodE": "Gospel of Nicodemus (E)",
+    "coorosiu.o2": "Orosius",
+    "cootest.o3": "Heptateuch",
+    "coprefcath1.o3": "Ælfric's Preface to Catholic Homilies I",
+    "coprefcath2.o3": "Ælfric's Preface to Catholic Homilies II",
+    "coprefcura.o2": "Preface to the Cura Pastoralis",
+    "coprefgen.o3": "Ælfric's Preface to Genesis",
+    "copreflives.o3": "Ælfric's Preface to Lives of Saints",
+    "coprefsolilo": "Preface to Augustine's Soliloquies",
+    "coquadru.o23": "Pseudo-Apuleius, Medicina de quadrupedibus",
+    "corood": "History of the Holy Rood-Tree",
+    "cosevensl": "Seven Sleepers",
+    "cosolilo": "St. Augustine's Soliloquies",
+    "cosolsat1.o4": "Solomon and Saturn I",
+    "cosolsat2": "Solomon and Saturn II",
+    "cotempo.o3": "Ælfric's De Temporibus Anni",
+    "coverhom": "Vercelli Homilies",
+    "coverhomE": "Vercelli Homilies (E)",
+    "coverhomL": "Vercelli Homilies (L)",
+    "covinceB": "Saint Vincent (Bodley 343)",
+    "covinsal": "Vindicta Salvatoris",
+    "cowsgosp.o3": "West-Saxon Gospels",
+    "cowulf.o34": "Wulfstan's Homilies",
+}
--- a/backend/venv/Lib/site-packages/nltk/corpus/util.py
+++ b/backend/venv/Lib/site-packages/nltk/corpus/util.py
@@ -0,0 +1,153 @@
+# Natural Language Toolkit: Corpus Reader Utility Functions
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+######################################################################
+# { Lazy Corpus Loader
+######################################################################
+
+import gc
+import re
+
+import nltk
+
+TRY_ZIPFILE_FIRST = False
+
+
+class LazyCorpusLoader:
+    """
+    To see the API documentation for this lazily loaded corpus, first
+    run corpus.ensure_loaded(), and then run help(this_corpus).
+
+    LazyCorpusLoader is a proxy object which is used to stand in for a
+    corpus object before the corpus is loaded.  This allows NLTK to
+    create an object for each corpus, but defer the costs associated
+    with loading those corpora until the first time that they're
+    actually accessed.
+
+    The first time this object is accessed in any way, it will load
+    the corresponding corpus, and transform itself into that corpus
+    (by modifying its own ``__class__`` and ``__dict__`` attributes).
+
+    If the corpus can not be found, then accessing this object will
+    raise an exception, displaying installation instructions for the
+    NLTK data package.  Once they've properly installed the data
+    package (or modified ``nltk.data.path`` to point to its location),
+    they can then use the corpus object without restarting python.
+
+    :param name: The name of the corpus
+    :type name: str
+    :param reader_cls: The specific CorpusReader class, e.g. PlaintextCorpusReader, WordListCorpusReader
+    :type reader: nltk.corpus.reader.api.CorpusReader
+    :param nltk_data_subdir: The subdirectory where the corpus is stored.
+    :type nltk_data_subdir: str
+    :param `*args`: Any other non-keywords arguments that `reader_cls` might need.
+    :param `**kwargs`: Any other keywords arguments that `reader_cls` might need.
+    """
+
+    def __init__(self, name, reader_cls, *args, **kwargs):
+        from nltk.corpus.reader.api import CorpusReader
+
+        assert issubclass(reader_cls, CorpusReader)
+        self.__name = self.__name__ = name
+        self.__reader_cls = reader_cls
+        # If nltk_data_subdir is set explicitly
+        if "nltk_data_subdir" in kwargs:
+            # Use the specified subdirectory path
+            self.subdir = kwargs["nltk_data_subdir"]
+            # Pops the `nltk_data_subdir` argument, we don't need it anymore.
+            kwargs.pop("nltk_data_subdir", None)
+        else:  # Otherwise use 'nltk_data/corpora'
+            self.subdir = "corpora"
+        self.__args = args
+        self.__kwargs = kwargs
+
+    def __load(self):
+        # Find the corpus root directory.
+        zip_name = re.sub(r"(([^/]+)(/.*)?)", r"\2.zip/\1/", self.__name)
+        if TRY_ZIPFILE_FIRST:
+            try:
+                root = nltk.data.find(f"{self.subdir}/{zip_name}")
+            except LookupError as e:
+                try:
+                    root = nltk.data.find(f"{self.subdir}/{self.__name}")
+                except LookupError:
+                    raise e
+        else:
+            try:
+                root = nltk.data.find(f"{self.subdir}/{self.__name}")
+            except LookupError as e:
+                try:
+                    root = nltk.data.find(f"{self.subdir}/{zip_name}")
+                except LookupError:
+                    raise e
+
+        # Load the corpus.
+        corpus = self.__reader_cls(root, *self.__args, **self.__kwargs)
+
+        # This is where the magic happens!  Transform ourselves into
+        # the corpus by modifying our own __dict__ and __class__ to
+        # match that of the corpus.
+
+        args, kwargs = self.__args, self.__kwargs
+        name, reader_cls = self.__name, self.__reader_cls
+
+        self.__dict__ = corpus.__dict__
+        self.__class__ = corpus.__class__
+
+        # _unload support: assign __dict__ and __class__ back, then do GC.
+        # after reassigning __dict__ there shouldn't be any references to
+        # corpus data so the memory should be deallocated after gc.collect()
+        def _unload(self):
+            lazy_reader = LazyCorpusLoader(name, reader_cls, *args, **kwargs)
+            self.__dict__ = lazy_reader.__dict__
+            self.__class__ = lazy_reader.__class__
+            gc.collect()
+
+        self._unload = _make_bound_method(_unload, self)
+
+    def __getattr__(self, attr):
+        # Fix for inspect.isclass under Python 2.6
+        # (see https://bugs.python.org/issue1225107).
+        # Without this fix tests may take extra 1.5GB RAM
+        # because all corpora gets loaded during test collection.
+        if attr == "__bases__":
+            raise AttributeError("LazyCorpusLoader object has no attribute '__bases__'")
+
+        self.__load()
+        # This looks circular, but its not, since __load() changes our
+        # __class__ to something new:
+        return getattr(self, attr)
+
+    def __repr__(self):
+        return "<{} in {!r} (not loaded yet)>".format(
+            self.__reader_cls.__name__,
+            ".../corpora/" + self.__name,
+        )
+
+    def _unload(self):
+        # If an exception occurs during corpus loading then
+        # '_unload' method may be unattached, so __getattr__ can be called;
+        # we shouldn't trigger corpus loading again in this case.
+        pass
+
+
+def _make_bound_method(func, self):
+    """
+    Magic for creating bound methods (used for _unload).
+    """
+
+    class Foo:
+        def meth(self):
+            pass
+
+    f = Foo()
+    bound_method = type(f.meth)
+
+    try:
+        return bound_method(func, self, self.__class__)
+    except TypeError:  # python3
+        return bound_method(func, self)