Initial commit
This commit is contained in:
551
backend/venv/Lib/site-packages/nltk/corpus/__init__.py
Normal file
551
backend/venv/Lib/site-packages/nltk/corpus/__init__.py
Normal file
@@ -0,0 +1,551 @@
|
||||
# Natural Language Toolkit: Corpus Readers
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
# TODO this docstring isn't up-to-date!
|
||||
"""
|
||||
NLTK corpus readers. The modules in this package provide functions
|
||||
that can be used to read corpus files in a variety of formats. These
|
||||
functions can be used to read both the corpus files that are
|
||||
distributed in the NLTK corpus package, and corpus files that are part
|
||||
of external corpora.
|
||||
|
||||
Available Corpora
|
||||
=================
|
||||
|
||||
Please see https://www.nltk.org/nltk_data/ for a complete list.
|
||||
Install corpora using nltk.download().
|
||||
|
||||
Corpus Reader Functions
|
||||
=======================
|
||||
Each corpus module defines one or more "corpus reader functions",
|
||||
which can be used to read documents from that corpus. These functions
|
||||
take an argument, ``item``, which is used to indicate which document
|
||||
should be read from the corpus:
|
||||
|
||||
- If ``item`` is one of the unique identifiers listed in the corpus
|
||||
module's ``items`` variable, then the corresponding document will
|
||||
be loaded from the NLTK corpus package.
|
||||
- If ``item`` is a filename, then that file will be read.
|
||||
|
||||
Additionally, corpus reader functions can be given lists of item
|
||||
names; in which case, they will return a concatenation of the
|
||||
corresponding documents.
|
||||
|
||||
Corpus reader functions are named based on the type of information
|
||||
they return. Some common examples, and their return types, are:
|
||||
|
||||
- words(): list of str
|
||||
- sents(): list of (list of str)
|
||||
- paras(): list of (list of (list of str))
|
||||
- tagged_words(): list of (str,str) tuple
|
||||
- tagged_sents(): list of (list of (str,str))
|
||||
- tagged_paras(): list of (list of (list of (str,str)))
|
||||
- chunked_sents(): list of (Tree w/ (str,str) leaves)
|
||||
- parsed_sents(): list of (Tree with str leaves)
|
||||
- parsed_paras(): list of (list of (Tree with str leaves))
|
||||
- xml(): A single xml ElementTree
|
||||
- raw(): unprocessed corpus contents
|
||||
|
||||
For example, to read a list of the words in the Brown Corpus, use
|
||||
``nltk.corpus.brown.words()``:
|
||||
|
||||
>>> from nltk.corpus import brown
|
||||
>>> print(", ".join(brown.words())) # doctest: +ELLIPSIS
|
||||
The, Fulton, County, Grand, Jury, said, ...
|
||||
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
from nltk.corpus.reader import *
|
||||
from nltk.corpus.util import LazyCorpusLoader
|
||||
from nltk.tokenize import RegexpTokenizer
|
||||
|
||||
abc: PlaintextCorpusReader = LazyCorpusLoader(
|
||||
"abc",
|
||||
PlaintextCorpusReader,
|
||||
r"(?!\.).*\.txt",
|
||||
encoding=[("science", "latin_1"), ("rural", "utf8")],
|
||||
)
|
||||
alpino: AlpinoCorpusReader = LazyCorpusLoader(
|
||||
"alpino", AlpinoCorpusReader, tagset="alpino"
|
||||
)
|
||||
bcp47: BCP47CorpusReader = LazyCorpusLoader(
|
||||
"bcp47", BCP47CorpusReader, r"(cldr|iana)/*"
|
||||
)
|
||||
brown: CategorizedTaggedCorpusReader = LazyCorpusLoader(
|
||||
"brown",
|
||||
CategorizedTaggedCorpusReader,
|
||||
r"c[a-z]\d\d",
|
||||
cat_file="cats.txt",
|
||||
tagset="brown",
|
||||
encoding="ascii",
|
||||
)
|
||||
cess_cat: BracketParseCorpusReader = LazyCorpusLoader(
|
||||
"cess_cat",
|
||||
BracketParseCorpusReader,
|
||||
r"(?!\.).*\.tbf",
|
||||
tagset="unknown",
|
||||
encoding="ISO-8859-15",
|
||||
)
|
||||
cess_esp: BracketParseCorpusReader = LazyCorpusLoader(
|
||||
"cess_esp",
|
||||
BracketParseCorpusReader,
|
||||
r"(?!\.).*\.tbf",
|
||||
tagset="unknown",
|
||||
encoding="ISO-8859-15",
|
||||
)
|
||||
cmudict: CMUDictCorpusReader = LazyCorpusLoader(
|
||||
"cmudict", CMUDictCorpusReader, ["cmudict"]
|
||||
)
|
||||
comtrans: AlignedCorpusReader = LazyCorpusLoader(
|
||||
"comtrans", AlignedCorpusReader, r"(?!\.).*\.txt"
|
||||
)
|
||||
comparative_sentences: ComparativeSentencesCorpusReader = LazyCorpusLoader(
|
||||
"comparative_sentences",
|
||||
ComparativeSentencesCorpusReader,
|
||||
r"labeledSentences\.txt",
|
||||
encoding="latin-1",
|
||||
)
|
||||
conll2000: ConllChunkCorpusReader = LazyCorpusLoader(
|
||||
"conll2000",
|
||||
ConllChunkCorpusReader,
|
||||
["train.txt", "test.txt"],
|
||||
("NP", "VP", "PP"),
|
||||
tagset="wsj",
|
||||
encoding="ascii",
|
||||
)
|
||||
conll2002: ConllChunkCorpusReader = LazyCorpusLoader(
|
||||
"conll2002",
|
||||
ConllChunkCorpusReader,
|
||||
r".*\.(test|train).*",
|
||||
("LOC", "PER", "ORG", "MISC"),
|
||||
encoding="utf-8",
|
||||
)
|
||||
conll2007: DependencyCorpusReader = LazyCorpusLoader(
|
||||
"conll2007",
|
||||
DependencyCorpusReader,
|
||||
r".*\.(test|train).*",
|
||||
encoding=[("eus", "ISO-8859-2"), ("esp", "utf8")],
|
||||
)
|
||||
crubadan: CrubadanCorpusReader = LazyCorpusLoader(
|
||||
"crubadan", CrubadanCorpusReader, r".*\.txt"
|
||||
)
|
||||
dependency_treebank: DependencyCorpusReader = LazyCorpusLoader(
|
||||
"dependency_treebank", DependencyCorpusReader, r".*\.dp", encoding="ascii"
|
||||
)
|
||||
extended_omw: CorpusReader = LazyCorpusLoader(
|
||||
"extended_omw", CorpusReader, r".*/wn-[a-z\-]*\.tab", encoding="utf8"
|
||||
)
|
||||
floresta: BracketParseCorpusReader = LazyCorpusLoader(
|
||||
"floresta",
|
||||
BracketParseCorpusReader,
|
||||
r"(?!\.).*\.ptb",
|
||||
"#",
|
||||
tagset="unknown",
|
||||
encoding="ISO-8859-15",
|
||||
)
|
||||
framenet15: FramenetCorpusReader = LazyCorpusLoader(
|
||||
"framenet_v15",
|
||||
FramenetCorpusReader,
|
||||
[
|
||||
"frRelation.xml",
|
||||
"frameIndex.xml",
|
||||
"fulltextIndex.xml",
|
||||
"luIndex.xml",
|
||||
"semTypes.xml",
|
||||
],
|
||||
)
|
||||
framenet: FramenetCorpusReader = LazyCorpusLoader(
|
||||
"framenet_v17",
|
||||
FramenetCorpusReader,
|
||||
[
|
||||
"frRelation.xml",
|
||||
"frameIndex.xml",
|
||||
"fulltextIndex.xml",
|
||||
"luIndex.xml",
|
||||
"semTypes.xml",
|
||||
],
|
||||
)
|
||||
gazetteers: WordListCorpusReader = LazyCorpusLoader(
|
||||
"gazetteers", WordListCorpusReader, r"(?!LICENSE|\.).*\.txt", encoding="ISO-8859-2"
|
||||
)
|
||||
genesis: PlaintextCorpusReader = LazyCorpusLoader(
|
||||
"genesis",
|
||||
PlaintextCorpusReader,
|
||||
r"(?!\.).*\.txt",
|
||||
encoding=[
|
||||
("finnish|french|german", "latin_1"),
|
||||
("swedish", "cp865"),
|
||||
(".*", "utf_8"),
|
||||
],
|
||||
)
|
||||
gutenberg: PlaintextCorpusReader = LazyCorpusLoader(
|
||||
"gutenberg", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="latin1"
|
||||
)
|
||||
ieer: IEERCorpusReader = LazyCorpusLoader("ieer", IEERCorpusReader, r"(?!README|\.).*")
|
||||
inaugural: PlaintextCorpusReader = LazyCorpusLoader(
|
||||
"inaugural", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="latin1"
|
||||
)
|
||||
# [XX] This should probably just use TaggedCorpusReader:
|
||||
indian: IndianCorpusReader = LazyCorpusLoader(
|
||||
"indian", IndianCorpusReader, r"(?!\.).*\.pos", tagset="unknown", encoding="utf8"
|
||||
)
|
||||
|
||||
jeita: ChasenCorpusReader = LazyCorpusLoader(
|
||||
"jeita", ChasenCorpusReader, r".*\.chasen", encoding="utf-8"
|
||||
)
|
||||
knbc: KNBCorpusReader = LazyCorpusLoader(
|
||||
"knbc/corpus1", KNBCorpusReader, r".*/KN.*", encoding="euc-jp"
|
||||
)
|
||||
lin_thesaurus: LinThesaurusCorpusReader = LazyCorpusLoader(
|
||||
"lin_thesaurus", LinThesaurusCorpusReader, r".*\.lsp"
|
||||
)
|
||||
mac_morpho: MacMorphoCorpusReader = LazyCorpusLoader(
|
||||
"mac_morpho",
|
||||
MacMorphoCorpusReader,
|
||||
r"(?!\.).*\.txt",
|
||||
tagset="unknown",
|
||||
encoding="latin-1",
|
||||
)
|
||||
machado: PortugueseCategorizedPlaintextCorpusReader = LazyCorpusLoader(
|
||||
"machado",
|
||||
PortugueseCategorizedPlaintextCorpusReader,
|
||||
r"(?!\.).*\.txt",
|
||||
cat_pattern=r"([a-z]*)/.*",
|
||||
encoding="latin-1",
|
||||
)
|
||||
masc_tagged: CategorizedTaggedCorpusReader = LazyCorpusLoader(
|
||||
"masc_tagged",
|
||||
CategorizedTaggedCorpusReader,
|
||||
r"(spoken|written)/.*\.txt",
|
||||
cat_file="categories.txt",
|
||||
tagset="wsj",
|
||||
encoding="utf-8",
|
||||
sep="_",
|
||||
)
|
||||
movie_reviews: CategorizedPlaintextCorpusReader = LazyCorpusLoader(
|
||||
"movie_reviews",
|
||||
CategorizedPlaintextCorpusReader,
|
||||
r"(?!\.).*\.txt",
|
||||
cat_pattern=r"(neg|pos)/.*",
|
||||
encoding="ascii",
|
||||
)
|
||||
multext_east: MTECorpusReader = LazyCorpusLoader(
|
||||
"mte_teip5", MTECorpusReader, r"(oana).*\.xml", encoding="utf-8"
|
||||
)
|
||||
names: WordListCorpusReader = LazyCorpusLoader(
|
||||
"names", WordListCorpusReader, r"(?!\.).*\.txt", encoding="ascii"
|
||||
)
|
||||
nps_chat: NPSChatCorpusReader = LazyCorpusLoader(
|
||||
"nps_chat", NPSChatCorpusReader, r"(?!README|\.).*\.xml", tagset="wsj"
|
||||
)
|
||||
opinion_lexicon: OpinionLexiconCorpusReader = LazyCorpusLoader(
|
||||
"opinion_lexicon",
|
||||
OpinionLexiconCorpusReader,
|
||||
r"(\w+)\-words\.txt",
|
||||
encoding="ISO-8859-2",
|
||||
)
|
||||
ppattach: PPAttachmentCorpusReader = LazyCorpusLoader(
|
||||
"ppattach", PPAttachmentCorpusReader, ["training", "test", "devset"]
|
||||
)
|
||||
product_reviews_1: ReviewsCorpusReader = LazyCorpusLoader(
|
||||
"product_reviews_1", ReviewsCorpusReader, r"^(?!Readme).*\.txt", encoding="utf8"
|
||||
)
|
||||
product_reviews_2: ReviewsCorpusReader = LazyCorpusLoader(
|
||||
"product_reviews_2", ReviewsCorpusReader, r"^(?!Readme).*\.txt", encoding="utf8"
|
||||
)
|
||||
pros_cons: ProsConsCorpusReader = LazyCorpusLoader(
|
||||
"pros_cons",
|
||||
ProsConsCorpusReader,
|
||||
r"Integrated(Cons|Pros)\.txt",
|
||||
cat_pattern=r"Integrated(Cons|Pros)\.txt",
|
||||
encoding="ISO-8859-2",
|
||||
)
|
||||
ptb: CategorizedBracketParseCorpusReader = (
|
||||
LazyCorpusLoader( # Penn Treebank v3: WSJ and Brown portions
|
||||
"ptb",
|
||||
CategorizedBracketParseCorpusReader,
|
||||
r"(WSJ/\d\d/WSJ_\d\d|BROWN/C[A-Z]/C[A-Z])\d\d.MRG",
|
||||
cat_file="allcats.txt",
|
||||
tagset="wsj",
|
||||
)
|
||||
)
|
||||
qc: StringCategoryCorpusReader = LazyCorpusLoader(
|
||||
"qc", StringCategoryCorpusReader, ["train.txt", "test.txt"], encoding="ISO-8859-2"
|
||||
)
|
||||
reuters: CategorizedPlaintextCorpusReader = LazyCorpusLoader(
|
||||
"reuters",
|
||||
CategorizedPlaintextCorpusReader,
|
||||
"(training|test).*",
|
||||
cat_file="cats.txt",
|
||||
encoding="ISO-8859-2",
|
||||
)
|
||||
rte: RTECorpusReader = LazyCorpusLoader("rte", RTECorpusReader, r"(?!\.).*\.xml")
|
||||
senseval: SensevalCorpusReader = LazyCorpusLoader(
|
||||
"senseval", SensevalCorpusReader, r"(?!\.).*\.pos"
|
||||
)
|
||||
sentence_polarity: CategorizedSentencesCorpusReader = LazyCorpusLoader(
|
||||
"sentence_polarity",
|
||||
CategorizedSentencesCorpusReader,
|
||||
r"rt-polarity\.(neg|pos)",
|
||||
cat_pattern=r"rt-polarity\.(neg|pos)",
|
||||
encoding="utf-8",
|
||||
)
|
||||
sentiwordnet: SentiWordNetCorpusReader = LazyCorpusLoader(
|
||||
"sentiwordnet", SentiWordNetCorpusReader, "SentiWordNet_3.0.0.txt", encoding="utf-8"
|
||||
)
|
||||
shakespeare: XMLCorpusReader = LazyCorpusLoader(
|
||||
"shakespeare", XMLCorpusReader, r"(?!\.).*\.xml"
|
||||
)
|
||||
sinica_treebank: SinicaTreebankCorpusReader = LazyCorpusLoader(
|
||||
"sinica_treebank",
|
||||
SinicaTreebankCorpusReader,
|
||||
["parsed"],
|
||||
tagset="unknown",
|
||||
encoding="utf-8",
|
||||
)
|
||||
state_union: PlaintextCorpusReader = LazyCorpusLoader(
|
||||
"state_union", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="ISO-8859-2"
|
||||
)
|
||||
stopwords: WordListCorpusReader = LazyCorpusLoader(
|
||||
"stopwords", WordListCorpusReader, r"(?!README|\.).*", encoding="utf8"
|
||||
)
|
||||
subjectivity: CategorizedSentencesCorpusReader = LazyCorpusLoader(
|
||||
"subjectivity",
|
||||
CategorizedSentencesCorpusReader,
|
||||
r"(quote.tok.gt9|plot.tok.gt9)\.5000",
|
||||
cat_map={"quote.tok.gt9.5000": ["subj"], "plot.tok.gt9.5000": ["obj"]},
|
||||
encoding="latin-1",
|
||||
)
|
||||
swadesh: SwadeshCorpusReader = LazyCorpusLoader(
|
||||
"swadesh", SwadeshCorpusReader, r"(?!README|\.).*", encoding="utf8"
|
||||
)
|
||||
swadesh110: PanlexSwadeshCorpusReader = LazyCorpusLoader(
|
||||
"panlex_swadesh", PanlexSwadeshCorpusReader, r"swadesh110/.*\.txt", encoding="utf8"
|
||||
)
|
||||
swadesh207: PanlexSwadeshCorpusReader = LazyCorpusLoader(
|
||||
"panlex_swadesh", PanlexSwadeshCorpusReader, r"swadesh207/.*\.txt", encoding="utf8"
|
||||
)
|
||||
switchboard: SwitchboardCorpusReader = LazyCorpusLoader(
|
||||
"switchboard", SwitchboardCorpusReader, tagset="wsj"
|
||||
)
|
||||
timit: TimitCorpusReader = LazyCorpusLoader("timit", TimitCorpusReader)
|
||||
timit_tagged: TimitTaggedCorpusReader = LazyCorpusLoader(
|
||||
"timit", TimitTaggedCorpusReader, r".+\.tags", tagset="wsj", encoding="ascii"
|
||||
)
|
||||
toolbox: ToolboxCorpusReader = LazyCorpusLoader(
|
||||
"toolbox", ToolboxCorpusReader, r"(?!.*(README|\.)).*\.(dic|txt)"
|
||||
)
|
||||
treebank: BracketParseCorpusReader = LazyCorpusLoader(
|
||||
"treebank/combined",
|
||||
BracketParseCorpusReader,
|
||||
r"wsj_.*\.mrg",
|
||||
tagset="wsj",
|
||||
encoding="ascii",
|
||||
)
|
||||
treebank_chunk: ChunkedCorpusReader = LazyCorpusLoader(
|
||||
"treebank/tagged",
|
||||
ChunkedCorpusReader,
|
||||
r"wsj_.*\.pos",
|
||||
sent_tokenizer=RegexpTokenizer(r"(?<=/\.)\s*(?![^\[]*\])", gaps=True),
|
||||
para_block_reader=tagged_treebank_para_block_reader,
|
||||
tagset="wsj",
|
||||
encoding="ascii",
|
||||
)
|
||||
treebank_raw: PlaintextCorpusReader = LazyCorpusLoader(
|
||||
"treebank/raw", PlaintextCorpusReader, r"wsj_.*", encoding="ISO-8859-2"
|
||||
)
|
||||
twitter_samples: TwitterCorpusReader = LazyCorpusLoader(
|
||||
"twitter_samples", TwitterCorpusReader, r".*\.json"
|
||||
)
|
||||
udhr: UdhrCorpusReader = LazyCorpusLoader("udhr", UdhrCorpusReader)
|
||||
udhr2: PlaintextCorpusReader = LazyCorpusLoader(
|
||||
"udhr2", PlaintextCorpusReader, r".*\.txt", encoding="utf8"
|
||||
)
|
||||
universal_treebanks: ConllCorpusReader = LazyCorpusLoader(
|
||||
"universal_treebanks_v20",
|
||||
ConllCorpusReader,
|
||||
r".*\.conll",
|
||||
columntypes=(
|
||||
"ignore",
|
||||
"words",
|
||||
"ignore",
|
||||
"ignore",
|
||||
"pos",
|
||||
"ignore",
|
||||
"ignore",
|
||||
"ignore",
|
||||
"ignore",
|
||||
"ignore",
|
||||
),
|
||||
)
|
||||
verbnet: VerbnetCorpusReader = LazyCorpusLoader(
|
||||
"verbnet", VerbnetCorpusReader, r"(?!\.).*\.xml"
|
||||
)
|
||||
webtext: PlaintextCorpusReader = LazyCorpusLoader(
|
||||
"webtext", PlaintextCorpusReader, r"(?!README|\.).*\.txt", encoding="ISO-8859-2"
|
||||
)
|
||||
wordnet: WordNetCorpusReader = LazyCorpusLoader(
|
||||
"wordnet",
|
||||
WordNetCorpusReader,
|
||||
LazyCorpusLoader("omw-1.4", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
|
||||
)
|
||||
## Use the following template to add a custom Wordnet package.
|
||||
## Just uncomment, and replace the identifier (my_wordnet) in two places:
|
||||
##
|
||||
# my_wordnet: WordNetCorpusReader = LazyCorpusLoader(
|
||||
# "my_wordnet",
|
||||
# WordNetCorpusReader,
|
||||
# LazyCorpusLoader("omw-1.4", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
|
||||
# )
|
||||
wordnet31: WordNetCorpusReader = LazyCorpusLoader(
|
||||
"wordnet31",
|
||||
WordNetCorpusReader,
|
||||
LazyCorpusLoader("omw-1.4", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
|
||||
)
|
||||
wordnet2021: WordNetCorpusReader = LazyCorpusLoader(
|
||||
# Obsolete, use english_wordnet instead.
|
||||
"wordnet2021",
|
||||
WordNetCorpusReader,
|
||||
LazyCorpusLoader("omw-1.4", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
|
||||
)
|
||||
wordnet2022: WordNetCorpusReader = LazyCorpusLoader(
|
||||
# Obsolete, use english_wordnet instead.
|
||||
"wordnet2022",
|
||||
WordNetCorpusReader,
|
||||
LazyCorpusLoader("omw-1.4", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
|
||||
)
|
||||
english_wordnet: WordNetCorpusReader = LazyCorpusLoader(
|
||||
# Latest Open English Wordnet
|
||||
"english_wordnet",
|
||||
WordNetCorpusReader,
|
||||
LazyCorpusLoader("omw-1.4", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
|
||||
)
|
||||
wordnet_ic: WordNetICCorpusReader = LazyCorpusLoader(
|
||||
"wordnet_ic", WordNetICCorpusReader, r".*\.dat"
|
||||
)
|
||||
words: WordListCorpusReader = LazyCorpusLoader(
|
||||
"words", WordListCorpusReader, r"(?!README|\.).*", encoding="ascii"
|
||||
)
|
||||
|
||||
# defined after treebank
|
||||
propbank: PropbankCorpusReader = LazyCorpusLoader(
|
||||
"propbank",
|
||||
PropbankCorpusReader,
|
||||
"prop.txt",
|
||||
r"frames/.*\.xml",
|
||||
"verbs.txt",
|
||||
lambda filename: re.sub(r"^wsj/\d\d/", "", filename),
|
||||
treebank,
|
||||
) # Must be defined *after* treebank corpus.
|
||||
nombank: NombankCorpusReader = LazyCorpusLoader(
|
||||
"nombank.1.0",
|
||||
NombankCorpusReader,
|
||||
"nombank.1.0",
|
||||
r"frames/.*\.xml",
|
||||
"nombank.1.0.words",
|
||||
lambda filename: re.sub(r"^wsj/\d\d/", "", filename),
|
||||
treebank,
|
||||
) # Must be defined *after* treebank corpus.
|
||||
propbank_ptb: PropbankCorpusReader = LazyCorpusLoader(
|
||||
"propbank",
|
||||
PropbankCorpusReader,
|
||||
"prop.txt",
|
||||
r"frames/.*\.xml",
|
||||
"verbs.txt",
|
||||
lambda filename: filename.upper(),
|
||||
ptb,
|
||||
) # Must be defined *after* ptb corpus.
|
||||
nombank_ptb: NombankCorpusReader = LazyCorpusLoader(
|
||||
"nombank.1.0",
|
||||
NombankCorpusReader,
|
||||
"nombank.1.0",
|
||||
r"frames/.*\.xml",
|
||||
"nombank.1.0.words",
|
||||
lambda filename: filename.upper(),
|
||||
ptb,
|
||||
) # Must be defined *after* ptb corpus.
|
||||
semcor: SemcorCorpusReader = LazyCorpusLoader(
|
||||
"semcor", SemcorCorpusReader, r"brown./tagfiles/br-.*\.xml", wordnet
|
||||
) # Must be defined *after* wordnet corpus.
|
||||
|
||||
nonbreaking_prefixes: NonbreakingPrefixesCorpusReader = LazyCorpusLoader(
|
||||
"nonbreaking_prefixes",
|
||||
NonbreakingPrefixesCorpusReader,
|
||||
r"(?!README|\.).*",
|
||||
encoding="utf8",
|
||||
)
|
||||
perluniprops: UnicharsCorpusReader = LazyCorpusLoader(
|
||||
"perluniprops",
|
||||
UnicharsCorpusReader,
|
||||
r"(?!README|\.).*",
|
||||
nltk_data_subdir="misc",
|
||||
encoding="utf8",
|
||||
)
|
||||
|
||||
# mwa_ppdb = LazyCorpusLoader(
|
||||
# 'mwa_ppdb', MWAPPDBCorpusReader, r'(?!README|\.).*', nltk_data_subdir='misc', encoding='utf8')
|
||||
|
||||
# See https://github.com/nltk/nltk/issues/1579
|
||||
# and https://github.com/nltk/nltk/issues/1716
|
||||
#
|
||||
# pl196x = LazyCorpusLoader(
|
||||
# 'pl196x', Pl196xCorpusReader, r'[a-z]-.*\.xml',
|
||||
# cat_file='cats.txt', textid_file='textids.txt', encoding='utf8')
|
||||
#
|
||||
# ipipan = LazyCorpusLoader(
|
||||
# 'ipipan', IPIPANCorpusReader, r'(?!\.).*morph\.xml')
|
||||
#
|
||||
# nkjp = LazyCorpusLoader(
|
||||
# 'nkjp', NKJPCorpusReader, r'', encoding='utf8')
|
||||
#
|
||||
# panlex_lite = LazyCorpusLoader(
|
||||
# 'panlex_lite', PanLexLiteCorpusReader)
|
||||
#
|
||||
# ycoe = LazyCorpusLoader(
|
||||
# 'ycoe', YCOECorpusReader)
|
||||
#
|
||||
# corpus not available with NLTK; these lines caused help(nltk.corpus) to break
|
||||
# hebrew_treebank = LazyCorpusLoader(
|
||||
# 'hebrew_treebank', BracketParseCorpusReader, r'.*\.txt')
|
||||
|
||||
|
||||
# FIXME: override any imported demo from various corpora, see https://github.com/nltk/nltk/issues/2116
|
||||
def demo():
|
||||
# This is out-of-date:
|
||||
abc.demo()
|
||||
brown.demo()
|
||||
# chat80.demo()
|
||||
cmudict.demo()
|
||||
conll2000.demo()
|
||||
conll2002.demo()
|
||||
genesis.demo()
|
||||
gutenberg.demo()
|
||||
ieer.demo()
|
||||
inaugural.demo()
|
||||
indian.demo()
|
||||
names.demo()
|
||||
ppattach.demo()
|
||||
senseval.demo()
|
||||
shakespeare.demo()
|
||||
sinica_treebank.demo()
|
||||
state_union.demo()
|
||||
stopwords.demo()
|
||||
timit.demo()
|
||||
toolbox.demo()
|
||||
treebank.demo()
|
||||
udhr.demo()
|
||||
webtext.demo()
|
||||
words.demo()
|
||||
|
||||
|
||||
# ycoe.demo()
|
||||
|
||||
if __name__ == "__main__":
|
||||
# demo()
|
||||
pass
|
||||
56
backend/venv/Lib/site-packages/nltk/corpus/europarl_raw.py
Normal file
56
backend/venv/Lib/site-packages/nltk/corpus/europarl_raw.py
Normal file
@@ -0,0 +1,56 @@
|
||||
# Natural Language Toolkit: Europarl Corpus Readers
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Nitin Madnani <nmadnani@umiacs.umd.edu>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import re
|
||||
|
||||
from nltk.corpus.reader import *
|
||||
from nltk.corpus.util import LazyCorpusLoader
|
||||
|
||||
# Create a new corpus reader instance for each European language
|
||||
danish: EuroparlCorpusReader = LazyCorpusLoader(
|
||||
"europarl_raw/danish", EuroparlCorpusReader, r"ep-.*\.da", encoding="utf-8"
|
||||
)
|
||||
|
||||
dutch: EuroparlCorpusReader = LazyCorpusLoader(
|
||||
"europarl_raw/dutch", EuroparlCorpusReader, r"ep-.*\.nl", encoding="utf-8"
|
||||
)
|
||||
|
||||
english: EuroparlCorpusReader = LazyCorpusLoader(
|
||||
"europarl_raw/english", EuroparlCorpusReader, r"ep-.*\.en", encoding="utf-8"
|
||||
)
|
||||
|
||||
finnish: EuroparlCorpusReader = LazyCorpusLoader(
|
||||
"europarl_raw/finnish", EuroparlCorpusReader, r"ep-.*\.fi", encoding="utf-8"
|
||||
)
|
||||
|
||||
french: EuroparlCorpusReader = LazyCorpusLoader(
|
||||
"europarl_raw/french", EuroparlCorpusReader, r"ep-.*\.fr", encoding="utf-8"
|
||||
)
|
||||
|
||||
german: EuroparlCorpusReader = LazyCorpusLoader(
|
||||
"europarl_raw/german", EuroparlCorpusReader, r"ep-.*\.de", encoding="utf-8"
|
||||
)
|
||||
|
||||
greek: EuroparlCorpusReader = LazyCorpusLoader(
|
||||
"europarl_raw/greek", EuroparlCorpusReader, r"ep-.*\.el", encoding="utf-8"
|
||||
)
|
||||
|
||||
italian: EuroparlCorpusReader = LazyCorpusLoader(
|
||||
"europarl_raw/italian", EuroparlCorpusReader, r"ep-.*\.it", encoding="utf-8"
|
||||
)
|
||||
|
||||
portuguese: EuroparlCorpusReader = LazyCorpusLoader(
|
||||
"europarl_raw/portuguese", EuroparlCorpusReader, r"ep-.*\.pt", encoding="utf-8"
|
||||
)
|
||||
|
||||
spanish: EuroparlCorpusReader = LazyCorpusLoader(
|
||||
"europarl_raw/spanish", EuroparlCorpusReader, r"ep-.*\.es", encoding="utf-8"
|
||||
)
|
||||
|
||||
swedish: EuroparlCorpusReader = LazyCorpusLoader(
|
||||
"europarl_raw/swedish", EuroparlCorpusReader, r"ep-.*\.sv", encoding="utf-8"
|
||||
)
|
||||
186
backend/venv/Lib/site-packages/nltk/corpus/reader/__init__.py
Normal file
186
backend/venv/Lib/site-packages/nltk/corpus/reader/__init__.py
Normal file
@@ -0,0 +1,186 @@
|
||||
# Natural Language Toolkit: Corpus Readers
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
NLTK corpus readers. The modules in this package provide functions
|
||||
that can be used to read corpus fileids in a variety of formats. These
|
||||
functions can be used to read both the corpus fileids that are
|
||||
distributed in the NLTK corpus package, and corpus fileids that are part
|
||||
of external corpora.
|
||||
|
||||
Corpus Reader Functions
|
||||
=======================
|
||||
Each corpus module defines one or more "corpus reader functions",
|
||||
which can be used to read documents from that corpus. These functions
|
||||
take an argument, ``item``, which is used to indicate which document
|
||||
should be read from the corpus:
|
||||
|
||||
- If ``item`` is one of the unique identifiers listed in the corpus
|
||||
module's ``items`` variable, then the corresponding document will
|
||||
be loaded from the NLTK corpus package.
|
||||
- If ``item`` is a fileid, then that file will be read.
|
||||
|
||||
Additionally, corpus reader functions can be given lists of item
|
||||
names; in which case, they will return a concatenation of the
|
||||
corresponding documents.
|
||||
|
||||
Corpus reader functions are named based on the type of information
|
||||
they return. Some common examples, and their return types, are:
|
||||
|
||||
- words(): list of str
|
||||
- sents(): list of (list of str)
|
||||
- paras(): list of (list of (list of str))
|
||||
- tagged_words(): list of (str,str) tuple
|
||||
- tagged_sents(): list of (list of (str,str))
|
||||
- tagged_paras(): list of (list of (list of (str,str)))
|
||||
- chunked_sents(): list of (Tree w/ (str,str) leaves)
|
||||
- parsed_sents(): list of (Tree with str leaves)
|
||||
- parsed_paras(): list of (list of (Tree with str leaves))
|
||||
- xml(): A single xml ElementTree
|
||||
- raw(): unprocessed corpus contents
|
||||
|
||||
For example, to read a list of the words in the Brown Corpus, use
|
||||
``nltk.corpus.brown.words()``:
|
||||
|
||||
>>> from nltk.corpus import brown
|
||||
>>> print(", ".join(brown.words()[:6])) # only first 6 words
|
||||
The, Fulton, County, Grand, Jury, said
|
||||
|
||||
isort:skip_file
|
||||
"""
|
||||
|
||||
from nltk.corpus.reader.plaintext import *
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.corpus.reader.tagged import *
|
||||
from nltk.corpus.reader.cmudict import *
|
||||
from nltk.corpus.reader.conll import *
|
||||
from nltk.corpus.reader.chunked import *
|
||||
from nltk.corpus.reader.wordlist import *
|
||||
from nltk.corpus.reader.xmldocs import *
|
||||
from nltk.corpus.reader.ppattach import *
|
||||
from nltk.corpus.reader.senseval import *
|
||||
from nltk.corpus.reader.ieer import *
|
||||
from nltk.corpus.reader.sinica_treebank import *
|
||||
from nltk.corpus.reader.bracket_parse import *
|
||||
from nltk.corpus.reader.indian import *
|
||||
from nltk.corpus.reader.toolbox import *
|
||||
from nltk.corpus.reader.timit import *
|
||||
from nltk.corpus.reader.ycoe import *
|
||||
from nltk.corpus.reader.rte import *
|
||||
from nltk.corpus.reader.string_category import *
|
||||
from nltk.corpus.reader.propbank import *
|
||||
from nltk.corpus.reader.verbnet import *
|
||||
from nltk.corpus.reader.bnc import *
|
||||
from nltk.corpus.reader.nps_chat import *
|
||||
from nltk.corpus.reader.wordnet import *
|
||||
from nltk.corpus.reader.switchboard import *
|
||||
from nltk.corpus.reader.dependency import *
|
||||
from nltk.corpus.reader.nombank import *
|
||||
from nltk.corpus.reader.ipipan import *
|
||||
from nltk.corpus.reader.pl196x import *
|
||||
from nltk.corpus.reader.knbc import *
|
||||
from nltk.corpus.reader.chasen import *
|
||||
from nltk.corpus.reader.childes import *
|
||||
from nltk.corpus.reader.aligned import *
|
||||
from nltk.corpus.reader.lin import *
|
||||
from nltk.corpus.reader.semcor import *
|
||||
from nltk.corpus.reader.framenet import *
|
||||
from nltk.corpus.reader.udhr import *
|
||||
from nltk.corpus.reader.bnc import *
|
||||
from nltk.corpus.reader.sentiwordnet import *
|
||||
from nltk.corpus.reader.twitter import *
|
||||
from nltk.corpus.reader.nkjp import *
|
||||
from nltk.corpus.reader.crubadan import *
|
||||
from nltk.corpus.reader.mte import *
|
||||
from nltk.corpus.reader.reviews import *
|
||||
from nltk.corpus.reader.opinion_lexicon import *
|
||||
from nltk.corpus.reader.pros_cons import *
|
||||
from nltk.corpus.reader.categorized_sents import *
|
||||
from nltk.corpus.reader.comparative_sents import *
|
||||
from nltk.corpus.reader.panlex_lite import *
|
||||
from nltk.corpus.reader.panlex_swadesh import *
|
||||
from nltk.corpus.reader.bcp47 import *
|
||||
|
||||
# Make sure that nltk.corpus.reader.bracket_parse gives the module, not
|
||||
# the function bracket_parse() defined in nltk.tree:
|
||||
from nltk.corpus.reader import bracket_parse
|
||||
|
||||
__all__ = [
|
||||
"CorpusReader",
|
||||
"CategorizedCorpusReader",
|
||||
"PlaintextCorpusReader",
|
||||
"find_corpus_fileids",
|
||||
"TaggedCorpusReader",
|
||||
"CMUDictCorpusReader",
|
||||
"ConllChunkCorpusReader",
|
||||
"WordListCorpusReader",
|
||||
"PPAttachmentCorpusReader",
|
||||
"SensevalCorpusReader",
|
||||
"IEERCorpusReader",
|
||||
"ChunkedCorpusReader",
|
||||
"SinicaTreebankCorpusReader",
|
||||
"BracketParseCorpusReader",
|
||||
"IndianCorpusReader",
|
||||
"ToolboxCorpusReader",
|
||||
"TimitCorpusReader",
|
||||
"YCOECorpusReader",
|
||||
"MacMorphoCorpusReader",
|
||||
"SyntaxCorpusReader",
|
||||
"AlpinoCorpusReader",
|
||||
"RTECorpusReader",
|
||||
"StringCategoryCorpusReader",
|
||||
"EuroparlCorpusReader",
|
||||
"CategorizedBracketParseCorpusReader",
|
||||
"CategorizedTaggedCorpusReader",
|
||||
"CategorizedPlaintextCorpusReader",
|
||||
"PortugueseCategorizedPlaintextCorpusReader",
|
||||
"tagged_treebank_para_block_reader",
|
||||
"PropbankCorpusReader",
|
||||
"VerbnetCorpusReader",
|
||||
"BNCCorpusReader",
|
||||
"ConllCorpusReader",
|
||||
"XMLCorpusReader",
|
||||
"NPSChatCorpusReader",
|
||||
"SwadeshCorpusReader",
|
||||
"WordNetCorpusReader",
|
||||
"WordNetICCorpusReader",
|
||||
"SwitchboardCorpusReader",
|
||||
"DependencyCorpusReader",
|
||||
"NombankCorpusReader",
|
||||
"IPIPANCorpusReader",
|
||||
"Pl196xCorpusReader",
|
||||
"TEICorpusView",
|
||||
"KNBCorpusReader",
|
||||
"ChasenCorpusReader",
|
||||
"CHILDESCorpusReader",
|
||||
"AlignedCorpusReader",
|
||||
"TimitTaggedCorpusReader",
|
||||
"LinThesaurusCorpusReader",
|
||||
"SemcorCorpusReader",
|
||||
"FramenetCorpusReader",
|
||||
"UdhrCorpusReader",
|
||||
"BNCCorpusReader",
|
||||
"SentiWordNetCorpusReader",
|
||||
"SentiSynset",
|
||||
"TwitterCorpusReader",
|
||||
"NKJPCorpusReader",
|
||||
"CrubadanCorpusReader",
|
||||
"MTECorpusReader",
|
||||
"ReviewsCorpusReader",
|
||||
"OpinionLexiconCorpusReader",
|
||||
"ProsConsCorpusReader",
|
||||
"CategorizedSentencesCorpusReader",
|
||||
"ComparativeSentencesCorpusReader",
|
||||
"PanLexLiteCorpusReader",
|
||||
"NonbreakingPrefixesCorpusReader",
|
||||
"UnicharsCorpusReader",
|
||||
"MWAPPDBCorpusReader",
|
||||
"PanlexSwadeshCorpusReader",
|
||||
"BCP47CorpusReader",
|
||||
]
|
||||
154
backend/venv/Lib/site-packages/nltk/corpus/reader/aligned.py
Normal file
154
backend/venv/Lib/site-packages/nltk/corpus/reader/aligned.py
Normal file
@@ -0,0 +1,154 @@
|
||||
# Natural Language Toolkit: Aligned Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# URL: <https://www.nltk.org/>
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
from nltk.corpus.reader.api import CorpusReader
|
||||
from nltk.corpus.reader.util import (
|
||||
StreamBackedCorpusView,
|
||||
concat,
|
||||
read_alignedsent_block,
|
||||
)
|
||||
from nltk.tokenize import RegexpTokenizer, WhitespaceTokenizer
|
||||
from nltk.translate import AlignedSent, Alignment
|
||||
|
||||
|
||||
class AlignedCorpusReader(CorpusReader):
|
||||
"""
|
||||
Reader for corpora of word-aligned sentences. Tokens are assumed
|
||||
to be separated by whitespace. Sentences begin on separate lines.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
root,
|
||||
fileids,
|
||||
sep="/",
|
||||
word_tokenizer=WhitespaceTokenizer(),
|
||||
sent_tokenizer=RegexpTokenizer("\n", gaps=True),
|
||||
alignedsent_block_reader=read_alignedsent_block,
|
||||
encoding="latin1",
|
||||
):
|
||||
"""
|
||||
Construct a new Aligned Corpus reader for a set of documents
|
||||
located at the given root directory. Example usage:
|
||||
|
||||
>>> root = '/...path to corpus.../'
|
||||
>>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP
|
||||
|
||||
:param root: The root directory for this corpus.
|
||||
:param fileids: A list or regexp specifying the fileids in this corpus.
|
||||
"""
|
||||
CorpusReader.__init__(self, root, fileids, encoding)
|
||||
self._sep = sep
|
||||
self._word_tokenizer = word_tokenizer
|
||||
self._sent_tokenizer = sent_tokenizer
|
||||
self._alignedsent_block_reader = alignedsent_block_reader
|
||||
|
||||
def words(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of words
|
||||
and punctuation symbols.
|
||||
:rtype: list(str)
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
AlignedSentCorpusView(
|
||||
fileid,
|
||||
enc,
|
||||
False,
|
||||
False,
|
||||
self._word_tokenizer,
|
||||
self._sent_tokenizer,
|
||||
self._alignedsent_block_reader,
|
||||
)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def sents(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of
|
||||
sentences or utterances, each encoded as a list of word
|
||||
strings.
|
||||
:rtype: list(list(str))
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
AlignedSentCorpusView(
|
||||
fileid,
|
||||
enc,
|
||||
False,
|
||||
True,
|
||||
self._word_tokenizer,
|
||||
self._sent_tokenizer,
|
||||
self._alignedsent_block_reader,
|
||||
)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def aligned_sents(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of AlignedSent objects.
|
||||
:rtype: list(AlignedSent)
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
AlignedSentCorpusView(
|
||||
fileid,
|
||||
enc,
|
||||
True,
|
||||
True,
|
||||
self._word_tokenizer,
|
||||
self._sent_tokenizer,
|
||||
self._alignedsent_block_reader,
|
||||
)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
class AlignedSentCorpusView(StreamBackedCorpusView):
|
||||
"""
|
||||
A specialized corpus view for aligned sentences.
|
||||
``AlignedSentCorpusView`` objects are typically created by
|
||||
``AlignedCorpusReader`` (not directly by nltk users).
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
corpus_file,
|
||||
encoding,
|
||||
aligned,
|
||||
group_by_sent,
|
||||
word_tokenizer,
|
||||
sent_tokenizer,
|
||||
alignedsent_block_reader,
|
||||
):
|
||||
self._aligned = aligned
|
||||
self._group_by_sent = group_by_sent
|
||||
self._word_tokenizer = word_tokenizer
|
||||
self._sent_tokenizer = sent_tokenizer
|
||||
self._alignedsent_block_reader = alignedsent_block_reader
|
||||
StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
|
||||
|
||||
def read_block(self, stream):
|
||||
block = [
|
||||
self._word_tokenizer.tokenize(sent_str)
|
||||
for alignedsent_str in self._alignedsent_block_reader(stream)
|
||||
for sent_str in self._sent_tokenizer.tokenize(alignedsent_str)
|
||||
]
|
||||
if self._aligned:
|
||||
block[2] = Alignment.fromstring(
|
||||
" ".join(block[2])
|
||||
) # kludge; we shouldn't have tokenized the alignment string
|
||||
block = [AlignedSent(*block)]
|
||||
elif self._group_by_sent:
|
||||
block = [block[0]]
|
||||
else:
|
||||
block = block[0]
|
||||
|
||||
return block
|
||||
517
backend/venv/Lib/site-packages/nltk/corpus/reader/api.py
Normal file
517
backend/venv/Lib/site-packages/nltk/corpus/reader/api.py
Normal file
@@ -0,0 +1,517 @@
|
||||
# Natural Language Toolkit: API for Corpus Readers
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
API for corpus readers.
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from itertools import chain
|
||||
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.data import FileSystemPathPointer, PathPointer, ZipFilePathPointer
|
||||
|
||||
|
||||
class CorpusReader:
|
||||
"""
|
||||
A base class for "corpus reader" classes, each of which can be
|
||||
used to read a specific corpus format. Each individual corpus
|
||||
reader instance is used to read a specific corpus, consisting of
|
||||
one or more files under a common root directory. Each file is
|
||||
identified by its ``file identifier``, which is the relative path
|
||||
to the file from the root directory.
|
||||
|
||||
A separate subclass is defined for each corpus format. These
|
||||
subclasses define one or more methods that provide 'views' on the
|
||||
corpus contents, such as ``words()`` (for a list of words) and
|
||||
``parsed_sents()`` (for a list of parsed sentences). Called with
|
||||
no arguments, these methods will return the contents of the entire
|
||||
corpus. For most corpora, these methods define one or more
|
||||
selection arguments, such as ``fileids`` or ``categories``, which can
|
||||
be used to select which portion of the corpus should be returned.
|
||||
"""
|
||||
|
||||
def __init__(self, root, fileids, encoding="utf8", tagset=None):
|
||||
"""
|
||||
:type root: PathPointer or str
|
||||
:param root: A path pointer identifying the root directory for
|
||||
this corpus. If a string is specified, then it will be
|
||||
converted to a ``PathPointer`` automatically.
|
||||
:param fileids: A list of the files that make up this corpus.
|
||||
This list can either be specified explicitly, as a list of
|
||||
strings; or implicitly, as a regular expression over file
|
||||
paths. The absolute path for each file will be constructed
|
||||
by joining the reader's root to each file name.
|
||||
:param encoding: The default unicode encoding for the files
|
||||
that make up the corpus. The value of ``encoding`` can be any
|
||||
of the following:
|
||||
|
||||
- A string: ``encoding`` is the encoding name for all files.
|
||||
- A dictionary: ``encoding[file_id]`` is the encoding
|
||||
name for the file whose identifier is ``file_id``. If
|
||||
``file_id`` is not in ``encoding``, then the file
|
||||
contents will be processed using non-unicode byte strings.
|
||||
- A list: ``encoding`` should be a list of ``(regexp, encoding)``
|
||||
tuples. The encoding for a file whose identifier is ``file_id``
|
||||
will be the ``encoding`` value for the first tuple whose
|
||||
``regexp`` matches the ``file_id``. If no tuple's ``regexp``
|
||||
matches the ``file_id``, the file contents will be processed
|
||||
using non-unicode byte strings.
|
||||
- None: the file contents of all files will be
|
||||
processed using non-unicode byte strings.
|
||||
:param tagset: The name of the tagset used by this corpus, to be used
|
||||
for normalizing or converting the POS tags returned by the
|
||||
``tagged_...()`` methods.
|
||||
"""
|
||||
# Convert the root to a path pointer, if necessary.
|
||||
if isinstance(root, str) and not isinstance(root, PathPointer):
|
||||
m = re.match(r"(.*\.zip)/?(.*)$|", root)
|
||||
zipfile, zipentry = m.groups()
|
||||
if zipfile:
|
||||
root = ZipFilePathPointer(zipfile, zipentry)
|
||||
else:
|
||||
root = FileSystemPathPointer(root)
|
||||
elif not isinstance(root, PathPointer):
|
||||
raise TypeError("CorpusReader: expected a string or a PathPointer")
|
||||
|
||||
# If `fileids` is a regexp, then expand it.
|
||||
if isinstance(fileids, str):
|
||||
fileids = find_corpus_fileids(root, fileids)
|
||||
|
||||
self._fileids = fileids
|
||||
"""A list of the relative paths for the fileids that make up
|
||||
this corpus."""
|
||||
|
||||
self._root = root
|
||||
"""The root directory for this corpus."""
|
||||
|
||||
self._readme = "README"
|
||||
self._license = "LICENSE"
|
||||
self._citation = "citation.bib"
|
||||
|
||||
# If encoding was specified as a list of regexps, then convert
|
||||
# it to a dictionary.
|
||||
if isinstance(encoding, list):
|
||||
encoding_dict = {}
|
||||
for fileid in self._fileids:
|
||||
for x in encoding:
|
||||
(regexp, enc) = x
|
||||
if re.match(regexp, fileid):
|
||||
encoding_dict[fileid] = enc
|
||||
break
|
||||
encoding = encoding_dict
|
||||
|
||||
self._encoding = encoding
|
||||
"""The default unicode encoding for the fileids that make up
|
||||
this corpus. If ``encoding`` is None, then the file
|
||||
contents are processed using byte strings."""
|
||||
self._tagset = tagset
|
||||
|
||||
def __repr__(self):
|
||||
if isinstance(self._root, ZipFilePathPointer):
|
||||
path = f"{self._root.zipfile.filename}/{self._root.entry}"
|
||||
else:
|
||||
path = "%s" % self._root.path
|
||||
return f"<{self.__class__.__name__} in {path!r}>"
|
||||
|
||||
def ensure_loaded(self):
|
||||
"""
|
||||
Load this corpus (if it has not already been loaded). This is
|
||||
used by LazyCorpusLoader as a simple method that can be used to
|
||||
make sure a corpus is loaded -- e.g., in case a user wants to
|
||||
do help(some_corpus).
|
||||
"""
|
||||
pass # no need to actually do anything.
|
||||
|
||||
def readme(self):
|
||||
"""
|
||||
Return the contents of the corpus README file, if it exists.
|
||||
"""
|
||||
with self.open(self._readme) as f:
|
||||
return f.read()
|
||||
|
||||
def license(self):
|
||||
"""
|
||||
Return the contents of the corpus LICENSE file, if it exists.
|
||||
"""
|
||||
with self.open(self._license) as f:
|
||||
return f.read()
|
||||
|
||||
def citation(self):
|
||||
"""
|
||||
Return the contents of the corpus citation.bib file, if it exists.
|
||||
"""
|
||||
with self.open(self._citation) as f:
|
||||
return f.read()
|
||||
|
||||
def fileids(self):
|
||||
"""
|
||||
Return a list of file identifiers for the fileids that make up
|
||||
this corpus.
|
||||
"""
|
||||
return self._fileids
|
||||
|
||||
def abspath(self, fileid):
|
||||
"""
|
||||
Return the absolute path for the given file.
|
||||
|
||||
:type fileid: str
|
||||
:param fileid: The file identifier for the file whose path
|
||||
should be returned.
|
||||
:rtype: PathPointer
|
||||
"""
|
||||
return self._root.join(fileid)
|
||||
|
||||
def abspaths(self, fileids=None, include_encoding=False, include_fileid=False):
|
||||
"""
|
||||
Return a list of the absolute paths for all fileids in this corpus;
|
||||
or for the given list of fileids, if specified.
|
||||
|
||||
:type fileids: None or str or list
|
||||
:param fileids: Specifies the set of fileids for which paths should
|
||||
be returned. Can be None, for all fileids; a list of
|
||||
file identifiers, for a specified set of fileids; or a single
|
||||
file identifier, for a single file. Note that the return
|
||||
value is always a list of paths, even if ``fileids`` is a
|
||||
single file identifier.
|
||||
|
||||
:param include_encoding: If true, then return a list of
|
||||
``(path_pointer, encoding)`` tuples.
|
||||
|
||||
:rtype: list(PathPointer)
|
||||
"""
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, str):
|
||||
fileids = [fileids]
|
||||
|
||||
paths = [self._root.join(f) for f in fileids]
|
||||
|
||||
if include_encoding and include_fileid:
|
||||
return list(zip(paths, [self.encoding(f) for f in fileids], fileids))
|
||||
elif include_fileid:
|
||||
return list(zip(paths, fileids))
|
||||
elif include_encoding:
|
||||
return list(zip(paths, [self.encoding(f) for f in fileids]))
|
||||
else:
|
||||
return paths
|
||||
|
||||
def raw(self, fileids=None):
|
||||
"""
|
||||
:param fileids: A list specifying the fileids that should be used.
|
||||
:return: the given file(s) as a single string.
|
||||
:rtype: str
|
||||
"""
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, str):
|
||||
fileids = [fileids]
|
||||
contents = []
|
||||
for f in fileids:
|
||||
with self.open(f) as fp:
|
||||
contents.append(fp.read())
|
||||
return concat(contents)
|
||||
|
||||
def open(self, file):
|
||||
"""
|
||||
Return an open stream that can be used to read the given file.
|
||||
If the file's encoding is not None, then the stream will
|
||||
automatically decode the file's contents into unicode.
|
||||
|
||||
:param file: The file identifier of the file to read.
|
||||
"""
|
||||
encoding = self.encoding(file)
|
||||
stream = self._root.join(file).open(encoding)
|
||||
return stream
|
||||
|
||||
def encoding(self, file):
|
||||
"""
|
||||
Return the unicode encoding for the given corpus file, if known.
|
||||
If the encoding is unknown, or if the given file should be
|
||||
processed using byte strings (str), then return None.
|
||||
"""
|
||||
if isinstance(self._encoding, dict):
|
||||
return self._encoding.get(file)
|
||||
else:
|
||||
return self._encoding
|
||||
|
||||
def _get_root(self):
|
||||
return self._root
|
||||
|
||||
root = property(
|
||||
_get_root,
|
||||
doc="""
|
||||
The directory where this corpus is stored.
|
||||
|
||||
:type: PathPointer""",
|
||||
)
|
||||
|
||||
|
||||
######################################################################
|
||||
# { Corpora containing categorized items
|
||||
######################################################################
|
||||
|
||||
|
||||
class CategorizedCorpusReader:
|
||||
"""
|
||||
A mixin class used to aid in the implementation of corpus readers
|
||||
for categorized corpora. This class defines the method
|
||||
``categories()``, which returns a list of the categories for the
|
||||
corpus or for a specified set of fileids; and overrides ``fileids()``
|
||||
to take a ``categories`` argument, restricting the set of fileids to
|
||||
be returned.
|
||||
|
||||
Subclasses are expected to:
|
||||
|
||||
- Call ``__init__()`` to set up the mapping.
|
||||
|
||||
- Override all view methods to accept a ``categories`` parameter,
|
||||
which can be used *instead* of the ``fileids`` parameter, to
|
||||
select which fileids should be included in the returned view.
|
||||
"""
|
||||
|
||||
def __init__(self, kwargs):
|
||||
"""
|
||||
Initialize this mapping based on keyword arguments, as
|
||||
follows:
|
||||
|
||||
- cat_pattern: A regular expression pattern used to find the
|
||||
category for each file identifier. The pattern will be
|
||||
applied to each file identifier, and the first matching
|
||||
group will be used as the category label for that file.
|
||||
|
||||
- cat_map: A dictionary, mapping from file identifiers to
|
||||
category labels.
|
||||
|
||||
- cat_file: The name of a file that contains the mapping
|
||||
from file identifiers to categories. The argument
|
||||
``cat_delimiter`` can be used to specify a delimiter.
|
||||
|
||||
The corresponding argument will be deleted from ``kwargs``. If
|
||||
more than one argument is specified, an exception will be
|
||||
raised.
|
||||
"""
|
||||
self._f2c = None #: file-to-category mapping
|
||||
self._c2f = None #: category-to-file mapping
|
||||
|
||||
self._pattern = None #: regexp specifying the mapping
|
||||
self._map = None #: dict specifying the mapping
|
||||
self._file = None #: fileid of file containing the mapping
|
||||
self._delimiter = None #: delimiter for ``self._file``
|
||||
|
||||
if "cat_pattern" in kwargs:
|
||||
self._pattern = kwargs["cat_pattern"]
|
||||
del kwargs["cat_pattern"]
|
||||
elif "cat_map" in kwargs:
|
||||
self._map = kwargs["cat_map"]
|
||||
del kwargs["cat_map"]
|
||||
elif "cat_file" in kwargs:
|
||||
self._file = kwargs["cat_file"]
|
||||
del kwargs["cat_file"]
|
||||
if "cat_delimiter" in kwargs:
|
||||
self._delimiter = kwargs["cat_delimiter"]
|
||||
del kwargs["cat_delimiter"]
|
||||
else:
|
||||
raise ValueError(
|
||||
"Expected keyword argument cat_pattern or " "cat_map or cat_file."
|
||||
)
|
||||
|
||||
if "cat_pattern" in kwargs or "cat_map" in kwargs or "cat_file" in kwargs:
|
||||
raise ValueError(
|
||||
"Specify exactly one of: cat_pattern, " "cat_map, cat_file."
|
||||
)
|
||||
|
||||
def _init(self):
|
||||
self._f2c = defaultdict(set)
|
||||
self._c2f = defaultdict(set)
|
||||
|
||||
if self._pattern is not None:
|
||||
for file_id in self._fileids:
|
||||
category = re.match(self._pattern, file_id).group(1)
|
||||
self._add(file_id, category)
|
||||
|
||||
elif self._map is not None:
|
||||
for file_id, categories in self._map.items():
|
||||
for category in categories:
|
||||
self._add(file_id, category)
|
||||
|
||||
elif self._file is not None:
|
||||
with self.open(self._file) as f:
|
||||
for line in f.readlines():
|
||||
line = line.strip()
|
||||
file_id, categories = line.split(self._delimiter, 1)
|
||||
if file_id not in self.fileids():
|
||||
raise ValueError(
|
||||
"In category mapping file %s: %s "
|
||||
"not found" % (self._file, file_id)
|
||||
)
|
||||
for category in categories.split(self._delimiter):
|
||||
self._add(file_id, category)
|
||||
|
||||
def _add(self, file_id, category):
|
||||
self._f2c[file_id].add(category)
|
||||
self._c2f[category].add(file_id)
|
||||
|
||||
def categories(self, fileids=None):
|
||||
"""
|
||||
Return a list of the categories that are defined for this corpus,
|
||||
or for the file(s) if it is given.
|
||||
"""
|
||||
if self._f2c is None:
|
||||
self._init()
|
||||
if fileids is None:
|
||||
return sorted(self._c2f)
|
||||
if isinstance(fileids, str):
|
||||
fileids = [fileids]
|
||||
return sorted(set.union(*(self._f2c[d] for d in fileids)))
|
||||
|
||||
def fileids(self, categories=None):
|
||||
"""
|
||||
Return a list of file identifiers for the files that make up
|
||||
this corpus, or that make up the given category(s) if specified.
|
||||
"""
|
||||
if categories is None:
|
||||
return super().fileids()
|
||||
elif isinstance(categories, str):
|
||||
if self._f2c is None:
|
||||
self._init()
|
||||
if categories in self._c2f:
|
||||
return sorted(self._c2f[categories])
|
||||
else:
|
||||
raise ValueError("Category %s not found" % categories)
|
||||
else:
|
||||
if self._f2c is None:
|
||||
self._init()
|
||||
return sorted(set.union(*(self._c2f[c] for c in categories)))
|
||||
|
||||
def _resolve(self, fileids, categories):
|
||||
if fileids is not None and categories is not None:
|
||||
raise ValueError("Specify fileids or categories, not both")
|
||||
if categories is not None:
|
||||
return self.fileids(categories)
|
||||
else:
|
||||
return fileids
|
||||
|
||||
def raw(self, fileids=None, categories=None):
|
||||
return super().raw(self._resolve(fileids, categories))
|
||||
|
||||
def words(self, fileids=None, categories=None):
|
||||
return super().words(self._resolve(fileids, categories))
|
||||
|
||||
def sents(self, fileids=None, categories=None):
|
||||
return super().sents(self._resolve(fileids, categories))
|
||||
|
||||
def paras(self, fileids=None, categories=None):
|
||||
return super().paras(self._resolve(fileids, categories))
|
||||
|
||||
|
||||
######################################################################
|
||||
# { Treebank readers
|
||||
######################################################################
|
||||
|
||||
|
||||
# [xx] is it worth it to factor this out?
|
||||
class SyntaxCorpusReader(CorpusReader):
|
||||
"""
|
||||
An abstract base class for reading corpora consisting of
|
||||
syntactically parsed text. Subclasses should define:
|
||||
|
||||
- ``__init__``, which specifies the location of the corpus
|
||||
and a method for detecting the sentence blocks in corpus files.
|
||||
- ``_read_block``, which reads a block from the input stream.
|
||||
- ``_word``, which takes a block and returns a list of list of words.
|
||||
- ``_tag``, which takes a block and returns a list of list of tagged
|
||||
words.
|
||||
- ``_parse``, which takes a block and returns a list of parsed
|
||||
sentences.
|
||||
"""
|
||||
|
||||
def _parse(self, s):
|
||||
raise NotImplementedError()
|
||||
|
||||
def _word(self, s):
|
||||
raise NotImplementedError()
|
||||
|
||||
def _tag(self, s):
|
||||
raise NotImplementedError()
|
||||
|
||||
def _read_block(self, stream):
|
||||
raise NotImplementedError()
|
||||
|
||||
def parsed_sents(self, fileids=None):
|
||||
reader = self._read_parsed_sent_block
|
||||
return concat(
|
||||
[
|
||||
StreamBackedCorpusView(fileid, reader, encoding=enc)
|
||||
for fileid, enc in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_sents(self, fileids=None, tagset=None):
|
||||
def reader(stream):
|
||||
return self._read_tagged_sent_block(stream, tagset)
|
||||
|
||||
return concat(
|
||||
[
|
||||
StreamBackedCorpusView(fileid, reader, encoding=enc)
|
||||
for fileid, enc in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def sents(self, fileids=None):
|
||||
reader = self._read_sent_block
|
||||
return concat(
|
||||
[
|
||||
StreamBackedCorpusView(fileid, reader, encoding=enc)
|
||||
for fileid, enc in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_words(self, fileids=None, tagset=None):
|
||||
def reader(stream):
|
||||
return self._read_tagged_word_block(stream, tagset)
|
||||
|
||||
return concat(
|
||||
[
|
||||
StreamBackedCorpusView(fileid, reader, encoding=enc)
|
||||
for fileid, enc in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def words(self, fileids=None):
|
||||
return concat(
|
||||
[
|
||||
StreamBackedCorpusView(fileid, self._read_word_block, encoding=enc)
|
||||
for fileid, enc in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# { Block Readers
|
||||
|
||||
def _read_word_block(self, stream):
|
||||
return list(chain.from_iterable(self._read_sent_block(stream)))
|
||||
|
||||
def _read_tagged_word_block(self, stream, tagset=None):
|
||||
return list(chain.from_iterable(self._read_tagged_sent_block(stream, tagset)))
|
||||
|
||||
def _read_sent_block(self, stream):
|
||||
return list(filter(None, [self._word(t) for t in self._read_block(stream)]))
|
||||
|
||||
def _read_tagged_sent_block(self, stream, tagset=None):
|
||||
return list(
|
||||
filter(None, [self._tag(t, tagset) for t in self._read_block(stream)])
|
||||
)
|
||||
|
||||
def _read_parsed_sent_block(self, stream):
|
||||
return list(filter(None, [self._parse(t) for t in self._read_block(stream)]))
|
||||
|
||||
# } End of Block Readers
|
||||
# ------------------------------------------------------------
|
||||
218
backend/venv/Lib/site-packages/nltk/corpus/reader/bcp47.py
Normal file
218
backend/venv/Lib/site-packages/nltk/corpus/reader/bcp47.py
Normal file
@@ -0,0 +1,218 @@
|
||||
# Natural Language Toolkit: BCP-47 language tags
|
||||
#
|
||||
# Copyright (C) 2022-2023 NLTK Project
|
||||
# Author: Eric Kafe <kafe.eric@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import re
|
||||
from warnings import warn
|
||||
from xml.etree import ElementTree as et
|
||||
|
||||
from nltk.corpus.reader import CorpusReader
|
||||
|
||||
|
||||
class BCP47CorpusReader(CorpusReader):
|
||||
"""
|
||||
Parse BCP-47 composite language tags
|
||||
|
||||
Supports all the main subtags, and the 'u-sd' extension:
|
||||
|
||||
>>> from nltk.corpus import bcp47
|
||||
>>> bcp47.name('oc-gascon-u-sd-fr64')
|
||||
'Occitan (post 1500): Gascon: Pyrénées-Atlantiques'
|
||||
|
||||
Can load a conversion table to Wikidata Q-codes:
|
||||
>>> bcp47.load_wiki_q()
|
||||
>>> bcp47.wiki_q['en-GI-spanglis']
|
||||
'Q79388'
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, root, fileids):
|
||||
"""Read the BCP-47 database"""
|
||||
super().__init__(root, fileids)
|
||||
self.langcode = {}
|
||||
with self.open("iana/language-subtag-registry.txt") as fp:
|
||||
self.db = self.data_dict(fp.read().split("%%\n"))
|
||||
with self.open("cldr/common-subdivisions-en.xml") as fp:
|
||||
self.subdiv = self.subdiv_dict(
|
||||
et.parse(fp).iterfind("localeDisplayNames/subdivisions/subdivision")
|
||||
)
|
||||
self.morphology()
|
||||
|
||||
def load_wiki_q(self):
|
||||
"""Load conversion table to Wikidata Q-codes (only if needed)"""
|
||||
with self.open("cldr/tools-cldr-rdf-external-entityToCode.tsv") as fp:
|
||||
self.wiki_q = self.wiki_dict(fp.read().strip().split("\n")[1:])
|
||||
|
||||
def wiki_dict(self, lines):
|
||||
"""Convert Wikidata list of Q-codes to a BCP-47 dictionary"""
|
||||
return {
|
||||
pair[1]: pair[0].split("/")[-1]
|
||||
for pair in [line.strip().split("\t") for line in lines]
|
||||
}
|
||||
|
||||
def subdiv_dict(self, subdivs):
|
||||
"""Convert the CLDR subdivisions list to a dictionary"""
|
||||
return {sub.attrib["type"]: sub.text for sub in subdivs}
|
||||
|
||||
def morphology(self):
|
||||
self.casing = {
|
||||
"language": str.lower,
|
||||
"extlang": str.lower,
|
||||
"script": str.title,
|
||||
"region": str.upper,
|
||||
"variant": str.lower,
|
||||
}
|
||||
dig = "[0-9]"
|
||||
low = "[a-z]"
|
||||
up = "[A-Z]"
|
||||
alnum = "[a-zA-Z0-9]"
|
||||
self.format = {
|
||||
"language": re.compile(f"{low*3}?"),
|
||||
"extlang": re.compile(f"{low*3}"),
|
||||
"script": re.compile(f"{up}{low*3}"),
|
||||
"region": re.compile(f"({up*2})|({dig*3})"),
|
||||
"variant": re.compile(f"{alnum*4}{(alnum+'?')*4}"),
|
||||
"singleton": re.compile(f"{low}"),
|
||||
}
|
||||
|
||||
def data_dict(self, records):
|
||||
"""Convert the BCP-47 language subtag registry to a dictionary"""
|
||||
self.version = records[0].replace("File-Date:", "").strip()
|
||||
dic = {}
|
||||
dic["deprecated"] = {}
|
||||
for label in [
|
||||
"language",
|
||||
"extlang",
|
||||
"script",
|
||||
"region",
|
||||
"variant",
|
||||
"redundant",
|
||||
"grandfathered",
|
||||
]:
|
||||
dic["deprecated"][label] = {}
|
||||
for record in records[1:]:
|
||||
fields = [field.split(": ") for field in record.strip().split("\n")]
|
||||
typ = fields[0][1]
|
||||
tag = fields[1][1]
|
||||
if typ not in dic:
|
||||
dic[typ] = {}
|
||||
subfields = {}
|
||||
for field in fields[2:]:
|
||||
if len(field) == 2:
|
||||
[key, val] = field
|
||||
if key not in subfields:
|
||||
subfields[key] = [val]
|
||||
else: # multiple value
|
||||
subfields[key].append(val)
|
||||
else: # multiline field
|
||||
subfields[key][-1] += " " + field[0].strip()
|
||||
if (
|
||||
"Deprecated" not in record
|
||||
and typ == "language"
|
||||
and key == "Description"
|
||||
):
|
||||
self.langcode[subfields[key][-1]] = tag
|
||||
for key in subfields:
|
||||
if len(subfields[key]) == 1: # single value
|
||||
subfields[key] = subfields[key][0]
|
||||
if "Deprecated" in record:
|
||||
dic["deprecated"][typ][tag] = subfields
|
||||
else:
|
||||
dic[typ][tag] = subfields
|
||||
return dic
|
||||
|
||||
def val2str(self, val):
|
||||
"""Return only first value"""
|
||||
if type(val) == list:
|
||||
# val = "/".join(val) # Concatenate all values
|
||||
val = val[0]
|
||||
return val
|
||||
|
||||
def lang2str(self, lg_record):
|
||||
"""Concatenate subtag values"""
|
||||
name = f"{lg_record['language']}"
|
||||
for label in ["extlang", "script", "region", "variant", "extension"]:
|
||||
if label in lg_record:
|
||||
name += f": {lg_record[label]}"
|
||||
return name
|
||||
|
||||
def parse_tag(self, tag):
|
||||
"""Convert a BCP-47 tag to a dictionary of labelled subtags"""
|
||||
subtags = tag.split("-")
|
||||
lang = {}
|
||||
labels = ["language", "extlang", "script", "region", "variant", "variant"]
|
||||
while subtags and labels:
|
||||
subtag = subtags.pop(0)
|
||||
found = False
|
||||
while labels:
|
||||
label = labels.pop(0)
|
||||
subtag = self.casing[label](subtag)
|
||||
if self.format[label].fullmatch(subtag):
|
||||
if subtag in self.db[label]:
|
||||
found = True
|
||||
valstr = self.val2str(self.db[label][subtag]["Description"])
|
||||
if label == "variant" and label in lang:
|
||||
lang[label] += ": " + valstr
|
||||
else:
|
||||
lang[label] = valstr
|
||||
break
|
||||
elif subtag in self.db["deprecated"][label]:
|
||||
found = True
|
||||
note = f"The {subtag!r} {label} code is deprecated"
|
||||
if "Preferred-Value" in self.db["deprecated"][label][subtag]:
|
||||
prefer = self.db["deprecated"][label][subtag][
|
||||
"Preferred-Value"
|
||||
]
|
||||
note += f"', prefer '{self.val2str(prefer)}'"
|
||||
lang[label] = self.val2str(
|
||||
self.db["deprecated"][label][subtag]["Description"]
|
||||
)
|
||||
warn(note)
|
||||
break
|
||||
if not found:
|
||||
if subtag == "u" and subtags[0] == "sd": # CLDR regional subdivisions
|
||||
sd = subtags[1]
|
||||
if sd in self.subdiv:
|
||||
ext = self.subdiv[sd]
|
||||
else:
|
||||
ext = f"<Unknown subdivision: {ext}>"
|
||||
else: # other extension subtags are not supported yet
|
||||
ext = f"{subtag}{''.join(['-'+ext for ext in subtags])}".lower()
|
||||
if not self.format["singleton"].fullmatch(subtag):
|
||||
ext = f"<Invalid extension: {ext}>"
|
||||
warn(ext)
|
||||
lang["extension"] = ext
|
||||
subtags = []
|
||||
return lang
|
||||
|
||||
def name(self, tag):
|
||||
"""
|
||||
Convert a BCP-47 tag to a colon-separated string of subtag names
|
||||
|
||||
>>> from nltk.corpus import bcp47
|
||||
>>> bcp47.name('ca-Latn-ES-valencia')
|
||||
'Catalan: Latin: Spain: Valencian'
|
||||
|
||||
"""
|
||||
for label in ["redundant", "grandfathered"]:
|
||||
val = None
|
||||
if tag in self.db[label]:
|
||||
val = f"{self.db[label][tag]['Description']}"
|
||||
note = f"The {tag!r} code is {label}"
|
||||
elif tag in self.db["deprecated"][label]:
|
||||
val = f"{self.db['deprecated'][label][tag]['Description']}"
|
||||
note = f"The {tag!r} code is {label} and deprecated"
|
||||
if "Preferred-Value" in self.db["deprecated"][label][tag]:
|
||||
prefer = self.db["deprecated"][label][tag]["Preferred-Value"]
|
||||
note += f", prefer {self.val2str(prefer)!r}"
|
||||
if val:
|
||||
warn(note)
|
||||
return val
|
||||
try:
|
||||
return self.lang2str(self.parse_tag(tag))
|
||||
except:
|
||||
warn(f"Tag {tag!r} was not recognized")
|
||||
return None
|
||||
265
backend/venv/Lib/site-packages/nltk/corpus/reader/bnc.py
Normal file
265
backend/venv/Lib/site-packages/nltk/corpus/reader/bnc.py
Normal file
@@ -0,0 +1,265 @@
|
||||
# Natural Language Toolkit: Plaintext Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""Corpus reader for the XML version of the British National Corpus."""
|
||||
|
||||
from nltk.corpus.reader.util import concat
|
||||
from nltk.corpus.reader.xmldocs import ElementTree, XMLCorpusReader, XMLCorpusView
|
||||
|
||||
|
||||
class BNCCorpusReader(XMLCorpusReader):
|
||||
r"""Corpus reader for the XML version of the British National Corpus.
|
||||
|
||||
For access to the complete XML data structure, use the ``xml()``
|
||||
method. For access to simple word lists and tagged word lists, use
|
||||
``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``.
|
||||
|
||||
You can obtain the full version of the BNC corpus at
|
||||
https://www.ota.ox.ac.uk/desc/2554
|
||||
|
||||
If you extracted the archive to a directory called `BNC`, then you can
|
||||
instantiate the reader as::
|
||||
|
||||
BNCCorpusReader(root='BNC/Texts/', fileids=r'[A-K]/\w*/\w*\.xml')
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, root, fileids, lazy=True):
|
||||
XMLCorpusReader.__init__(self, root, fileids)
|
||||
self._lazy = lazy
|
||||
|
||||
def words(self, fileids=None, strip_space=True, stem=False):
|
||||
"""
|
||||
:return: the given file(s) as a list of words
|
||||
and punctuation symbols.
|
||||
:rtype: list(str)
|
||||
|
||||
:param strip_space: If true, then strip trailing spaces from
|
||||
word tokens. Otherwise, leave the spaces on the tokens.
|
||||
:param stem: If true, then use word stems instead of word strings.
|
||||
"""
|
||||
return self._views(fileids, False, None, strip_space, stem)
|
||||
|
||||
def tagged_words(self, fileids=None, c5=False, strip_space=True, stem=False):
|
||||
"""
|
||||
:return: the given file(s) as a list of tagged
|
||||
words and punctuation symbols, encoded as tuples
|
||||
``(word,tag)``.
|
||||
:rtype: list(tuple(str,str))
|
||||
|
||||
:param c5: If true, then the tags used will be the more detailed
|
||||
c5 tags. Otherwise, the simplified tags will be used.
|
||||
:param strip_space: If true, then strip trailing spaces from
|
||||
word tokens. Otherwise, leave the spaces on the tokens.
|
||||
:param stem: If true, then use word stems instead of word strings.
|
||||
"""
|
||||
tag = "c5" if c5 else "pos"
|
||||
return self._views(fileids, False, tag, strip_space, stem)
|
||||
|
||||
def sents(self, fileids=None, strip_space=True, stem=False):
|
||||
"""
|
||||
:return: the given file(s) as a list of
|
||||
sentences or utterances, each encoded as a list of word
|
||||
strings.
|
||||
:rtype: list(list(str))
|
||||
|
||||
:param strip_space: If true, then strip trailing spaces from
|
||||
word tokens. Otherwise, leave the spaces on the tokens.
|
||||
:param stem: If true, then use word stems instead of word strings.
|
||||
"""
|
||||
return self._views(fileids, True, None, strip_space, stem)
|
||||
|
||||
def tagged_sents(self, fileids=None, c5=False, strip_space=True, stem=False):
|
||||
"""
|
||||
:return: the given file(s) as a list of
|
||||
sentences, each encoded as a list of ``(word,tag)`` tuples.
|
||||
:rtype: list(list(tuple(str,str)))
|
||||
|
||||
:param c5: If true, then the tags used will be the more detailed
|
||||
c5 tags. Otherwise, the simplified tags will be used.
|
||||
:param strip_space: If true, then strip trailing spaces from
|
||||
word tokens. Otherwise, leave the spaces on the tokens.
|
||||
:param stem: If true, then use word stems instead of word strings.
|
||||
"""
|
||||
tag = "c5" if c5 else "pos"
|
||||
return self._views(
|
||||
fileids, sent=True, tag=tag, strip_space=strip_space, stem=stem
|
||||
)
|
||||
|
||||
def _views(self, fileids=None, sent=False, tag=False, strip_space=True, stem=False):
|
||||
"""A helper function that instantiates BNCWordViews or the list of words/sentences."""
|
||||
f = BNCWordView if self._lazy else self._words
|
||||
return concat(
|
||||
[
|
||||
f(fileid, sent, tag, strip_space, stem)
|
||||
for fileid in self.abspaths(fileids)
|
||||
]
|
||||
)
|
||||
|
||||
def _words(self, fileid, bracket_sent, tag, strip_space, stem):
|
||||
"""
|
||||
Helper used to implement the view methods -- returns a list of
|
||||
words or a list of sentences, optionally tagged.
|
||||
|
||||
:param fileid: The name of the underlying file.
|
||||
:param bracket_sent: If true, include sentence bracketing.
|
||||
:param tag: The name of the tagset to use, or None for no tags.
|
||||
:param strip_space: If true, strip spaces from word tokens.
|
||||
:param stem: If true, then substitute stems for words.
|
||||
"""
|
||||
result = []
|
||||
|
||||
xmldoc = ElementTree.parse(fileid).getroot()
|
||||
for xmlsent in xmldoc.findall(".//s"):
|
||||
sent = []
|
||||
for xmlword in _all_xmlwords_in(xmlsent):
|
||||
word = xmlword.text
|
||||
if not word:
|
||||
word = "" # fixes issue 337?
|
||||
if strip_space or stem:
|
||||
word = word.strip()
|
||||
if stem:
|
||||
word = xmlword.get("hw", word)
|
||||
if tag == "c5":
|
||||
word = (word, xmlword.get("c5"))
|
||||
elif tag == "pos":
|
||||
word = (word, xmlword.get("pos", xmlword.get("c5")))
|
||||
sent.append(word)
|
||||
if bracket_sent:
|
||||
result.append(BNCSentence(xmlsent.attrib["n"], sent))
|
||||
else:
|
||||
result.extend(sent)
|
||||
|
||||
assert None not in result
|
||||
return result
|
||||
|
||||
|
||||
def _all_xmlwords_in(elt, result=None):
|
||||
if result is None:
|
||||
result = []
|
||||
for child in elt:
|
||||
if child.tag in ("c", "w"):
|
||||
result.append(child)
|
||||
else:
|
||||
_all_xmlwords_in(child, result)
|
||||
return result
|
||||
|
||||
|
||||
class BNCSentence(list):
|
||||
"""
|
||||
A list of words, augmented by an attribute ``num`` used to record
|
||||
the sentence identifier (the ``n`` attribute from the XML).
|
||||
"""
|
||||
|
||||
def __init__(self, num, items):
|
||||
self.num = num
|
||||
list.__init__(self, items)
|
||||
|
||||
|
||||
class BNCWordView(XMLCorpusView):
|
||||
"""
|
||||
A stream backed corpus view specialized for use with the BNC corpus.
|
||||
"""
|
||||
|
||||
tags_to_ignore = {
|
||||
"pb",
|
||||
"gap",
|
||||
"vocal",
|
||||
"event",
|
||||
"unclear",
|
||||
"shift",
|
||||
"pause",
|
||||
"align",
|
||||
}
|
||||
"""These tags are ignored. For their description refer to the
|
||||
technical documentation, for example,
|
||||
http://www.natcorp.ox.ac.uk/docs/URG/ref-vocal.html
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, fileid, sent, tag, strip_space, stem):
|
||||
"""
|
||||
:param fileid: The name of the underlying file.
|
||||
:param sent: If true, include sentence bracketing.
|
||||
:param tag: The name of the tagset to use, or None for no tags.
|
||||
:param strip_space: If true, strip spaces from word tokens.
|
||||
:param stem: If true, then substitute stems for words.
|
||||
"""
|
||||
if sent:
|
||||
tagspec = ".*/s"
|
||||
else:
|
||||
tagspec = ".*/s/(.*/)?(c|w)"
|
||||
self._sent = sent
|
||||
self._tag = tag
|
||||
self._strip_space = strip_space
|
||||
self._stem = stem
|
||||
|
||||
self.title = None #: Title of the document.
|
||||
self.author = None #: Author of the document.
|
||||
self.editor = None #: Editor
|
||||
self.resps = None #: Statement of responsibility
|
||||
|
||||
XMLCorpusView.__init__(self, fileid, tagspec)
|
||||
|
||||
# Read in a tasty header.
|
||||
self._open()
|
||||
self.read_block(self._stream, ".*/teiHeader$", self.handle_header)
|
||||
self.close()
|
||||
|
||||
# Reset tag context.
|
||||
self._tag_context = {0: ()}
|
||||
|
||||
def handle_header(self, elt, context):
|
||||
# Set up some metadata!
|
||||
titles = elt.findall("titleStmt/title")
|
||||
if titles:
|
||||
self.title = "\n".join(title.text.strip() for title in titles)
|
||||
|
||||
authors = elt.findall("titleStmt/author")
|
||||
if authors:
|
||||
self.author = "\n".join(author.text.strip() for author in authors)
|
||||
|
||||
editors = elt.findall("titleStmt/editor")
|
||||
if editors:
|
||||
self.editor = "\n".join(editor.text.strip() for editor in editors)
|
||||
|
||||
resps = elt.findall("titleStmt/respStmt")
|
||||
if resps:
|
||||
self.resps = "\n\n".join(
|
||||
"\n".join(resp_elt.text.strip() for resp_elt in resp) for resp in resps
|
||||
)
|
||||
|
||||
def handle_elt(self, elt, context):
|
||||
if self._sent:
|
||||
return self.handle_sent(elt)
|
||||
else:
|
||||
return self.handle_word(elt)
|
||||
|
||||
def handle_word(self, elt):
|
||||
word = elt.text
|
||||
if not word:
|
||||
word = "" # fixes issue 337?
|
||||
if self._strip_space or self._stem:
|
||||
word = word.strip()
|
||||
if self._stem:
|
||||
word = elt.get("hw", word)
|
||||
if self._tag == "c5":
|
||||
word = (word, elt.get("c5"))
|
||||
elif self._tag == "pos":
|
||||
word = (word, elt.get("pos", elt.get("c5")))
|
||||
return word
|
||||
|
||||
def handle_sent(self, elt):
|
||||
sent = []
|
||||
for child in elt:
|
||||
if child.tag in ("mw", "hi", "corr", "trunc"):
|
||||
sent += [self.handle_word(w) for w in child]
|
||||
elif child.tag in ("w", "c"):
|
||||
sent.append(self.handle_word(child))
|
||||
elif child.tag not in self.tags_to_ignore:
|
||||
raise ValueError("Unexpected element %s" % child.tag)
|
||||
return BNCSentence(elt.attrib["n"], sent)
|
||||
@@ -0,0 +1,237 @@
|
||||
# Natural Language Toolkit: Penn Treebank Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
"""
|
||||
Corpus reader for corpora that consist of parenthesis-delineated parse trees.
|
||||
"""
|
||||
|
||||
import sys
|
||||
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.tag import map_tag
|
||||
from nltk.tree import Tree
|
||||
|
||||
# we use [^\s()]+ instead of \S+? to avoid matching ()
|
||||
SORTTAGWRD = re.compile(r"\((\d+) ([^\s()]+) ([^\s()]+)\)")
|
||||
TAGWORD = re.compile(r"\(([^\s()]+) ([^\s()]+)\)")
|
||||
WORD = re.compile(r"\([^\s()]+ ([^\s()]+)\)")
|
||||
EMPTY_BRACKETS = re.compile(r"\s*\(\s*\(")
|
||||
|
||||
|
||||
class BracketParseCorpusReader(SyntaxCorpusReader):
|
||||
"""
|
||||
Reader for corpora that consist of parenthesis-delineated parse trees,
|
||||
like those found in the "combined" section of the Penn Treebank,
|
||||
e.g. "(S (NP (DT the) (JJ little) (NN dog)) (VP (VBD barked)))".
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
root,
|
||||
fileids,
|
||||
comment_char=None,
|
||||
detect_blocks="unindented_paren",
|
||||
encoding="utf8",
|
||||
tagset=None,
|
||||
):
|
||||
"""
|
||||
:param root: The root directory for this corpus.
|
||||
:param fileids: A list or regexp specifying the fileids in this corpus.
|
||||
:param comment_char: The character which can appear at the start of
|
||||
a line to indicate that the rest of the line is a comment.
|
||||
:param detect_blocks: The method that is used to find blocks
|
||||
in the corpus; can be 'unindented_paren' (every unindented
|
||||
parenthesis starts a new parse) or 'sexpr' (brackets are
|
||||
matched).
|
||||
:param tagset: The name of the tagset used by this corpus, to be used
|
||||
for normalizing or converting the POS tags returned by the
|
||||
``tagged_...()`` methods.
|
||||
"""
|
||||
SyntaxCorpusReader.__init__(self, root, fileids, encoding)
|
||||
self._comment_char = comment_char
|
||||
self._detect_blocks = detect_blocks
|
||||
self._tagset = tagset
|
||||
|
||||
def _read_block(self, stream):
|
||||
if self._detect_blocks == "sexpr":
|
||||
return read_sexpr_block(stream, comment_char=self._comment_char)
|
||||
elif self._detect_blocks == "blankline":
|
||||
return read_blankline_block(stream)
|
||||
elif self._detect_blocks == "unindented_paren":
|
||||
# Tokens start with unindented left parens.
|
||||
toks = read_regexp_block(stream, start_re=r"^\(")
|
||||
# Strip any comments out of the tokens.
|
||||
if self._comment_char:
|
||||
toks = [
|
||||
re.sub("(?m)^%s.*" % re.escape(self._comment_char), "", tok)
|
||||
for tok in toks
|
||||
]
|
||||
return toks
|
||||
else:
|
||||
assert 0, "bad block type"
|
||||
|
||||
def _normalize(self, t):
|
||||
# Replace leaves of the form (!), (,), with (! !), (, ,)
|
||||
t = re.sub(r"\((.)\)", r"(\1 \1)", t)
|
||||
# Replace leaves of the form (tag word root) with (tag word)
|
||||
t = re.sub(r"\(([^\s()]+) ([^\s()]+) [^\s()]+\)", r"(\1 \2)", t)
|
||||
return t
|
||||
|
||||
def _parse(self, t):
|
||||
try:
|
||||
tree = Tree.fromstring(self._normalize(t))
|
||||
# If there's an empty node at the top, strip it off
|
||||
if tree.label() == "" and len(tree) == 1:
|
||||
return tree[0]
|
||||
else:
|
||||
return tree
|
||||
|
||||
except ValueError as e:
|
||||
sys.stderr.write("Bad tree detected; trying to recover...\n")
|
||||
# Try to recover, if we can:
|
||||
if e.args == ("mismatched parens",):
|
||||
for n in range(1, 5):
|
||||
try:
|
||||
v = Tree(self._normalize(t + ")" * n))
|
||||
sys.stderr.write(
|
||||
" Recovered by adding %d close " "paren(s)\n" % n
|
||||
)
|
||||
return v
|
||||
except ValueError:
|
||||
pass
|
||||
# Try something else:
|
||||
sys.stderr.write(" Recovered by returning a flat parse.\n")
|
||||
# sys.stderr.write(' '.join(t.split())+'\n')
|
||||
return Tree("S", self._tag(t))
|
||||
|
||||
def _tag(self, t, tagset=None):
|
||||
tagged_sent = [(w, p) for (p, w) in TAGWORD.findall(self._normalize(t))]
|
||||
if tagset and tagset != self._tagset:
|
||||
tagged_sent = [
|
||||
(w, map_tag(self._tagset, tagset, p)) for (w, p) in tagged_sent
|
||||
]
|
||||
return tagged_sent
|
||||
|
||||
def _word(self, t):
|
||||
return WORD.findall(self._normalize(t))
|
||||
|
||||
|
||||
class CategorizedBracketParseCorpusReader(
|
||||
CategorizedCorpusReader, BracketParseCorpusReader
|
||||
):
|
||||
"""
|
||||
A reader for parsed corpora whose documents are
|
||||
divided into categories based on their file identifiers.
|
||||
@author: Nathan Schneider <nschneid@cs.cmu.edu>
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
"""
|
||||
Initialize the corpus reader. Categorization arguments
|
||||
(C{cat_pattern}, C{cat_map}, and C{cat_file}) are passed to
|
||||
the L{CategorizedCorpusReader constructor
|
||||
<CategorizedCorpusReader.__init__>}. The remaining arguments
|
||||
are passed to the L{BracketParseCorpusReader constructor
|
||||
<BracketParseCorpusReader.__init__>}.
|
||||
"""
|
||||
CategorizedCorpusReader.__init__(self, kwargs)
|
||||
BracketParseCorpusReader.__init__(self, *args, **kwargs)
|
||||
|
||||
def tagged_words(self, fileids=None, categories=None, tagset=None):
|
||||
return super().tagged_words(self._resolve(fileids, categories), tagset)
|
||||
|
||||
def tagged_sents(self, fileids=None, categories=None, tagset=None):
|
||||
return super().tagged_sents(self._resolve(fileids, categories), tagset)
|
||||
|
||||
def tagged_paras(self, fileids=None, categories=None, tagset=None):
|
||||
return super().tagged_paras(self._resolve(fileids, categories), tagset)
|
||||
|
||||
def parsed_words(self, fileids=None, categories=None):
|
||||
return super().parsed_words(self._resolve(fileids, categories))
|
||||
|
||||
def parsed_sents(self, fileids=None, categories=None):
|
||||
return super().parsed_sents(self._resolve(fileids, categories))
|
||||
|
||||
def parsed_paras(self, fileids=None, categories=None):
|
||||
return super().parsed_paras(self._resolve(fileids, categories))
|
||||
|
||||
|
||||
class AlpinoCorpusReader(BracketParseCorpusReader):
|
||||
"""
|
||||
Reader for the Alpino Dutch Treebank.
|
||||
This corpus has a lexical breakdown structure embedded, as read by `_parse`
|
||||
Unfortunately this puts punctuation and some other words out of the sentence
|
||||
order in the xml element tree. This is no good for `tag_` and `word_`
|
||||
`_tag` and `_word` will be overridden to use a non-default new parameter 'ordered'
|
||||
to the overridden _normalize function. The _parse function can then remain
|
||||
untouched.
|
||||
"""
|
||||
|
||||
def __init__(self, root, encoding="ISO-8859-1", tagset=None):
|
||||
BracketParseCorpusReader.__init__(
|
||||
self,
|
||||
root,
|
||||
r"alpino\.xml",
|
||||
detect_blocks="blankline",
|
||||
encoding=encoding,
|
||||
tagset=tagset,
|
||||
)
|
||||
|
||||
def _normalize(self, t, ordered=False):
|
||||
"""Normalize the xml sentence element in t.
|
||||
The sentence elements <alpino_ds>, although embedded in a few overall
|
||||
xml elements, are separated by blank lines. That's how the reader can
|
||||
deliver them one at a time.
|
||||
Each sentence has a few category subnodes that are of no use to us.
|
||||
The remaining word nodes may or may not appear in the proper order.
|
||||
Each word node has attributes, among which:
|
||||
- begin : the position of the word in the sentence
|
||||
- pos : Part of Speech: the Tag
|
||||
- word : the actual word
|
||||
The return value is a string with all xml elementes replaced by
|
||||
clauses: either a cat clause with nested clauses, or a word clause.
|
||||
The order of the bracket clauses closely follows the xml.
|
||||
If ordered == True, the word clauses include an order sequence number.
|
||||
If ordered == False, the word clauses only have pos and word parts.
|
||||
"""
|
||||
if t[:10] != "<alpino_ds":
|
||||
return ""
|
||||
# convert XML to sexpr notation
|
||||
t = re.sub(r' <node .*? cat="(\w+)".*>', r"(\1", t)
|
||||
if ordered:
|
||||
t = re.sub(
|
||||
r' <node. *?begin="(\d+)".*? pos="(\w+)".*? word="([^"]+)".*?/>',
|
||||
r"(\1 \2 \3)",
|
||||
t,
|
||||
)
|
||||
else:
|
||||
t = re.sub(r' <node .*?pos="(\w+)".*? word="([^"]+)".*?/>', r"(\1 \2)", t)
|
||||
t = re.sub(r" </node>", r")", t)
|
||||
t = re.sub(r"<sentence>.*</sentence>", r"", t)
|
||||
t = re.sub(r"</?alpino_ds.*>", r"", t)
|
||||
return t
|
||||
|
||||
def _tag(self, t, tagset=None):
|
||||
tagged_sent = [
|
||||
(int(o), w, p)
|
||||
for (o, p, w) in SORTTAGWRD.findall(self._normalize(t, ordered=True))
|
||||
]
|
||||
tagged_sent.sort()
|
||||
if tagset and tagset != self._tagset:
|
||||
tagged_sent = [
|
||||
(w, map_tag(self._tagset, tagset, p)) for (o, w, p) in tagged_sent
|
||||
]
|
||||
else:
|
||||
tagged_sent = [(w, p) for (o, w, p) in tagged_sent]
|
||||
return tagged_sent
|
||||
|
||||
def _word(self, t):
|
||||
"""Return a correctly ordered list if words"""
|
||||
tagged_sent = self._tag(t)
|
||||
return [w for (w, p) in tagged_sent]
|
||||
@@ -0,0 +1,168 @@
|
||||
# Natural Language Toolkit: Categorized Sentences Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
CorpusReader structured for corpora that contain one instance on each row.
|
||||
This CorpusReader is specifically used for the Subjectivity Dataset and the
|
||||
Sentence Polarity Dataset.
|
||||
|
||||
- Subjectivity Dataset information -
|
||||
|
||||
Authors: Bo Pang and Lillian Lee.
|
||||
Url: https://www.cs.cornell.edu/people/pabo/movie-review-data
|
||||
|
||||
Distributed with permission.
|
||||
|
||||
Related papers:
|
||||
|
||||
- Bo Pang and Lillian Lee. "A Sentimental Education: Sentiment Analysis Using
|
||||
Subjectivity Summarization Based on Minimum Cuts". Proceedings of the ACL,
|
||||
2004.
|
||||
|
||||
- Sentence Polarity Dataset information -
|
||||
|
||||
Authors: Bo Pang and Lillian Lee.
|
||||
Url: https://www.cs.cornell.edu/people/pabo/movie-review-data
|
||||
|
||||
Related papers:
|
||||
|
||||
- Bo Pang and Lillian Lee. "Seeing stars: Exploiting class relationships for
|
||||
sentiment categorization with respect to rating scales". Proceedings of the
|
||||
ACL, 2005.
|
||||
"""
|
||||
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.tokenize import *
|
||||
|
||||
|
||||
class CategorizedSentencesCorpusReader(CategorizedCorpusReader, CorpusReader):
|
||||
"""
|
||||
A reader for corpora in which each row represents a single instance, mainly
|
||||
a sentence. Istances are divided into categories based on their file identifiers
|
||||
(see CategorizedCorpusReader).
|
||||
Since many corpora allow rows that contain more than one sentence, it is
|
||||
possible to specify a sentence tokenizer to retrieve all sentences instead
|
||||
than all rows.
|
||||
|
||||
Examples using the Subjectivity Dataset:
|
||||
|
||||
>>> from nltk.corpus import subjectivity
|
||||
>>> subjectivity.sents()[23] # doctest: +NORMALIZE_WHITESPACE
|
||||
['television', 'made', 'him', 'famous', ',', 'but', 'his', 'biggest', 'hits',
|
||||
'happened', 'off', 'screen', '.']
|
||||
>>> subjectivity.categories()
|
||||
['obj', 'subj']
|
||||
>>> subjectivity.words(categories='subj')
|
||||
['smart', 'and', 'alert', ',', 'thirteen', ...]
|
||||
|
||||
Examples using the Sentence Polarity Dataset:
|
||||
|
||||
>>> from nltk.corpus import sentence_polarity
|
||||
>>> sentence_polarity.sents() # doctest: +NORMALIZE_WHITESPACE
|
||||
[['simplistic', ',', 'silly', 'and', 'tedious', '.'], ["it's", 'so', 'laddish',
|
||||
'and', 'juvenile', ',', 'only', 'teenage', 'boys', 'could', 'possibly', 'find',
|
||||
'it', 'funny', '.'], ...]
|
||||
>>> sentence_polarity.categories()
|
||||
['neg', 'pos']
|
||||
"""
|
||||
|
||||
CorpusView = StreamBackedCorpusView
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
root,
|
||||
fileids,
|
||||
word_tokenizer=WhitespaceTokenizer(),
|
||||
sent_tokenizer=None,
|
||||
encoding="utf8",
|
||||
**kwargs
|
||||
):
|
||||
"""
|
||||
:param root: The root directory for the corpus.
|
||||
:param fileids: a list or regexp specifying the fileids in the corpus.
|
||||
:param word_tokenizer: a tokenizer for breaking sentences or paragraphs
|
||||
into words. Default: `WhitespaceTokenizer`
|
||||
:param sent_tokenizer: a tokenizer for breaking paragraphs into sentences.
|
||||
:param encoding: the encoding that should be used to read the corpus.
|
||||
:param kwargs: additional parameters passed to CategorizedCorpusReader.
|
||||
"""
|
||||
|
||||
CorpusReader.__init__(self, root, fileids, encoding)
|
||||
CategorizedCorpusReader.__init__(self, kwargs)
|
||||
self._word_tokenizer = word_tokenizer
|
||||
self._sent_tokenizer = sent_tokenizer
|
||||
|
||||
def sents(self, fileids=None, categories=None):
|
||||
"""
|
||||
Return all sentences in the corpus or in the specified file(s).
|
||||
|
||||
:param fileids: a list or regexp specifying the ids of the files whose
|
||||
sentences have to be returned.
|
||||
:param categories: a list specifying the categories whose sentences have
|
||||
to be returned.
|
||||
:return: the given file(s) as a list of sentences.
|
||||
Each sentence is tokenized using the specified word_tokenizer.
|
||||
:rtype: list(list(str))
|
||||
"""
|
||||
fileids = self._resolve(fileids, categories)
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, str):
|
||||
fileids = [fileids]
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(path, self._read_sent_block, encoding=enc)
|
||||
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
||||
]
|
||||
)
|
||||
|
||||
def words(self, fileids=None, categories=None):
|
||||
"""
|
||||
Return all words and punctuation symbols in the corpus or in the specified
|
||||
file(s).
|
||||
|
||||
:param fileids: a list or regexp specifying the ids of the files whose
|
||||
words have to be returned.
|
||||
:param categories: a list specifying the categories whose words have to
|
||||
be returned.
|
||||
:return: the given file(s) as a list of words and punctuation symbols.
|
||||
:rtype: list(str)
|
||||
"""
|
||||
fileids = self._resolve(fileids, categories)
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, str):
|
||||
fileids = [fileids]
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(path, self._read_word_block, encoding=enc)
|
||||
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
||||
]
|
||||
)
|
||||
|
||||
def _read_sent_block(self, stream):
|
||||
sents = []
|
||||
for i in range(20): # Read 20 lines at a time.
|
||||
line = stream.readline()
|
||||
if not line:
|
||||
continue
|
||||
if self._sent_tokenizer:
|
||||
sents.extend(
|
||||
[
|
||||
self._word_tokenizer.tokenize(sent)
|
||||
for sent in self._sent_tokenizer.tokenize(line)
|
||||
]
|
||||
)
|
||||
else:
|
||||
sents.append(self._word_tokenizer.tokenize(line))
|
||||
return sents
|
||||
|
||||
def _read_word_block(self, stream):
|
||||
words = []
|
||||
for sent in self._read_sent_block(stream):
|
||||
words.extend(sent)
|
||||
return words
|
||||
154
backend/venv/Lib/site-packages/nltk/corpus/reader/chasen.py
Normal file
154
backend/venv/Lib/site-packages/nltk/corpus/reader/chasen.py
Normal file
@@ -0,0 +1,154 @@
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Masato Hagiwara <hagisan@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import sys
|
||||
|
||||
from nltk.corpus.reader import util
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.corpus.reader.util import *
|
||||
|
||||
|
||||
class ChasenCorpusReader(CorpusReader):
|
||||
def __init__(self, root, fileids, encoding="utf8", sent_splitter=None):
|
||||
self._sent_splitter = sent_splitter
|
||||
CorpusReader.__init__(self, root, fileids, encoding)
|
||||
|
||||
def words(self, fileids=None):
|
||||
return concat(
|
||||
[
|
||||
ChasenCorpusView(fileid, enc, False, False, False, self._sent_splitter)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_words(self, fileids=None):
|
||||
return concat(
|
||||
[
|
||||
ChasenCorpusView(fileid, enc, True, False, False, self._sent_splitter)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def sents(self, fileids=None):
|
||||
return concat(
|
||||
[
|
||||
ChasenCorpusView(fileid, enc, False, True, False, self._sent_splitter)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_sents(self, fileids=None):
|
||||
return concat(
|
||||
[
|
||||
ChasenCorpusView(fileid, enc, True, True, False, self._sent_splitter)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def paras(self, fileids=None):
|
||||
return concat(
|
||||
[
|
||||
ChasenCorpusView(fileid, enc, False, True, True, self._sent_splitter)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_paras(self, fileids=None):
|
||||
return concat(
|
||||
[
|
||||
ChasenCorpusView(fileid, enc, True, True, True, self._sent_splitter)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
class ChasenCorpusView(StreamBackedCorpusView):
|
||||
"""
|
||||
A specialized corpus view for ChasenReader. Similar to ``TaggedCorpusView``,
|
||||
but this'll use fixed sets of word and sentence tokenizer.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
corpus_file,
|
||||
encoding,
|
||||
tagged,
|
||||
group_by_sent,
|
||||
group_by_para,
|
||||
sent_splitter=None,
|
||||
):
|
||||
self._tagged = tagged
|
||||
self._group_by_sent = group_by_sent
|
||||
self._group_by_para = group_by_para
|
||||
self._sent_splitter = sent_splitter
|
||||
StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
|
||||
|
||||
def read_block(self, stream):
|
||||
"""Reads one paragraph at a time."""
|
||||
block = []
|
||||
for para_str in read_regexp_block(stream, r".", r"^EOS\n"):
|
||||
para = []
|
||||
|
||||
sent = []
|
||||
for line in para_str.splitlines():
|
||||
_eos = line.strip() == "EOS"
|
||||
_cells = line.split("\t")
|
||||
w = (_cells[0], "\t".join(_cells[1:]))
|
||||
if not _eos:
|
||||
sent.append(w)
|
||||
|
||||
if _eos or (self._sent_splitter and self._sent_splitter(w)):
|
||||
if not self._tagged:
|
||||
sent = [w for (w, t) in sent]
|
||||
if self._group_by_sent:
|
||||
para.append(sent)
|
||||
else:
|
||||
para.extend(sent)
|
||||
sent = []
|
||||
|
||||
if len(sent) > 0:
|
||||
if not self._tagged:
|
||||
sent = [w for (w, t) in sent]
|
||||
|
||||
if self._group_by_sent:
|
||||
para.append(sent)
|
||||
else:
|
||||
para.extend(sent)
|
||||
|
||||
if self._group_by_para:
|
||||
block.append(para)
|
||||
else:
|
||||
block.extend(para)
|
||||
|
||||
return block
|
||||
|
||||
|
||||
def demo():
|
||||
import nltk
|
||||
from nltk.corpus.util import LazyCorpusLoader
|
||||
|
||||
jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*chasen", encoding="utf-8")
|
||||
print("/".join(jeita.words()[22100:22140]))
|
||||
|
||||
print(
|
||||
"\nEOS\n".join(
|
||||
"\n".join("{}/{}".format(w[0], w[1].split("\t")[2]) for w in sent)
|
||||
for sent in jeita.tagged_sents()[2170:2173]
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def test():
|
||||
from nltk.corpus.util import LazyCorpusLoader
|
||||
|
||||
jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*chasen", encoding="utf-8")
|
||||
|
||||
assert isinstance(jeita.tagged_words()[0][1], str)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
test()
|
||||
630
backend/venv/Lib/site-packages/nltk/corpus/reader/childes.py
Normal file
630
backend/venv/Lib/site-packages/nltk/corpus/reader/childes.py
Normal file
@@ -0,0 +1,630 @@
|
||||
# CHILDES XML Corpus Reader
|
||||
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Tomonori Nagano <tnagano@gc.cuny.edu>
|
||||
# Alexis Dimitriadis <A.Dimitriadis@uu.nl>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Corpus reader for the XML version of the CHILDES corpus.
|
||||
"""
|
||||
|
||||
__docformat__ = "epytext en"
|
||||
|
||||
import re
|
||||
from collections import defaultdict
|
||||
|
||||
from nltk.corpus.reader.util import concat
|
||||
from nltk.corpus.reader.xmldocs import ElementTree, XMLCorpusReader
|
||||
from nltk.util import LazyConcatenation, LazyMap, flatten
|
||||
|
||||
# to resolve the namespace issue
|
||||
NS = "http://www.talkbank.org/ns/talkbank"
|
||||
|
||||
|
||||
class CHILDESCorpusReader(XMLCorpusReader):
|
||||
"""
|
||||
Corpus reader for the XML version of the CHILDES corpus.
|
||||
The CHILDES corpus is available at ``https://childes.talkbank.org/``. The XML
|
||||
version of CHILDES is located at ``https://childes.talkbank.org/data-xml/``.
|
||||
Copy the needed parts of the CHILDES XML corpus into the NLTK data directory
|
||||
(``nltk_data/corpora/CHILDES/``).
|
||||
|
||||
For access to the file text use the usual nltk functions,
|
||||
``words()``, ``sents()``, ``tagged_words()`` and ``tagged_sents()``.
|
||||
"""
|
||||
|
||||
def __init__(self, root, fileids, lazy=True):
|
||||
XMLCorpusReader.__init__(self, root, fileids)
|
||||
self._lazy = lazy
|
||||
|
||||
def words(
|
||||
self,
|
||||
fileids=None,
|
||||
speaker="ALL",
|
||||
stem=False,
|
||||
relation=False,
|
||||
strip_space=True,
|
||||
replace=False,
|
||||
):
|
||||
"""
|
||||
:return: the given file(s) as a list of words
|
||||
:rtype: list(str)
|
||||
|
||||
:param speaker: If specified, select specific speaker(s) defined
|
||||
in the corpus. Default is 'ALL' (all participants). Common choices
|
||||
are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
|
||||
researchers)
|
||||
:param stem: If true, then use word stems instead of word strings.
|
||||
:param relation: If true, then return tuples of (stem, index,
|
||||
dependent_index)
|
||||
:param strip_space: If true, then strip trailing spaces from word
|
||||
tokens. Otherwise, leave the spaces on the tokens.
|
||||
:param replace: If true, then use the replaced (intended) word instead
|
||||
of the original word (e.g., 'wat' will be replaced with 'watch')
|
||||
"""
|
||||
sent = None
|
||||
pos = False
|
||||
if not self._lazy:
|
||||
return [
|
||||
self._get_words(
|
||||
fileid, speaker, sent, stem, relation, pos, strip_space, replace
|
||||
)
|
||||
for fileid in self.abspaths(fileids)
|
||||
]
|
||||
|
||||
get_words = lambda fileid: self._get_words(
|
||||
fileid, speaker, sent, stem, relation, pos, strip_space, replace
|
||||
)
|
||||
return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
|
||||
|
||||
def tagged_words(
|
||||
self,
|
||||
fileids=None,
|
||||
speaker="ALL",
|
||||
stem=False,
|
||||
relation=False,
|
||||
strip_space=True,
|
||||
replace=False,
|
||||
):
|
||||
"""
|
||||
:return: the given file(s) as a list of tagged
|
||||
words and punctuation symbols, encoded as tuples
|
||||
``(word,tag)``.
|
||||
:rtype: list(tuple(str,str))
|
||||
|
||||
:param speaker: If specified, select specific speaker(s) defined
|
||||
in the corpus. Default is 'ALL' (all participants). Common choices
|
||||
are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
|
||||
researchers)
|
||||
:param stem: If true, then use word stems instead of word strings.
|
||||
:param relation: If true, then return tuples of (stem, index,
|
||||
dependent_index)
|
||||
:param strip_space: If true, then strip trailing spaces from word
|
||||
tokens. Otherwise, leave the spaces on the tokens.
|
||||
:param replace: If true, then use the replaced (intended) word instead
|
||||
of the original word (e.g., 'wat' will be replaced with 'watch')
|
||||
"""
|
||||
sent = None
|
||||
pos = True
|
||||
if not self._lazy:
|
||||
return [
|
||||
self._get_words(
|
||||
fileid, speaker, sent, stem, relation, pos, strip_space, replace
|
||||
)
|
||||
for fileid in self.abspaths(fileids)
|
||||
]
|
||||
|
||||
get_words = lambda fileid: self._get_words(
|
||||
fileid, speaker, sent, stem, relation, pos, strip_space, replace
|
||||
)
|
||||
return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
|
||||
|
||||
def sents(
|
||||
self,
|
||||
fileids=None,
|
||||
speaker="ALL",
|
||||
stem=False,
|
||||
relation=None,
|
||||
strip_space=True,
|
||||
replace=False,
|
||||
):
|
||||
"""
|
||||
:return: the given file(s) as a list of sentences or utterances, each
|
||||
encoded as a list of word strings.
|
||||
:rtype: list(list(str))
|
||||
|
||||
:param speaker: If specified, select specific speaker(s) defined
|
||||
in the corpus. Default is 'ALL' (all participants). Common choices
|
||||
are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
|
||||
researchers)
|
||||
:param stem: If true, then use word stems instead of word strings.
|
||||
:param relation: If true, then return tuples of ``(str,pos,relation_list)``.
|
||||
If there is manually-annotated relation info, it will return
|
||||
tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)``
|
||||
:param strip_space: If true, then strip trailing spaces from word
|
||||
tokens. Otherwise, leave the spaces on the tokens.
|
||||
:param replace: If true, then use the replaced (intended) word instead
|
||||
of the original word (e.g., 'wat' will be replaced with 'watch')
|
||||
"""
|
||||
sent = True
|
||||
pos = False
|
||||
if not self._lazy:
|
||||
return [
|
||||
self._get_words(
|
||||
fileid, speaker, sent, stem, relation, pos, strip_space, replace
|
||||
)
|
||||
for fileid in self.abspaths(fileids)
|
||||
]
|
||||
|
||||
get_words = lambda fileid: self._get_words(
|
||||
fileid, speaker, sent, stem, relation, pos, strip_space, replace
|
||||
)
|
||||
return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
|
||||
|
||||
def tagged_sents(
|
||||
self,
|
||||
fileids=None,
|
||||
speaker="ALL",
|
||||
stem=False,
|
||||
relation=None,
|
||||
strip_space=True,
|
||||
replace=False,
|
||||
):
|
||||
"""
|
||||
:return: the given file(s) as a list of
|
||||
sentences, each encoded as a list of ``(word,tag)`` tuples.
|
||||
:rtype: list(list(tuple(str,str)))
|
||||
|
||||
:param speaker: If specified, select specific speaker(s) defined
|
||||
in the corpus. Default is 'ALL' (all participants). Common choices
|
||||
are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
|
||||
researchers)
|
||||
:param stem: If true, then use word stems instead of word strings.
|
||||
:param relation: If true, then return tuples of ``(str,pos,relation_list)``.
|
||||
If there is manually-annotated relation info, it will return
|
||||
tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)``
|
||||
:param strip_space: If true, then strip trailing spaces from word
|
||||
tokens. Otherwise, leave the spaces on the tokens.
|
||||
:param replace: If true, then use the replaced (intended) word instead
|
||||
of the original word (e.g., 'wat' will be replaced with 'watch')
|
||||
"""
|
||||
sent = True
|
||||
pos = True
|
||||
if not self._lazy:
|
||||
return [
|
||||
self._get_words(
|
||||
fileid, speaker, sent, stem, relation, pos, strip_space, replace
|
||||
)
|
||||
for fileid in self.abspaths(fileids)
|
||||
]
|
||||
|
||||
get_words = lambda fileid: self._get_words(
|
||||
fileid, speaker, sent, stem, relation, pos, strip_space, replace
|
||||
)
|
||||
return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
|
||||
|
||||
def corpus(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a dict of ``(corpus_property_key, value)``
|
||||
:rtype: list(dict)
|
||||
"""
|
||||
if not self._lazy:
|
||||
return [self._get_corpus(fileid) for fileid in self.abspaths(fileids)]
|
||||
return LazyMap(self._get_corpus, self.abspaths(fileids))
|
||||
|
||||
def _get_corpus(self, fileid):
|
||||
results = dict()
|
||||
xmldoc = ElementTree.parse(fileid).getroot()
|
||||
for key, value in xmldoc.items():
|
||||
results[key] = value
|
||||
return results
|
||||
|
||||
def participants(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a dict of
|
||||
``(participant_property_key, value)``
|
||||
:rtype: list(dict)
|
||||
"""
|
||||
if not self._lazy:
|
||||
return [self._get_participants(fileid) for fileid in self.abspaths(fileids)]
|
||||
return LazyMap(self._get_participants, self.abspaths(fileids))
|
||||
|
||||
def _get_participants(self, fileid):
|
||||
# multidimensional dicts
|
||||
def dictOfDicts():
|
||||
return defaultdict(dictOfDicts)
|
||||
|
||||
xmldoc = ElementTree.parse(fileid).getroot()
|
||||
# getting participants' data
|
||||
pat = dictOfDicts()
|
||||
for participant in xmldoc.findall(
|
||||
f".//{{{NS}}}Participants/{{{NS}}}participant"
|
||||
):
|
||||
for key, value in participant.items():
|
||||
pat[participant.get("id")][key] = value
|
||||
return pat
|
||||
|
||||
def age(self, fileids=None, speaker="CHI", month=False):
|
||||
"""
|
||||
:return: the given file(s) as string or int
|
||||
:rtype: list or int
|
||||
|
||||
:param month: If true, return months instead of year-month-date
|
||||
"""
|
||||
if not self._lazy:
|
||||
return [
|
||||
self._get_age(fileid, speaker, month)
|
||||
for fileid in self.abspaths(fileids)
|
||||
]
|
||||
get_age = lambda fileid: self._get_age(fileid, speaker, month)
|
||||
return LazyMap(get_age, self.abspaths(fileids))
|
||||
|
||||
def _get_age(self, fileid, speaker, month):
|
||||
xmldoc = ElementTree.parse(fileid).getroot()
|
||||
for pat in xmldoc.findall(f".//{{{NS}}}Participants/{{{NS}}}participant"):
|
||||
try:
|
||||
if pat.get("id") == speaker:
|
||||
age = pat.get("age")
|
||||
if month:
|
||||
age = self.convert_age(age)
|
||||
return age
|
||||
# some files don't have age data
|
||||
except (TypeError, AttributeError) as e:
|
||||
return None
|
||||
|
||||
def convert_age(self, age_year):
|
||||
"Caclculate age in months from a string in CHILDES format"
|
||||
m = re.match(r"P(\d+)Y(\d+)M?(\d?\d?)D?", age_year)
|
||||
age_month = int(m.group(1)) * 12 + int(m.group(2))
|
||||
try:
|
||||
if int(m.group(3)) > 15:
|
||||
age_month += 1
|
||||
# some corpora don't have age information?
|
||||
except ValueError as e:
|
||||
pass
|
||||
return age_month
|
||||
|
||||
def MLU(self, fileids=None, speaker="CHI"):
|
||||
"""
|
||||
:return: the given file(s) as a floating number
|
||||
:rtype: list(float)
|
||||
"""
|
||||
if not self._lazy:
|
||||
return [
|
||||
self._getMLU(fileid, speaker=speaker)
|
||||
for fileid in self.abspaths(fileids)
|
||||
]
|
||||
get_MLU = lambda fileid: self._getMLU(fileid, speaker=speaker)
|
||||
return LazyMap(get_MLU, self.abspaths(fileids))
|
||||
|
||||
def _getMLU(self, fileid, speaker):
|
||||
sents = self._get_words(
|
||||
fileid,
|
||||
speaker=speaker,
|
||||
sent=True,
|
||||
stem=True,
|
||||
relation=False,
|
||||
pos=True,
|
||||
strip_space=True,
|
||||
replace=True,
|
||||
)
|
||||
results = []
|
||||
lastSent = []
|
||||
numFillers = 0
|
||||
sentDiscount = 0
|
||||
for sent in sents:
|
||||
posList = [pos for (word, pos) in sent]
|
||||
# if any part of the sentence is intelligible
|
||||
if any(pos == "unk" for pos in posList):
|
||||
continue
|
||||
# if the sentence is null
|
||||
elif sent == []:
|
||||
continue
|
||||
# if the sentence is the same as the last sent
|
||||
elif sent == lastSent:
|
||||
continue
|
||||
else:
|
||||
results.append([word for (word, pos) in sent])
|
||||
# count number of fillers
|
||||
if len({"co", None}.intersection(posList)) > 0:
|
||||
numFillers += posList.count("co")
|
||||
numFillers += posList.count(None)
|
||||
sentDiscount += 1
|
||||
lastSent = sent
|
||||
try:
|
||||
thisWordList = flatten(results)
|
||||
# count number of morphemes
|
||||
# (e.g., 'read' = 1 morpheme but 'read-PAST' is 2 morphemes)
|
||||
numWords = (
|
||||
len(flatten([word.split("-") for word in thisWordList])) - numFillers
|
||||
)
|
||||
numSents = len(results) - sentDiscount
|
||||
mlu = numWords / numSents
|
||||
except ZeroDivisionError:
|
||||
mlu = 0
|
||||
# return {'mlu':mlu,'wordNum':numWords,'sentNum':numSents}
|
||||
return mlu
|
||||
|
||||
def _get_words(
|
||||
self, fileid, speaker, sent, stem, relation, pos, strip_space, replace
|
||||
):
|
||||
if (
|
||||
isinstance(speaker, str) and speaker != "ALL"
|
||||
): # ensure we have a list of speakers
|
||||
speaker = [speaker]
|
||||
xmldoc = ElementTree.parse(fileid).getroot()
|
||||
# processing each xml doc
|
||||
results = []
|
||||
for xmlsent in xmldoc.findall(".//{%s}u" % NS):
|
||||
sents = []
|
||||
# select speakers
|
||||
if speaker == "ALL" or xmlsent.get("who") in speaker:
|
||||
for xmlword in xmlsent.findall(".//{%s}w" % NS):
|
||||
infl = None
|
||||
suffixStem = None
|
||||
suffixTag = None
|
||||
# getting replaced words
|
||||
if replace and xmlsent.find(f".//{{{NS}}}w/{{{NS}}}replacement"):
|
||||
xmlword = xmlsent.find(
|
||||
f".//{{{NS}}}w/{{{NS}}}replacement/{{{NS}}}w"
|
||||
)
|
||||
elif replace and xmlsent.find(f".//{{{NS}}}w/{{{NS}}}wk"):
|
||||
xmlword = xmlsent.find(f".//{{{NS}}}w/{{{NS}}}wk")
|
||||
# get text
|
||||
if xmlword.text:
|
||||
word = xmlword.text
|
||||
else:
|
||||
word = ""
|
||||
# strip tailing space
|
||||
if strip_space:
|
||||
word = word.strip()
|
||||
# stem
|
||||
if relation or stem:
|
||||
try:
|
||||
xmlstem = xmlword.find(".//{%s}stem" % NS)
|
||||
word = xmlstem.text
|
||||
except AttributeError as e:
|
||||
pass
|
||||
# if there is an inflection
|
||||
try:
|
||||
xmlinfl = xmlword.find(
|
||||
f".//{{{NS}}}mor/{{{NS}}}mw/{{{NS}}}mk"
|
||||
)
|
||||
word += "-" + xmlinfl.text
|
||||
except:
|
||||
pass
|
||||
# if there is a suffix
|
||||
try:
|
||||
xmlsuffix = xmlword.find(
|
||||
".//{%s}mor/{%s}mor-post/{%s}mw/{%s}stem"
|
||||
% (NS, NS, NS, NS)
|
||||
)
|
||||
suffixStem = xmlsuffix.text
|
||||
except AttributeError:
|
||||
suffixStem = ""
|
||||
if suffixStem:
|
||||
word += "~" + suffixStem
|
||||
# pos
|
||||
if relation or pos:
|
||||
try:
|
||||
xmlpos = xmlword.findall(".//{%s}c" % NS)
|
||||
xmlpos2 = xmlword.findall(".//{%s}s" % NS)
|
||||
if xmlpos2 != []:
|
||||
tag = xmlpos[0].text + ":" + xmlpos2[0].text
|
||||
else:
|
||||
tag = xmlpos[0].text
|
||||
except (AttributeError, IndexError) as e:
|
||||
tag = ""
|
||||
try:
|
||||
xmlsuffixpos = xmlword.findall(
|
||||
".//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}c"
|
||||
% (NS, NS, NS, NS, NS)
|
||||
)
|
||||
xmlsuffixpos2 = xmlword.findall(
|
||||
".//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}s"
|
||||
% (NS, NS, NS, NS, NS)
|
||||
)
|
||||
if xmlsuffixpos2:
|
||||
suffixTag = (
|
||||
xmlsuffixpos[0].text + ":" + xmlsuffixpos2[0].text
|
||||
)
|
||||
else:
|
||||
suffixTag = xmlsuffixpos[0].text
|
||||
except:
|
||||
pass
|
||||
if suffixTag:
|
||||
tag += "~" + suffixTag
|
||||
word = (word, tag)
|
||||
# relational
|
||||
# the gold standard is stored in
|
||||
# <mor></mor><mor type="trn"><gra type="grt">
|
||||
if relation == True:
|
||||
for xmlstem_rel in xmlword.findall(
|
||||
f".//{{{NS}}}mor/{{{NS}}}gra"
|
||||
):
|
||||
if not xmlstem_rel.get("type") == "grt":
|
||||
word = (
|
||||
word[0],
|
||||
word[1],
|
||||
xmlstem_rel.get("index")
|
||||
+ "|"
|
||||
+ xmlstem_rel.get("head")
|
||||
+ "|"
|
||||
+ xmlstem_rel.get("relation"),
|
||||
)
|
||||
else:
|
||||
word = (
|
||||
word[0],
|
||||
word[1],
|
||||
word[2],
|
||||
word[0],
|
||||
word[1],
|
||||
xmlstem_rel.get("index")
|
||||
+ "|"
|
||||
+ xmlstem_rel.get("head")
|
||||
+ "|"
|
||||
+ xmlstem_rel.get("relation"),
|
||||
)
|
||||
try:
|
||||
for xmlpost_rel in xmlword.findall(
|
||||
f".//{{{NS}}}mor/{{{NS}}}mor-post/{{{NS}}}gra"
|
||||
):
|
||||
if not xmlpost_rel.get("type") == "grt":
|
||||
suffixStem = (
|
||||
suffixStem[0],
|
||||
suffixStem[1],
|
||||
xmlpost_rel.get("index")
|
||||
+ "|"
|
||||
+ xmlpost_rel.get("head")
|
||||
+ "|"
|
||||
+ xmlpost_rel.get("relation"),
|
||||
)
|
||||
else:
|
||||
suffixStem = (
|
||||
suffixStem[0],
|
||||
suffixStem[1],
|
||||
suffixStem[2],
|
||||
suffixStem[0],
|
||||
suffixStem[1],
|
||||
xmlpost_rel.get("index")
|
||||
+ "|"
|
||||
+ xmlpost_rel.get("head")
|
||||
+ "|"
|
||||
+ xmlpost_rel.get("relation"),
|
||||
)
|
||||
except:
|
||||
pass
|
||||
sents.append(word)
|
||||
if sent or relation:
|
||||
results.append(sents)
|
||||
else:
|
||||
results.extend(sents)
|
||||
return LazyMap(lambda x: x, results)
|
||||
|
||||
# Ready-to-use browser opener
|
||||
|
||||
"""
|
||||
The base URL for viewing files on the childes website. This
|
||||
shouldn't need to be changed, unless CHILDES changes the configuration
|
||||
of their server or unless the user sets up their own corpus webserver.
|
||||
"""
|
||||
childes_url_base = r"https://childes.talkbank.org/browser/index.php?url="
|
||||
|
||||
def webview_file(self, fileid, urlbase=None):
|
||||
"""Map a corpus file to its web version on the CHILDES website,
|
||||
and open it in a web browser.
|
||||
|
||||
The complete URL to be used is:
|
||||
childes.childes_url_base + urlbase + fileid.replace('.xml', '.cha')
|
||||
|
||||
If no urlbase is passed, we try to calculate it. This
|
||||
requires that the childes corpus was set up to mirror the
|
||||
folder hierarchy under childes.psy.cmu.edu/data-xml/, e.g.:
|
||||
nltk_data/corpora/childes/Eng-USA/Cornell/??? or
|
||||
nltk_data/corpora/childes/Romance/Spanish/Aguirre/???
|
||||
|
||||
The function first looks (as a special case) if "Eng-USA" is
|
||||
on the path consisting of <corpus root>+fileid; then if
|
||||
"childes", possibly followed by "data-xml", appears. If neither
|
||||
one is found, we use the unmodified fileid and hope for the best.
|
||||
If this is not right, specify urlbase explicitly, e.g., if the
|
||||
corpus root points to the Cornell folder, urlbase='Eng-USA/Cornell'.
|
||||
"""
|
||||
|
||||
import webbrowser
|
||||
|
||||
if urlbase:
|
||||
path = urlbase + "/" + fileid
|
||||
else:
|
||||
full = self.root + "/" + fileid
|
||||
full = re.sub(r"\\", "/", full)
|
||||
if "/childes/" in full.lower():
|
||||
# Discard /data-xml/ if present
|
||||
path = re.findall(r"(?i)/childes(?:/data-xml)?/(.*)\.xml", full)[0]
|
||||
elif "eng-usa" in full.lower():
|
||||
path = "Eng-USA/" + re.findall(r"/(?i)Eng-USA/(.*)\.xml", full)[0]
|
||||
else:
|
||||
path = fileid
|
||||
|
||||
# Strip ".xml" and add ".cha", as necessary:
|
||||
if path.endswith(".xml"):
|
||||
path = path[:-4]
|
||||
|
||||
if not path.endswith(".cha"):
|
||||
path = path + ".cha"
|
||||
|
||||
url = self.childes_url_base + path
|
||||
|
||||
webbrowser.open_new_tab(url)
|
||||
print("Opening in browser:", url)
|
||||
# Pausing is a good idea, but it's up to the user...
|
||||
# raw_input("Hit Return to continue")
|
||||
|
||||
|
||||
def demo(corpus_root=None):
|
||||
"""
|
||||
The CHILDES corpus should be manually downloaded and saved
|
||||
to ``[NLTK_Data_Dir]/corpora/childes/``
|
||||
"""
|
||||
if not corpus_root:
|
||||
from nltk.data import find
|
||||
|
||||
corpus_root = find("corpora/childes/data-xml/Eng-USA/")
|
||||
|
||||
try:
|
||||
childes = CHILDESCorpusReader(corpus_root, ".*.xml")
|
||||
# describe all corpus
|
||||
for file in childes.fileids()[:5]:
|
||||
corpus = ""
|
||||
corpus_id = ""
|
||||
for key, value in childes.corpus(file)[0].items():
|
||||
if key == "Corpus":
|
||||
corpus = value
|
||||
if key == "Id":
|
||||
corpus_id = value
|
||||
print("Reading", corpus, corpus_id, " .....")
|
||||
print("words:", childes.words(file)[:7], "...")
|
||||
print(
|
||||
"words with replaced words:",
|
||||
childes.words(file, replace=True)[:7],
|
||||
" ...",
|
||||
)
|
||||
print("words with pos tags:", childes.tagged_words(file)[:7], " ...")
|
||||
print("words (only MOT):", childes.words(file, speaker="MOT")[:7], "...")
|
||||
print("words (only CHI):", childes.words(file, speaker="CHI")[:7], "...")
|
||||
print("stemmed words:", childes.words(file, stem=True)[:7], " ...")
|
||||
print(
|
||||
"words with relations and pos-tag:",
|
||||
childes.words(file, relation=True)[:5],
|
||||
" ...",
|
||||
)
|
||||
print("sentence:", childes.sents(file)[:2], " ...")
|
||||
for participant, values in childes.participants(file)[0].items():
|
||||
for key, value in values.items():
|
||||
print("\tparticipant", participant, key, ":", value)
|
||||
print("num of sent:", len(childes.sents(file)))
|
||||
print("num of morphemes:", len(childes.words(file, stem=True)))
|
||||
print("age:", childes.age(file))
|
||||
print("age in month:", childes.age(file, month=True))
|
||||
print("MLU:", childes.MLU(file))
|
||||
print()
|
||||
|
||||
except LookupError as e:
|
||||
print(
|
||||
"""The CHILDES corpus, or the parts you need, should be manually
|
||||
downloaded from https://childes.talkbank.org/data-xml/ and saved at
|
||||
[NLTK_Data_Dir]/corpora/childes/
|
||||
Alternately, you can call the demo with the path to a portion of the CHILDES corpus, e.g.:
|
||||
demo('/path/to/childes/data-xml/Eng-USA/")
|
||||
"""
|
||||
)
|
||||
# corpus_root_http = urllib2.urlopen('https://childes.talkbank.org/data-xml/Eng-USA/Bates.zip')
|
||||
# corpus_root_http_bates = zipfile.ZipFile(cStringIO.StringIO(corpus_root_http.read()))
|
||||
##this fails
|
||||
# childes = CHILDESCorpusReader(corpus_root_http_bates,corpus_root_http_bates.namelist())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
273
backend/venv/Lib/site-packages/nltk/corpus/reader/chunked.py
Normal file
273
backend/venv/Lib/site-packages/nltk/corpus/reader/chunked.py
Normal file
@@ -0,0 +1,273 @@
|
||||
# Natural Language Toolkit: Chunked Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
A reader for corpora that contain chunked (and optionally tagged)
|
||||
documents.
|
||||
"""
|
||||
|
||||
import codecs
|
||||
import os.path
|
||||
|
||||
import nltk
|
||||
from nltk.chunk import tagstr2tree
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.tokenize import *
|
||||
from nltk.tree import Tree
|
||||
|
||||
|
||||
class ChunkedCorpusReader(CorpusReader):
|
||||
"""
|
||||
Reader for chunked (and optionally tagged) corpora. Paragraphs
|
||||
are split using a block reader. They are then tokenized into
|
||||
sentences using a sentence tokenizer. Finally, these sentences
|
||||
are parsed into chunk trees using a string-to-chunktree conversion
|
||||
function. Each of these steps can be performed using a default
|
||||
function or a custom function. By default, paragraphs are split
|
||||
on blank lines; sentences are listed one per line; and sentences
|
||||
are parsed into chunk trees using ``nltk.chunk.tagstr2tree``.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
root,
|
||||
fileids,
|
||||
extension="",
|
||||
str2chunktree=tagstr2tree,
|
||||
sent_tokenizer=RegexpTokenizer("\n", gaps=True),
|
||||
para_block_reader=read_blankline_block,
|
||||
encoding="utf8",
|
||||
tagset=None,
|
||||
):
|
||||
"""
|
||||
:param root: The root directory for this corpus.
|
||||
:param fileids: A list or regexp specifying the fileids in this corpus.
|
||||
"""
|
||||
CorpusReader.__init__(self, root, fileids, encoding)
|
||||
self._cv_args = (str2chunktree, sent_tokenizer, para_block_reader, tagset)
|
||||
"""Arguments for corpus views generated by this corpus: a tuple
|
||||
(str2chunktree, sent_tokenizer, para_block_tokenizer)"""
|
||||
|
||||
def words(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of words
|
||||
and punctuation symbols.
|
||||
:rtype: list(str)
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
ChunkedCorpusView(f, enc, 0, 0, 0, 0, *self._cv_args)
|
||||
for (f, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def sents(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of
|
||||
sentences or utterances, each encoded as a list of word
|
||||
strings.
|
||||
:rtype: list(list(str))
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
ChunkedCorpusView(f, enc, 0, 1, 0, 0, *self._cv_args)
|
||||
for (f, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def paras(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of
|
||||
paragraphs, each encoded as a list of sentences, which are
|
||||
in turn encoded as lists of word strings.
|
||||
:rtype: list(list(list(str)))
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
ChunkedCorpusView(f, enc, 0, 1, 1, 0, *self._cv_args)
|
||||
for (f, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_words(self, fileids=None, tagset=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of tagged
|
||||
words and punctuation symbols, encoded as tuples
|
||||
``(word,tag)``.
|
||||
:rtype: list(tuple(str,str))
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
ChunkedCorpusView(
|
||||
f, enc, 1, 0, 0, 0, *self._cv_args, target_tagset=tagset
|
||||
)
|
||||
for (f, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_sents(self, fileids=None, tagset=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of
|
||||
sentences, each encoded as a list of ``(word,tag)`` tuples.
|
||||
|
||||
:rtype: list(list(tuple(str,str)))
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
ChunkedCorpusView(
|
||||
f, enc, 1, 1, 0, 0, *self._cv_args, target_tagset=tagset
|
||||
)
|
||||
for (f, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_paras(self, fileids=None, tagset=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of
|
||||
paragraphs, each encoded as a list of sentences, which are
|
||||
in turn encoded as lists of ``(word,tag)`` tuples.
|
||||
:rtype: list(list(list(tuple(str,str))))
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
ChunkedCorpusView(
|
||||
f, enc, 1, 1, 1, 0, *self._cv_args, target_tagset=tagset
|
||||
)
|
||||
for (f, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def chunked_words(self, fileids=None, tagset=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of tagged
|
||||
words and chunks. Words are encoded as ``(word, tag)``
|
||||
tuples (if the corpus has tags) or word strings (if the
|
||||
corpus has no tags). Chunks are encoded as depth-one
|
||||
trees over ``(word,tag)`` tuples or word strings.
|
||||
:rtype: list(tuple(str,str) and Tree)
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
ChunkedCorpusView(
|
||||
f, enc, 1, 0, 0, 1, *self._cv_args, target_tagset=tagset
|
||||
)
|
||||
for (f, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def chunked_sents(self, fileids=None, tagset=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of
|
||||
sentences, each encoded as a shallow Tree. The leaves
|
||||
of these trees are encoded as ``(word, tag)`` tuples (if
|
||||
the corpus has tags) or word strings (if the corpus has no
|
||||
tags).
|
||||
:rtype: list(Tree)
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
ChunkedCorpusView(
|
||||
f, enc, 1, 1, 0, 1, *self._cv_args, target_tagset=tagset
|
||||
)
|
||||
for (f, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def chunked_paras(self, fileids=None, tagset=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of
|
||||
paragraphs, each encoded as a list of sentences, which are
|
||||
in turn encoded as a shallow Tree. The leaves of these
|
||||
trees are encoded as ``(word, tag)`` tuples (if the corpus
|
||||
has tags) or word strings (if the corpus has no tags).
|
||||
:rtype: list(list(Tree))
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
ChunkedCorpusView(
|
||||
f, enc, 1, 1, 1, 1, *self._cv_args, target_tagset=tagset
|
||||
)
|
||||
for (f, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def _read_block(self, stream):
|
||||
return [tagstr2tree(t) for t in read_blankline_block(stream)]
|
||||
|
||||
|
||||
class ChunkedCorpusView(StreamBackedCorpusView):
|
||||
def __init__(
|
||||
self,
|
||||
fileid,
|
||||
encoding,
|
||||
tagged,
|
||||
group_by_sent,
|
||||
group_by_para,
|
||||
chunked,
|
||||
str2chunktree,
|
||||
sent_tokenizer,
|
||||
para_block_reader,
|
||||
source_tagset=None,
|
||||
target_tagset=None,
|
||||
):
|
||||
StreamBackedCorpusView.__init__(self, fileid, encoding=encoding)
|
||||
self._tagged = tagged
|
||||
self._group_by_sent = group_by_sent
|
||||
self._group_by_para = group_by_para
|
||||
self._chunked = chunked
|
||||
self._str2chunktree = str2chunktree
|
||||
self._sent_tokenizer = sent_tokenizer
|
||||
self._para_block_reader = para_block_reader
|
||||
self._source_tagset = source_tagset
|
||||
self._target_tagset = target_tagset
|
||||
|
||||
def read_block(self, stream):
|
||||
block = []
|
||||
for para_str in self._para_block_reader(stream):
|
||||
para = []
|
||||
for sent_str in self._sent_tokenizer.tokenize(para_str):
|
||||
sent = self._str2chunktree(
|
||||
sent_str,
|
||||
source_tagset=self._source_tagset,
|
||||
target_tagset=self._target_tagset,
|
||||
)
|
||||
|
||||
# If requested, throw away the tags.
|
||||
if not self._tagged:
|
||||
sent = self._untag(sent)
|
||||
|
||||
# If requested, throw away the chunks.
|
||||
if not self._chunked:
|
||||
sent = sent.leaves()
|
||||
|
||||
# Add the sentence to `para`.
|
||||
if self._group_by_sent:
|
||||
para.append(sent)
|
||||
else:
|
||||
para.extend(sent)
|
||||
|
||||
# Add the paragraph to `block`.
|
||||
if self._group_by_para:
|
||||
block.append(para)
|
||||
else:
|
||||
block.extend(para)
|
||||
|
||||
# Return the block
|
||||
return block
|
||||
|
||||
def _untag(self, tree):
|
||||
for i, child in enumerate(tree):
|
||||
if isinstance(child, Tree):
|
||||
self._untag(child)
|
||||
elif isinstance(child, tuple):
|
||||
tree[i] = child[0]
|
||||
else:
|
||||
raise ValueError("expected child to be Tree or tuple")
|
||||
return tree
|
||||
88
backend/venv/Lib/site-packages/nltk/corpus/reader/cmudict.py
Normal file
88
backend/venv/Lib/site-packages/nltk/corpus/reader/cmudict.py
Normal file
@@ -0,0 +1,88 @@
|
||||
# Natural Language Toolkit: Carnegie Mellon Pronouncing Dictionary Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
The Carnegie Mellon Pronouncing Dictionary [cmudict.0.6]
|
||||
ftp://ftp.cs.cmu.edu/project/speech/dict/
|
||||
Copyright 1998 Carnegie Mellon University
|
||||
|
||||
File Format: Each line consists of an uppercased word, a counter
|
||||
(for alternative pronunciations), and a transcription. Vowels are
|
||||
marked for stress (1=primary, 2=secondary, 0=no stress). E.g.:
|
||||
NATURAL 1 N AE1 CH ER0 AH0 L
|
||||
|
||||
The dictionary contains 127069 entries. Of these, 119400 words are assigned
|
||||
a unique pronunciation, 6830 words have two pronunciations, and 839 words have
|
||||
three or more pronunciations. Many of these are fast-speech variants.
|
||||
|
||||
Phonemes: There are 39 phonemes, as shown below:
|
||||
|
||||
Phoneme Example Translation Phoneme Example Translation
|
||||
------- ------- ----------- ------- ------- -----------
|
||||
AA odd AA D AE at AE T
|
||||
AH hut HH AH T AO ought AO T
|
||||
AW cow K AW AY hide HH AY D
|
||||
B be B IY CH cheese CH IY Z
|
||||
D dee D IY DH thee DH IY
|
||||
EH Ed EH D ER hurt HH ER T
|
||||
EY ate EY T F fee F IY
|
||||
G green G R IY N HH he HH IY
|
||||
IH it IH T IY eat IY T
|
||||
JH gee JH IY K key K IY
|
||||
L lee L IY M me M IY
|
||||
N knee N IY NG ping P IH NG
|
||||
OW oat OW T OY toy T OY
|
||||
P pee P IY R read R IY D
|
||||
S sea S IY SH she SH IY
|
||||
T tea T IY TH theta TH EY T AH
|
||||
UH hood HH UH D UW two T UW
|
||||
V vee V IY W we W IY
|
||||
Y yield Y IY L D Z zee Z IY
|
||||
ZH seizure S IY ZH ER
|
||||
"""
|
||||
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.util import Index
|
||||
|
||||
|
||||
class CMUDictCorpusReader(CorpusReader):
|
||||
def entries(self):
|
||||
"""
|
||||
:return: the cmudict lexicon as a list of entries
|
||||
containing (word, transcriptions) tuples.
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
StreamBackedCorpusView(fileid, read_cmudict_block, encoding=enc)
|
||||
for fileid, enc in self.abspaths(None, True)
|
||||
]
|
||||
)
|
||||
|
||||
def words(self):
|
||||
"""
|
||||
:return: a list of all words defined in the cmudict lexicon.
|
||||
"""
|
||||
return [word.lower() for (word, _) in self.entries()]
|
||||
|
||||
def dict(self):
|
||||
"""
|
||||
:return: the cmudict lexicon as a dictionary, whose keys are
|
||||
lowercase words and whose values are lists of pronunciations.
|
||||
"""
|
||||
return dict(Index(self.entries()))
|
||||
|
||||
|
||||
def read_cmudict_block(stream):
|
||||
entries = []
|
||||
while len(entries) < 100: # Read 100 at a time.
|
||||
line = stream.readline()
|
||||
if line == "":
|
||||
return entries # end of file.
|
||||
pieces = line.split()
|
||||
entries.append((pieces[0].lower(), pieces[2:]))
|
||||
return entries
|
||||
@@ -0,0 +1,309 @@
|
||||
# Natural Language Toolkit: Comparative Sentence Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
CorpusReader for the Comparative Sentence Dataset.
|
||||
|
||||
- Comparative Sentence Dataset information -
|
||||
|
||||
Annotated by: Nitin Jindal and Bing Liu, 2006.
|
||||
Department of Computer Sicence
|
||||
University of Illinois at Chicago
|
||||
|
||||
Contact: Nitin Jindal, njindal@cs.uic.edu
|
||||
Bing Liu, liub@cs.uic.edu (https://www.cs.uic.edu/~liub)
|
||||
|
||||
Distributed with permission.
|
||||
|
||||
Related papers:
|
||||
|
||||
- Nitin Jindal and Bing Liu. "Identifying Comparative Sentences in Text Documents".
|
||||
Proceedings of the ACM SIGIR International Conference on Information Retrieval
|
||||
(SIGIR-06), 2006.
|
||||
|
||||
- Nitin Jindal and Bing Liu. "Mining Comprative Sentences and Relations".
|
||||
Proceedings of Twenty First National Conference on Artificial Intelligence
|
||||
(AAAI-2006), 2006.
|
||||
|
||||
- Murthy Ganapathibhotla and Bing Liu. "Mining Opinions in Comparative Sentences".
|
||||
Proceedings of the 22nd International Conference on Computational Linguistics
|
||||
(Coling-2008), Manchester, 18-22 August, 2008.
|
||||
"""
|
||||
import re
|
||||
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.tokenize import *
|
||||
|
||||
# Regular expressions for dataset components
|
||||
STARS = re.compile(r"^\*+$")
|
||||
COMPARISON = re.compile(r"<cs-[1234]>")
|
||||
CLOSE_COMPARISON = re.compile(r"</cs-[1234]>")
|
||||
GRAD_COMPARISON = re.compile(r"<cs-[123]>")
|
||||
NON_GRAD_COMPARISON = re.compile(r"<cs-4>")
|
||||
ENTITIES_FEATS = re.compile(r"(\d)_((?:[\.\w\s/-](?!\d_))+)")
|
||||
KEYWORD = re.compile(r"\(([^\(]*)\)$")
|
||||
|
||||
|
||||
class Comparison:
|
||||
"""
|
||||
A Comparison represents a comparative sentence and its constituents.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
text=None,
|
||||
comp_type=None,
|
||||
entity_1=None,
|
||||
entity_2=None,
|
||||
feature=None,
|
||||
keyword=None,
|
||||
):
|
||||
"""
|
||||
:param text: a string (optionally tokenized) containing a comparison.
|
||||
:param comp_type: an integer defining the type of comparison expressed.
|
||||
Values can be: 1 (Non-equal gradable), 2 (Equative), 3 (Superlative),
|
||||
4 (Non-gradable).
|
||||
:param entity_1: the first entity considered in the comparison relation.
|
||||
:param entity_2: the second entity considered in the comparison relation.
|
||||
:param feature: the feature considered in the comparison relation.
|
||||
:param keyword: the word or phrase which is used for that comparative relation.
|
||||
"""
|
||||
self.text = text
|
||||
self.comp_type = comp_type
|
||||
self.entity_1 = entity_1
|
||||
self.entity_2 = entity_2
|
||||
self.feature = feature
|
||||
self.keyword = keyword
|
||||
|
||||
def __repr__(self):
|
||||
return (
|
||||
'Comparison(text="{}", comp_type={}, entity_1="{}", entity_2="{}", '
|
||||
'feature="{}", keyword="{}")'
|
||||
).format(
|
||||
self.text,
|
||||
self.comp_type,
|
||||
self.entity_1,
|
||||
self.entity_2,
|
||||
self.feature,
|
||||
self.keyword,
|
||||
)
|
||||
|
||||
|
||||
class ComparativeSentencesCorpusReader(CorpusReader):
|
||||
"""
|
||||
Reader for the Comparative Sentence Dataset by Jindal and Liu (2006).
|
||||
|
||||
>>> from nltk.corpus import comparative_sentences
|
||||
>>> comparison = comparative_sentences.comparisons()[0]
|
||||
>>> comparison.text # doctest: +NORMALIZE_WHITESPACE
|
||||
['its', 'fast-forward', 'and', 'rewind', 'work', 'much', 'more', 'smoothly',
|
||||
'and', 'consistently', 'than', 'those', 'of', 'other', 'models', 'i', "'ve",
|
||||
'had', '.']
|
||||
>>> comparison.entity_2
|
||||
'models'
|
||||
>>> (comparison.feature, comparison.keyword)
|
||||
('rewind', 'more')
|
||||
>>> len(comparative_sentences.comparisons())
|
||||
853
|
||||
"""
|
||||
|
||||
CorpusView = StreamBackedCorpusView
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
root,
|
||||
fileids,
|
||||
word_tokenizer=WhitespaceTokenizer(),
|
||||
sent_tokenizer=None,
|
||||
encoding="utf8",
|
||||
):
|
||||
"""
|
||||
:param root: The root directory for this corpus.
|
||||
:param fileids: a list or regexp specifying the fileids in this corpus.
|
||||
:param word_tokenizer: tokenizer for breaking sentences or paragraphs
|
||||
into words. Default: `WhitespaceTokenizer`
|
||||
:param sent_tokenizer: tokenizer for breaking paragraphs into sentences.
|
||||
:param encoding: the encoding that should be used to read the corpus.
|
||||
"""
|
||||
|
||||
CorpusReader.__init__(self, root, fileids, encoding)
|
||||
self._word_tokenizer = word_tokenizer
|
||||
self._sent_tokenizer = sent_tokenizer
|
||||
self._readme = "README.txt"
|
||||
|
||||
def comparisons(self, fileids=None):
|
||||
"""
|
||||
Return all comparisons in the corpus.
|
||||
|
||||
:param fileids: a list or regexp specifying the ids of the files whose
|
||||
comparisons have to be returned.
|
||||
:return: the given file(s) as a list of Comparison objects.
|
||||
:rtype: list(Comparison)
|
||||
"""
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, str):
|
||||
fileids = [fileids]
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(path, self._read_comparison_block, encoding=enc)
|
||||
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
||||
]
|
||||
)
|
||||
|
||||
def keywords(self, fileids=None):
|
||||
"""
|
||||
Return a set of all keywords used in the corpus.
|
||||
|
||||
:param fileids: a list or regexp specifying the ids of the files whose
|
||||
keywords have to be returned.
|
||||
:return: the set of keywords and comparative phrases used in the corpus.
|
||||
:rtype: set(str)
|
||||
"""
|
||||
all_keywords = concat(
|
||||
[
|
||||
self.CorpusView(path, self._read_keyword_block, encoding=enc)
|
||||
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
||||
]
|
||||
)
|
||||
|
||||
keywords_set = {keyword.lower() for keyword in all_keywords if keyword}
|
||||
return keywords_set
|
||||
|
||||
def keywords_readme(self):
|
||||
"""
|
||||
Return the list of words and constituents considered as clues of a
|
||||
comparison (from listOfkeywords.txt).
|
||||
"""
|
||||
keywords = []
|
||||
with self.open("listOfkeywords.txt") as fp:
|
||||
raw_text = fp.read()
|
||||
for line in raw_text.split("\n"):
|
||||
if not line or line.startswith("//"):
|
||||
continue
|
||||
keywords.append(line.strip())
|
||||
return keywords
|
||||
|
||||
def sents(self, fileids=None):
|
||||
"""
|
||||
Return all sentences in the corpus.
|
||||
|
||||
:param fileids: a list or regexp specifying the ids of the files whose
|
||||
sentences have to be returned.
|
||||
:return: all sentences of the corpus as lists of tokens (or as plain
|
||||
strings, if no word tokenizer is specified).
|
||||
:rtype: list(list(str)) or list(str)
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(path, self._read_sent_block, encoding=enc)
|
||||
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
||||
]
|
||||
)
|
||||
|
||||
def words(self, fileids=None):
|
||||
"""
|
||||
Return all words and punctuation symbols in the corpus.
|
||||
|
||||
:param fileids: a list or regexp specifying the ids of the files whose
|
||||
words have to be returned.
|
||||
:return: the given file(s) as a list of words and punctuation symbols.
|
||||
:rtype: list(str)
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(path, self._read_word_block, encoding=enc)
|
||||
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
||||
]
|
||||
)
|
||||
|
||||
def _read_comparison_block(self, stream):
|
||||
while True:
|
||||
line = stream.readline()
|
||||
if not line:
|
||||
return [] # end of file.
|
||||
comparison_tags = re.findall(COMPARISON, line)
|
||||
if comparison_tags:
|
||||
grad_comparisons = re.findall(GRAD_COMPARISON, line)
|
||||
non_grad_comparisons = re.findall(NON_GRAD_COMPARISON, line)
|
||||
# Advance to the next line (it contains the comparative sentence)
|
||||
comparison_text = stream.readline().strip()
|
||||
if self._word_tokenizer:
|
||||
comparison_text = self._word_tokenizer.tokenize(comparison_text)
|
||||
# Skip the next line (it contains closing comparison tags)
|
||||
stream.readline()
|
||||
# If gradable comparisons are found, create Comparison instances
|
||||
# and populate their fields
|
||||
comparison_bundle = []
|
||||
if grad_comparisons:
|
||||
# Each comparison tag has its own relations on a separate line
|
||||
for comp in grad_comparisons:
|
||||
comp_type = int(re.match(r"<cs-(\d)>", comp).group(1))
|
||||
comparison = Comparison(
|
||||
text=comparison_text, comp_type=comp_type
|
||||
)
|
||||
line = stream.readline()
|
||||
entities_feats = ENTITIES_FEATS.findall(line)
|
||||
if entities_feats:
|
||||
for code, entity_feat in entities_feats:
|
||||
if code == "1":
|
||||
comparison.entity_1 = entity_feat.strip()
|
||||
elif code == "2":
|
||||
comparison.entity_2 = entity_feat.strip()
|
||||
elif code == "3":
|
||||
comparison.feature = entity_feat.strip()
|
||||
keyword = KEYWORD.findall(line)
|
||||
if keyword:
|
||||
comparison.keyword = keyword[0]
|
||||
comparison_bundle.append(comparison)
|
||||
# If non-gradable comparisons are found, create a simple Comparison
|
||||
# instance for each one
|
||||
if non_grad_comparisons:
|
||||
for comp in non_grad_comparisons:
|
||||
# comp_type in this case should always be 4.
|
||||
comp_type = int(re.match(r"<cs-(\d)>", comp).group(1))
|
||||
comparison = Comparison(
|
||||
text=comparison_text, comp_type=comp_type
|
||||
)
|
||||
comparison_bundle.append(comparison)
|
||||
# Flatten the list of comparisons before returning them
|
||||
# return concat([comparison_bundle])
|
||||
return comparison_bundle
|
||||
|
||||
def _read_keyword_block(self, stream):
|
||||
keywords = []
|
||||
for comparison in self._read_comparison_block(stream):
|
||||
keywords.append(comparison.keyword)
|
||||
return keywords
|
||||
|
||||
def _read_sent_block(self, stream):
|
||||
while True:
|
||||
line = stream.readline()
|
||||
if re.match(STARS, line):
|
||||
while True:
|
||||
line = stream.readline()
|
||||
if re.match(STARS, line):
|
||||
break
|
||||
continue
|
||||
if (
|
||||
not re.findall(COMPARISON, line)
|
||||
and not ENTITIES_FEATS.findall(line)
|
||||
and not re.findall(CLOSE_COMPARISON, line)
|
||||
):
|
||||
if self._sent_tokenizer:
|
||||
return [
|
||||
self._word_tokenizer.tokenize(sent)
|
||||
for sent in self._sent_tokenizer.tokenize(line)
|
||||
]
|
||||
else:
|
||||
return [self._word_tokenizer.tokenize(line)]
|
||||
|
||||
def _read_word_block(self, stream):
|
||||
words = []
|
||||
for sent in self._read_sent_block(stream):
|
||||
words.extend(sent)
|
||||
return words
|
||||
579
backend/venv/Lib/site-packages/nltk/corpus/reader/conll.py
Normal file
579
backend/venv/Lib/site-packages/nltk/corpus/reader/conll.py
Normal file
@@ -0,0 +1,579 @@
|
||||
# Natural Language Toolkit: CONLL Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Read CoNLL-style chunk fileids.
|
||||
"""
|
||||
|
||||
import textwrap
|
||||
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.tag import map_tag
|
||||
from nltk.tree import Tree
|
||||
from nltk.util import LazyConcatenation, LazyMap
|
||||
|
||||
|
||||
class ConllCorpusReader(CorpusReader):
|
||||
"""
|
||||
A corpus reader for CoNLL-style files. These files consist of a
|
||||
series of sentences, separated by blank lines. Each sentence is
|
||||
encoded using a table (or "grid") of values, where each line
|
||||
corresponds to a single word, and each column corresponds to an
|
||||
annotation type. The set of columns used by CoNLL-style files can
|
||||
vary from corpus to corpus; the ``ConllCorpusReader`` constructor
|
||||
therefore takes an argument, ``columntypes``, which is used to
|
||||
specify the columns that are used by a given corpus. By default
|
||||
columns are split by consecutive whitespaces, with the
|
||||
``separator`` argument you can set a string to split by (e.g.
|
||||
``\'\t\'``).
|
||||
|
||||
|
||||
@todo: Add support for reading from corpora where different
|
||||
parallel files contain different columns.
|
||||
@todo: Possibly add caching of the grid corpus view? This would
|
||||
allow the same grid view to be used by different data access
|
||||
methods (eg words() and parsed_sents() could both share the
|
||||
same grid corpus view object).
|
||||
@todo: Better support for -DOCSTART-. Currently, we just ignore
|
||||
it, but it could be used to define methods that retrieve a
|
||||
document at a time (eg parsed_documents()).
|
||||
"""
|
||||
|
||||
# /////////////////////////////////////////////////////////////////
|
||||
# Column Types
|
||||
# /////////////////////////////////////////////////////////////////
|
||||
|
||||
WORDS = "words" #: column type for words
|
||||
POS = "pos" #: column type for part-of-speech tags
|
||||
TREE = "tree" #: column type for parse trees
|
||||
CHUNK = "chunk" #: column type for chunk structures
|
||||
NE = "ne" #: column type for named entities
|
||||
SRL = "srl" #: column type for semantic role labels
|
||||
IGNORE = "ignore" #: column type for column that should be ignored
|
||||
|
||||
#: A list of all column types supported by the conll corpus reader.
|
||||
COLUMN_TYPES = (WORDS, POS, TREE, CHUNK, NE, SRL, IGNORE)
|
||||
|
||||
# /////////////////////////////////////////////////////////////////
|
||||
# Constructor
|
||||
# /////////////////////////////////////////////////////////////////
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
root,
|
||||
fileids,
|
||||
columntypes,
|
||||
chunk_types=None,
|
||||
root_label="S",
|
||||
pos_in_tree=False,
|
||||
srl_includes_roleset=True,
|
||||
encoding="utf8",
|
||||
tree_class=Tree,
|
||||
tagset=None,
|
||||
separator=None,
|
||||
):
|
||||
for columntype in columntypes:
|
||||
if columntype not in self.COLUMN_TYPES:
|
||||
raise ValueError("Bad column type %r" % columntype)
|
||||
if isinstance(chunk_types, str):
|
||||
chunk_types = [chunk_types]
|
||||
self._chunk_types = chunk_types
|
||||
self._colmap = {c: i for (i, c) in enumerate(columntypes)}
|
||||
self._pos_in_tree = pos_in_tree
|
||||
self._root_label = root_label # for chunks
|
||||
self._srl_includes_roleset = srl_includes_roleset
|
||||
self._tree_class = tree_class
|
||||
CorpusReader.__init__(self, root, fileids, encoding)
|
||||
self._tagset = tagset
|
||||
self.sep = separator
|
||||
|
||||
# /////////////////////////////////////////////////////////////////
|
||||
# Data Access Methods
|
||||
# /////////////////////////////////////////////////////////////////
|
||||
|
||||
def words(self, fileids=None):
|
||||
self._require(self.WORDS)
|
||||
return LazyConcatenation(LazyMap(self._get_words, self._grids(fileids)))
|
||||
|
||||
def sents(self, fileids=None):
|
||||
self._require(self.WORDS)
|
||||
return LazyMap(self._get_words, self._grids(fileids))
|
||||
|
||||
def tagged_words(self, fileids=None, tagset=None):
|
||||
self._require(self.WORDS, self.POS)
|
||||
|
||||
def get_tagged_words(grid):
|
||||
return self._get_tagged_words(grid, tagset)
|
||||
|
||||
return LazyConcatenation(LazyMap(get_tagged_words, self._grids(fileids)))
|
||||
|
||||
def tagged_sents(self, fileids=None, tagset=None):
|
||||
self._require(self.WORDS, self.POS)
|
||||
|
||||
def get_tagged_words(grid):
|
||||
return self._get_tagged_words(grid, tagset)
|
||||
|
||||
return LazyMap(get_tagged_words, self._grids(fileids))
|
||||
|
||||
def chunked_words(self, fileids=None, chunk_types=None, tagset=None):
|
||||
self._require(self.WORDS, self.POS, self.CHUNK)
|
||||
if chunk_types is None:
|
||||
chunk_types = self._chunk_types
|
||||
|
||||
def get_chunked_words(grid): # capture chunk_types as local var
|
||||
return self._get_chunked_words(grid, chunk_types, tagset)
|
||||
|
||||
return LazyConcatenation(LazyMap(get_chunked_words, self._grids(fileids)))
|
||||
|
||||
def chunked_sents(self, fileids=None, chunk_types=None, tagset=None):
|
||||
self._require(self.WORDS, self.POS, self.CHUNK)
|
||||
if chunk_types is None:
|
||||
chunk_types = self._chunk_types
|
||||
|
||||
def get_chunked_words(grid): # capture chunk_types as local var
|
||||
return self._get_chunked_words(grid, chunk_types, tagset)
|
||||
|
||||
return LazyMap(get_chunked_words, self._grids(fileids))
|
||||
|
||||
def parsed_sents(self, fileids=None, pos_in_tree=None, tagset=None):
|
||||
self._require(self.WORDS, self.POS, self.TREE)
|
||||
if pos_in_tree is None:
|
||||
pos_in_tree = self._pos_in_tree
|
||||
|
||||
def get_parsed_sent(grid): # capture pos_in_tree as local var
|
||||
return self._get_parsed_sent(grid, pos_in_tree, tagset)
|
||||
|
||||
return LazyMap(get_parsed_sent, self._grids(fileids))
|
||||
|
||||
def srl_spans(self, fileids=None):
|
||||
self._require(self.SRL)
|
||||
return LazyMap(self._get_srl_spans, self._grids(fileids))
|
||||
|
||||
def srl_instances(self, fileids=None, pos_in_tree=None, flatten=True):
|
||||
self._require(self.WORDS, self.POS, self.TREE, self.SRL)
|
||||
if pos_in_tree is None:
|
||||
pos_in_tree = self._pos_in_tree
|
||||
|
||||
def get_srl_instances(grid): # capture pos_in_tree as local var
|
||||
return self._get_srl_instances(grid, pos_in_tree)
|
||||
|
||||
result = LazyMap(get_srl_instances, self._grids(fileids))
|
||||
if flatten:
|
||||
result = LazyConcatenation(result)
|
||||
return result
|
||||
|
||||
def iob_words(self, fileids=None, tagset=None):
|
||||
"""
|
||||
:return: a list of word/tag/IOB tuples
|
||||
:rtype: list(tuple)
|
||||
:param fileids: the list of fileids that make up this corpus
|
||||
:type fileids: None or str or list
|
||||
"""
|
||||
self._require(self.WORDS, self.POS, self.CHUNK)
|
||||
|
||||
def get_iob_words(grid):
|
||||
return self._get_iob_words(grid, tagset)
|
||||
|
||||
return LazyConcatenation(LazyMap(get_iob_words, self._grids(fileids)))
|
||||
|
||||
def iob_sents(self, fileids=None, tagset=None):
|
||||
"""
|
||||
:return: a list of lists of word/tag/IOB tuples
|
||||
:rtype: list(list)
|
||||
:param fileids: the list of fileids that make up this corpus
|
||||
:type fileids: None or str or list
|
||||
"""
|
||||
self._require(self.WORDS, self.POS, self.CHUNK)
|
||||
|
||||
def get_iob_words(grid):
|
||||
return self._get_iob_words(grid, tagset)
|
||||
|
||||
return LazyMap(get_iob_words, self._grids(fileids))
|
||||
|
||||
# /////////////////////////////////////////////////////////////////
|
||||
# Grid Reading
|
||||
# /////////////////////////////////////////////////////////////////
|
||||
|
||||
def _grids(self, fileids=None):
|
||||
# n.b.: we could cache the object returned here (keyed on
|
||||
# fileids), which would let us reuse the same corpus view for
|
||||
# different things (eg srl and parse trees).
|
||||
return concat(
|
||||
[
|
||||
StreamBackedCorpusView(fileid, self._read_grid_block, encoding=enc)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def _read_grid_block(self, stream):
|
||||
grids = []
|
||||
for block in read_blankline_block(stream):
|
||||
block = block.strip()
|
||||
if not block:
|
||||
continue
|
||||
|
||||
grid = [line.split(self.sep) for line in block.split("\n")]
|
||||
|
||||
# If there's a docstart row, then discard. ([xx] eventually it
|
||||
# would be good to actually use it)
|
||||
if grid[0][self._colmap.get("words", 0)] == "-DOCSTART-":
|
||||
del grid[0]
|
||||
|
||||
# Check that the grid is consistent.
|
||||
for row in grid:
|
||||
if len(row) != len(grid[0]):
|
||||
raise ValueError("Inconsistent number of columns:\n%s" % block)
|
||||
grids.append(grid)
|
||||
return grids
|
||||
|
||||
# /////////////////////////////////////////////////////////////////
|
||||
# Transforms
|
||||
# /////////////////////////////////////////////////////////////////
|
||||
# given a grid, transform it into some representation (e.g.,
|
||||
# a list of words or a parse tree).
|
||||
|
||||
def _get_words(self, grid):
|
||||
return self._get_column(grid, self._colmap["words"])
|
||||
|
||||
def _get_tagged_words(self, grid, tagset=None):
|
||||
pos_tags = self._get_column(grid, self._colmap["pos"])
|
||||
if tagset and tagset != self._tagset:
|
||||
pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
|
||||
return list(zip(self._get_column(grid, self._colmap["words"]), pos_tags))
|
||||
|
||||
def _get_iob_words(self, grid, tagset=None):
|
||||
pos_tags = self._get_column(grid, self._colmap["pos"])
|
||||
if tagset and tagset != self._tagset:
|
||||
pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
|
||||
return list(
|
||||
zip(
|
||||
self._get_column(grid, self._colmap["words"]),
|
||||
pos_tags,
|
||||
self._get_column(grid, self._colmap["chunk"]),
|
||||
)
|
||||
)
|
||||
|
||||
def _get_chunked_words(self, grid, chunk_types, tagset=None):
|
||||
# n.b.: this method is very similar to conllstr2tree.
|
||||
words = self._get_column(grid, self._colmap["words"])
|
||||
pos_tags = self._get_column(grid, self._colmap["pos"])
|
||||
if tagset and tagset != self._tagset:
|
||||
pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
|
||||
chunk_tags = self._get_column(grid, self._colmap["chunk"])
|
||||
|
||||
stack = [Tree(self._root_label, [])]
|
||||
|
||||
for word, pos_tag, chunk_tag in zip(words, pos_tags, chunk_tags):
|
||||
if chunk_tag == "O":
|
||||
state, chunk_type = "O", ""
|
||||
else:
|
||||
(state, chunk_type) = chunk_tag.split("-")
|
||||
# If it's a chunk we don't care about, treat it as O.
|
||||
if chunk_types is not None and chunk_type not in chunk_types:
|
||||
state = "O"
|
||||
# Treat a mismatching I like a B.
|
||||
if state == "I" and chunk_type != stack[-1].label():
|
||||
state = "B"
|
||||
# For B or I: close any open chunks
|
||||
if state in "BO" and len(stack) == 2:
|
||||
stack.pop()
|
||||
# For B: start a new chunk.
|
||||
if state == "B":
|
||||
new_chunk = Tree(chunk_type, [])
|
||||
stack[-1].append(new_chunk)
|
||||
stack.append(new_chunk)
|
||||
# Add the word token.
|
||||
stack[-1].append((word, pos_tag))
|
||||
|
||||
return stack[0]
|
||||
|
||||
def _get_parsed_sent(self, grid, pos_in_tree, tagset=None):
|
||||
words = self._get_column(grid, self._colmap["words"])
|
||||
pos_tags = self._get_column(grid, self._colmap["pos"])
|
||||
if tagset and tagset != self._tagset:
|
||||
pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
|
||||
parse_tags = self._get_column(grid, self._colmap["tree"])
|
||||
|
||||
treestr = ""
|
||||
for word, pos_tag, parse_tag in zip(words, pos_tags, parse_tags):
|
||||
if word == "(":
|
||||
word = "-LRB-"
|
||||
if word == ")":
|
||||
word = "-RRB-"
|
||||
if pos_tag == "(":
|
||||
pos_tag = "-LRB-"
|
||||
if pos_tag == ")":
|
||||
pos_tag = "-RRB-"
|
||||
(left, right) = parse_tag.split("*")
|
||||
right = right.count(")") * ")" # only keep ')'.
|
||||
treestr += f"{left} ({pos_tag} {word}) {right}"
|
||||
try:
|
||||
tree = self._tree_class.fromstring(treestr)
|
||||
except (ValueError, IndexError):
|
||||
tree = self._tree_class.fromstring(f"({self._root_label} {treestr})")
|
||||
|
||||
if not pos_in_tree:
|
||||
for subtree in tree.subtrees():
|
||||
for i, child in enumerate(subtree):
|
||||
if (
|
||||
isinstance(child, Tree)
|
||||
and len(child) == 1
|
||||
and isinstance(child[0], str)
|
||||
):
|
||||
subtree[i] = (child[0], child.label())
|
||||
|
||||
return tree
|
||||
|
||||
def _get_srl_spans(self, grid):
|
||||
"""
|
||||
list of list of (start, end), tag) tuples
|
||||
"""
|
||||
if self._srl_includes_roleset:
|
||||
predicates = self._get_column(grid, self._colmap["srl"] + 1)
|
||||
start_col = self._colmap["srl"] + 2
|
||||
else:
|
||||
predicates = self._get_column(grid, self._colmap["srl"])
|
||||
start_col = self._colmap["srl"] + 1
|
||||
|
||||
# Count how many predicates there are. This tells us how many
|
||||
# columns to expect for SRL data.
|
||||
num_preds = len([p for p in predicates if p != "-"])
|
||||
|
||||
spanlists = []
|
||||
for i in range(num_preds):
|
||||
col = self._get_column(grid, start_col + i)
|
||||
spanlist = []
|
||||
stack = []
|
||||
for wordnum, srl_tag in enumerate(col):
|
||||
(left, right) = srl_tag.split("*")
|
||||
for tag in left.split("("):
|
||||
if tag:
|
||||
stack.append((tag, wordnum))
|
||||
for i in range(right.count(")")):
|
||||
(tag, start) = stack.pop()
|
||||
spanlist.append(((start, wordnum + 1), tag))
|
||||
spanlists.append(spanlist)
|
||||
|
||||
return spanlists
|
||||
|
||||
def _get_srl_instances(self, grid, pos_in_tree):
|
||||
tree = self._get_parsed_sent(grid, pos_in_tree)
|
||||
spanlists = self._get_srl_spans(grid)
|
||||
if self._srl_includes_roleset:
|
||||
predicates = self._get_column(grid, self._colmap["srl"] + 1)
|
||||
rolesets = self._get_column(grid, self._colmap["srl"])
|
||||
else:
|
||||
predicates = self._get_column(grid, self._colmap["srl"])
|
||||
rolesets = [None] * len(predicates)
|
||||
|
||||
instances = ConllSRLInstanceList(tree)
|
||||
for wordnum, predicate in enumerate(predicates):
|
||||
if predicate == "-":
|
||||
continue
|
||||
# Decide which spanlist to use. Don't assume that they're
|
||||
# sorted in the same order as the predicates (even though
|
||||
# they usually are).
|
||||
for spanlist in spanlists:
|
||||
for (start, end), tag in spanlist:
|
||||
if wordnum in range(start, end) and tag in ("V", "C-V"):
|
||||
break
|
||||
else:
|
||||
continue
|
||||
break
|
||||
else:
|
||||
raise ValueError("No srl column found for %r" % predicate)
|
||||
instances.append(
|
||||
ConllSRLInstance(tree, wordnum, predicate, rolesets[wordnum], spanlist)
|
||||
)
|
||||
|
||||
return instances
|
||||
|
||||
# /////////////////////////////////////////////////////////////////
|
||||
# Helper Methods
|
||||
# /////////////////////////////////////////////////////////////////
|
||||
|
||||
def _require(self, *columntypes):
|
||||
for columntype in columntypes:
|
||||
if columntype not in self._colmap:
|
||||
raise ValueError(
|
||||
"This corpus does not contain a %s " "column." % columntype
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _get_column(grid, column_index):
|
||||
return [grid[i][column_index] for i in range(len(grid))]
|
||||
|
||||
|
||||
class ConllSRLInstance:
|
||||
"""
|
||||
An SRL instance from a CoNLL corpus, which identifies and
|
||||
providing labels for the arguments of a single verb.
|
||||
"""
|
||||
|
||||
# [xx] add inst.core_arguments, inst.argm_arguments?
|
||||
|
||||
def __init__(self, tree, verb_head, verb_stem, roleset, tagged_spans):
|
||||
self.verb = []
|
||||
"""A list of the word indices of the words that compose the
|
||||
verb whose arguments are identified by this instance.
|
||||
This will contain multiple word indices when multi-word
|
||||
verbs are used (e.g. 'turn on')."""
|
||||
|
||||
self.verb_head = verb_head
|
||||
"""The word index of the head word of the verb whose arguments
|
||||
are identified by this instance. E.g., for a sentence that
|
||||
uses the verb 'turn on,' ``verb_head`` will be the word index
|
||||
of the word 'turn'."""
|
||||
|
||||
self.verb_stem = verb_stem
|
||||
|
||||
self.roleset = roleset
|
||||
|
||||
self.arguments = []
|
||||
"""A list of ``(argspan, argid)`` tuples, specifying the location
|
||||
and type for each of the arguments identified by this
|
||||
instance. ``argspan`` is a tuple ``start, end``, indicating
|
||||
that the argument consists of the ``words[start:end]``."""
|
||||
|
||||
self.tagged_spans = tagged_spans
|
||||
"""A list of ``(span, id)`` tuples, specifying the location and
|
||||
type for each of the arguments, as well as the verb pieces,
|
||||
that make up this instance."""
|
||||
|
||||
self.tree = tree
|
||||
"""The parse tree for the sentence containing this instance."""
|
||||
|
||||
self.words = tree.leaves()
|
||||
"""A list of the words in the sentence containing this
|
||||
instance."""
|
||||
|
||||
# Fill in the self.verb and self.arguments values.
|
||||
for (start, end), tag in tagged_spans:
|
||||
if tag in ("V", "C-V"):
|
||||
self.verb += list(range(start, end))
|
||||
else:
|
||||
self.arguments.append(((start, end), tag))
|
||||
|
||||
def __repr__(self):
|
||||
# Originally, its:
|
||||
##plural = 's' if len(self.arguments) != 1 else ''
|
||||
plural = "s" if len(self.arguments) != 1 else ""
|
||||
return "<ConllSRLInstance for %r with %d argument%s>" % (
|
||||
(self.verb_stem, len(self.arguments), plural)
|
||||
)
|
||||
|
||||
def pprint(self):
|
||||
verbstr = " ".join(self.words[i][0] for i in self.verb)
|
||||
hdr = f"SRL for {verbstr!r} (stem={self.verb_stem!r}):\n"
|
||||
s = ""
|
||||
for i, word in enumerate(self.words):
|
||||
if isinstance(word, tuple):
|
||||
word = word[0]
|
||||
for (start, end), argid in self.arguments:
|
||||
if i == start:
|
||||
s += "[%s " % argid
|
||||
if i == end:
|
||||
s += "] "
|
||||
if i in self.verb:
|
||||
word = "<<%s>>" % word
|
||||
s += word + " "
|
||||
return hdr + textwrap.fill(
|
||||
s.replace(" ]", "]"), initial_indent=" ", subsequent_indent=" "
|
||||
)
|
||||
|
||||
|
||||
class ConllSRLInstanceList(list):
|
||||
"""
|
||||
Set of instances for a single sentence
|
||||
"""
|
||||
|
||||
def __init__(self, tree, instances=()):
|
||||
self.tree = tree
|
||||
list.__init__(self, instances)
|
||||
|
||||
def __str__(self):
|
||||
return self.pprint()
|
||||
|
||||
def pprint(self, include_tree=False):
|
||||
# Sanity check: trees should be the same
|
||||
for inst in self:
|
||||
if inst.tree != self.tree:
|
||||
raise ValueError("Tree mismatch!")
|
||||
|
||||
# If desired, add trees:
|
||||
if include_tree:
|
||||
words = self.tree.leaves()
|
||||
pos = [None] * len(words)
|
||||
synt = ["*"] * len(words)
|
||||
self._tree2conll(self.tree, 0, words, pos, synt)
|
||||
|
||||
s = ""
|
||||
for i in range(len(words)):
|
||||
# optional tree columns
|
||||
if include_tree:
|
||||
s += "%-20s " % words[i]
|
||||
s += "%-8s " % pos[i]
|
||||
s += "%15s*%-8s " % tuple(synt[i].split("*"))
|
||||
|
||||
# verb head column
|
||||
for inst in self:
|
||||
if i == inst.verb_head:
|
||||
s += "%-20s " % inst.verb_stem
|
||||
break
|
||||
else:
|
||||
s += "%-20s " % "-"
|
||||
# Remaining columns: self
|
||||
for inst in self:
|
||||
argstr = "*"
|
||||
for (start, end), argid in inst.tagged_spans:
|
||||
if i == start:
|
||||
argstr = f"({argid}{argstr}"
|
||||
if i == (end - 1):
|
||||
argstr += ")"
|
||||
s += "%-12s " % argstr
|
||||
s += "\n"
|
||||
return s
|
||||
|
||||
def _tree2conll(self, tree, wordnum, words, pos, synt):
|
||||
assert isinstance(tree, Tree)
|
||||
if len(tree) == 1 and isinstance(tree[0], str):
|
||||
pos[wordnum] = tree.label()
|
||||
assert words[wordnum] == tree[0]
|
||||
return wordnum + 1
|
||||
elif len(tree) == 1 and isinstance(tree[0], tuple):
|
||||
assert len(tree[0]) == 2
|
||||
pos[wordnum], pos[wordnum] = tree[0]
|
||||
return wordnum + 1
|
||||
else:
|
||||
synt[wordnum] = f"({tree.label()}{synt[wordnum]}"
|
||||
for child in tree:
|
||||
wordnum = self._tree2conll(child, wordnum, words, pos, synt)
|
||||
synt[wordnum - 1] += ")"
|
||||
return wordnum
|
||||
|
||||
|
||||
class ConllChunkCorpusReader(ConllCorpusReader):
|
||||
"""
|
||||
A ConllCorpusReader whose data file contains three columns: words,
|
||||
pos, and chunk.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, root, fileids, chunk_types, encoding="utf8", tagset=None, separator=None
|
||||
):
|
||||
ConllCorpusReader.__init__(
|
||||
self,
|
||||
root,
|
||||
fileids,
|
||||
("words", "pos", "chunk"),
|
||||
chunk_types=chunk_types,
|
||||
encoding=encoding,
|
||||
tagset=tagset,
|
||||
separator=separator,
|
||||
)
|
||||
106
backend/venv/Lib/site-packages/nltk/corpus/reader/crubadan.py
Normal file
106
backend/venv/Lib/site-packages/nltk/corpus/reader/crubadan.py
Normal file
@@ -0,0 +1,106 @@
|
||||
# Natural Language Toolkit: An Crubadan N-grams Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Avital Pekker <avital.pekker@utoronto.ca>
|
||||
#
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
An NLTK interface for the n-gram statistics gathered from
|
||||
the corpora for each language using An Crubadan.
|
||||
|
||||
There are multiple potential applications for the data but
|
||||
this reader was created with the goal of using it in the
|
||||
context of language identification.
|
||||
|
||||
For details about An Crubadan, this data, and its potential uses, see:
|
||||
http://borel.slu.edu/crubadan/index.html
|
||||
"""
|
||||
|
||||
import re
|
||||
from os import path
|
||||
|
||||
from nltk.corpus.reader import CorpusReader
|
||||
from nltk.data import ZipFilePathPointer
|
||||
from nltk.probability import FreqDist
|
||||
|
||||
|
||||
class CrubadanCorpusReader(CorpusReader):
|
||||
"""
|
||||
A corpus reader used to access language An Crubadan n-gram files.
|
||||
"""
|
||||
|
||||
_LANG_MAPPER_FILE = "table.txt"
|
||||
_all_lang_freq = {}
|
||||
|
||||
def __init__(self, root, fileids, encoding="utf8", tagset=None):
|
||||
super().__init__(root, fileids, encoding="utf8")
|
||||
self._lang_mapping_data = []
|
||||
self._load_lang_mapping_data()
|
||||
|
||||
def lang_freq(self, lang):
|
||||
"""Return n-gram FreqDist for a specific language
|
||||
given ISO 639-3 language code"""
|
||||
|
||||
if lang not in self._all_lang_freq:
|
||||
self._all_lang_freq[lang] = self._load_lang_ngrams(lang)
|
||||
|
||||
return self._all_lang_freq[lang]
|
||||
|
||||
def langs(self):
|
||||
"""Return a list of supported languages as ISO 639-3 codes"""
|
||||
return [row[1] for row in self._lang_mapping_data]
|
||||
|
||||
def iso_to_crubadan(self, lang):
|
||||
"""Return internal Crubadan code based on ISO 639-3 code"""
|
||||
for i in self._lang_mapping_data:
|
||||
if i[1].lower() == lang.lower():
|
||||
return i[0]
|
||||
|
||||
def crubadan_to_iso(self, lang):
|
||||
"""Return ISO 639-3 code given internal Crubadan code"""
|
||||
for i in self._lang_mapping_data:
|
||||
if i[0].lower() == lang.lower():
|
||||
return i[1]
|
||||
|
||||
def _load_lang_mapping_data(self):
|
||||
"""Load language mappings between codes and description from table.txt"""
|
||||
if isinstance(self.root, ZipFilePathPointer):
|
||||
raise RuntimeError(
|
||||
"Please install the 'crubadan' corpus first, use nltk.download()"
|
||||
)
|
||||
|
||||
mapper_file = path.join(self.root, self._LANG_MAPPER_FILE)
|
||||
if self._LANG_MAPPER_FILE not in self.fileids():
|
||||
raise RuntimeError("Could not find language mapper file: " + mapper_file)
|
||||
|
||||
with open(mapper_file, encoding="utf-8") as raw:
|
||||
strip_raw = raw.read().strip()
|
||||
|
||||
self._lang_mapping_data = [row.split("\t") for row in strip_raw.split("\n")]
|
||||
|
||||
def _load_lang_ngrams(self, lang):
|
||||
"""Load single n-gram language file given the ISO 639-3 language code
|
||||
and return its FreqDist"""
|
||||
|
||||
if lang not in self.langs():
|
||||
raise RuntimeError("Unsupported language.")
|
||||
|
||||
crubadan_code = self.iso_to_crubadan(lang)
|
||||
ngram_file = path.join(self.root, crubadan_code + "-3grams.txt")
|
||||
|
||||
if not path.isfile(ngram_file):
|
||||
raise RuntimeError("No N-gram file found for requested language.")
|
||||
|
||||
counts = FreqDist()
|
||||
with open(ngram_file, encoding="utf-8") as f:
|
||||
for line in f:
|
||||
data = line.split(" ")
|
||||
|
||||
ngram = data[1].strip("\n")
|
||||
freq = int(data[0])
|
||||
|
||||
counts[ngram] = freq
|
||||
|
||||
return counts
|
||||
115
backend/venv/Lib/site-packages/nltk/corpus/reader/dependency.py
Normal file
115
backend/venv/Lib/site-packages/nltk/corpus/reader/dependency.py
Normal file
@@ -0,0 +1,115 @@
|
||||
# Natural Language Toolkit: Dependency Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Kepa Sarasola <kepa.sarasola@ehu.es>
|
||||
# Iker Manterola <returntothehangar@hotmail.com>
|
||||
#
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.parse import DependencyGraph
|
||||
from nltk.tokenize import *
|
||||
|
||||
|
||||
class DependencyCorpusReader(SyntaxCorpusReader):
|
||||
def __init__(
|
||||
self,
|
||||
root,
|
||||
fileids,
|
||||
encoding="utf8",
|
||||
word_tokenizer=TabTokenizer(),
|
||||
sent_tokenizer=RegexpTokenizer("\n", gaps=True),
|
||||
para_block_reader=read_blankline_block,
|
||||
):
|
||||
SyntaxCorpusReader.__init__(self, root, fileids, encoding)
|
||||
|
||||
#########################################################
|
||||
|
||||
def words(self, fileids=None):
|
||||
return concat(
|
||||
[
|
||||
DependencyCorpusView(fileid, False, False, False, encoding=enc)
|
||||
for fileid, enc in self.abspaths(fileids, include_encoding=True)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_words(self, fileids=None):
|
||||
return concat(
|
||||
[
|
||||
DependencyCorpusView(fileid, True, False, False, encoding=enc)
|
||||
for fileid, enc in self.abspaths(fileids, include_encoding=True)
|
||||
]
|
||||
)
|
||||
|
||||
def sents(self, fileids=None):
|
||||
return concat(
|
||||
[
|
||||
DependencyCorpusView(fileid, False, True, False, encoding=enc)
|
||||
for fileid, enc in self.abspaths(fileids, include_encoding=True)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_sents(self, fileids=None):
|
||||
return concat(
|
||||
[
|
||||
DependencyCorpusView(fileid, True, True, False, encoding=enc)
|
||||
for fileid, enc in self.abspaths(fileids, include_encoding=True)
|
||||
]
|
||||
)
|
||||
|
||||
def parsed_sents(self, fileids=None):
|
||||
sents = concat(
|
||||
[
|
||||
DependencyCorpusView(fileid, False, True, True, encoding=enc)
|
||||
for fileid, enc in self.abspaths(fileids, include_encoding=True)
|
||||
]
|
||||
)
|
||||
return [DependencyGraph(sent) for sent in sents]
|
||||
|
||||
|
||||
class DependencyCorpusView(StreamBackedCorpusView):
|
||||
_DOCSTART = "-DOCSTART- -DOCSTART- O\n" # dokumentu hasiera definitzen da
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
corpus_file,
|
||||
tagged,
|
||||
group_by_sent,
|
||||
dependencies,
|
||||
chunk_types=None,
|
||||
encoding="utf8",
|
||||
):
|
||||
self._tagged = tagged
|
||||
self._dependencies = dependencies
|
||||
self._group_by_sent = group_by_sent
|
||||
self._chunk_types = chunk_types
|
||||
StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
|
||||
|
||||
def read_block(self, stream):
|
||||
# Read the next sentence.
|
||||
sent = read_blankline_block(stream)[0].strip()
|
||||
# Strip off the docstart marker, if present.
|
||||
if sent.startswith(self._DOCSTART):
|
||||
sent = sent[len(self._DOCSTART) :].lstrip()
|
||||
|
||||
# extract word and tag from any of the formats
|
||||
if not self._dependencies:
|
||||
lines = [line.split("\t") for line in sent.split("\n")]
|
||||
if len(lines[0]) == 3 or len(lines[0]) == 4:
|
||||
sent = [(line[0], line[1]) for line in lines]
|
||||
elif len(lines[0]) == 10:
|
||||
sent = [(line[1], line[4]) for line in lines]
|
||||
else:
|
||||
raise ValueError("Unexpected number of fields in dependency tree file")
|
||||
|
||||
# discard tags if they weren't requested
|
||||
if not self._tagged:
|
||||
sent = [word for (word, tag) in sent]
|
||||
|
||||
# Return the result.
|
||||
if self._group_by_sent:
|
||||
return [sent]
|
||||
else:
|
||||
return list(sent)
|
||||
3428
backend/venv/Lib/site-packages/nltk/corpus/reader/framenet.py
Normal file
3428
backend/venv/Lib/site-packages/nltk/corpus/reader/framenet.py
Normal file
File diff suppressed because it is too large
Load Diff
116
backend/venv/Lib/site-packages/nltk/corpus/reader/ieer.py
Normal file
116
backend/venv/Lib/site-packages/nltk/corpus/reader/ieer.py
Normal file
@@ -0,0 +1,116 @@
|
||||
# Natural Language Toolkit: IEER Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Corpus reader for the Information Extraction and Entity Recognition Corpus.
|
||||
|
||||
NIST 1999 Information Extraction: Entity Recognition Evaluation
|
||||
https://www.itl.nist.gov/iad/894.01/tests/ie-er/er_99/er_99.htm
|
||||
|
||||
This corpus contains the NEWSWIRE development test data for the
|
||||
NIST 1999 IE-ER Evaluation. The files were taken from the
|
||||
subdirectory: ``/ie_er_99/english/devtest/newswire/*.ref.nwt``
|
||||
and filenames were shortened.
|
||||
|
||||
The corpus contains the following files: APW_19980314, APW_19980424,
|
||||
APW_19980429, NYT_19980315, NYT_19980403, and NYT_19980407.
|
||||
"""
|
||||
|
||||
import nltk
|
||||
from nltk.corpus.reader.api import *
|
||||
|
||||
#: A dictionary whose keys are the names of documents in this corpus;
|
||||
#: and whose values are descriptions of those documents' contents.
|
||||
titles = {
|
||||
"APW_19980314": "Associated Press Weekly, 14 March 1998",
|
||||
"APW_19980424": "Associated Press Weekly, 24 April 1998",
|
||||
"APW_19980429": "Associated Press Weekly, 29 April 1998",
|
||||
"NYT_19980315": "New York Times, 15 March 1998",
|
||||
"NYT_19980403": "New York Times, 3 April 1998",
|
||||
"NYT_19980407": "New York Times, 7 April 1998",
|
||||
}
|
||||
|
||||
#: A list of all documents in this corpus.
|
||||
documents = sorted(titles)
|
||||
|
||||
|
||||
class IEERDocument:
|
||||
def __init__(self, text, docno=None, doctype=None, date_time=None, headline=""):
|
||||
self.text = text
|
||||
self.docno = docno
|
||||
self.doctype = doctype
|
||||
self.date_time = date_time
|
||||
self.headline = headline
|
||||
|
||||
def __repr__(self):
|
||||
if self.headline:
|
||||
headline = " ".join(self.headline.leaves())
|
||||
else:
|
||||
headline = (
|
||||
" ".join([w for w in self.text.leaves() if w[:1] != "<"][:12]) + "..."
|
||||
)
|
||||
if self.docno is not None:
|
||||
return f"<IEERDocument {self.docno}: {headline!r}>"
|
||||
else:
|
||||
return "<IEERDocument: %r>" % headline
|
||||
|
||||
|
||||
class IEERCorpusReader(CorpusReader):
|
||||
""" """
|
||||
|
||||
def docs(self, fileids=None):
|
||||
return concat(
|
||||
[
|
||||
StreamBackedCorpusView(fileid, self._read_block, encoding=enc)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def parsed_docs(self, fileids=None):
|
||||
return concat(
|
||||
[
|
||||
StreamBackedCorpusView(fileid, self._read_parsed_block, encoding=enc)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def _read_parsed_block(self, stream):
|
||||
# TODO: figure out while empty documents are being returned
|
||||
return [
|
||||
self._parse(doc)
|
||||
for doc in self._read_block(stream)
|
||||
if self._parse(doc).docno is not None
|
||||
]
|
||||
|
||||
def _parse(self, doc):
|
||||
val = nltk.chunk.ieerstr2tree(doc, root_label="DOCUMENT")
|
||||
if isinstance(val, dict):
|
||||
return IEERDocument(**val)
|
||||
else:
|
||||
return IEERDocument(val)
|
||||
|
||||
def _read_block(self, stream):
|
||||
out = []
|
||||
# Skip any preamble.
|
||||
while True:
|
||||
line = stream.readline()
|
||||
if not line:
|
||||
break
|
||||
if line.strip() == "<DOC>":
|
||||
break
|
||||
out.append(line)
|
||||
# Read the document
|
||||
while True:
|
||||
line = stream.readline()
|
||||
if not line:
|
||||
break
|
||||
out.append(line)
|
||||
if line.strip() == "</DOC>":
|
||||
break
|
||||
# Return the document
|
||||
return ["\n".join(out)]
|
||||
93
backend/venv/Lib/site-packages/nltk/corpus/reader/indian.py
Normal file
93
backend/venv/Lib/site-packages/nltk/corpus/reader/indian.py
Normal file
@@ -0,0 +1,93 @@
|
||||
# Natural Language Toolkit: Indian Language POS-Tagged Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Indian Language POS-Tagged Corpus
|
||||
Collected by A Kumaran, Microsoft Research, India
|
||||
Distributed with permission
|
||||
|
||||
Contents:
|
||||
- Bangla: IIT Kharagpur
|
||||
- Hindi: Microsoft Research India
|
||||
- Marathi: IIT Bombay
|
||||
- Telugu: IIIT Hyderabad
|
||||
"""
|
||||
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.tag import map_tag, str2tuple
|
||||
|
||||
|
||||
class IndianCorpusReader(CorpusReader):
|
||||
"""
|
||||
List of words, one per line. Blank lines are ignored.
|
||||
"""
|
||||
|
||||
def words(self, fileids=None):
|
||||
return concat(
|
||||
[
|
||||
IndianCorpusView(fileid, enc, False, False)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_words(self, fileids=None, tagset=None):
|
||||
if tagset and tagset != self._tagset:
|
||||
tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
|
||||
else:
|
||||
tag_mapping_function = None
|
||||
return concat(
|
||||
[
|
||||
IndianCorpusView(fileid, enc, True, False, tag_mapping_function)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def sents(self, fileids=None):
|
||||
return concat(
|
||||
[
|
||||
IndianCorpusView(fileid, enc, False, True)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_sents(self, fileids=None, tagset=None):
|
||||
if tagset and tagset != self._tagset:
|
||||
tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
|
||||
else:
|
||||
tag_mapping_function = None
|
||||
return concat(
|
||||
[
|
||||
IndianCorpusView(fileid, enc, True, True, tag_mapping_function)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
class IndianCorpusView(StreamBackedCorpusView):
|
||||
def __init__(
|
||||
self, corpus_file, encoding, tagged, group_by_sent, tag_mapping_function=None
|
||||
):
|
||||
self._tagged = tagged
|
||||
self._group_by_sent = group_by_sent
|
||||
self._tag_mapping_function = tag_mapping_function
|
||||
StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
|
||||
|
||||
def read_block(self, stream):
|
||||
line = stream.readline()
|
||||
if line.startswith("<"):
|
||||
return []
|
||||
sent = [str2tuple(word, sep="_") for word in line.split()]
|
||||
if self._tag_mapping_function:
|
||||
sent = [(w, self._tag_mapping_function(t)) for (w, t) in sent]
|
||||
if not self._tagged:
|
||||
sent = [w for (w, t) in sent]
|
||||
if self._group_by_sent:
|
||||
return [sent]
|
||||
else:
|
||||
return sent
|
||||
354
backend/venv/Lib/site-packages/nltk/corpus/reader/ipipan.py
Normal file
354
backend/venv/Lib/site-packages/nltk/corpus/reader/ipipan.py
Normal file
@@ -0,0 +1,354 @@
|
||||
# Natural Language Toolkit: IPI PAN Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Konrad Goluchowski <kodie@mimuw.edu.pl>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import functools
|
||||
|
||||
from nltk.corpus.reader.api import CorpusReader
|
||||
from nltk.corpus.reader.util import StreamBackedCorpusView, concat
|
||||
|
||||
|
||||
def _parse_args(fun):
|
||||
@functools.wraps(fun)
|
||||
def decorator(self, fileids=None, **kwargs):
|
||||
kwargs.pop("tags", None)
|
||||
if not fileids:
|
||||
fileids = self.fileids()
|
||||
return fun(self, fileids, **kwargs)
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
class IPIPANCorpusReader(CorpusReader):
|
||||
"""
|
||||
Corpus reader designed to work with corpus created by IPI PAN.
|
||||
See http://korpus.pl/en/ for more details about IPI PAN corpus.
|
||||
|
||||
The corpus includes information about text domain, channel and categories.
|
||||
You can access possible values using ``domains()``, ``channels()`` and
|
||||
``categories()``. You can use also this metadata to filter files, e.g.:
|
||||
``fileids(channel='prasa')``, ``fileids(categories='publicystyczny')``.
|
||||
|
||||
The reader supports methods: words, sents, paras and their tagged versions.
|
||||
You can get part of speech instead of full tag by giving "simplify_tags=True"
|
||||
parameter, e.g.: ``tagged_sents(simplify_tags=True)``.
|
||||
|
||||
Also you can get all tags disambiguated tags specifying parameter
|
||||
"one_tag=False", e.g.: ``tagged_paras(one_tag=False)``.
|
||||
|
||||
You can get all tags that were assigned by a morphological analyzer specifying
|
||||
parameter "disamb_only=False", e.g. ``tagged_words(disamb_only=False)``.
|
||||
|
||||
The IPIPAN Corpus contains tags indicating if there is a space between two
|
||||
tokens. To add special "no space" markers, you should specify parameter
|
||||
"append_no_space=True", e.g. ``tagged_words(append_no_space=True)``.
|
||||
As a result in place where there should be no space between two tokens new
|
||||
pair ('', 'no-space') will be inserted (for tagged data) and just '' for
|
||||
methods without tags.
|
||||
|
||||
The corpus reader can also try to append spaces between words. To enable this
|
||||
option, specify parameter "append_space=True", e.g. ``words(append_space=True)``.
|
||||
As a result either ' ' or (' ', 'space') will be inserted between tokens.
|
||||
|
||||
By default, xml entities like " and & are replaced by corresponding
|
||||
characters. You can turn off this feature, specifying parameter
|
||||
"replace_xmlentities=False", e.g. ``words(replace_xmlentities=False)``.
|
||||
"""
|
||||
|
||||
def __init__(self, root, fileids):
|
||||
CorpusReader.__init__(self, root, fileids, None, None)
|
||||
|
||||
def channels(self, fileids=None):
|
||||
if not fileids:
|
||||
fileids = self.fileids()
|
||||
return self._parse_header(fileids, "channel")
|
||||
|
||||
def domains(self, fileids=None):
|
||||
if not fileids:
|
||||
fileids = self.fileids()
|
||||
return self._parse_header(fileids, "domain")
|
||||
|
||||
def categories(self, fileids=None):
|
||||
if not fileids:
|
||||
fileids = self.fileids()
|
||||
return [
|
||||
self._map_category(cat) for cat in self._parse_header(fileids, "keyTerm")
|
||||
]
|
||||
|
||||
def fileids(self, channels=None, domains=None, categories=None):
|
||||
if channels is not None and domains is not None and categories is not None:
|
||||
raise ValueError(
|
||||
"You can specify only one of channels, domains "
|
||||
"and categories parameter at once"
|
||||
)
|
||||
if channels is None and domains is None and categories is None:
|
||||
return CorpusReader.fileids(self)
|
||||
if isinstance(channels, str):
|
||||
channels = [channels]
|
||||
if isinstance(domains, str):
|
||||
domains = [domains]
|
||||
if isinstance(categories, str):
|
||||
categories = [categories]
|
||||
if channels:
|
||||
return self._list_morph_files_by("channel", channels)
|
||||
elif domains:
|
||||
return self._list_morph_files_by("domain", domains)
|
||||
else:
|
||||
return self._list_morph_files_by(
|
||||
"keyTerm", categories, map=self._map_category
|
||||
)
|
||||
|
||||
@_parse_args
|
||||
def sents(self, fileids=None, **kwargs):
|
||||
return concat(
|
||||
[
|
||||
self._view(
|
||||
fileid, mode=IPIPANCorpusView.SENTS_MODE, tags=False, **kwargs
|
||||
)
|
||||
for fileid in self._list_morph_files(fileids)
|
||||
]
|
||||
)
|
||||
|
||||
@_parse_args
|
||||
def paras(self, fileids=None, **kwargs):
|
||||
return concat(
|
||||
[
|
||||
self._view(
|
||||
fileid, mode=IPIPANCorpusView.PARAS_MODE, tags=False, **kwargs
|
||||
)
|
||||
for fileid in self._list_morph_files(fileids)
|
||||
]
|
||||
)
|
||||
|
||||
@_parse_args
|
||||
def words(self, fileids=None, **kwargs):
|
||||
return concat(
|
||||
[
|
||||
self._view(fileid, tags=False, **kwargs)
|
||||
for fileid in self._list_morph_files(fileids)
|
||||
]
|
||||
)
|
||||
|
||||
@_parse_args
|
||||
def tagged_sents(self, fileids=None, **kwargs):
|
||||
return concat(
|
||||
[
|
||||
self._view(fileid, mode=IPIPANCorpusView.SENTS_MODE, **kwargs)
|
||||
for fileid in self._list_morph_files(fileids)
|
||||
]
|
||||
)
|
||||
|
||||
@_parse_args
|
||||
def tagged_paras(self, fileids=None, **kwargs):
|
||||
return concat(
|
||||
[
|
||||
self._view(fileid, mode=IPIPANCorpusView.PARAS_MODE, **kwargs)
|
||||
for fileid in self._list_morph_files(fileids)
|
||||
]
|
||||
)
|
||||
|
||||
@_parse_args
|
||||
def tagged_words(self, fileids=None, **kwargs):
|
||||
return concat(
|
||||
[self._view(fileid, **kwargs) for fileid in self._list_morph_files(fileids)]
|
||||
)
|
||||
|
||||
def _list_morph_files(self, fileids):
|
||||
return [f for f in self.abspaths(fileids)]
|
||||
|
||||
def _list_header_files(self, fileids):
|
||||
return [
|
||||
f.replace("morph.xml", "header.xml")
|
||||
for f in self._list_morph_files(fileids)
|
||||
]
|
||||
|
||||
def _parse_header(self, fileids, tag):
|
||||
values = set()
|
||||
for f in self._list_header_files(fileids):
|
||||
values_list = self._get_tag(f, tag)
|
||||
for v in values_list:
|
||||
values.add(v)
|
||||
return list(values)
|
||||
|
||||
def _list_morph_files_by(self, tag, values, map=None):
|
||||
fileids = self.fileids()
|
||||
ret_fileids = set()
|
||||
for f in fileids:
|
||||
fp = self.abspath(f).replace("morph.xml", "header.xml")
|
||||
values_list = self._get_tag(fp, tag)
|
||||
for value in values_list:
|
||||
if map is not None:
|
||||
value = map(value)
|
||||
if value in values:
|
||||
ret_fileids.add(f)
|
||||
return list(ret_fileids)
|
||||
|
||||
def _get_tag(self, f, tag):
|
||||
tags = []
|
||||
with open(f) as infile:
|
||||
header = infile.read()
|
||||
tag_end = 0
|
||||
while True:
|
||||
tag_pos = header.find("<" + tag, tag_end)
|
||||
if tag_pos < 0:
|
||||
return tags
|
||||
tag_end = header.find("</" + tag + ">", tag_pos)
|
||||
tags.append(header[tag_pos + len(tag) + 2 : tag_end])
|
||||
|
||||
def _map_category(self, cat):
|
||||
pos = cat.find(">")
|
||||
if pos == -1:
|
||||
return cat
|
||||
else:
|
||||
return cat[pos + 1 :]
|
||||
|
||||
def _view(self, filename, **kwargs):
|
||||
tags = kwargs.pop("tags", True)
|
||||
mode = kwargs.pop("mode", 0)
|
||||
simplify_tags = kwargs.pop("simplify_tags", False)
|
||||
one_tag = kwargs.pop("one_tag", True)
|
||||
disamb_only = kwargs.pop("disamb_only", True)
|
||||
append_no_space = kwargs.pop("append_no_space", False)
|
||||
append_space = kwargs.pop("append_space", False)
|
||||
replace_xmlentities = kwargs.pop("replace_xmlentities", True)
|
||||
|
||||
if len(kwargs) > 0:
|
||||
raise ValueError("Unexpected arguments: %s" % kwargs.keys())
|
||||
if not one_tag and not disamb_only:
|
||||
raise ValueError(
|
||||
"You cannot specify both one_tag=False and " "disamb_only=False"
|
||||
)
|
||||
if not tags and (simplify_tags or not one_tag or not disamb_only):
|
||||
raise ValueError(
|
||||
"You cannot specify simplify_tags, one_tag or "
|
||||
"disamb_only with functions other than tagged_*"
|
||||
)
|
||||
|
||||
return IPIPANCorpusView(
|
||||
filename,
|
||||
tags=tags,
|
||||
mode=mode,
|
||||
simplify_tags=simplify_tags,
|
||||
one_tag=one_tag,
|
||||
disamb_only=disamb_only,
|
||||
append_no_space=append_no_space,
|
||||
append_space=append_space,
|
||||
replace_xmlentities=replace_xmlentities,
|
||||
)
|
||||
|
||||
|
||||
class IPIPANCorpusView(StreamBackedCorpusView):
|
||||
WORDS_MODE = 0
|
||||
SENTS_MODE = 1
|
||||
PARAS_MODE = 2
|
||||
|
||||
def __init__(self, filename, startpos=0, **kwargs):
|
||||
StreamBackedCorpusView.__init__(self, filename, None, startpos, None)
|
||||
self.in_sentence = False
|
||||
self.position = 0
|
||||
|
||||
self.show_tags = kwargs.pop("tags", True)
|
||||
self.disamb_only = kwargs.pop("disamb_only", True)
|
||||
self.mode = kwargs.pop("mode", IPIPANCorpusView.WORDS_MODE)
|
||||
self.simplify_tags = kwargs.pop("simplify_tags", False)
|
||||
self.one_tag = kwargs.pop("one_tag", True)
|
||||
self.append_no_space = kwargs.pop("append_no_space", False)
|
||||
self.append_space = kwargs.pop("append_space", False)
|
||||
self.replace_xmlentities = kwargs.pop("replace_xmlentities", True)
|
||||
|
||||
def read_block(self, stream):
|
||||
sentence = []
|
||||
sentences = []
|
||||
space = False
|
||||
no_space = False
|
||||
|
||||
tags = set()
|
||||
|
||||
lines = self._read_data(stream)
|
||||
|
||||
while True:
|
||||
# we may have only part of last line
|
||||
if len(lines) <= 1:
|
||||
self._seek(stream)
|
||||
lines = self._read_data(stream)
|
||||
|
||||
if lines == [""]:
|
||||
assert not sentences
|
||||
return []
|
||||
|
||||
line = lines.pop()
|
||||
self.position += len(line) + 1
|
||||
|
||||
if line.startswith('<chunk type="s"'):
|
||||
self.in_sentence = True
|
||||
elif line.startswith('<chunk type="p"'):
|
||||
pass
|
||||
elif line.startswith("<tok"):
|
||||
if self.append_space and space and not no_space:
|
||||
self._append_space(sentence)
|
||||
space = True
|
||||
no_space = False
|
||||
orth = ""
|
||||
tags = set()
|
||||
elif line.startswith("</chunk"):
|
||||
if self.in_sentence:
|
||||
self.in_sentence = False
|
||||
self._seek(stream)
|
||||
if self.mode == self.SENTS_MODE:
|
||||
return [sentence]
|
||||
elif self.mode == self.WORDS_MODE:
|
||||
if self.append_space:
|
||||
self._append_space(sentence)
|
||||
return sentence
|
||||
else:
|
||||
sentences.append(sentence)
|
||||
elif self.mode == self.PARAS_MODE:
|
||||
self._seek(stream)
|
||||
return [sentences]
|
||||
elif line.startswith("<orth"):
|
||||
orth = line[6:-7]
|
||||
if self.replace_xmlentities:
|
||||
orth = orth.replace(""", '"').replace("&", "&")
|
||||
elif line.startswith("<lex"):
|
||||
if not self.disamb_only or line.find("disamb=") != -1:
|
||||
tag = line[line.index("<ctag") + 6 : line.index("</ctag")]
|
||||
tags.add(tag)
|
||||
elif line.startswith("</tok"):
|
||||
if self.show_tags:
|
||||
if self.simplify_tags:
|
||||
tags = [t.split(":")[0] for t in tags]
|
||||
if not self.one_tag or not self.disamb_only:
|
||||
sentence.append((orth, tuple(tags)))
|
||||
else:
|
||||
sentence.append((orth, tags.pop()))
|
||||
else:
|
||||
sentence.append(orth)
|
||||
elif line.startswith("<ns/>"):
|
||||
if self.append_space:
|
||||
no_space = True
|
||||
if self.append_no_space:
|
||||
if self.show_tags:
|
||||
sentence.append(("", "no-space"))
|
||||
else:
|
||||
sentence.append("")
|
||||
elif line.startswith("</cesAna"):
|
||||
pass
|
||||
|
||||
def _read_data(self, stream):
|
||||
self.position = stream.tell()
|
||||
buff = stream.read(4096)
|
||||
lines = buff.split("\n")
|
||||
lines.reverse()
|
||||
return lines
|
||||
|
||||
def _seek(self, stream):
|
||||
stream.seek(self.position)
|
||||
|
||||
def _append_space(self, sentence):
|
||||
if self.show_tags:
|
||||
sentence.append((" ", "space"))
|
||||
else:
|
||||
sentence.append(" ")
|
||||
186
backend/venv/Lib/site-packages/nltk/corpus/reader/knbc.py
Normal file
186
backend/venv/Lib/site-packages/nltk/corpus/reader/knbc.py
Normal file
@@ -0,0 +1,186 @@
|
||||
#! /usr/bin/env python
|
||||
# KNB Corpus reader
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Masato Hagiwara <hagisan@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
# For more information, see http://lilyx.net/pages/nltkjapanesecorpus.html
|
||||
|
||||
import re
|
||||
|
||||
from nltk.corpus.reader.api import CorpusReader, SyntaxCorpusReader
|
||||
from nltk.corpus.reader.util import (
|
||||
FileSystemPathPointer,
|
||||
find_corpus_fileids,
|
||||
read_blankline_block,
|
||||
)
|
||||
from nltk.parse import DependencyGraph
|
||||
|
||||
# default function to convert morphlist to str for tree representation
|
||||
_morphs2str_default = lambda morphs: "/".join(m[0] for m in morphs if m[0] != "EOS")
|
||||
|
||||
|
||||
class KNBCorpusReader(SyntaxCorpusReader):
|
||||
"""
|
||||
This class implements:
|
||||
- ``__init__``, which specifies the location of the corpus
|
||||
and a method for detecting the sentence blocks in corpus files.
|
||||
- ``_read_block``, which reads a block from the input stream.
|
||||
- ``_word``, which takes a block and returns a list of list of words.
|
||||
- ``_tag``, which takes a block and returns a list of list of tagged
|
||||
words.
|
||||
- ``_parse``, which takes a block and returns a list of parsed
|
||||
sentences.
|
||||
|
||||
The structure of tagged words:
|
||||
tagged_word = (word(str), tags(tuple))
|
||||
tags = (surface, reading, lemma, pos1, posid1, pos2, posid2, pos3, posid3, others ...)
|
||||
|
||||
Usage example
|
||||
|
||||
>>> from nltk.corpus.util import LazyCorpusLoader
|
||||
>>> knbc = LazyCorpusLoader(
|
||||
... 'knbc/corpus1',
|
||||
... KNBCorpusReader,
|
||||
... r'.*/KN.*',
|
||||
... encoding='euc-jp',
|
||||
... )
|
||||
|
||||
>>> len(knbc.sents()[0])
|
||||
9
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, root, fileids, encoding="utf8", morphs2str=_morphs2str_default):
|
||||
"""
|
||||
Initialize KNBCorpusReader
|
||||
morphs2str is a function to convert morphlist to str for tree representation
|
||||
for _parse()
|
||||
"""
|
||||
SyntaxCorpusReader.__init__(self, root, fileids, encoding)
|
||||
self.morphs2str = morphs2str
|
||||
|
||||
def _read_block(self, stream):
|
||||
# blocks are split by blankline (or EOF) - default
|
||||
return read_blankline_block(stream)
|
||||
|
||||
def _word(self, t):
|
||||
res = []
|
||||
for line in t.splitlines():
|
||||
# ignore the Bunsets headers
|
||||
if not re.match(r"EOS|\*|\#|\+", line):
|
||||
cells = line.strip().split(" ")
|
||||
res.append(cells[0])
|
||||
|
||||
return res
|
||||
|
||||
# ignores tagset argument
|
||||
def _tag(self, t, tagset=None):
|
||||
res = []
|
||||
for line in t.splitlines():
|
||||
# ignore the Bunsets headers
|
||||
if not re.match(r"EOS|\*|\#|\+", line):
|
||||
cells = line.strip().split(" ")
|
||||
# convert cells to morph tuples
|
||||
res.append((cells[0], " ".join(cells[1:])))
|
||||
|
||||
return res
|
||||
|
||||
def _parse(self, t):
|
||||
dg = DependencyGraph()
|
||||
i = 0
|
||||
for line in t.splitlines():
|
||||
if line[0] in "*+":
|
||||
# start of bunsetsu or tag
|
||||
|
||||
cells = line.strip().split(" ", 3)
|
||||
m = re.match(r"([\-0-9]*)([ADIP])", cells[1])
|
||||
|
||||
assert m is not None
|
||||
|
||||
node = dg.nodes[i]
|
||||
node.update({"address": i, "rel": m.group(2), "word": []})
|
||||
|
||||
dep_parent = int(m.group(1))
|
||||
|
||||
if dep_parent == -1:
|
||||
dg.root = node
|
||||
else:
|
||||
dg.nodes[dep_parent]["deps"].append(i)
|
||||
|
||||
i += 1
|
||||
elif line[0] != "#":
|
||||
# normal morph
|
||||
cells = line.strip().split(" ")
|
||||
# convert cells to morph tuples
|
||||
morph = cells[0], " ".join(cells[1:])
|
||||
dg.nodes[i - 1]["word"].append(morph)
|
||||
|
||||
if self.morphs2str:
|
||||
for node in dg.nodes.values():
|
||||
node["word"] = self.morphs2str(node["word"])
|
||||
|
||||
return dg.tree()
|
||||
|
||||
|
||||
######################################################################
|
||||
# Demo
|
||||
######################################################################
|
||||
|
||||
|
||||
def demo():
|
||||
import nltk
|
||||
from nltk.corpus.util import LazyCorpusLoader
|
||||
|
||||
root = nltk.data.find("corpora/knbc/corpus1")
|
||||
fileids = [
|
||||
f
|
||||
for f in find_corpus_fileids(FileSystemPathPointer(root), ".*")
|
||||
if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)
|
||||
]
|
||||
|
||||
def _knbc_fileids_sort(x):
|
||||
cells = x.split("-")
|
||||
return (cells[0], int(cells[1]), int(cells[2]), int(cells[3]))
|
||||
|
||||
knbc = LazyCorpusLoader(
|
||||
"knbc/corpus1",
|
||||
KNBCorpusReader,
|
||||
sorted(fileids, key=_knbc_fileids_sort),
|
||||
encoding="euc-jp",
|
||||
)
|
||||
|
||||
print(knbc.fileids()[:10])
|
||||
print("".join(knbc.words()[:100]))
|
||||
|
||||
print("\n\n".join(str(tree) for tree in knbc.parsed_sents()[:2]))
|
||||
|
||||
knbc.morphs2str = lambda morphs: "/".join(
|
||||
"{}({})".format(m[0], m[1].split(" ")[2]) for m in morphs if m[0] != "EOS"
|
||||
).encode("utf-8")
|
||||
|
||||
print("\n\n".join("%s" % tree for tree in knbc.parsed_sents()[:2]))
|
||||
|
||||
print(
|
||||
"\n".join(
|
||||
" ".join("{}/{}".format(w[0], w[1].split(" ")[2]) for w in sent)
|
||||
for sent in knbc.tagged_sents()[0:2]
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def test():
|
||||
from nltk.corpus.util import LazyCorpusLoader
|
||||
|
||||
knbc = LazyCorpusLoader(
|
||||
"knbc/corpus1", KNBCorpusReader, r".*/KN.*", encoding="euc-jp"
|
||||
)
|
||||
assert isinstance(knbc.words()[0], str)
|
||||
assert isinstance(knbc.sents()[0][0], str)
|
||||
assert isinstance(knbc.tagged_words()[0], tuple)
|
||||
assert isinstance(knbc.tagged_sents()[0][0], tuple)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
183
backend/venv/Lib/site-packages/nltk/corpus/reader/lin.py
Normal file
183
backend/venv/Lib/site-packages/nltk/corpus/reader/lin.py
Normal file
@@ -0,0 +1,183 @@
|
||||
# Natural Language Toolkit: Lin's Thesaurus
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Dan Blanchard <dblanchard@ets.org>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.txt
|
||||
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from functools import reduce
|
||||
|
||||
from nltk.corpus.reader import CorpusReader
|
||||
|
||||
|
||||
class LinThesaurusCorpusReader(CorpusReader):
|
||||
"""Wrapper for the LISP-formatted thesauruses distributed by Dekang Lin."""
|
||||
|
||||
# Compiled regular expression for extracting the key from the first line of each
|
||||
# thesaurus entry
|
||||
_key_re = re.compile(r'\("?([^"]+)"? \(desc [0-9.]+\).+')
|
||||
|
||||
@staticmethod
|
||||
def __defaultdict_factory():
|
||||
"""Factory for creating defaultdict of defaultdict(dict)s"""
|
||||
return defaultdict(dict)
|
||||
|
||||
def __init__(self, root, badscore=0.0):
|
||||
"""
|
||||
Initialize the thesaurus.
|
||||
|
||||
:param root: root directory containing thesaurus LISP files
|
||||
:type root: C{string}
|
||||
:param badscore: the score to give to words which do not appear in each other's sets of synonyms
|
||||
:type badscore: C{float}
|
||||
"""
|
||||
|
||||
super().__init__(root, r"sim[A-Z]\.lsp")
|
||||
self._thesaurus = defaultdict(LinThesaurusCorpusReader.__defaultdict_factory)
|
||||
self._badscore = badscore
|
||||
for path, encoding, fileid in self.abspaths(
|
||||
include_encoding=True, include_fileid=True
|
||||
):
|
||||
with open(path) as lin_file:
|
||||
first = True
|
||||
for line in lin_file:
|
||||
line = line.strip()
|
||||
# Start of entry
|
||||
if first:
|
||||
key = LinThesaurusCorpusReader._key_re.sub(r"\1", line)
|
||||
first = False
|
||||
# End of entry
|
||||
elif line == "))":
|
||||
first = True
|
||||
# Lines with pairs of ngrams and scores
|
||||
else:
|
||||
split_line = line.split("\t")
|
||||
if len(split_line) == 2:
|
||||
ngram, score = split_line
|
||||
self._thesaurus[fileid][key][ngram.strip('"')] = float(
|
||||
score
|
||||
)
|
||||
|
||||
def similarity(self, ngram1, ngram2, fileid=None):
|
||||
"""
|
||||
Returns the similarity score for two ngrams.
|
||||
|
||||
:param ngram1: first ngram to compare
|
||||
:type ngram1: C{string}
|
||||
:param ngram2: second ngram to compare
|
||||
:type ngram2: C{string}
|
||||
:param fileid: thesaurus fileid to search in. If None, search all fileids.
|
||||
:type fileid: C{string}
|
||||
:return: If fileid is specified, just the score for the two ngrams; otherwise,
|
||||
list of tuples of fileids and scores.
|
||||
"""
|
||||
# Entries don't contain themselves, so make sure similarity between item and itself is 1.0
|
||||
if ngram1 == ngram2:
|
||||
if fileid:
|
||||
return 1.0
|
||||
else:
|
||||
return [(fid, 1.0) for fid in self._fileids]
|
||||
else:
|
||||
if fileid:
|
||||
return (
|
||||
self._thesaurus[fileid][ngram1][ngram2]
|
||||
if ngram2 in self._thesaurus[fileid][ngram1]
|
||||
else self._badscore
|
||||
)
|
||||
else:
|
||||
return [
|
||||
(
|
||||
fid,
|
||||
(
|
||||
self._thesaurus[fid][ngram1][ngram2]
|
||||
if ngram2 in self._thesaurus[fid][ngram1]
|
||||
else self._badscore
|
||||
),
|
||||
)
|
||||
for fid in self._fileids
|
||||
]
|
||||
|
||||
def scored_synonyms(self, ngram, fileid=None):
|
||||
"""
|
||||
Returns a list of scored synonyms (tuples of synonyms and scores) for the current ngram
|
||||
|
||||
:param ngram: ngram to lookup
|
||||
:type ngram: C{string}
|
||||
:param fileid: thesaurus fileid to search in. If None, search all fileids.
|
||||
:type fileid: C{string}
|
||||
:return: If fileid is specified, list of tuples of scores and synonyms; otherwise,
|
||||
list of tuples of fileids and lists, where inner lists consist of tuples of
|
||||
scores and synonyms.
|
||||
"""
|
||||
if fileid:
|
||||
return self._thesaurus[fileid][ngram].items()
|
||||
else:
|
||||
return [
|
||||
(fileid, self._thesaurus[fileid][ngram].items())
|
||||
for fileid in self._fileids
|
||||
]
|
||||
|
||||
def synonyms(self, ngram, fileid=None):
|
||||
"""
|
||||
Returns a list of synonyms for the current ngram.
|
||||
|
||||
:param ngram: ngram to lookup
|
||||
:type ngram: C{string}
|
||||
:param fileid: thesaurus fileid to search in. If None, search all fileids.
|
||||
:type fileid: C{string}
|
||||
:return: If fileid is specified, list of synonyms; otherwise, list of tuples of fileids and
|
||||
lists, where inner lists contain synonyms.
|
||||
"""
|
||||
if fileid:
|
||||
return self._thesaurus[fileid][ngram].keys()
|
||||
else:
|
||||
return [
|
||||
(fileid, self._thesaurus[fileid][ngram].keys())
|
||||
for fileid in self._fileids
|
||||
]
|
||||
|
||||
def __contains__(self, ngram):
|
||||
"""
|
||||
Determines whether or not the given ngram is in the thesaurus.
|
||||
|
||||
:param ngram: ngram to lookup
|
||||
:type ngram: C{string}
|
||||
:return: whether the given ngram is in the thesaurus.
|
||||
"""
|
||||
return reduce(
|
||||
lambda accum, fileid: accum or (ngram in self._thesaurus[fileid]),
|
||||
self._fileids,
|
||||
False,
|
||||
)
|
||||
|
||||
|
||||
######################################################################
|
||||
# Demo
|
||||
######################################################################
|
||||
|
||||
|
||||
def demo():
|
||||
from nltk.corpus import lin_thesaurus as thes
|
||||
|
||||
word1 = "business"
|
||||
word2 = "enterprise"
|
||||
print("Getting synonyms for " + word1)
|
||||
print(thes.synonyms(word1))
|
||||
|
||||
print("Getting scored synonyms for " + word1)
|
||||
print(thes.scored_synonyms(word1))
|
||||
|
||||
print("Getting synonyms from simN.lsp (noun subsection) for " + word1)
|
||||
print(thes.synonyms(word1, fileid="simN.lsp"))
|
||||
|
||||
print("Getting synonyms from simN.lsp (noun subsection) for " + word1)
|
||||
print(thes.synonyms(word1, fileid="simN.lsp"))
|
||||
|
||||
print(f"Similarity score for {word1} and {word2}:")
|
||||
print(thes.similarity(word1, word2))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
344
backend/venv/Lib/site-packages/nltk/corpus/reader/markdown.py
Normal file
344
backend/venv/Lib/site-packages/nltk/corpus/reader/markdown.py
Normal file
@@ -0,0 +1,344 @@
|
||||
from collections import namedtuple
|
||||
from functools import partial, wraps
|
||||
|
||||
from nltk.corpus.reader.api import CategorizedCorpusReader
|
||||
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
|
||||
from nltk.corpus.reader.util import concat, read_blankline_block
|
||||
from nltk.tokenize import blankline_tokenize, sent_tokenize, word_tokenize
|
||||
|
||||
|
||||
def comma_separated_string_args(func):
|
||||
"""
|
||||
A decorator that allows a function to be called with
|
||||
a single string of comma-separated values which become
|
||||
individual function arguments.
|
||||
"""
|
||||
|
||||
@wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
_args = list()
|
||||
for arg in args:
|
||||
if isinstance(arg, str):
|
||||
_args.append({part.strip() for part in arg.split(",")})
|
||||
elif isinstance(arg, list):
|
||||
_args.append(set(arg))
|
||||
else:
|
||||
_args.append(arg)
|
||||
for name, value in kwargs.items():
|
||||
if isinstance(value, str):
|
||||
kwargs[name] = {part.strip() for part in value.split(",")}
|
||||
return func(*_args, **kwargs)
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
def read_parse_blankline_block(stream, parser):
|
||||
block = read_blankline_block(stream)
|
||||
if block:
|
||||
return [parser.render(block[0])]
|
||||
return block
|
||||
|
||||
|
||||
class MarkdownBlock:
|
||||
def __init__(self, content):
|
||||
self.content = content
|
||||
self.truncate_at = 16
|
||||
|
||||
def __repr__(self):
|
||||
return f"{self.__class__.__name__}(content={repr(str(self))})"
|
||||
|
||||
def __str__(self):
|
||||
return (
|
||||
f"{self.content[:self.truncate_at]}"
|
||||
f"{'...' if len(self.content) > self.truncate_at else ''}"
|
||||
)
|
||||
|
||||
@property
|
||||
def raw(self):
|
||||
return self.content
|
||||
|
||||
@property
|
||||
def words(self):
|
||||
return word_tokenize(self.content)
|
||||
|
||||
@property
|
||||
def sents(self):
|
||||
return [word_tokenize(sent) for sent in sent_tokenize(self.content)]
|
||||
|
||||
@property
|
||||
def paras(self):
|
||||
return [
|
||||
[word_tokenize(sent) for sent in sent_tokenize(para)]
|
||||
for para in blankline_tokenize(self.content)
|
||||
]
|
||||
|
||||
|
||||
class CodeBlock(MarkdownBlock):
|
||||
def __init__(self, language, *args):
|
||||
self.language = language
|
||||
super().__init__(*args)
|
||||
|
||||
@property
|
||||
def sents(self):
|
||||
return [word_tokenize(line) for line in self.content.splitlines()]
|
||||
|
||||
@property
|
||||
def lines(self):
|
||||
return self.content.splitlines()
|
||||
|
||||
@property
|
||||
def paras(self):
|
||||
return [
|
||||
[word_tokenize(line) for line in para.splitlines()]
|
||||
for para in blankline_tokenize(self.content)
|
||||
]
|
||||
|
||||
|
||||
class MarkdownSection(MarkdownBlock):
|
||||
def __init__(self, heading, level, *args):
|
||||
self.heading = heading
|
||||
self.level = level
|
||||
super().__init__(*args)
|
||||
|
||||
|
||||
Image = namedtuple("Image", "label, src, title")
|
||||
Link = namedtuple("Link", "label, href, title")
|
||||
List = namedtuple("List", "is_ordered, items")
|
||||
|
||||
|
||||
class MarkdownCorpusReader(PlaintextCorpusReader):
|
||||
def __init__(self, *args, parser=None, **kwargs):
|
||||
from markdown_it import MarkdownIt
|
||||
from mdit_plain.renderer import RendererPlain
|
||||
from mdit_py_plugins.front_matter import front_matter_plugin
|
||||
|
||||
self.parser = parser
|
||||
if self.parser is None:
|
||||
self.parser = MarkdownIt("commonmark", renderer_cls=RendererPlain)
|
||||
self.parser.use(front_matter_plugin)
|
||||
|
||||
kwargs.setdefault(
|
||||
"para_block_reader", partial(read_parse_blankline_block, parser=self.parser)
|
||||
)
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
# This override takes care of removing markup.
|
||||
def _read_word_block(self, stream):
|
||||
words = list()
|
||||
for para in self._para_block_reader(stream):
|
||||
words.extend(self._word_tokenizer.tokenize(para))
|
||||
return words
|
||||
|
||||
|
||||
class CategorizedMarkdownCorpusReader(CategorizedCorpusReader, MarkdownCorpusReader):
|
||||
"""
|
||||
A reader for markdown corpora whose documents are divided into
|
||||
categories based on their file identifiers.
|
||||
|
||||
Based on nltk.corpus.reader.plaintext.CategorizedPlaintextCorpusReader:
|
||||
https://www.nltk.org/_modules/nltk/corpus/reader/api.html#CategorizedCorpusReader
|
||||
"""
|
||||
|
||||
def __init__(self, *args, cat_field="tags", **kwargs):
|
||||
"""
|
||||
Initialize the corpus reader. Categorization arguments
|
||||
(``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
|
||||
the ``CategorizedCorpusReader`` constructor. The remaining arguments
|
||||
are passed to the ``MarkdownCorpusReader`` constructor.
|
||||
"""
|
||||
cat_args = ["cat_pattern", "cat_map", "cat_file"]
|
||||
if not any(arg in kwargs for arg in cat_args):
|
||||
# Initialize with a blank map now,
|
||||
# and try to build categories from document metadata later.
|
||||
kwargs["cat_map"] = dict()
|
||||
CategorizedCorpusReader.__init__(self, kwargs)
|
||||
MarkdownCorpusReader.__init__(self, *args, **kwargs)
|
||||
|
||||
# Map file IDs to categories if self._map exists but is still empty:
|
||||
if self._map is not None and not self._map:
|
||||
for file_id in self._fileids:
|
||||
metadata = self.metadata(file_id)
|
||||
if metadata:
|
||||
self._map[file_id] = metadata[0].get(cat_field, [])
|
||||
|
||||
### Begin CategorizedCorpusReader Overrides
|
||||
@comma_separated_string_args
|
||||
def categories(self, fileids=None):
|
||||
return super().categories(fileids)
|
||||
|
||||
@comma_separated_string_args
|
||||
def fileids(self, categories=None):
|
||||
if categories is None:
|
||||
return self._fileids
|
||||
return super().fileids(categories)
|
||||
|
||||
### End CategorizedCorpusReader Overrides
|
||||
|
||||
### Begin MarkdownCorpusReader Overrides
|
||||
@comma_separated_string_args
|
||||
def raw(self, fileids=None, categories=None):
|
||||
return super().raw(self._resolve(fileids, categories))
|
||||
|
||||
@comma_separated_string_args
|
||||
def words(self, fileids=None, categories=None):
|
||||
return super().words(self._resolve(fileids, categories))
|
||||
|
||||
@comma_separated_string_args
|
||||
def sents(self, fileids=None, categories=None):
|
||||
return super().sents(self._resolve(fileids, categories))
|
||||
|
||||
@comma_separated_string_args
|
||||
def paras(self, fileids=None, categories=None):
|
||||
return super().paras(self._resolve(fileids, categories))
|
||||
|
||||
### End MarkdownCorpusReader Overrides
|
||||
|
||||
def concatenated_view(self, reader, fileids, categories):
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(path, reader, encoding=enc)
|
||||
for (path, enc) in self.abspaths(
|
||||
self._resolve(fileids, categories), include_encoding=True
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
def metadata_reader(self, stream):
|
||||
from yaml import safe_load
|
||||
|
||||
return [
|
||||
safe_load(t.content)
|
||||
for t in self.parser.parse(stream.read())
|
||||
if t.type == "front_matter"
|
||||
]
|
||||
|
||||
@comma_separated_string_args
|
||||
def metadata(self, fileids=None, categories=None):
|
||||
return self.concatenated_view(self.metadata_reader, fileids, categories)
|
||||
|
||||
def blockquote_reader(self, stream):
|
||||
tokens = self.parser.parse(stream.read())
|
||||
opening_tokens = filter(
|
||||
lambda t: t.level == 0 and t.type == "blockquote_open", tokens
|
||||
)
|
||||
closing_tokens = filter(
|
||||
lambda t: t.level == 0 and t.type == "blockquote_close", tokens
|
||||
)
|
||||
blockquotes = list()
|
||||
for o, c in zip(opening_tokens, closing_tokens):
|
||||
opening_index = tokens.index(o)
|
||||
closing_index = tokens.index(c, opening_index)
|
||||
blockquotes.append(tokens[opening_index : closing_index + 1])
|
||||
return [
|
||||
MarkdownBlock(
|
||||
self.parser.renderer.render(block, self.parser.options, env=None)
|
||||
)
|
||||
for block in blockquotes
|
||||
]
|
||||
|
||||
@comma_separated_string_args
|
||||
def blockquotes(self, fileids=None, categories=None):
|
||||
return self.concatenated_view(self.blockquote_reader, fileids, categories)
|
||||
|
||||
def code_block_reader(self, stream):
|
||||
return [
|
||||
CodeBlock(
|
||||
t.info,
|
||||
t.content,
|
||||
)
|
||||
for t in self.parser.parse(stream.read())
|
||||
if t.level == 0 and t.type in ("fence", "code_block")
|
||||
]
|
||||
|
||||
@comma_separated_string_args
|
||||
def code_blocks(self, fileids=None, categories=None):
|
||||
return self.concatenated_view(self.code_block_reader, fileids, categories)
|
||||
|
||||
def image_reader(self, stream):
|
||||
return [
|
||||
Image(
|
||||
child_token.content,
|
||||
child_token.attrGet("src"),
|
||||
child_token.attrGet("title"),
|
||||
)
|
||||
for inline_token in filter(
|
||||
lambda t: t.type == "inline", self.parser.parse(stream.read())
|
||||
)
|
||||
for child_token in inline_token.children
|
||||
if child_token.type == "image"
|
||||
]
|
||||
|
||||
@comma_separated_string_args
|
||||
def images(self, fileids=None, categories=None):
|
||||
return self.concatenated_view(self.image_reader, fileids, categories)
|
||||
|
||||
def link_reader(self, stream):
|
||||
return [
|
||||
Link(
|
||||
inline_token.children[i + 1].content,
|
||||
child_token.attrGet("href"),
|
||||
child_token.attrGet("title"),
|
||||
)
|
||||
for inline_token in filter(
|
||||
lambda t: t.type == "inline", self.parser.parse(stream.read())
|
||||
)
|
||||
for i, child_token in enumerate(inline_token.children)
|
||||
if child_token.type == "link_open"
|
||||
]
|
||||
|
||||
@comma_separated_string_args
|
||||
def links(self, fileids=None, categories=None):
|
||||
return self.concatenated_view(self.link_reader, fileids, categories)
|
||||
|
||||
def list_reader(self, stream):
|
||||
tokens = self.parser.parse(stream.read())
|
||||
opening_types = ("bullet_list_open", "ordered_list_open")
|
||||
opening_tokens = filter(
|
||||
lambda t: t.level == 0 and t.type in opening_types, tokens
|
||||
)
|
||||
closing_types = ("bullet_list_close", "ordered_list_close")
|
||||
closing_tokens = filter(
|
||||
lambda t: t.level == 0 and t.type in closing_types, tokens
|
||||
)
|
||||
list_blocks = list()
|
||||
for o, c in zip(opening_tokens, closing_tokens):
|
||||
opening_index = tokens.index(o)
|
||||
closing_index = tokens.index(c, opening_index)
|
||||
list_blocks.append(tokens[opening_index : closing_index + 1])
|
||||
return [
|
||||
List(
|
||||
tokens[0].type == "ordered_list_open",
|
||||
[t.content for t in tokens if t.content],
|
||||
)
|
||||
for tokens in list_blocks
|
||||
]
|
||||
|
||||
@comma_separated_string_args
|
||||
def lists(self, fileids=None, categories=None):
|
||||
return self.concatenated_view(self.list_reader, fileids, categories)
|
||||
|
||||
def section_reader(self, stream):
|
||||
section_blocks, block = list(), list()
|
||||
for t in self.parser.parse(stream.read()):
|
||||
if t.level == 0 and t.type == "heading_open":
|
||||
if not block:
|
||||
block.append(t)
|
||||
else:
|
||||
section_blocks.append(block)
|
||||
block = [t]
|
||||
elif block:
|
||||
block.append(t)
|
||||
if block:
|
||||
section_blocks.append(block)
|
||||
return [
|
||||
MarkdownSection(
|
||||
block[1].content,
|
||||
block[0].markup.count("#"),
|
||||
self.parser.renderer.render(block, self.parser.options, env=None),
|
||||
)
|
||||
for block in section_blocks
|
||||
]
|
||||
|
||||
@comma_separated_string_args
|
||||
def sections(self, fileids=None, categories=None):
|
||||
return self.concatenated_view(self.section_reader, fileids, categories)
|
||||
398
backend/venv/Lib/site-packages/nltk/corpus/reader/mte.py
Normal file
398
backend/venv/Lib/site-packages/nltk/corpus/reader/mte.py
Normal file
@@ -0,0 +1,398 @@
|
||||
"""
|
||||
A reader for corpora whose documents are in MTE format.
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
from functools import reduce
|
||||
|
||||
from nltk.corpus.reader import TaggedCorpusReader, concat
|
||||
from nltk.corpus.reader.xmldocs import XMLCorpusView
|
||||
|
||||
|
||||
def xpath(root, path, ns):
|
||||
return root.findall(path, ns)
|
||||
|
||||
|
||||
class MTECorpusView(XMLCorpusView):
|
||||
"""
|
||||
Class for lazy viewing the MTE Corpus.
|
||||
"""
|
||||
|
||||
def __init__(self, fileid, tagspec, elt_handler=None):
|
||||
XMLCorpusView.__init__(self, fileid, tagspec, elt_handler)
|
||||
|
||||
def read_block(self, stream, tagspec=None, elt_handler=None):
|
||||
return list(
|
||||
filter(
|
||||
lambda x: x is not None,
|
||||
XMLCorpusView.read_block(self, stream, tagspec, elt_handler),
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
class MTEFileReader:
|
||||
"""
|
||||
Class for loading the content of the multext-east corpus. It
|
||||
parses the xml files and does some tag-filtering depending on the
|
||||
given method parameters.
|
||||
"""
|
||||
|
||||
ns = {
|
||||
"tei": "https://www.tei-c.org/ns/1.0",
|
||||
"xml": "https://www.w3.org/XML/1998/namespace",
|
||||
}
|
||||
tag_ns = "{https://www.tei-c.org/ns/1.0}"
|
||||
xml_ns = "{https://www.w3.org/XML/1998/namespace}"
|
||||
word_path = "TEI/text/body/div/div/p/s/(w|c)"
|
||||
sent_path = "TEI/text/body/div/div/p/s"
|
||||
para_path = "TEI/text/body/div/div/p"
|
||||
|
||||
def __init__(self, file_path):
|
||||
self.__file_path = file_path
|
||||
|
||||
@classmethod
|
||||
def _word_elt(cls, elt, context):
|
||||
return elt.text
|
||||
|
||||
@classmethod
|
||||
def _sent_elt(cls, elt, context):
|
||||
return [cls._word_elt(w, None) for w in xpath(elt, "*", cls.ns)]
|
||||
|
||||
@classmethod
|
||||
def _para_elt(cls, elt, context):
|
||||
return [cls._sent_elt(s, None) for s in xpath(elt, "*", cls.ns)]
|
||||
|
||||
@classmethod
|
||||
def _tagged_word_elt(cls, elt, context):
|
||||
if "ana" not in elt.attrib:
|
||||
return (elt.text, "")
|
||||
|
||||
if cls.__tags == "" and cls.__tagset == "msd":
|
||||
return (elt.text, elt.attrib["ana"])
|
||||
elif cls.__tags == "" and cls.__tagset == "universal":
|
||||
return (elt.text, MTETagConverter.msd_to_universal(elt.attrib["ana"]))
|
||||
else:
|
||||
tags = re.compile("^" + re.sub("-", ".", cls.__tags) + ".*$")
|
||||
if tags.match(elt.attrib["ana"]):
|
||||
if cls.__tagset == "msd":
|
||||
return (elt.text, elt.attrib["ana"])
|
||||
else:
|
||||
return (
|
||||
elt.text,
|
||||
MTETagConverter.msd_to_universal(elt.attrib["ana"]),
|
||||
)
|
||||
else:
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def _tagged_sent_elt(cls, elt, context):
|
||||
return list(
|
||||
filter(
|
||||
lambda x: x is not None,
|
||||
[cls._tagged_word_elt(w, None) for w in xpath(elt, "*", cls.ns)],
|
||||
)
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _tagged_para_elt(cls, elt, context):
|
||||
return list(
|
||||
filter(
|
||||
lambda x: x is not None,
|
||||
[cls._tagged_sent_elt(s, None) for s in xpath(elt, "*", cls.ns)],
|
||||
)
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _lemma_word_elt(cls, elt, context):
|
||||
if "lemma" not in elt.attrib:
|
||||
return (elt.text, "")
|
||||
else:
|
||||
return (elt.text, elt.attrib["lemma"])
|
||||
|
||||
@classmethod
|
||||
def _lemma_sent_elt(cls, elt, context):
|
||||
return [cls._lemma_word_elt(w, None) for w in xpath(elt, "*", cls.ns)]
|
||||
|
||||
@classmethod
|
||||
def _lemma_para_elt(cls, elt, context):
|
||||
return [cls._lemma_sent_elt(s, None) for s in xpath(elt, "*", cls.ns)]
|
||||
|
||||
def words(self):
|
||||
return MTECorpusView(
|
||||
self.__file_path, MTEFileReader.word_path, MTEFileReader._word_elt
|
||||
)
|
||||
|
||||
def sents(self):
|
||||
return MTECorpusView(
|
||||
self.__file_path, MTEFileReader.sent_path, MTEFileReader._sent_elt
|
||||
)
|
||||
|
||||
def paras(self):
|
||||
return MTECorpusView(
|
||||
self.__file_path, MTEFileReader.para_path, MTEFileReader._para_elt
|
||||
)
|
||||
|
||||
def lemma_words(self):
|
||||
return MTECorpusView(
|
||||
self.__file_path, MTEFileReader.word_path, MTEFileReader._lemma_word_elt
|
||||
)
|
||||
|
||||
def tagged_words(self, tagset, tags):
|
||||
MTEFileReader.__tagset = tagset
|
||||
MTEFileReader.__tags = tags
|
||||
return MTECorpusView(
|
||||
self.__file_path, MTEFileReader.word_path, MTEFileReader._tagged_word_elt
|
||||
)
|
||||
|
||||
def lemma_sents(self):
|
||||
return MTECorpusView(
|
||||
self.__file_path, MTEFileReader.sent_path, MTEFileReader._lemma_sent_elt
|
||||
)
|
||||
|
||||
def tagged_sents(self, tagset, tags):
|
||||
MTEFileReader.__tagset = tagset
|
||||
MTEFileReader.__tags = tags
|
||||
return MTECorpusView(
|
||||
self.__file_path, MTEFileReader.sent_path, MTEFileReader._tagged_sent_elt
|
||||
)
|
||||
|
||||
def lemma_paras(self):
|
||||
return MTECorpusView(
|
||||
self.__file_path, MTEFileReader.para_path, MTEFileReader._lemma_para_elt
|
||||
)
|
||||
|
||||
def tagged_paras(self, tagset, tags):
|
||||
MTEFileReader.__tagset = tagset
|
||||
MTEFileReader.__tags = tags
|
||||
return MTECorpusView(
|
||||
self.__file_path, MTEFileReader.para_path, MTEFileReader._tagged_para_elt
|
||||
)
|
||||
|
||||
|
||||
class MTETagConverter:
|
||||
"""
|
||||
Class for converting msd tags to universal tags, more conversion
|
||||
options are currently not implemented.
|
||||
"""
|
||||
|
||||
mapping_msd_universal = {
|
||||
"A": "ADJ",
|
||||
"S": "ADP",
|
||||
"R": "ADV",
|
||||
"C": "CONJ",
|
||||
"D": "DET",
|
||||
"N": "NOUN",
|
||||
"M": "NUM",
|
||||
"Q": "PRT",
|
||||
"P": "PRON",
|
||||
"V": "VERB",
|
||||
".": ".",
|
||||
"-": "X",
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def msd_to_universal(tag):
|
||||
"""
|
||||
This function converts the annotation from the Multex-East to the universal tagset
|
||||
as described in Chapter 5 of the NLTK-Book
|
||||
|
||||
Unknown Tags will be mapped to X. Punctuation marks are not supported in MSD tags, so
|
||||
"""
|
||||
indicator = tag[0] if not tag[0] == "#" else tag[1]
|
||||
|
||||
if not indicator in MTETagConverter.mapping_msd_universal:
|
||||
indicator = "-"
|
||||
|
||||
return MTETagConverter.mapping_msd_universal[indicator]
|
||||
|
||||
|
||||
class MTECorpusReader(TaggedCorpusReader):
|
||||
"""
|
||||
Reader for corpora following the TEI-p5 xml scheme, such as MULTEXT-East.
|
||||
MULTEXT-East contains part-of-speech-tagged words with a quite precise tagging
|
||||
scheme. These tags can be converted to the Universal tagset
|
||||
"""
|
||||
|
||||
def __init__(self, root=None, fileids=None, encoding="utf8"):
|
||||
"""
|
||||
Construct a new MTECorpusreader for a set of documents
|
||||
located at the given root directory. Example usage:
|
||||
|
||||
>>> root = '/...path to corpus.../'
|
||||
>>> reader = MTECorpusReader(root, 'oana-*.xml', 'utf8') # doctest: +SKIP
|
||||
|
||||
:param root: The root directory for this corpus. (default points to location in multext config file)
|
||||
:param fileids: A list or regexp specifying the fileids in this corpus. (default is oana-en.xml)
|
||||
:param encoding: The encoding of the given files (default is utf8)
|
||||
"""
|
||||
TaggedCorpusReader.__init__(self, root, fileids, encoding)
|
||||
self._readme = "00README.txt"
|
||||
|
||||
def __fileids(self, fileids):
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, str):
|
||||
fileids = [fileids]
|
||||
# filter wrong userinput
|
||||
fileids = filter(lambda x: x in self._fileids, fileids)
|
||||
# filter multext-east sourcefiles that are not compatible to the teip5 specification
|
||||
fileids = filter(lambda x: x not in ["oana-bg.xml", "oana-mk.xml"], fileids)
|
||||
if not fileids:
|
||||
print("No valid multext-east file specified")
|
||||
return fileids
|
||||
|
||||
def words(self, fileids=None):
|
||||
"""
|
||||
:param fileids: A list specifying the fileids that should be used.
|
||||
:return: the given file(s) as a list of words and punctuation symbols.
|
||||
:rtype: list(str)
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
MTEFileReader(os.path.join(self._root, f)).words()
|
||||
for f in self.__fileids(fileids)
|
||||
]
|
||||
)
|
||||
|
||||
def sents(self, fileids=None):
|
||||
"""
|
||||
:param fileids: A list specifying the fileids that should be used.
|
||||
:return: the given file(s) as a list of sentences or utterances,
|
||||
each encoded as a list of word strings
|
||||
:rtype: list(list(str))
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
MTEFileReader(os.path.join(self._root, f)).sents()
|
||||
for f in self.__fileids(fileids)
|
||||
]
|
||||
)
|
||||
|
||||
def paras(self, fileids=None):
|
||||
"""
|
||||
:param fileids: A list specifying the fileids that should be used.
|
||||
:return: the given file(s) as a list of paragraphs, each encoded as a list
|
||||
of sentences, which are in turn encoded as lists of word string
|
||||
:rtype: list(list(list(str)))
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
MTEFileReader(os.path.join(self._root, f)).paras()
|
||||
for f in self.__fileids(fileids)
|
||||
]
|
||||
)
|
||||
|
||||
def lemma_words(self, fileids=None):
|
||||
"""
|
||||
:param fileids: A list specifying the fileids that should be used.
|
||||
:return: the given file(s) as a list of words, the corresponding lemmas
|
||||
and punctuation symbols, encoded as tuples (word, lemma)
|
||||
:rtype: list(tuple(str,str))
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
MTEFileReader(os.path.join(self._root, f)).lemma_words()
|
||||
for f in self.__fileids(fileids)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_words(self, fileids=None, tagset="msd", tags=""):
|
||||
"""
|
||||
:param fileids: A list specifying the fileids that should be used.
|
||||
:param tagset: The tagset that should be used in the returned object,
|
||||
either "universal" or "msd", "msd" is the default
|
||||
:param tags: An MSD Tag that is used to filter all parts of the used corpus
|
||||
that are not more precise or at least equal to the given tag
|
||||
:return: the given file(s) as a list of tagged words and punctuation symbols
|
||||
encoded as tuples (word, tag)
|
||||
:rtype: list(tuple(str, str))
|
||||
"""
|
||||
if tagset == "universal" or tagset == "msd":
|
||||
return concat(
|
||||
[
|
||||
MTEFileReader(os.path.join(self._root, f)).tagged_words(
|
||||
tagset, tags
|
||||
)
|
||||
for f in self.__fileids(fileids)
|
||||
]
|
||||
)
|
||||
else:
|
||||
print("Unknown tagset specified.")
|
||||
|
||||
def lemma_sents(self, fileids=None):
|
||||
"""
|
||||
:param fileids: A list specifying the fileids that should be used.
|
||||
:return: the given file(s) as a list of sentences or utterances, each
|
||||
encoded as a list of tuples of the word and the corresponding
|
||||
lemma (word, lemma)
|
||||
:rtype: list(list(tuple(str, str)))
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
MTEFileReader(os.path.join(self._root, f)).lemma_sents()
|
||||
for f in self.__fileids(fileids)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_sents(self, fileids=None, tagset="msd", tags=""):
|
||||
"""
|
||||
:param fileids: A list specifying the fileids that should be used.
|
||||
:param tagset: The tagset that should be used in the returned object,
|
||||
either "universal" or "msd", "msd" is the default
|
||||
:param tags: An MSD Tag that is used to filter all parts of the used corpus
|
||||
that are not more precise or at least equal to the given tag
|
||||
:return: the given file(s) as a list of sentences or utterances, each
|
||||
each encoded as a list of (word,tag) tuples
|
||||
:rtype: list(list(tuple(str, str)))
|
||||
"""
|
||||
if tagset == "universal" or tagset == "msd":
|
||||
return concat(
|
||||
[
|
||||
MTEFileReader(os.path.join(self._root, f)).tagged_sents(
|
||||
tagset, tags
|
||||
)
|
||||
for f in self.__fileids(fileids)
|
||||
]
|
||||
)
|
||||
else:
|
||||
print("Unknown tagset specified.")
|
||||
|
||||
def lemma_paras(self, fileids=None):
|
||||
"""
|
||||
:param fileids: A list specifying the fileids that should be used.
|
||||
:return: the given file(s) as a list of paragraphs, each encoded as a
|
||||
list of sentences, which are in turn encoded as a list of
|
||||
tuples of the word and the corresponding lemma (word, lemma)
|
||||
:rtype: list(List(List(tuple(str, str))))
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
MTEFileReader(os.path.join(self._root, f)).lemma_paras()
|
||||
for f in self.__fileids(fileids)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_paras(self, fileids=None, tagset="msd", tags=""):
|
||||
"""
|
||||
:param fileids: A list specifying the fileids that should be used.
|
||||
:param tagset: The tagset that should be used in the returned object,
|
||||
either "universal" or "msd", "msd" is the default
|
||||
:param tags: An MSD Tag that is used to filter all parts of the used corpus
|
||||
that are not more precise or at least equal to the given tag
|
||||
:return: the given file(s) as a list of paragraphs, each encoded as a
|
||||
list of sentences, which are in turn encoded as a list
|
||||
of (word,tag) tuples
|
||||
:rtype: list(list(list(tuple(str, str))))
|
||||
"""
|
||||
if tagset == "universal" or tagset == "msd":
|
||||
return concat(
|
||||
[
|
||||
MTEFileReader(os.path.join(self._root, f)).tagged_paras(
|
||||
tagset, tags
|
||||
)
|
||||
for f in self.__fileids(fileids)
|
||||
]
|
||||
)
|
||||
else:
|
||||
print("Unknown tagset specified.")
|
||||
486
backend/venv/Lib/site-packages/nltk/corpus/reader/nkjp.py
Normal file
486
backend/venv/Lib/site-packages/nltk/corpus/reader/nkjp.py
Normal file
@@ -0,0 +1,486 @@
|
||||
# Natural Language Toolkit: NKJP Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Gabriela Kaczka
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import functools
|
||||
import os
|
||||
import re
|
||||
import tempfile
|
||||
|
||||
from nltk.corpus.reader.util import concat
|
||||
from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView
|
||||
|
||||
|
||||
def _parse_args(fun):
|
||||
"""
|
||||
Wraps function arguments:
|
||||
if fileids not specified then function set NKJPCorpusReader paths.
|
||||
"""
|
||||
|
||||
@functools.wraps(fun)
|
||||
def decorator(self, fileids=None, **kwargs):
|
||||
if not fileids:
|
||||
fileids = self._paths
|
||||
return fun(self, fileids, **kwargs)
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
class NKJPCorpusReader(XMLCorpusReader):
|
||||
WORDS_MODE = 0
|
||||
SENTS_MODE = 1
|
||||
HEADER_MODE = 2
|
||||
RAW_MODE = 3
|
||||
|
||||
def __init__(self, root, fileids=".*"):
|
||||
"""
|
||||
Corpus reader designed to work with National Corpus of Polish.
|
||||
See http://nkjp.pl/ for more details about NKJP.
|
||||
use example:
|
||||
import nltk
|
||||
import nkjp
|
||||
from nkjp import NKJPCorpusReader
|
||||
x = NKJPCorpusReader(root='/home/USER/nltk_data/corpora/nkjp/', fileids='') # obtain the whole corpus
|
||||
x.header()
|
||||
x.raw()
|
||||
x.words()
|
||||
x.tagged_words(tags=['subst', 'comp']) #Link to find more tags: nkjp.pl/poliqarp/help/ense2.html
|
||||
x.sents()
|
||||
x = NKJPCorpusReader(root='/home/USER/nltk_data/corpora/nkjp/', fileids='Wilk*') # obtain particular file(s)
|
||||
x.header(fileids=['WilkDom', '/home/USER/nltk_data/corpora/nkjp/WilkWilczy'])
|
||||
x.tagged_words(fileids=['WilkDom', '/home/USER/nltk_data/corpora/nkjp/WilkWilczy'], tags=['subst', 'comp'])
|
||||
"""
|
||||
if isinstance(fileids, str):
|
||||
XMLCorpusReader.__init__(self, root, fileids + ".*/header.xml")
|
||||
else:
|
||||
XMLCorpusReader.__init__(
|
||||
self, root, [fileid + "/header.xml" for fileid in fileids]
|
||||
)
|
||||
self._paths = self.get_paths()
|
||||
|
||||
def get_paths(self):
|
||||
return [
|
||||
os.path.join(str(self._root), f.split("header.xml")[0])
|
||||
for f in self._fileids
|
||||
]
|
||||
|
||||
def fileids(self):
|
||||
"""
|
||||
Returns a list of file identifiers for the fileids that make up
|
||||
this corpus.
|
||||
"""
|
||||
return [f.split("header.xml")[0] for f in self._fileids]
|
||||
|
||||
def _view(self, filename, tags=None, **kwargs):
|
||||
"""
|
||||
Returns a view specialised for use with particular corpus file.
|
||||
"""
|
||||
mode = kwargs.pop("mode", NKJPCorpusReader.WORDS_MODE)
|
||||
if mode is NKJPCorpusReader.WORDS_MODE:
|
||||
return NKJPCorpus_Morph_View(filename, tags=tags)
|
||||
elif mode is NKJPCorpusReader.SENTS_MODE:
|
||||
return NKJPCorpus_Segmentation_View(filename, tags=tags)
|
||||
elif mode is NKJPCorpusReader.HEADER_MODE:
|
||||
return NKJPCorpus_Header_View(filename, tags=tags)
|
||||
elif mode is NKJPCorpusReader.RAW_MODE:
|
||||
return NKJPCorpus_Text_View(
|
||||
filename, tags=tags, mode=NKJPCorpus_Text_View.RAW_MODE
|
||||
)
|
||||
|
||||
else:
|
||||
raise NameError("No such mode!")
|
||||
|
||||
def add_root(self, fileid):
|
||||
"""
|
||||
Add root if necessary to specified fileid.
|
||||
"""
|
||||
if self.root in fileid:
|
||||
return fileid
|
||||
return self.root + fileid
|
||||
|
||||
@_parse_args
|
||||
def header(self, fileids=None, **kwargs):
|
||||
"""
|
||||
Returns header(s) of specified fileids.
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
self._view(
|
||||
self.add_root(fileid), mode=NKJPCorpusReader.HEADER_MODE, **kwargs
|
||||
).handle_query()
|
||||
for fileid in fileids
|
||||
]
|
||||
)
|
||||
|
||||
@_parse_args
|
||||
def sents(self, fileids=None, **kwargs):
|
||||
"""
|
||||
Returns sentences in specified fileids.
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
self._view(
|
||||
self.add_root(fileid), mode=NKJPCorpusReader.SENTS_MODE, **kwargs
|
||||
).handle_query()
|
||||
for fileid in fileids
|
||||
]
|
||||
)
|
||||
|
||||
@_parse_args
|
||||
def words(self, fileids=None, **kwargs):
|
||||
"""
|
||||
Returns words in specified fileids.
|
||||
"""
|
||||
|
||||
return concat(
|
||||
[
|
||||
self._view(
|
||||
self.add_root(fileid), mode=NKJPCorpusReader.WORDS_MODE, **kwargs
|
||||
).handle_query()
|
||||
for fileid in fileids
|
||||
]
|
||||
)
|
||||
|
||||
@_parse_args
|
||||
def tagged_words(self, fileids=None, **kwargs):
|
||||
"""
|
||||
Call with specified tags as a list, e.g. tags=['subst', 'comp'].
|
||||
Returns tagged words in specified fileids.
|
||||
"""
|
||||
tags = kwargs.pop("tags", [])
|
||||
return concat(
|
||||
[
|
||||
self._view(
|
||||
self.add_root(fileid),
|
||||
mode=NKJPCorpusReader.WORDS_MODE,
|
||||
tags=tags,
|
||||
**kwargs
|
||||
).handle_query()
|
||||
for fileid in fileids
|
||||
]
|
||||
)
|
||||
|
||||
@_parse_args
|
||||
def raw(self, fileids=None, **kwargs):
|
||||
"""
|
||||
Returns words in specified fileids.
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
self._view(
|
||||
self.add_root(fileid), mode=NKJPCorpusReader.RAW_MODE, **kwargs
|
||||
).handle_query()
|
||||
for fileid in fileids
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
class NKJPCorpus_Header_View(XMLCorpusView):
|
||||
def __init__(self, filename, **kwargs):
|
||||
"""
|
||||
HEADER_MODE
|
||||
A stream backed corpus view specialized for use with
|
||||
header.xml files in NKJP corpus.
|
||||
"""
|
||||
self.tagspec = ".*/sourceDesc$"
|
||||
XMLCorpusView.__init__(self, filename + "header.xml", self.tagspec)
|
||||
|
||||
def handle_query(self):
|
||||
self._open()
|
||||
header = []
|
||||
while True:
|
||||
segm = XMLCorpusView.read_block(self, self._stream)
|
||||
if len(segm) == 0:
|
||||
break
|
||||
header.extend(segm)
|
||||
self.close()
|
||||
return header
|
||||
|
||||
def handle_elt(self, elt, context):
|
||||
titles = elt.findall("bibl/title")
|
||||
title = []
|
||||
if titles:
|
||||
title = "\n".join(title.text.strip() for title in titles)
|
||||
|
||||
authors = elt.findall("bibl/author")
|
||||
author = []
|
||||
if authors:
|
||||
author = "\n".join(author.text.strip() for author in authors)
|
||||
|
||||
dates = elt.findall("bibl/date")
|
||||
date = []
|
||||
if dates:
|
||||
date = "\n".join(date.text.strip() for date in dates)
|
||||
|
||||
publishers = elt.findall("bibl/publisher")
|
||||
publisher = []
|
||||
if publishers:
|
||||
publisher = "\n".join(publisher.text.strip() for publisher in publishers)
|
||||
|
||||
idnos = elt.findall("bibl/idno")
|
||||
idno = []
|
||||
if idnos:
|
||||
idno = "\n".join(idno.text.strip() for idno in idnos)
|
||||
|
||||
notes = elt.findall("bibl/note")
|
||||
note = []
|
||||
if notes:
|
||||
note = "\n".join(note.text.strip() for note in notes)
|
||||
|
||||
return {
|
||||
"title": title,
|
||||
"author": author,
|
||||
"date": date,
|
||||
"publisher": publisher,
|
||||
"idno": idno,
|
||||
"note": note,
|
||||
}
|
||||
|
||||
|
||||
class XML_Tool:
|
||||
"""
|
||||
Helper class creating xml file to one without references to nkjp: namespace.
|
||||
That's needed because the XMLCorpusView assumes that one can find short substrings
|
||||
of XML that are valid XML, which is not true if a namespace is declared at top level
|
||||
"""
|
||||
|
||||
def __init__(self, root, filename):
|
||||
self.read_file = os.path.join(root, filename)
|
||||
self.write_file = tempfile.NamedTemporaryFile(delete=False)
|
||||
|
||||
def build_preprocessed_file(self):
|
||||
try:
|
||||
fr = open(self.read_file)
|
||||
fw = self.write_file
|
||||
line = " "
|
||||
while len(line):
|
||||
line = fr.readline()
|
||||
x = re.split(r"nkjp:[^ ]* ", line) # in all files
|
||||
ret = " ".join(x)
|
||||
x = re.split("<nkjp:paren>", ret) # in ann_segmentation.xml
|
||||
ret = " ".join(x)
|
||||
x = re.split("</nkjp:paren>", ret) # in ann_segmentation.xml
|
||||
ret = " ".join(x)
|
||||
x = re.split("<choice>", ret) # in ann_segmentation.xml
|
||||
ret = " ".join(x)
|
||||
x = re.split("</choice>", ret) # in ann_segmentation.xml
|
||||
ret = " ".join(x)
|
||||
fw.write(ret)
|
||||
fr.close()
|
||||
fw.close()
|
||||
return self.write_file.name
|
||||
except Exception as e:
|
||||
self.remove_preprocessed_file()
|
||||
raise Exception from e
|
||||
|
||||
def remove_preprocessed_file(self):
|
||||
os.remove(self.write_file.name)
|
||||
|
||||
|
||||
class NKJPCorpus_Segmentation_View(XMLCorpusView):
|
||||
"""
|
||||
A stream backed corpus view specialized for use with
|
||||
ann_segmentation.xml files in NKJP corpus.
|
||||
"""
|
||||
|
||||
def __init__(self, filename, **kwargs):
|
||||
self.tagspec = ".*p/.*s"
|
||||
# intersperse NKJPCorpus_Text_View
|
||||
self.text_view = NKJPCorpus_Text_View(
|
||||
filename, mode=NKJPCorpus_Text_View.SENTS_MODE
|
||||
)
|
||||
self.text_view.handle_query()
|
||||
# xml preprocessing
|
||||
self.xml_tool = XML_Tool(filename, "ann_segmentation.xml")
|
||||
# base class init
|
||||
XMLCorpusView.__init__(
|
||||
self, self.xml_tool.build_preprocessed_file(), self.tagspec
|
||||
)
|
||||
|
||||
def get_segm_id(self, example_word):
|
||||
return example_word.split("(")[1].split(",")[0]
|
||||
|
||||
def get_sent_beg(self, beg_word):
|
||||
# returns index of beginning letter in sentence
|
||||
return int(beg_word.split(",")[1])
|
||||
|
||||
def get_sent_end(self, end_word):
|
||||
# returns index of end letter in sentence
|
||||
splitted = end_word.split(")")[0].split(",")
|
||||
return int(splitted[1]) + int(splitted[2])
|
||||
|
||||
def get_sentences(self, sent_segm):
|
||||
# returns one sentence
|
||||
id = self.get_segm_id(sent_segm[0])
|
||||
segm = self.text_view.segm_dict[id] # text segment
|
||||
beg = self.get_sent_beg(sent_segm[0])
|
||||
end = self.get_sent_end(sent_segm[len(sent_segm) - 1])
|
||||
return segm[beg:end]
|
||||
|
||||
def remove_choice(self, segm):
|
||||
ret = []
|
||||
prev_txt_end = -1
|
||||
prev_txt_nr = -1
|
||||
for word in segm:
|
||||
txt_nr = self.get_segm_id(word)
|
||||
# get increasing sequence of ids: in case of choice get first possibility
|
||||
if self.get_sent_beg(word) > prev_txt_end - 1 or prev_txt_nr != txt_nr:
|
||||
ret.append(word)
|
||||
prev_txt_end = self.get_sent_end(word)
|
||||
prev_txt_nr = txt_nr
|
||||
|
||||
return ret
|
||||
|
||||
def handle_query(self):
|
||||
try:
|
||||
self._open()
|
||||
sentences = []
|
||||
while True:
|
||||
sent_segm = XMLCorpusView.read_block(self, self._stream)
|
||||
if len(sent_segm) == 0:
|
||||
break
|
||||
for segm in sent_segm:
|
||||
segm = self.remove_choice(segm)
|
||||
sentences.append(self.get_sentences(segm))
|
||||
self.close()
|
||||
self.xml_tool.remove_preprocessed_file()
|
||||
return sentences
|
||||
except Exception as e:
|
||||
self.xml_tool.remove_preprocessed_file()
|
||||
raise Exception from e
|
||||
|
||||
def handle_elt(self, elt, context):
|
||||
ret = []
|
||||
for seg in elt:
|
||||
ret.append(seg.get("corresp"))
|
||||
return ret
|
||||
|
||||
|
||||
class NKJPCorpus_Text_View(XMLCorpusView):
|
||||
"""
|
||||
A stream backed corpus view specialized for use with
|
||||
text.xml files in NKJP corpus.
|
||||
"""
|
||||
|
||||
SENTS_MODE = 0
|
||||
RAW_MODE = 1
|
||||
|
||||
def __init__(self, filename, **kwargs):
|
||||
self.mode = kwargs.pop("mode", 0)
|
||||
self.tagspec = ".*/div/ab"
|
||||
self.segm_dict = dict()
|
||||
# xml preprocessing
|
||||
self.xml_tool = XML_Tool(filename, "text.xml")
|
||||
# base class init
|
||||
XMLCorpusView.__init__(
|
||||
self, self.xml_tool.build_preprocessed_file(), self.tagspec
|
||||
)
|
||||
|
||||
def handle_query(self):
|
||||
try:
|
||||
self._open()
|
||||
x = self.read_block(self._stream)
|
||||
self.close()
|
||||
self.xml_tool.remove_preprocessed_file()
|
||||
return x
|
||||
except Exception as e:
|
||||
self.xml_tool.remove_preprocessed_file()
|
||||
raise Exception from e
|
||||
|
||||
def read_block(self, stream, tagspec=None, elt_handler=None):
|
||||
"""
|
||||
Returns text as a list of sentences.
|
||||
"""
|
||||
txt = []
|
||||
while True:
|
||||
segm = XMLCorpusView.read_block(self, stream)
|
||||
if len(segm) == 0:
|
||||
break
|
||||
for part in segm:
|
||||
txt.append(part)
|
||||
|
||||
return [" ".join([segm for segm in txt])]
|
||||
|
||||
def get_segm_id(self, elt):
|
||||
for attr in elt.attrib:
|
||||
if attr.endswith("id"):
|
||||
return elt.get(attr)
|
||||
|
||||
def handle_elt(self, elt, context):
|
||||
# fill dictionary to use later in sents mode
|
||||
if self.mode is NKJPCorpus_Text_View.SENTS_MODE:
|
||||
self.segm_dict[self.get_segm_id(elt)] = elt.text
|
||||
return elt.text
|
||||
|
||||
|
||||
class NKJPCorpus_Morph_View(XMLCorpusView):
|
||||
"""
|
||||
A stream backed corpus view specialized for use with
|
||||
ann_morphosyntax.xml files in NKJP corpus.
|
||||
"""
|
||||
|
||||
def __init__(self, filename, **kwargs):
|
||||
self.tags = kwargs.pop("tags", None)
|
||||
self.tagspec = ".*/seg/fs"
|
||||
self.xml_tool = XML_Tool(filename, "ann_morphosyntax.xml")
|
||||
XMLCorpusView.__init__(
|
||||
self, self.xml_tool.build_preprocessed_file(), self.tagspec
|
||||
)
|
||||
|
||||
def handle_query(self):
|
||||
try:
|
||||
self._open()
|
||||
words = []
|
||||
while True:
|
||||
segm = XMLCorpusView.read_block(self, self._stream)
|
||||
if len(segm) == 0:
|
||||
break
|
||||
for part in segm:
|
||||
if part is not None:
|
||||
words.append(part)
|
||||
self.close()
|
||||
self.xml_tool.remove_preprocessed_file()
|
||||
return words
|
||||
except Exception as e:
|
||||
self.xml_tool.remove_preprocessed_file()
|
||||
raise Exception from e
|
||||
|
||||
def handle_elt(self, elt, context):
|
||||
word = ""
|
||||
flag = False
|
||||
is_not_interp = True
|
||||
# if tags not specified, then always return word
|
||||
if self.tags is None:
|
||||
flag = True
|
||||
|
||||
for child in elt:
|
||||
# get word
|
||||
if "name" in child.keys() and child.attrib["name"] == "orth":
|
||||
for symbol in child:
|
||||
if symbol.tag == "string":
|
||||
word = symbol.text
|
||||
elif "name" in child.keys() and child.attrib["name"] == "interps":
|
||||
for symbol in child:
|
||||
if "type" in symbol.keys() and symbol.attrib["type"] == "lex":
|
||||
for symbol2 in symbol:
|
||||
if (
|
||||
"name" in symbol2.keys()
|
||||
and symbol2.attrib["name"] == "ctag"
|
||||
):
|
||||
for symbol3 in symbol2:
|
||||
if (
|
||||
"value" in symbol3.keys()
|
||||
and self.tags is not None
|
||||
and symbol3.attrib["value"] in self.tags
|
||||
):
|
||||
flag = True
|
||||
elif (
|
||||
"value" in symbol3.keys()
|
||||
and symbol3.attrib["value"] == "interp"
|
||||
):
|
||||
is_not_interp = False
|
||||
if flag and is_not_interp:
|
||||
return word
|
||||
465
backend/venv/Lib/site-packages/nltk/corpus/reader/nombank.py
Normal file
465
backend/venv/Lib/site-packages/nltk/corpus/reader/nombank.py
Normal file
@@ -0,0 +1,465 @@
|
||||
# Natural Language Toolkit: NomBank Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Authors: Paul Bedaride <paul.bedaride@gmail.com>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
from functools import total_ordering
|
||||
from xml.etree import ElementTree
|
||||
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.internals import raise_unorderable_types
|
||||
from nltk.tree import Tree
|
||||
|
||||
|
||||
class NombankCorpusReader(CorpusReader):
|
||||
"""
|
||||
Corpus reader for the nombank corpus, which augments the Penn
|
||||
Treebank with information about the predicate argument structure
|
||||
of every noun instance. The corpus consists of two parts: the
|
||||
predicate-argument annotations themselves, and a set of "frameset
|
||||
files" which define the argument labels used by the annotations,
|
||||
on a per-noun basis. Each "frameset file" contains one or more
|
||||
predicates, such as ``'turn'`` or ``'turn_on'``, each of which is
|
||||
divided into coarse-grained word senses called "rolesets". For
|
||||
each "roleset", the frameset file provides descriptions of the
|
||||
argument roles, along with examples.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
root,
|
||||
nomfile,
|
||||
framefiles="",
|
||||
nounsfile=None,
|
||||
parse_fileid_xform=None,
|
||||
parse_corpus=None,
|
||||
encoding="utf8",
|
||||
):
|
||||
"""
|
||||
:param root: The root directory for this corpus.
|
||||
:param nomfile: The name of the file containing the predicate-
|
||||
argument annotations (relative to ``root``).
|
||||
:param framefiles: A list or regexp specifying the frameset
|
||||
fileids for this corpus.
|
||||
:param parse_fileid_xform: A transform that should be applied
|
||||
to the fileids in this corpus. This should be a function
|
||||
of one argument (a fileid) that returns a string (the new
|
||||
fileid).
|
||||
:param parse_corpus: The corpus containing the parse trees
|
||||
corresponding to this corpus. These parse trees are
|
||||
necessary to resolve the tree pointers used by nombank.
|
||||
"""
|
||||
|
||||
# If framefiles is specified as a regexp, expand it.
|
||||
if isinstance(framefiles, str):
|
||||
self._fileids = find_corpus_fileids(root, framefiles)
|
||||
self._fileids = list(framefiles)
|
||||
# Initialize the corpus reader.
|
||||
CorpusReader.__init__(self, root, framefiles, encoding)
|
||||
|
||||
# Record our nom file & nouns file.
|
||||
self._nomfile = nomfile
|
||||
self._nounsfile = nounsfile
|
||||
self._parse_fileid_xform = parse_fileid_xform
|
||||
self._parse_corpus = parse_corpus
|
||||
|
||||
def instances(self, baseform=None):
|
||||
"""
|
||||
:return: a corpus view that acts as a list of
|
||||
``NombankInstance`` objects, one for each noun in the corpus.
|
||||
"""
|
||||
kwargs = {}
|
||||
if baseform is not None:
|
||||
kwargs["instance_filter"] = lambda inst: inst.baseform == baseform
|
||||
return StreamBackedCorpusView(
|
||||
self.abspath(self._nomfile),
|
||||
lambda stream: self._read_instance_block(stream, **kwargs),
|
||||
encoding=self.encoding(self._nomfile),
|
||||
)
|
||||
|
||||
def lines(self):
|
||||
"""
|
||||
:return: a corpus view that acts as a list of strings, one for
|
||||
each line in the predicate-argument annotation file.
|
||||
"""
|
||||
return StreamBackedCorpusView(
|
||||
self.abspath(self._nomfile),
|
||||
read_line_block,
|
||||
encoding=self.encoding(self._nomfile),
|
||||
)
|
||||
|
||||
def roleset(self, roleset_id):
|
||||
"""
|
||||
:return: the xml description for the given roleset.
|
||||
"""
|
||||
baseform = roleset_id.split(".")[0]
|
||||
baseform = baseform.replace("perc-sign", "%")
|
||||
baseform = baseform.replace("oneslashonezero", "1/10").replace(
|
||||
"1/10", "1-slash-10"
|
||||
)
|
||||
framefile = "frames/%s.xml" % baseform
|
||||
if framefile not in self.fileids():
|
||||
raise ValueError("Frameset file for %s not found" % roleset_id)
|
||||
|
||||
# n.b.: The encoding for XML fileids is specified by the file
|
||||
# itself; so we ignore self._encoding here.
|
||||
with self.abspath(framefile).open() as fp:
|
||||
etree = ElementTree.parse(fp).getroot()
|
||||
for roleset in etree.findall("predicate/roleset"):
|
||||
if roleset.attrib["id"] == roleset_id:
|
||||
return roleset
|
||||
raise ValueError(f"Roleset {roleset_id} not found in {framefile}")
|
||||
|
||||
def rolesets(self, baseform=None):
|
||||
"""
|
||||
:return: list of xml descriptions for rolesets.
|
||||
"""
|
||||
if baseform is not None:
|
||||
framefile = "frames/%s.xml" % baseform
|
||||
if framefile not in self.fileids():
|
||||
raise ValueError("Frameset file for %s not found" % baseform)
|
||||
framefiles = [framefile]
|
||||
else:
|
||||
framefiles = self.fileids()
|
||||
|
||||
rsets = []
|
||||
for framefile in framefiles:
|
||||
# n.b.: The encoding for XML fileids is specified by the file
|
||||
# itself; so we ignore self._encoding here.
|
||||
with self.abspath(framefile).open() as fp:
|
||||
etree = ElementTree.parse(fp).getroot()
|
||||
rsets.append(etree.findall("predicate/roleset"))
|
||||
return LazyConcatenation(rsets)
|
||||
|
||||
def nouns(self):
|
||||
"""
|
||||
:return: a corpus view that acts as a list of all noun lemmas
|
||||
in this corpus (from the nombank.1.0.words file).
|
||||
"""
|
||||
return StreamBackedCorpusView(
|
||||
self.abspath(self._nounsfile),
|
||||
read_line_block,
|
||||
encoding=self.encoding(self._nounsfile),
|
||||
)
|
||||
|
||||
def _read_instance_block(self, stream, instance_filter=lambda inst: True):
|
||||
block = []
|
||||
|
||||
# Read 100 at a time.
|
||||
for i in range(100):
|
||||
line = stream.readline().strip()
|
||||
if line:
|
||||
inst = NombankInstance.parse(
|
||||
line, self._parse_fileid_xform, self._parse_corpus
|
||||
)
|
||||
if instance_filter(inst):
|
||||
block.append(inst)
|
||||
|
||||
return block
|
||||
|
||||
|
||||
######################################################################
|
||||
# { Nombank Instance & related datatypes
|
||||
######################################################################
|
||||
|
||||
|
||||
class NombankInstance:
|
||||
def __init__(
|
||||
self,
|
||||
fileid,
|
||||
sentnum,
|
||||
wordnum,
|
||||
baseform,
|
||||
sensenumber,
|
||||
predicate,
|
||||
predid,
|
||||
arguments,
|
||||
parse_corpus=None,
|
||||
):
|
||||
self.fileid = fileid
|
||||
"""The name of the file containing the parse tree for this
|
||||
instance's sentence."""
|
||||
|
||||
self.sentnum = sentnum
|
||||
"""The sentence number of this sentence within ``fileid``.
|
||||
Indexing starts from zero."""
|
||||
|
||||
self.wordnum = wordnum
|
||||
"""The word number of this instance's predicate within its
|
||||
containing sentence. Word numbers are indexed starting from
|
||||
zero, and include traces and other empty parse elements."""
|
||||
|
||||
self.baseform = baseform
|
||||
"""The baseform of the predicate."""
|
||||
|
||||
self.sensenumber = sensenumber
|
||||
"""The sense number of the predicate."""
|
||||
|
||||
self.predicate = predicate
|
||||
"""A ``NombankTreePointer`` indicating the position of this
|
||||
instance's predicate within its containing sentence."""
|
||||
|
||||
self.predid = predid
|
||||
"""Identifier of the predicate."""
|
||||
|
||||
self.arguments = tuple(arguments)
|
||||
"""A list of tuples (argloc, argid), specifying the location
|
||||
and identifier for each of the predicate's argument in the
|
||||
containing sentence. Argument identifiers are strings such as
|
||||
``'ARG0'`` or ``'ARGM-TMP'``. This list does *not* contain
|
||||
the predicate."""
|
||||
|
||||
self.parse_corpus = parse_corpus
|
||||
"""A corpus reader for the parse trees corresponding to the
|
||||
instances in this nombank corpus."""
|
||||
|
||||
@property
|
||||
def roleset(self):
|
||||
"""The name of the roleset used by this instance's predicate.
|
||||
Use ``nombank.roleset() <NombankCorpusReader.roleset>`` to
|
||||
look up information about the roleset."""
|
||||
r = self.baseform.replace("%", "perc-sign")
|
||||
r = r.replace("1/10", "1-slash-10").replace("1-slash-10", "oneslashonezero")
|
||||
return f"{r}.{self.sensenumber}"
|
||||
|
||||
def __repr__(self):
|
||||
return "<NombankInstance: {}, sent {}, word {}>".format(
|
||||
self.fileid,
|
||||
self.sentnum,
|
||||
self.wordnum,
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
s = "{} {} {} {} {}".format(
|
||||
self.fileid,
|
||||
self.sentnum,
|
||||
self.wordnum,
|
||||
self.baseform,
|
||||
self.sensenumber,
|
||||
)
|
||||
items = self.arguments + ((self.predicate, "rel"),)
|
||||
for argloc, argid in sorted(items):
|
||||
s += f" {argloc}-{argid}"
|
||||
return s
|
||||
|
||||
def _get_tree(self):
|
||||
if self.parse_corpus is None:
|
||||
return None
|
||||
if self.fileid not in self.parse_corpus.fileids():
|
||||
return None
|
||||
return self.parse_corpus.parsed_sents(self.fileid)[self.sentnum]
|
||||
|
||||
tree = property(
|
||||
_get_tree,
|
||||
doc="""
|
||||
The parse tree corresponding to this instance, or None if
|
||||
the corresponding tree is not available.""",
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def parse(s, parse_fileid_xform=None, parse_corpus=None):
|
||||
pieces = s.split()
|
||||
if len(pieces) < 6:
|
||||
raise ValueError("Badly formatted nombank line: %r" % s)
|
||||
|
||||
# Divide the line into its basic pieces.
|
||||
(fileid, sentnum, wordnum, baseform, sensenumber) = pieces[:5]
|
||||
|
||||
args = pieces[5:]
|
||||
rel = [args.pop(i) for i, p in enumerate(args) if "-rel" in p]
|
||||
if len(rel) != 1:
|
||||
raise ValueError("Badly formatted nombank line: %r" % s)
|
||||
|
||||
# Apply the fileid selector, if any.
|
||||
if parse_fileid_xform is not None:
|
||||
fileid = parse_fileid_xform(fileid)
|
||||
|
||||
# Convert sentence & word numbers to ints.
|
||||
sentnum = int(sentnum)
|
||||
wordnum = int(wordnum)
|
||||
|
||||
# Parse the predicate location.
|
||||
|
||||
predloc, predid = rel[0].split("-", 1)
|
||||
predicate = NombankTreePointer.parse(predloc)
|
||||
|
||||
# Parse the arguments.
|
||||
arguments = []
|
||||
for arg in args:
|
||||
argloc, argid = arg.split("-", 1)
|
||||
arguments.append((NombankTreePointer.parse(argloc), argid))
|
||||
|
||||
# Put it all together.
|
||||
return NombankInstance(
|
||||
fileid,
|
||||
sentnum,
|
||||
wordnum,
|
||||
baseform,
|
||||
sensenumber,
|
||||
predicate,
|
||||
predid,
|
||||
arguments,
|
||||
parse_corpus,
|
||||
)
|
||||
|
||||
|
||||
class NombankPointer:
|
||||
"""
|
||||
A pointer used by nombank to identify one or more constituents in
|
||||
a parse tree. ``NombankPointer`` is an abstract base class with
|
||||
three concrete subclasses:
|
||||
|
||||
- ``NombankTreePointer`` is used to point to single constituents.
|
||||
- ``NombankSplitTreePointer`` is used to point to 'split'
|
||||
constituents, which consist of a sequence of two or more
|
||||
``NombankTreePointer`` pointers.
|
||||
- ``NombankChainTreePointer`` is used to point to entire trace
|
||||
chains in a tree. It consists of a sequence of pieces, which
|
||||
can be ``NombankTreePointer`` or ``NombankSplitTreePointer`` pointers.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
if self.__class__ == NombankPointer:
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
class NombankChainTreePointer(NombankPointer):
|
||||
def __init__(self, pieces):
|
||||
self.pieces = pieces
|
||||
"""A list of the pieces that make up this chain. Elements may
|
||||
be either ``NombankSplitTreePointer`` or
|
||||
``NombankTreePointer`` pointers."""
|
||||
|
||||
def __str__(self):
|
||||
return "*".join("%s" % p for p in self.pieces)
|
||||
|
||||
def __repr__(self):
|
||||
return "<NombankChainTreePointer: %s>" % self
|
||||
|
||||
def select(self, tree):
|
||||
if tree is None:
|
||||
raise ValueError("Parse tree not available")
|
||||
return Tree("*CHAIN*", [p.select(tree) for p in self.pieces])
|
||||
|
||||
|
||||
class NombankSplitTreePointer(NombankPointer):
|
||||
def __init__(self, pieces):
|
||||
self.pieces = pieces
|
||||
"""A list of the pieces that make up this chain. Elements are
|
||||
all ``NombankTreePointer`` pointers."""
|
||||
|
||||
def __str__(self):
|
||||
return ",".join("%s" % p for p in self.pieces)
|
||||
|
||||
def __repr__(self):
|
||||
return "<NombankSplitTreePointer: %s>" % self
|
||||
|
||||
def select(self, tree):
|
||||
if tree is None:
|
||||
raise ValueError("Parse tree not available")
|
||||
return Tree("*SPLIT*", [p.select(tree) for p in self.pieces])
|
||||
|
||||
|
||||
@total_ordering
|
||||
class NombankTreePointer(NombankPointer):
|
||||
"""
|
||||
wordnum:height*wordnum:height*...
|
||||
wordnum:height,
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, wordnum, height):
|
||||
self.wordnum = wordnum
|
||||
self.height = height
|
||||
|
||||
@staticmethod
|
||||
def parse(s):
|
||||
# Deal with chains (xx*yy*zz)
|
||||
pieces = s.split("*")
|
||||
if len(pieces) > 1:
|
||||
return NombankChainTreePointer(
|
||||
[NombankTreePointer.parse(elt) for elt in pieces]
|
||||
)
|
||||
|
||||
# Deal with split args (xx,yy,zz)
|
||||
pieces = s.split(",")
|
||||
if len(pieces) > 1:
|
||||
return NombankSplitTreePointer(
|
||||
[NombankTreePointer.parse(elt) for elt in pieces]
|
||||
)
|
||||
|
||||
# Deal with normal pointers.
|
||||
pieces = s.split(":")
|
||||
if len(pieces) != 2:
|
||||
raise ValueError("bad nombank pointer %r" % s)
|
||||
return NombankTreePointer(int(pieces[0]), int(pieces[1]))
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.wordnum}:{self.height}"
|
||||
|
||||
def __repr__(self):
|
||||
return "NombankTreePointer(%d, %d)" % (self.wordnum, self.height)
|
||||
|
||||
def __eq__(self, other):
|
||||
while isinstance(other, (NombankChainTreePointer, NombankSplitTreePointer)):
|
||||
other = other.pieces[0]
|
||||
|
||||
if not isinstance(other, NombankTreePointer):
|
||||
return self is other
|
||||
|
||||
return self.wordnum == other.wordnum and self.height == other.height
|
||||
|
||||
def __ne__(self, other):
|
||||
return not self == other
|
||||
|
||||
def __lt__(self, other):
|
||||
while isinstance(other, (NombankChainTreePointer, NombankSplitTreePointer)):
|
||||
other = other.pieces[0]
|
||||
|
||||
if not isinstance(other, NombankTreePointer):
|
||||
return id(self) < id(other)
|
||||
|
||||
return (self.wordnum, -self.height) < (other.wordnum, -other.height)
|
||||
|
||||
def select(self, tree):
|
||||
if tree is None:
|
||||
raise ValueError("Parse tree not available")
|
||||
return tree[self.treepos(tree)]
|
||||
|
||||
def treepos(self, tree):
|
||||
"""
|
||||
Convert this pointer to a standard 'tree position' pointer,
|
||||
given that it points to the given tree.
|
||||
"""
|
||||
if tree is None:
|
||||
raise ValueError("Parse tree not available")
|
||||
stack = [tree]
|
||||
treepos = []
|
||||
|
||||
wordnum = 0
|
||||
while True:
|
||||
# tree node:
|
||||
if isinstance(stack[-1], Tree):
|
||||
# Select the next child.
|
||||
if len(treepos) < len(stack):
|
||||
treepos.append(0)
|
||||
else:
|
||||
treepos[-1] += 1
|
||||
# Update the stack.
|
||||
if treepos[-1] < len(stack[-1]):
|
||||
stack.append(stack[-1][treepos[-1]])
|
||||
else:
|
||||
# End of node's child list: pop up a level.
|
||||
stack.pop()
|
||||
treepos.pop()
|
||||
# word node:
|
||||
else:
|
||||
if wordnum == self.wordnum:
|
||||
return tuple(treepos[: len(treepos) - self.height - 1])
|
||||
else:
|
||||
wordnum += 1
|
||||
stack.pop()
|
||||
@@ -0,0 +1,90 @@
|
||||
# Natural Language Toolkit: NPS Chat Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import re
|
||||
import textwrap
|
||||
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.corpus.reader.xmldocs import *
|
||||
from nltk.internals import ElementWrapper
|
||||
from nltk.tag import map_tag
|
||||
from nltk.util import LazyConcatenation
|
||||
|
||||
|
||||
class NPSChatCorpusReader(XMLCorpusReader):
|
||||
def __init__(self, root, fileids, wrap_etree=False, tagset=None):
|
||||
XMLCorpusReader.__init__(self, root, fileids, wrap_etree)
|
||||
self._tagset = tagset
|
||||
|
||||
def xml_posts(self, fileids=None):
|
||||
if self._wrap_etree:
|
||||
return concat(
|
||||
[
|
||||
XMLCorpusView(fileid, "Session/Posts/Post", self._wrap_elt)
|
||||
for fileid in self.abspaths(fileids)
|
||||
]
|
||||
)
|
||||
else:
|
||||
return concat(
|
||||
[
|
||||
XMLCorpusView(fileid, "Session/Posts/Post")
|
||||
for fileid in self.abspaths(fileids)
|
||||
]
|
||||
)
|
||||
|
||||
def posts(self, fileids=None):
|
||||
return concat(
|
||||
[
|
||||
XMLCorpusView(
|
||||
fileid, "Session/Posts/Post/terminals", self._elt_to_words
|
||||
)
|
||||
for fileid in self.abspaths(fileids)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_posts(self, fileids=None, tagset=None):
|
||||
def reader(elt, handler):
|
||||
return self._elt_to_tagged_words(elt, handler, tagset)
|
||||
|
||||
return concat(
|
||||
[
|
||||
XMLCorpusView(fileid, "Session/Posts/Post/terminals", reader)
|
||||
for fileid in self.abspaths(fileids)
|
||||
]
|
||||
)
|
||||
|
||||
def words(self, fileids=None):
|
||||
return LazyConcatenation(self.posts(fileids))
|
||||
|
||||
def tagged_words(self, fileids=None, tagset=None):
|
||||
return LazyConcatenation(self.tagged_posts(fileids, tagset))
|
||||
|
||||
def _wrap_elt(self, elt, handler):
|
||||
return ElementWrapper(elt)
|
||||
|
||||
def _elt_to_words(self, elt, handler):
|
||||
return [self._simplify_username(t.attrib["word"]) for t in elt.findall("t")]
|
||||
|
||||
def _elt_to_tagged_words(self, elt, handler, tagset=None):
|
||||
tagged_post = [
|
||||
(self._simplify_username(t.attrib["word"]), t.attrib["pos"])
|
||||
for t in elt.findall("t")
|
||||
]
|
||||
if tagset and tagset != self._tagset:
|
||||
tagged_post = [
|
||||
(w, map_tag(self._tagset, tagset, t)) for (w, t) in tagged_post
|
||||
]
|
||||
return tagged_post
|
||||
|
||||
@staticmethod
|
||||
def _simplify_username(word):
|
||||
if "User" in word:
|
||||
word = "U" + word.split("User", 1)[1]
|
||||
elif isinstance(word, bytes):
|
||||
word = word.decode("ascii")
|
||||
return word
|
||||
@@ -0,0 +1,125 @@
|
||||
# Natural Language Toolkit: Opinion Lexicon Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
CorpusReader for the Opinion Lexicon.
|
||||
|
||||
Opinion Lexicon information
|
||||
===========================
|
||||
|
||||
Authors: Minqing Hu and Bing Liu, 2004.
|
||||
Department of Computer Science
|
||||
University of Illinois at Chicago
|
||||
|
||||
Contact: Bing Liu, liub@cs.uic.edu
|
||||
https://www.cs.uic.edu/~liub
|
||||
|
||||
Distributed with permission.
|
||||
|
||||
Related papers:
|
||||
|
||||
- Minqing Hu and Bing Liu. "Mining and summarizing customer reviews".
|
||||
Proceedings of the ACM SIGKDD International Conference on Knowledge Discovery
|
||||
& Data Mining (KDD-04), Aug 22-25, 2004, Seattle, Washington, USA.
|
||||
|
||||
- Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing and
|
||||
Comparing Opinions on the Web". Proceedings of the 14th International World
|
||||
Wide Web conference (WWW-2005), May 10-14, 2005, Chiba, Japan.
|
||||
"""
|
||||
|
||||
from nltk.corpus.reader import WordListCorpusReader
|
||||
from nltk.corpus.reader.api import *
|
||||
|
||||
|
||||
class IgnoreReadmeCorpusView(StreamBackedCorpusView):
|
||||
"""
|
||||
This CorpusView is used to skip the initial readme block of the corpus.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
StreamBackedCorpusView.__init__(self, *args, **kwargs)
|
||||
# open self._stream
|
||||
self._open()
|
||||
# skip the readme block
|
||||
read_blankline_block(self._stream)
|
||||
# Set the initial position to the current stream position
|
||||
self._filepos = [self._stream.tell()]
|
||||
|
||||
|
||||
class OpinionLexiconCorpusReader(WordListCorpusReader):
|
||||
"""
|
||||
Reader for Liu and Hu opinion lexicon. Blank lines and readme are ignored.
|
||||
|
||||
>>> from nltk.corpus import opinion_lexicon
|
||||
>>> opinion_lexicon.words()
|
||||
['2-faced', '2-faces', 'abnormal', 'abolish', ...]
|
||||
|
||||
The OpinionLexiconCorpusReader provides shortcuts to retrieve positive/negative
|
||||
words:
|
||||
|
||||
>>> opinion_lexicon.negative()
|
||||
['2-faced', '2-faces', 'abnormal', 'abolish', ...]
|
||||
|
||||
Note that words from `words()` method are sorted by file id, not alphabetically:
|
||||
|
||||
>>> opinion_lexicon.words()[0:10] # doctest: +NORMALIZE_WHITESPACE
|
||||
['2-faced', '2-faces', 'abnormal', 'abolish', 'abominable', 'abominably',
|
||||
'abominate', 'abomination', 'abort', 'aborted']
|
||||
>>> sorted(opinion_lexicon.words())[0:10] # doctest: +NORMALIZE_WHITESPACE
|
||||
['2-faced', '2-faces', 'a+', 'abnormal', 'abolish', 'abominable', 'abominably',
|
||||
'abominate', 'abomination', 'abort']
|
||||
"""
|
||||
|
||||
CorpusView = IgnoreReadmeCorpusView
|
||||
|
||||
def words(self, fileids=None):
|
||||
"""
|
||||
Return all words in the opinion lexicon. Note that these words are not
|
||||
sorted in alphabetical order.
|
||||
|
||||
:param fileids: a list or regexp specifying the ids of the files whose
|
||||
words have to be returned.
|
||||
:return: the given file(s) as a list of words and punctuation symbols.
|
||||
:rtype: list(str)
|
||||
"""
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, str):
|
||||
fileids = [fileids]
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(path, self._read_word_block, encoding=enc)
|
||||
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
||||
]
|
||||
)
|
||||
|
||||
def positive(self):
|
||||
"""
|
||||
Return all positive words in alphabetical order.
|
||||
|
||||
:return: a list of positive words.
|
||||
:rtype: list(str)
|
||||
"""
|
||||
return self.words("positive-words.txt")
|
||||
|
||||
def negative(self):
|
||||
"""
|
||||
Return all negative words in alphabetical order.
|
||||
|
||||
:return: a list of negative words.
|
||||
:rtype: list(str)
|
||||
"""
|
||||
return self.words("negative-words.txt")
|
||||
|
||||
def _read_word_block(self, stream):
|
||||
words = []
|
||||
for i in range(20): # Read 20 lines at a time.
|
||||
line = stream.readline()
|
||||
if not line:
|
||||
continue
|
||||
words.append(line.strip())
|
||||
return words
|
||||
174
backend/venv/Lib/site-packages/nltk/corpus/reader/panlex_lite.py
Normal file
174
backend/venv/Lib/site-packages/nltk/corpus/reader/panlex_lite.py
Normal file
@@ -0,0 +1,174 @@
|
||||
# Natural Language Toolkit: PanLex Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: David Kamholz <kamholz@panlex.org>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
CorpusReader for PanLex Lite, a stripped down version of PanLex distributed
|
||||
as an SQLite database. See the README.txt in the panlex_lite corpus directory
|
||||
for more information on PanLex Lite.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sqlite3
|
||||
|
||||
from nltk.corpus.reader.api import CorpusReader
|
||||
|
||||
|
||||
class PanLexLiteCorpusReader(CorpusReader):
|
||||
MEANING_Q = """
|
||||
SELECT dnx2.mn, dnx2.uq, dnx2.ap, dnx2.ui, ex2.tt, ex2.lv
|
||||
FROM dnx
|
||||
JOIN ex ON (ex.ex = dnx.ex)
|
||||
JOIN dnx dnx2 ON (dnx2.mn = dnx.mn)
|
||||
JOIN ex ex2 ON (ex2.ex = dnx2.ex)
|
||||
WHERE dnx.ex != dnx2.ex AND ex.tt = ? AND ex.lv = ?
|
||||
ORDER BY dnx2.uq DESC
|
||||
"""
|
||||
|
||||
TRANSLATION_Q = """
|
||||
SELECT s.tt, sum(s.uq) AS trq FROM (
|
||||
SELECT ex2.tt, max(dnx.uq) AS uq
|
||||
FROM dnx
|
||||
JOIN ex ON (ex.ex = dnx.ex)
|
||||
JOIN dnx dnx2 ON (dnx2.mn = dnx.mn)
|
||||
JOIN ex ex2 ON (ex2.ex = dnx2.ex)
|
||||
WHERE dnx.ex != dnx2.ex AND ex.lv = ? AND ex.tt = ? AND ex2.lv = ?
|
||||
GROUP BY ex2.tt, dnx.ui
|
||||
) s
|
||||
GROUP BY s.tt
|
||||
ORDER BY trq DESC, s.tt
|
||||
"""
|
||||
|
||||
def __init__(self, root):
|
||||
self._c = sqlite3.connect(os.path.join(root, "db.sqlite")).cursor()
|
||||
|
||||
self._uid_lv = {}
|
||||
self._lv_uid = {}
|
||||
|
||||
for row in self._c.execute("SELECT uid, lv FROM lv"):
|
||||
self._uid_lv[row[0]] = row[1]
|
||||
self._lv_uid[row[1]] = row[0]
|
||||
|
||||
def language_varieties(self, lc=None):
|
||||
"""
|
||||
Return a list of PanLex language varieties.
|
||||
|
||||
:param lc: ISO 639 alpha-3 code. If specified, filters returned varieties
|
||||
by this code. If unspecified, all varieties are returned.
|
||||
:return: the specified language varieties as a list of tuples. The first
|
||||
element is the language variety's seven-character uniform identifier,
|
||||
and the second element is its default name.
|
||||
:rtype: list(tuple)
|
||||
"""
|
||||
|
||||
if lc is None:
|
||||
return self._c.execute("SELECT uid, tt FROM lv ORDER BY uid").fetchall()
|
||||
else:
|
||||
return self._c.execute(
|
||||
"SELECT uid, tt FROM lv WHERE lc = ? ORDER BY uid", (lc,)
|
||||
).fetchall()
|
||||
|
||||
def meanings(self, expr_uid, expr_tt):
|
||||
"""
|
||||
Return a list of meanings for an expression.
|
||||
|
||||
:param expr_uid: the expression's language variety, as a seven-character
|
||||
uniform identifier.
|
||||
:param expr_tt: the expression's text.
|
||||
:return: a list of Meaning objects.
|
||||
:rtype: list(Meaning)
|
||||
"""
|
||||
|
||||
expr_lv = self._uid_lv[expr_uid]
|
||||
|
||||
mn_info = {}
|
||||
|
||||
for i in self._c.execute(self.MEANING_Q, (expr_tt, expr_lv)):
|
||||
mn = i[0]
|
||||
uid = self._lv_uid[i[5]]
|
||||
|
||||
if not mn in mn_info:
|
||||
mn_info[mn] = {
|
||||
"uq": i[1],
|
||||
"ap": i[2],
|
||||
"ui": i[3],
|
||||
"ex": {expr_uid: [expr_tt]},
|
||||
}
|
||||
|
||||
if not uid in mn_info[mn]["ex"]:
|
||||
mn_info[mn]["ex"][uid] = []
|
||||
|
||||
mn_info[mn]["ex"][uid].append(i[4])
|
||||
|
||||
return [Meaning(mn, mn_info[mn]) for mn in mn_info]
|
||||
|
||||
def translations(self, from_uid, from_tt, to_uid):
|
||||
"""
|
||||
Return a list of translations for an expression into a single language
|
||||
variety.
|
||||
|
||||
:param from_uid: the source expression's language variety, as a
|
||||
seven-character uniform identifier.
|
||||
:param from_tt: the source expression's text.
|
||||
:param to_uid: the target language variety, as a seven-character
|
||||
uniform identifier.
|
||||
:return: a list of translation tuples. The first element is the expression
|
||||
text and the second element is the translation quality.
|
||||
:rtype: list(tuple)
|
||||
"""
|
||||
|
||||
from_lv = self._uid_lv[from_uid]
|
||||
to_lv = self._uid_lv[to_uid]
|
||||
|
||||
return self._c.execute(self.TRANSLATION_Q, (from_lv, from_tt, to_lv)).fetchall()
|
||||
|
||||
|
||||
class Meaning(dict):
|
||||
"""
|
||||
Represents a single PanLex meaning. A meaning is a translation set derived
|
||||
from a single source.
|
||||
"""
|
||||
|
||||
def __init__(self, mn, attr):
|
||||
super().__init__(**attr)
|
||||
self["mn"] = mn
|
||||
|
||||
def id(self):
|
||||
"""
|
||||
:return: the meaning's id.
|
||||
:rtype: int
|
||||
"""
|
||||
return self["mn"]
|
||||
|
||||
def quality(self):
|
||||
"""
|
||||
:return: the meaning's source's quality (0=worst, 9=best).
|
||||
:rtype: int
|
||||
"""
|
||||
return self["uq"]
|
||||
|
||||
def source(self):
|
||||
"""
|
||||
:return: the meaning's source id.
|
||||
:rtype: int
|
||||
"""
|
||||
return self["ap"]
|
||||
|
||||
def source_group(self):
|
||||
"""
|
||||
:return: the meaning's source group id.
|
||||
:rtype: int
|
||||
"""
|
||||
return self["ui"]
|
||||
|
||||
def expressions(self):
|
||||
"""
|
||||
:return: the meaning's expressions as a dictionary whose keys are language
|
||||
variety uniform identifiers and whose values are lists of expression
|
||||
texts.
|
||||
:rtype: dict
|
||||
"""
|
||||
return self["ex"]
|
||||
@@ -0,0 +1,95 @@
|
||||
# Natural Language Toolkit: Word List Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
|
||||
import re
|
||||
from collections import defaultdict, namedtuple
|
||||
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.corpus.reader.wordlist import WordListCorpusReader
|
||||
from nltk.tokenize import line_tokenize
|
||||
|
||||
PanlexLanguage = namedtuple(
|
||||
"PanlexLanguage",
|
||||
[
|
||||
"panlex_uid", # (1) PanLex UID
|
||||
"iso639", # (2) ISO 639 language code
|
||||
"iso639_type", # (3) ISO 639 language type, see README
|
||||
"script", # (4) normal scripts of expressions
|
||||
"name", # (5) PanLex default name
|
||||
"langvar_uid", # (6) UID of the language variety in which the default name is an expression
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
class PanlexSwadeshCorpusReader(WordListCorpusReader):
|
||||
"""
|
||||
This is a class to read the PanLex Swadesh list from
|
||||
|
||||
David Kamholz, Jonathan Pool, and Susan M. Colowick (2014).
|
||||
PanLex: Building a Resource for Panlingual Lexical Translation.
|
||||
In LREC. http://www.lrec-conf.org/proceedings/lrec2014/pdf/1029_Paper.pdf
|
||||
|
||||
License: CC0 1.0 Universal
|
||||
https://creativecommons.org/publicdomain/zero/1.0/legalcode
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
# Find the swadesh size using the fileids' path.
|
||||
self.swadesh_size = re.match(r"swadesh([0-9].*)\/", self.fileids()[0]).group(1)
|
||||
self._languages = {lang.panlex_uid: lang for lang in self.get_languages()}
|
||||
self._macro_langauges = self.get_macrolanguages()
|
||||
|
||||
def license(self):
|
||||
return "CC0 1.0 Universal"
|
||||
|
||||
def language_codes(self):
|
||||
return self._languages.keys()
|
||||
|
||||
def get_languages(self):
|
||||
for line in self.raw(f"langs{self.swadesh_size}.txt").split("\n"):
|
||||
if not line.strip(): # Skip empty lines.
|
||||
continue
|
||||
yield PanlexLanguage(*line.strip().split("\t"))
|
||||
|
||||
def get_macrolanguages(self):
|
||||
macro_langauges = defaultdict(list)
|
||||
for lang in self._languages.values():
|
||||
macro_langauges[lang.iso639].append(lang.panlex_uid)
|
||||
return macro_langauges
|
||||
|
||||
def words_by_lang(self, lang_code):
|
||||
"""
|
||||
:return: a list of list(str)
|
||||
"""
|
||||
fileid = f"swadesh{self.swadesh_size}/{lang_code}.txt"
|
||||
return [concept.split("\t") for concept in self.words(fileid)]
|
||||
|
||||
def words_by_iso639(self, iso63_code):
|
||||
"""
|
||||
:return: a list of list(str)
|
||||
"""
|
||||
fileids = [
|
||||
f"swadesh{self.swadesh_size}/{lang_code}.txt"
|
||||
for lang_code in self._macro_langauges[iso63_code]
|
||||
]
|
||||
return [
|
||||
concept.split("\t") for fileid in fileids for concept in self.words(fileid)
|
||||
]
|
||||
|
||||
def entries(self, fileids=None):
|
||||
"""
|
||||
:return: a tuple of words for the specified fileids.
|
||||
"""
|
||||
if not fileids:
|
||||
fileids = self.fileids()
|
||||
|
||||
wordlists = [self.words(f) for f in fileids]
|
||||
return list(zip(*wordlists))
|
||||
373
backend/venv/Lib/site-packages/nltk/corpus/reader/pl196x.py
Normal file
373
backend/venv/Lib/site-packages/nltk/corpus/reader/pl196x.py
Normal file
@@ -0,0 +1,373 @@
|
||||
# Natural Language Toolkit:
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Piotr Kasprzyk <p.j.kasprzyk@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.corpus.reader.xmldocs import XMLCorpusReader
|
||||
|
||||
PARA = re.compile(r"<p(?: [^>]*){0,1}>(.*?)</p>")
|
||||
SENT = re.compile(r"<s(?: [^>]*){0,1}>(.*?)</s>")
|
||||
|
||||
TAGGEDWORD = re.compile(r"<([wc](?: [^>]*){0,1}>)(.*?)</[wc]>")
|
||||
WORD = re.compile(r"<[wc](?: [^>]*){0,1}>(.*?)</[wc]>")
|
||||
|
||||
TYPE = re.compile(r'type="(.*?)"')
|
||||
ANA = re.compile(r'ana="(.*?)"')
|
||||
|
||||
TEXTID = re.compile(r'text id="(.*?)"')
|
||||
|
||||
|
||||
class TEICorpusView(StreamBackedCorpusView):
|
||||
def __init__(
|
||||
self,
|
||||
corpus_file,
|
||||
tagged,
|
||||
group_by_sent,
|
||||
group_by_para,
|
||||
tagset=None,
|
||||
head_len=0,
|
||||
textids=None,
|
||||
):
|
||||
self._tagged = tagged
|
||||
self._textids = textids
|
||||
|
||||
self._group_by_sent = group_by_sent
|
||||
self._group_by_para = group_by_para
|
||||
# WARNING -- skip header
|
||||
StreamBackedCorpusView.__init__(self, corpus_file, startpos=head_len)
|
||||
|
||||
_pagesize = 4096
|
||||
|
||||
def read_block(self, stream):
|
||||
block = stream.readlines(self._pagesize)
|
||||
block = concat(block)
|
||||
while (block.count("<text id") > block.count("</text>")) or block.count(
|
||||
"<text id"
|
||||
) == 0:
|
||||
tmp = stream.readline()
|
||||
if len(tmp) <= 0:
|
||||
break
|
||||
block += tmp
|
||||
|
||||
block = block.replace("\n", "")
|
||||
|
||||
textids = TEXTID.findall(block)
|
||||
if self._textids:
|
||||
for tid in textids:
|
||||
if tid not in self._textids:
|
||||
beg = block.find(tid) - 1
|
||||
end = block[beg:].find("</text>") + len("</text>")
|
||||
block = block[:beg] + block[beg + end :]
|
||||
|
||||
output = []
|
||||
for para_str in PARA.findall(block):
|
||||
para = []
|
||||
for sent_str in SENT.findall(para_str):
|
||||
if not self._tagged:
|
||||
sent = WORD.findall(sent_str)
|
||||
else:
|
||||
sent = list(map(self._parse_tag, TAGGEDWORD.findall(sent_str)))
|
||||
if self._group_by_sent:
|
||||
para.append(sent)
|
||||
else:
|
||||
para.extend(sent)
|
||||
if self._group_by_para:
|
||||
output.append(para)
|
||||
else:
|
||||
output.extend(para)
|
||||
return output
|
||||
|
||||
def _parse_tag(self, tag_word_tuple):
|
||||
(tag, word) = tag_word_tuple
|
||||
if tag.startswith("w"):
|
||||
tag = ANA.search(tag).group(1)
|
||||
else: # tag.startswith('c')
|
||||
tag = TYPE.search(tag).group(1)
|
||||
return word, tag
|
||||
|
||||
|
||||
class Pl196xCorpusReader(CategorizedCorpusReader, XMLCorpusReader):
|
||||
head_len = 2770
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
if "textid_file" in kwargs:
|
||||
self._textids = kwargs["textid_file"]
|
||||
else:
|
||||
self._textids = None
|
||||
|
||||
XMLCorpusReader.__init__(self, *args)
|
||||
CategorizedCorpusReader.__init__(self, kwargs)
|
||||
|
||||
self._init_textids()
|
||||
|
||||
def _init_textids(self):
|
||||
self._f2t = defaultdict(list)
|
||||
self._t2f = defaultdict(list)
|
||||
if self._textids is not None:
|
||||
with open(self._textids) as fp:
|
||||
for line in fp:
|
||||
line = line.strip()
|
||||
file_id, text_ids = line.split(" ", 1)
|
||||
if file_id not in self.fileids():
|
||||
raise ValueError(
|
||||
"In text_id mapping file %s: %s not found"
|
||||
% (self._textids, file_id)
|
||||
)
|
||||
for text_id in text_ids.split(self._delimiter):
|
||||
self._add_textids(file_id, text_id)
|
||||
|
||||
def _add_textids(self, file_id, text_id):
|
||||
self._f2t[file_id].append(text_id)
|
||||
self._t2f[text_id].append(file_id)
|
||||
|
||||
def _resolve(self, fileids, categories, textids=None):
|
||||
tmp = None
|
||||
if (
|
||||
len(
|
||||
list(
|
||||
filter(
|
||||
lambda accessor: accessor is None,
|
||||
(fileids, categories, textids),
|
||||
)
|
||||
)
|
||||
)
|
||||
!= 1
|
||||
):
|
||||
raise ValueError(
|
||||
"Specify exactly one of: fileids, " "categories or textids"
|
||||
)
|
||||
|
||||
if fileids is not None:
|
||||
return fileids, None
|
||||
|
||||
if categories is not None:
|
||||
return self.fileids(categories), None
|
||||
|
||||
if textids is not None:
|
||||
if isinstance(textids, str):
|
||||
textids = [textids]
|
||||
files = sum((self._t2f[t] for t in textids), [])
|
||||
tdict = dict()
|
||||
for f in files:
|
||||
tdict[f] = set(self._f2t[f]) & set(textids)
|
||||
return files, tdict
|
||||
|
||||
def decode_tag(self, tag):
|
||||
# to be implemented
|
||||
return tag
|
||||
|
||||
def textids(self, fileids=None, categories=None):
|
||||
"""
|
||||
In the pl196x corpus each category is stored in single
|
||||
file and thus both methods provide identical functionality. In order
|
||||
to accommodate finer granularity, a non-standard textids() method was
|
||||
implemented. All the main functions can be supplied with a list
|
||||
of required chunks---giving much more control to the user.
|
||||
"""
|
||||
fileids, _ = self._resolve(fileids, categories)
|
||||
if fileids is None:
|
||||
return sorted(self._t2f)
|
||||
|
||||
if isinstance(fileids, str):
|
||||
fileids = [fileids]
|
||||
return sorted(sum((self._f2t[d] for d in fileids), []))
|
||||
|
||||
def words(self, fileids=None, categories=None, textids=None):
|
||||
fileids, textids = self._resolve(fileids, categories, textids)
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, str):
|
||||
fileids = [fileids]
|
||||
|
||||
if textids:
|
||||
return concat(
|
||||
[
|
||||
TEICorpusView(
|
||||
self.abspath(fileid),
|
||||
False,
|
||||
False,
|
||||
False,
|
||||
head_len=self.head_len,
|
||||
textids=textids[fileid],
|
||||
)
|
||||
for fileid in fileids
|
||||
]
|
||||
)
|
||||
else:
|
||||
return concat(
|
||||
[
|
||||
TEICorpusView(
|
||||
self.abspath(fileid),
|
||||
False,
|
||||
False,
|
||||
False,
|
||||
head_len=self.head_len,
|
||||
)
|
||||
for fileid in fileids
|
||||
]
|
||||
)
|
||||
|
||||
def sents(self, fileids=None, categories=None, textids=None):
|
||||
fileids, textids = self._resolve(fileids, categories, textids)
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, str):
|
||||
fileids = [fileids]
|
||||
|
||||
if textids:
|
||||
return concat(
|
||||
[
|
||||
TEICorpusView(
|
||||
self.abspath(fileid),
|
||||
False,
|
||||
True,
|
||||
False,
|
||||
head_len=self.head_len,
|
||||
textids=textids[fileid],
|
||||
)
|
||||
for fileid in fileids
|
||||
]
|
||||
)
|
||||
else:
|
||||
return concat(
|
||||
[
|
||||
TEICorpusView(
|
||||
self.abspath(fileid), False, True, False, head_len=self.head_len
|
||||
)
|
||||
for fileid in fileids
|
||||
]
|
||||
)
|
||||
|
||||
def paras(self, fileids=None, categories=None, textids=None):
|
||||
fileids, textids = self._resolve(fileids, categories, textids)
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, str):
|
||||
fileids = [fileids]
|
||||
|
||||
if textids:
|
||||
return concat(
|
||||
[
|
||||
TEICorpusView(
|
||||
self.abspath(fileid),
|
||||
False,
|
||||
True,
|
||||
True,
|
||||
head_len=self.head_len,
|
||||
textids=textids[fileid],
|
||||
)
|
||||
for fileid in fileids
|
||||
]
|
||||
)
|
||||
else:
|
||||
return concat(
|
||||
[
|
||||
TEICorpusView(
|
||||
self.abspath(fileid), False, True, True, head_len=self.head_len
|
||||
)
|
||||
for fileid in fileids
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_words(self, fileids=None, categories=None, textids=None):
|
||||
fileids, textids = self._resolve(fileids, categories, textids)
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, str):
|
||||
fileids = [fileids]
|
||||
|
||||
if textids:
|
||||
return concat(
|
||||
[
|
||||
TEICorpusView(
|
||||
self.abspath(fileid),
|
||||
True,
|
||||
False,
|
||||
False,
|
||||
head_len=self.head_len,
|
||||
textids=textids[fileid],
|
||||
)
|
||||
for fileid in fileids
|
||||
]
|
||||
)
|
||||
else:
|
||||
return concat(
|
||||
[
|
||||
TEICorpusView(
|
||||
self.abspath(fileid), True, False, False, head_len=self.head_len
|
||||
)
|
||||
for fileid in fileids
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_sents(self, fileids=None, categories=None, textids=None):
|
||||
fileids, textids = self._resolve(fileids, categories, textids)
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, str):
|
||||
fileids = [fileids]
|
||||
|
||||
if textids:
|
||||
return concat(
|
||||
[
|
||||
TEICorpusView(
|
||||
self.abspath(fileid),
|
||||
True,
|
||||
True,
|
||||
False,
|
||||
head_len=self.head_len,
|
||||
textids=textids[fileid],
|
||||
)
|
||||
for fileid in fileids
|
||||
]
|
||||
)
|
||||
else:
|
||||
return concat(
|
||||
[
|
||||
TEICorpusView(
|
||||
self.abspath(fileid), True, True, False, head_len=self.head_len
|
||||
)
|
||||
for fileid in fileids
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_paras(self, fileids=None, categories=None, textids=None):
|
||||
fileids, textids = self._resolve(fileids, categories, textids)
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, str):
|
||||
fileids = [fileids]
|
||||
|
||||
if textids:
|
||||
return concat(
|
||||
[
|
||||
TEICorpusView(
|
||||
self.abspath(fileid),
|
||||
True,
|
||||
True,
|
||||
True,
|
||||
head_len=self.head_len,
|
||||
textids=textids[fileid],
|
||||
)
|
||||
for fileid in fileids
|
||||
]
|
||||
)
|
||||
else:
|
||||
return concat(
|
||||
[
|
||||
TEICorpusView(
|
||||
self.abspath(fileid), True, True, True, head_len=self.head_len
|
||||
)
|
||||
for fileid in fileids
|
||||
]
|
||||
)
|
||||
|
||||
def xml(self, fileids=None, categories=None):
|
||||
fileids, _ = self._resolve(fileids, categories)
|
||||
if len(fileids) == 1:
|
||||
return XMLCorpusReader.xml(self, fileids[0])
|
||||
else:
|
||||
raise TypeError("Expected a single file")
|
||||
237
backend/venv/Lib/site-packages/nltk/corpus/reader/plaintext.py
Normal file
237
backend/venv/Lib/site-packages/nltk/corpus/reader/plaintext.py
Normal file
@@ -0,0 +1,237 @@
|
||||
# Natural Language Toolkit: Plaintext Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# Nitin Madnani <nmadnani@umiacs.umd.edu>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
A reader for corpora that consist of plaintext documents.
|
||||
"""
|
||||
|
||||
import nltk.data
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.tokenize import *
|
||||
|
||||
|
||||
class PlaintextCorpusReader(CorpusReader):
|
||||
"""
|
||||
Reader for corpora that consist of plaintext documents. Paragraphs
|
||||
are assumed to be split using blank lines. Sentences and words can
|
||||
be tokenized using the default tokenizers, or by custom tokenizers
|
||||
specified as parameters to the constructor.
|
||||
|
||||
This corpus reader can be customized (e.g., to skip preface
|
||||
sections of specific document formats) by creating a subclass and
|
||||
overriding the ``CorpusView`` class variable.
|
||||
"""
|
||||
|
||||
CorpusView = StreamBackedCorpusView
|
||||
"""The corpus view class used by this reader. Subclasses of
|
||||
``PlaintextCorpusReader`` may specify alternative corpus view
|
||||
classes (e.g., to skip the preface sections of documents.)"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
root,
|
||||
fileids,
|
||||
word_tokenizer=WordPunctTokenizer(),
|
||||
sent_tokenizer=None,
|
||||
para_block_reader=read_blankline_block,
|
||||
encoding="utf8",
|
||||
):
|
||||
r"""
|
||||
Construct a new plaintext corpus reader for a set of documents
|
||||
located at the given root directory. Example usage:
|
||||
|
||||
>>> root = '/usr/local/share/nltk_data/corpora/webtext/'
|
||||
>>> reader = PlaintextCorpusReader(root, '.*\.txt') # doctest: +SKIP
|
||||
|
||||
:param root: The root directory for this corpus.
|
||||
:param fileids: A list or regexp specifying the fileids in this corpus.
|
||||
:param word_tokenizer: Tokenizer for breaking sentences or
|
||||
paragraphs into words.
|
||||
:param sent_tokenizer: Tokenizer for breaking paragraphs
|
||||
into words.
|
||||
:param para_block_reader: The block reader used to divide the
|
||||
corpus into paragraph blocks.
|
||||
"""
|
||||
CorpusReader.__init__(self, root, fileids, encoding)
|
||||
self._word_tokenizer = word_tokenizer
|
||||
self._sent_tokenizer = sent_tokenizer
|
||||
self._para_block_reader = para_block_reader
|
||||
|
||||
def words(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of words
|
||||
and punctuation symbols.
|
||||
:rtype: list(str)
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(path, self._read_word_block, encoding=enc)
|
||||
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
||||
]
|
||||
)
|
||||
|
||||
def sents(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of
|
||||
sentences or utterances, each encoded as a list of word
|
||||
strings.
|
||||
:rtype: list(list(str))
|
||||
"""
|
||||
if self._sent_tokenizer is None:
|
||||
try:
|
||||
self._sent_tokenizer = PunktTokenizer()
|
||||
except:
|
||||
raise ValueError("No sentence tokenizer for this corpus")
|
||||
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(path, self._read_sent_block, encoding=enc)
|
||||
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
||||
]
|
||||
)
|
||||
|
||||
def paras(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of
|
||||
paragraphs, each encoded as a list of sentences, which are
|
||||
in turn encoded as lists of word strings.
|
||||
:rtype: list(list(list(str)))
|
||||
"""
|
||||
if self._sent_tokenizer is None:
|
||||
try:
|
||||
self._sent_tokenizer = PunktTokenizer()
|
||||
except:
|
||||
raise ValueError("No sentence tokenizer for this corpus")
|
||||
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(path, self._read_para_block, encoding=enc)
|
||||
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
||||
]
|
||||
)
|
||||
|
||||
def _read_word_block(self, stream):
|
||||
words = []
|
||||
for i in range(20): # Read 20 lines at a time.
|
||||
words.extend(self._word_tokenizer.tokenize(stream.readline()))
|
||||
return words
|
||||
|
||||
def _read_sent_block(self, stream):
|
||||
sents = []
|
||||
for para in self._para_block_reader(stream):
|
||||
sents.extend(
|
||||
[
|
||||
self._word_tokenizer.tokenize(sent)
|
||||
for sent in self._sent_tokenizer.tokenize(para)
|
||||
]
|
||||
)
|
||||
return sents
|
||||
|
||||
def _read_para_block(self, stream):
|
||||
paras = []
|
||||
for para in self._para_block_reader(stream):
|
||||
paras.append(
|
||||
[
|
||||
self._word_tokenizer.tokenize(sent)
|
||||
for sent in self._sent_tokenizer.tokenize(para)
|
||||
]
|
||||
)
|
||||
return paras
|
||||
|
||||
|
||||
class CategorizedPlaintextCorpusReader(CategorizedCorpusReader, PlaintextCorpusReader):
|
||||
"""
|
||||
A reader for plaintext corpora whose documents are divided into
|
||||
categories based on their file identifiers.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
"""
|
||||
Initialize the corpus reader. Categorization arguments
|
||||
(``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
|
||||
the ``CategorizedCorpusReader`` constructor. The remaining arguments
|
||||
are passed to the ``PlaintextCorpusReader`` constructor.
|
||||
"""
|
||||
CategorizedCorpusReader.__init__(self, kwargs)
|
||||
PlaintextCorpusReader.__init__(self, *args, **kwargs)
|
||||
|
||||
|
||||
class PortugueseCategorizedPlaintextCorpusReader(CategorizedPlaintextCorpusReader):
|
||||
"""
|
||||
This class is identical with CategorizedPlaintextCorpusReader,
|
||||
except that it initializes a Portuguese PunktTokenizer:
|
||||
|
||||
>>> from nltk.corpus import machado
|
||||
>>> print(machado._sent_tokenizer._lang)
|
||||
portuguese
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
CategorizedPlaintextCorpusReader.__init__(self, *args, **kwargs)
|
||||
# Fixed (@ekaf 2025), new way to invoke Punkt:
|
||||
self._sent_tokenizer = PunktTokenizer("portuguese")
|
||||
|
||||
|
||||
class EuroparlCorpusReader(PlaintextCorpusReader):
|
||||
"""
|
||||
Reader for Europarl corpora that consist of plaintext documents.
|
||||
Documents are divided into chapters instead of paragraphs as
|
||||
for regular plaintext documents. Chapters are separated using blank
|
||||
lines. Everything is inherited from ``PlaintextCorpusReader`` except
|
||||
that:
|
||||
|
||||
- Since the corpus is pre-processed and pre-tokenized, the
|
||||
word tokenizer should just split the line at whitespaces.
|
||||
- For the same reason, the sentence tokenizer should just
|
||||
split the paragraph at line breaks.
|
||||
- There is a new 'chapters()' method that returns chapters instead
|
||||
instead of paragraphs.
|
||||
- The 'paras()' method inherited from PlaintextCorpusReader is
|
||||
made non-functional to remove any confusion between chapters
|
||||
and paragraphs for Europarl.
|
||||
"""
|
||||
|
||||
def _read_word_block(self, stream):
|
||||
words = []
|
||||
for i in range(20): # Read 20 lines at a time.
|
||||
words.extend(stream.readline().split())
|
||||
return words
|
||||
|
||||
def _read_sent_block(self, stream):
|
||||
sents = []
|
||||
for para in self._para_block_reader(stream):
|
||||
sents.extend([sent.split() for sent in para.splitlines()])
|
||||
return sents
|
||||
|
||||
def _read_para_block(self, stream):
|
||||
paras = []
|
||||
for para in self._para_block_reader(stream):
|
||||
paras.append([sent.split() for sent in para.splitlines()])
|
||||
return paras
|
||||
|
||||
def chapters(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of
|
||||
chapters, each encoded as a list of sentences, which are
|
||||
in turn encoded as lists of word strings.
|
||||
:rtype: list(list(list(str)))
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(fileid, self._read_para_block, encoding=enc)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def paras(self, fileids=None):
|
||||
raise NotImplementedError(
|
||||
"The Europarl corpus reader does not support paragraphs. Please use chapters() instead."
|
||||
)
|
||||
@@ -0,0 +1,95 @@
|
||||
# Natural Language Toolkit: PP Attachment Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Read lines from the Prepositional Phrase Attachment Corpus.
|
||||
|
||||
The PP Attachment Corpus contains several files having the format:
|
||||
|
||||
sentence_id verb noun1 preposition noun2 attachment
|
||||
|
||||
For example:
|
||||
|
||||
42960 gives authority to administration V
|
||||
46742 gives inventors of microchip N
|
||||
|
||||
The PP attachment is to the verb phrase (V) or noun phrase (N), i.e.:
|
||||
|
||||
(VP gives (NP authority) (PP to administration))
|
||||
(VP gives (NP inventors (PP of microchip)))
|
||||
|
||||
The corpus contains the following files:
|
||||
|
||||
training: training set
|
||||
devset: development test set, used for algorithm development.
|
||||
test: test set, used to report results
|
||||
bitstrings: word classes derived from Mutual Information Clustering for the Wall Street Journal.
|
||||
|
||||
Ratnaparkhi, Adwait (1994). A Maximum Entropy Model for Prepositional
|
||||
Phrase Attachment. Proceedings of the ARPA Human Language Technology
|
||||
Conference. [http://www.cis.upenn.edu/~adwait/papers/hlt94.ps]
|
||||
|
||||
The PP Attachment Corpus is distributed with NLTK with the permission
|
||||
of the author.
|
||||
"""
|
||||
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.corpus.reader.util import *
|
||||
|
||||
|
||||
class PPAttachment:
|
||||
def __init__(self, sent, verb, noun1, prep, noun2, attachment):
|
||||
self.sent = sent
|
||||
self.verb = verb
|
||||
self.noun1 = noun1
|
||||
self.prep = prep
|
||||
self.noun2 = noun2
|
||||
self.attachment = attachment
|
||||
|
||||
def __repr__(self):
|
||||
return (
|
||||
"PPAttachment(sent=%r, verb=%r, noun1=%r, prep=%r, "
|
||||
"noun2=%r, attachment=%r)"
|
||||
% (self.sent, self.verb, self.noun1, self.prep, self.noun2, self.attachment)
|
||||
)
|
||||
|
||||
|
||||
class PPAttachmentCorpusReader(CorpusReader):
|
||||
"""
|
||||
sentence_id verb noun1 preposition noun2 attachment
|
||||
"""
|
||||
|
||||
def attachments(self, fileids):
|
||||
return concat(
|
||||
[
|
||||
StreamBackedCorpusView(fileid, self._read_obj_block, encoding=enc)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def tuples(self, fileids):
|
||||
return concat(
|
||||
[
|
||||
StreamBackedCorpusView(fileid, self._read_tuple_block, encoding=enc)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def _read_tuple_block(self, stream):
|
||||
line = stream.readline()
|
||||
if line:
|
||||
return [tuple(line.split())]
|
||||
else:
|
||||
return []
|
||||
|
||||
def _read_obj_block(self, stream):
|
||||
line = stream.readline()
|
||||
if line:
|
||||
return [PPAttachment(*line.split())]
|
||||
else:
|
||||
return []
|
||||
519
backend/venv/Lib/site-packages/nltk/corpus/reader/propbank.py
Normal file
519
backend/venv/Lib/site-packages/nltk/corpus/reader/propbank.py
Normal file
@@ -0,0 +1,519 @@
|
||||
# Natural Language Toolkit: PropBank Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import re
|
||||
from functools import total_ordering
|
||||
from xml.etree import ElementTree
|
||||
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.internals import raise_unorderable_types
|
||||
from nltk.tree import Tree
|
||||
|
||||
|
||||
class PropbankCorpusReader(CorpusReader):
|
||||
"""
|
||||
Corpus reader for the propbank corpus, which augments the Penn
|
||||
Treebank with information about the predicate argument structure
|
||||
of every verb instance. The corpus consists of two parts: the
|
||||
predicate-argument annotations themselves, and a set of "frameset
|
||||
files" which define the argument labels used by the annotations,
|
||||
on a per-verb basis. Each "frameset file" contains one or more
|
||||
predicates, such as ``'turn'`` or ``'turn_on'``, each of which is
|
||||
divided into coarse-grained word senses called "rolesets". For
|
||||
each "roleset", the frameset file provides descriptions of the
|
||||
argument roles, along with examples.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
root,
|
||||
propfile,
|
||||
framefiles="",
|
||||
verbsfile=None,
|
||||
parse_fileid_xform=None,
|
||||
parse_corpus=None,
|
||||
encoding="utf8",
|
||||
):
|
||||
"""
|
||||
:param root: The root directory for this corpus.
|
||||
:param propfile: The name of the file containing the predicate-
|
||||
argument annotations (relative to ``root``).
|
||||
:param framefiles: A list or regexp specifying the frameset
|
||||
fileids for this corpus.
|
||||
:param parse_fileid_xform: A transform that should be applied
|
||||
to the fileids in this corpus. This should be a function
|
||||
of one argument (a fileid) that returns a string (the new
|
||||
fileid).
|
||||
:param parse_corpus: The corpus containing the parse trees
|
||||
corresponding to this corpus. These parse trees are
|
||||
necessary to resolve the tree pointers used by propbank.
|
||||
"""
|
||||
# If framefiles is specified as a regexp, expand it.
|
||||
if isinstance(framefiles, str):
|
||||
framefiles = find_corpus_fileids(root, framefiles)
|
||||
framefiles = list(framefiles)
|
||||
# Initialize the corpus reader.
|
||||
CorpusReader.__init__(self, root, [propfile, verbsfile] + framefiles, encoding)
|
||||
|
||||
# Record our frame fileids & prop file.
|
||||
self._propfile = propfile
|
||||
self._framefiles = framefiles
|
||||
self._verbsfile = verbsfile
|
||||
self._parse_fileid_xform = parse_fileid_xform
|
||||
self._parse_corpus = parse_corpus
|
||||
|
||||
def instances(self, baseform=None):
|
||||
"""
|
||||
:return: a corpus view that acts as a list of
|
||||
``PropBankInstance`` objects, one for each noun in the corpus.
|
||||
"""
|
||||
kwargs = {}
|
||||
if baseform is not None:
|
||||
kwargs["instance_filter"] = lambda inst: inst.baseform == baseform
|
||||
return StreamBackedCorpusView(
|
||||
self.abspath(self._propfile),
|
||||
lambda stream: self._read_instance_block(stream, **kwargs),
|
||||
encoding=self.encoding(self._propfile),
|
||||
)
|
||||
|
||||
def lines(self):
|
||||
"""
|
||||
:return: a corpus view that acts as a list of strings, one for
|
||||
each line in the predicate-argument annotation file.
|
||||
"""
|
||||
return StreamBackedCorpusView(
|
||||
self.abspath(self._propfile),
|
||||
read_line_block,
|
||||
encoding=self.encoding(self._propfile),
|
||||
)
|
||||
|
||||
def roleset(self, roleset_id):
|
||||
"""
|
||||
:return: the xml description for the given roleset.
|
||||
"""
|
||||
baseform = roleset_id.split(".")[0]
|
||||
framefile = "frames/%s.xml" % baseform
|
||||
if framefile not in self._framefiles:
|
||||
raise ValueError("Frameset file for %s not found" % roleset_id)
|
||||
|
||||
# n.b.: The encoding for XML fileids is specified by the file
|
||||
# itself; so we ignore self._encoding here.
|
||||
with self.abspath(framefile).open() as fp:
|
||||
etree = ElementTree.parse(fp).getroot()
|
||||
for roleset in etree.findall("predicate/roleset"):
|
||||
if roleset.attrib["id"] == roleset_id:
|
||||
return roleset
|
||||
raise ValueError(f"Roleset {roleset_id} not found in {framefile}")
|
||||
|
||||
def rolesets(self, baseform=None):
|
||||
"""
|
||||
:return: list of xml descriptions for rolesets.
|
||||
"""
|
||||
if baseform is not None:
|
||||
framefile = "frames/%s.xml" % baseform
|
||||
if framefile not in self._framefiles:
|
||||
raise ValueError("Frameset file for %s not found" % baseform)
|
||||
framefiles = [framefile]
|
||||
else:
|
||||
framefiles = self._framefiles
|
||||
|
||||
rsets = []
|
||||
for framefile in framefiles:
|
||||
# n.b.: The encoding for XML fileids is specified by the file
|
||||
# itself; so we ignore self._encoding here.
|
||||
with self.abspath(framefile).open() as fp:
|
||||
etree = ElementTree.parse(fp).getroot()
|
||||
rsets.append(etree.findall("predicate/roleset"))
|
||||
return LazyConcatenation(rsets)
|
||||
|
||||
def verbs(self):
|
||||
"""
|
||||
:return: a corpus view that acts as a list of all verb lemmas
|
||||
in this corpus (from the verbs.txt file).
|
||||
"""
|
||||
return StreamBackedCorpusView(
|
||||
self.abspath(self._verbsfile),
|
||||
read_line_block,
|
||||
encoding=self.encoding(self._verbsfile),
|
||||
)
|
||||
|
||||
def _read_instance_block(self, stream, instance_filter=lambda inst: True):
|
||||
block = []
|
||||
|
||||
# Read 100 at a time.
|
||||
for i in range(100):
|
||||
line = stream.readline().strip()
|
||||
if line:
|
||||
inst = PropbankInstance.parse(
|
||||
line, self._parse_fileid_xform, self._parse_corpus
|
||||
)
|
||||
if instance_filter(inst):
|
||||
block.append(inst)
|
||||
|
||||
return block
|
||||
|
||||
|
||||
######################################################################
|
||||
# { Propbank Instance & related datatypes
|
||||
######################################################################
|
||||
|
||||
|
||||
class PropbankInstance:
|
||||
def __init__(
|
||||
self,
|
||||
fileid,
|
||||
sentnum,
|
||||
wordnum,
|
||||
tagger,
|
||||
roleset,
|
||||
inflection,
|
||||
predicate,
|
||||
arguments,
|
||||
parse_corpus=None,
|
||||
):
|
||||
self.fileid = fileid
|
||||
"""The name of the file containing the parse tree for this
|
||||
instance's sentence."""
|
||||
|
||||
self.sentnum = sentnum
|
||||
"""The sentence number of this sentence within ``fileid``.
|
||||
Indexing starts from zero."""
|
||||
|
||||
self.wordnum = wordnum
|
||||
"""The word number of this instance's predicate within its
|
||||
containing sentence. Word numbers are indexed starting from
|
||||
zero, and include traces and other empty parse elements."""
|
||||
|
||||
self.tagger = tagger
|
||||
"""An identifier for the tagger who tagged this instance; or
|
||||
``'gold'`` if this is an adjuticated instance."""
|
||||
|
||||
self.roleset = roleset
|
||||
"""The name of the roleset used by this instance's predicate.
|
||||
Use ``propbank.roleset() <PropbankCorpusReader.roleset>`` to
|
||||
look up information about the roleset."""
|
||||
|
||||
self.inflection = inflection
|
||||
"""A ``PropbankInflection`` object describing the inflection of
|
||||
this instance's predicate."""
|
||||
|
||||
self.predicate = predicate
|
||||
"""A ``PropbankTreePointer`` indicating the position of this
|
||||
instance's predicate within its containing sentence."""
|
||||
|
||||
self.arguments = tuple(arguments)
|
||||
"""A list of tuples (argloc, argid), specifying the location
|
||||
and identifier for each of the predicate's argument in the
|
||||
containing sentence. Argument identifiers are strings such as
|
||||
``'ARG0'`` or ``'ARGM-TMP'``. This list does *not* contain
|
||||
the predicate."""
|
||||
|
||||
self.parse_corpus = parse_corpus
|
||||
"""A corpus reader for the parse trees corresponding to the
|
||||
instances in this propbank corpus."""
|
||||
|
||||
@property
|
||||
def baseform(self):
|
||||
"""The baseform of the predicate."""
|
||||
return self.roleset.split(".")[0]
|
||||
|
||||
@property
|
||||
def sensenumber(self):
|
||||
"""The sense number of the predicate."""
|
||||
return self.roleset.split(".")[1]
|
||||
|
||||
@property
|
||||
def predid(self):
|
||||
"""Identifier of the predicate."""
|
||||
return "rel"
|
||||
|
||||
def __repr__(self):
|
||||
return "<PropbankInstance: {}, sent {}, word {}>".format(
|
||||
self.fileid,
|
||||
self.sentnum,
|
||||
self.wordnum,
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
s = "{} {} {} {} {} {}".format(
|
||||
self.fileid,
|
||||
self.sentnum,
|
||||
self.wordnum,
|
||||
self.tagger,
|
||||
self.roleset,
|
||||
self.inflection,
|
||||
)
|
||||
items = self.arguments + ((self.predicate, "rel"),)
|
||||
for argloc, argid in sorted(items):
|
||||
s += f" {argloc}-{argid}"
|
||||
return s
|
||||
|
||||
def _get_tree(self):
|
||||
if self.parse_corpus is None:
|
||||
return None
|
||||
if self.fileid not in self.parse_corpus.fileids():
|
||||
return None
|
||||
return self.parse_corpus.parsed_sents(self.fileid)[self.sentnum]
|
||||
|
||||
tree = property(
|
||||
_get_tree,
|
||||
doc="""
|
||||
The parse tree corresponding to this instance, or None if
|
||||
the corresponding tree is not available.""",
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def parse(s, parse_fileid_xform=None, parse_corpus=None):
|
||||
pieces = s.split()
|
||||
if len(pieces) < 7:
|
||||
raise ValueError("Badly formatted propbank line: %r" % s)
|
||||
|
||||
# Divide the line into its basic pieces.
|
||||
(fileid, sentnum, wordnum, tagger, roleset, inflection) = pieces[:6]
|
||||
rel = [p for p in pieces[6:] if p.endswith("-rel")]
|
||||
args = [p for p in pieces[6:] if not p.endswith("-rel")]
|
||||
if len(rel) != 1:
|
||||
raise ValueError("Badly formatted propbank line: %r" % s)
|
||||
|
||||
# Apply the fileid selector, if any.
|
||||
if parse_fileid_xform is not None:
|
||||
fileid = parse_fileid_xform(fileid)
|
||||
|
||||
# Convert sentence & word numbers to ints.
|
||||
sentnum = int(sentnum)
|
||||
wordnum = int(wordnum)
|
||||
|
||||
# Parse the inflection
|
||||
inflection = PropbankInflection.parse(inflection)
|
||||
|
||||
# Parse the predicate location.
|
||||
predicate = PropbankTreePointer.parse(rel[0][:-4])
|
||||
|
||||
# Parse the arguments.
|
||||
arguments = []
|
||||
for arg in args:
|
||||
argloc, argid = arg.split("-", 1)
|
||||
arguments.append((PropbankTreePointer.parse(argloc), argid))
|
||||
|
||||
# Put it all together.
|
||||
return PropbankInstance(
|
||||
fileid,
|
||||
sentnum,
|
||||
wordnum,
|
||||
tagger,
|
||||
roleset,
|
||||
inflection,
|
||||
predicate,
|
||||
arguments,
|
||||
parse_corpus,
|
||||
)
|
||||
|
||||
|
||||
class PropbankPointer:
|
||||
"""
|
||||
A pointer used by propbank to identify one or more constituents in
|
||||
a parse tree. ``PropbankPointer`` is an abstract base class with
|
||||
three concrete subclasses:
|
||||
|
||||
- ``PropbankTreePointer`` is used to point to single constituents.
|
||||
- ``PropbankSplitTreePointer`` is used to point to 'split'
|
||||
constituents, which consist of a sequence of two or more
|
||||
``PropbankTreePointer`` pointers.
|
||||
- ``PropbankChainTreePointer`` is used to point to entire trace
|
||||
chains in a tree. It consists of a sequence of pieces, which
|
||||
can be ``PropbankTreePointer`` or ``PropbankSplitTreePointer`` pointers.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
if self.__class__ == PropbankPointer:
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
class PropbankChainTreePointer(PropbankPointer):
|
||||
def __init__(self, pieces):
|
||||
self.pieces = pieces
|
||||
"""A list of the pieces that make up this chain. Elements may
|
||||
be either ``PropbankSplitTreePointer`` or
|
||||
``PropbankTreePointer`` pointers."""
|
||||
|
||||
def __str__(self):
|
||||
return "*".join("%s" % p for p in self.pieces)
|
||||
|
||||
def __repr__(self):
|
||||
return "<PropbankChainTreePointer: %s>" % self
|
||||
|
||||
def select(self, tree):
|
||||
if tree is None:
|
||||
raise ValueError("Parse tree not available")
|
||||
return Tree("*CHAIN*", [p.select(tree) for p in self.pieces])
|
||||
|
||||
|
||||
class PropbankSplitTreePointer(PropbankPointer):
|
||||
def __init__(self, pieces):
|
||||
self.pieces = pieces
|
||||
"""A list of the pieces that make up this chain. Elements are
|
||||
all ``PropbankTreePointer`` pointers."""
|
||||
|
||||
def __str__(self):
|
||||
return ",".join("%s" % p for p in self.pieces)
|
||||
|
||||
def __repr__(self):
|
||||
return "<PropbankSplitTreePointer: %s>" % self
|
||||
|
||||
def select(self, tree):
|
||||
if tree is None:
|
||||
raise ValueError("Parse tree not available")
|
||||
return Tree("*SPLIT*", [p.select(tree) for p in self.pieces])
|
||||
|
||||
|
||||
@total_ordering
|
||||
class PropbankTreePointer(PropbankPointer):
|
||||
"""
|
||||
wordnum:height*wordnum:height*...
|
||||
wordnum:height,
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, wordnum, height):
|
||||
self.wordnum = wordnum
|
||||
self.height = height
|
||||
|
||||
@staticmethod
|
||||
def parse(s):
|
||||
# Deal with chains (xx*yy*zz)
|
||||
pieces = s.split("*")
|
||||
if len(pieces) > 1:
|
||||
return PropbankChainTreePointer(
|
||||
[PropbankTreePointer.parse(elt) for elt in pieces]
|
||||
)
|
||||
|
||||
# Deal with split args (xx,yy,zz)
|
||||
pieces = s.split(",")
|
||||
if len(pieces) > 1:
|
||||
return PropbankSplitTreePointer(
|
||||
[PropbankTreePointer.parse(elt) for elt in pieces]
|
||||
)
|
||||
|
||||
# Deal with normal pointers.
|
||||
pieces = s.split(":")
|
||||
if len(pieces) != 2:
|
||||
raise ValueError("bad propbank pointer %r" % s)
|
||||
return PropbankTreePointer(int(pieces[0]), int(pieces[1]))
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.wordnum}:{self.height}"
|
||||
|
||||
def __repr__(self):
|
||||
return "PropbankTreePointer(%d, %d)" % (self.wordnum, self.height)
|
||||
|
||||
def __eq__(self, other):
|
||||
while isinstance(other, (PropbankChainTreePointer, PropbankSplitTreePointer)):
|
||||
other = other.pieces[0]
|
||||
|
||||
if not isinstance(other, PropbankTreePointer):
|
||||
return self is other
|
||||
|
||||
return self.wordnum == other.wordnum and self.height == other.height
|
||||
|
||||
def __ne__(self, other):
|
||||
return not self == other
|
||||
|
||||
def __lt__(self, other):
|
||||
while isinstance(other, (PropbankChainTreePointer, PropbankSplitTreePointer)):
|
||||
other = other.pieces[0]
|
||||
|
||||
if not isinstance(other, PropbankTreePointer):
|
||||
return id(self) < id(other)
|
||||
|
||||
return (self.wordnum, -self.height) < (other.wordnum, -other.height)
|
||||
|
||||
def select(self, tree):
|
||||
if tree is None:
|
||||
raise ValueError("Parse tree not available")
|
||||
return tree[self.treepos(tree)]
|
||||
|
||||
def treepos(self, tree):
|
||||
"""
|
||||
Convert this pointer to a standard 'tree position' pointer,
|
||||
given that it points to the given tree.
|
||||
"""
|
||||
if tree is None:
|
||||
raise ValueError("Parse tree not available")
|
||||
stack = [tree]
|
||||
treepos = []
|
||||
|
||||
wordnum = 0
|
||||
while True:
|
||||
# tree node:
|
||||
if isinstance(stack[-1], Tree):
|
||||
# Select the next child.
|
||||
if len(treepos) < len(stack):
|
||||
treepos.append(0)
|
||||
else:
|
||||
treepos[-1] += 1
|
||||
# Update the stack.
|
||||
if treepos[-1] < len(stack[-1]):
|
||||
stack.append(stack[-1][treepos[-1]])
|
||||
else:
|
||||
# End of node's child list: pop up a level.
|
||||
stack.pop()
|
||||
treepos.pop()
|
||||
# word node:
|
||||
else:
|
||||
if wordnum == self.wordnum:
|
||||
return tuple(treepos[: len(treepos) - self.height - 1])
|
||||
else:
|
||||
wordnum += 1
|
||||
stack.pop()
|
||||
|
||||
|
||||
class PropbankInflection:
|
||||
# { Inflection Form
|
||||
INFINITIVE = "i"
|
||||
GERUND = "g"
|
||||
PARTICIPLE = "p"
|
||||
FINITE = "v"
|
||||
# { Inflection Tense
|
||||
FUTURE = "f"
|
||||
PAST = "p"
|
||||
PRESENT = "n"
|
||||
# { Inflection Aspect
|
||||
PERFECT = "p"
|
||||
PROGRESSIVE = "o"
|
||||
PERFECT_AND_PROGRESSIVE = "b"
|
||||
# { Inflection Person
|
||||
THIRD_PERSON = "3"
|
||||
# { Inflection Voice
|
||||
ACTIVE = "a"
|
||||
PASSIVE = "p"
|
||||
# { Inflection
|
||||
NONE = "-"
|
||||
# }
|
||||
|
||||
def __init__(self, form="-", tense="-", aspect="-", person="-", voice="-"):
|
||||
self.form = form
|
||||
self.tense = tense
|
||||
self.aspect = aspect
|
||||
self.person = person
|
||||
self.voice = voice
|
||||
|
||||
def __str__(self):
|
||||
return self.form + self.tense + self.aspect + self.person + self.voice
|
||||
|
||||
def __repr__(self):
|
||||
return "<PropbankInflection: %s>" % self
|
||||
|
||||
_VALIDATE = re.compile(r"[igpv\-][fpn\-][pob\-][3\-][ap\-]$")
|
||||
|
||||
@staticmethod
|
||||
def parse(s):
|
||||
if not isinstance(s, str):
|
||||
raise TypeError("expected a string")
|
||||
if len(s) != 5 or not PropbankInflection._VALIDATE.match(s):
|
||||
raise ValueError("Bad propbank inflection string %r" % s)
|
||||
return PropbankInflection(*s)
|
||||
133
backend/venv/Lib/site-packages/nltk/corpus/reader/pros_cons.py
Normal file
133
backend/venv/Lib/site-packages/nltk/corpus/reader/pros_cons.py
Normal file
@@ -0,0 +1,133 @@
|
||||
# Natural Language Toolkit: Pros and Cons Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
CorpusReader for the Pros and Cons dataset.
|
||||
|
||||
- Pros and Cons dataset information -
|
||||
|
||||
Contact: Bing Liu, liub@cs.uic.edu
|
||||
https://www.cs.uic.edu/~liub
|
||||
|
||||
Distributed with permission.
|
||||
|
||||
Related papers:
|
||||
|
||||
- Murthy Ganapathibhotla and Bing Liu. "Mining Opinions in Comparative Sentences".
|
||||
Proceedings of the 22nd International Conference on Computational Linguistics
|
||||
(Coling-2008), Manchester, 18-22 August, 2008.
|
||||
|
||||
- Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing and Comparing
|
||||
Opinions on the Web". Proceedings of the 14th international World Wide Web
|
||||
conference (WWW-2005), May 10-14, 2005, in Chiba, Japan.
|
||||
"""
|
||||
import re
|
||||
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.tokenize import *
|
||||
|
||||
|
||||
class ProsConsCorpusReader(CategorizedCorpusReader, CorpusReader):
|
||||
"""
|
||||
Reader for the Pros and Cons sentence dataset.
|
||||
|
||||
>>> from nltk.corpus import pros_cons
|
||||
>>> pros_cons.sents(categories='Cons') # doctest: +NORMALIZE_WHITESPACE
|
||||
[['East', 'batteries', '!', 'On', '-', 'off', 'switch', 'too', 'easy',
|
||||
'to', 'maneuver', '.'], ['Eats', '...', 'no', ',', 'GULPS', 'batteries'],
|
||||
...]
|
||||
>>> pros_cons.words('IntegratedPros.txt')
|
||||
['Easy', 'to', 'use', ',', 'economical', '!', ...]
|
||||
"""
|
||||
|
||||
CorpusView = StreamBackedCorpusView
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
root,
|
||||
fileids,
|
||||
word_tokenizer=WordPunctTokenizer(),
|
||||
encoding="utf8",
|
||||
**kwargs
|
||||
):
|
||||
"""
|
||||
:param root: The root directory for the corpus.
|
||||
:param fileids: a list or regexp specifying the fileids in the corpus.
|
||||
:param word_tokenizer: a tokenizer for breaking sentences or paragraphs
|
||||
into words. Default: `WhitespaceTokenizer`
|
||||
:param encoding: the encoding that should be used to read the corpus.
|
||||
:param kwargs: additional parameters passed to CategorizedCorpusReader.
|
||||
"""
|
||||
|
||||
CorpusReader.__init__(self, root, fileids, encoding)
|
||||
CategorizedCorpusReader.__init__(self, kwargs)
|
||||
self._word_tokenizer = word_tokenizer
|
||||
|
||||
def sents(self, fileids=None, categories=None):
|
||||
"""
|
||||
Return all sentences in the corpus or in the specified files/categories.
|
||||
|
||||
:param fileids: a list or regexp specifying the ids of the files whose
|
||||
sentences have to be returned.
|
||||
:param categories: a list specifying the categories whose sentences
|
||||
have to be returned.
|
||||
:return: the given file(s) as a list of sentences. Each sentence is
|
||||
tokenized using the specified word_tokenizer.
|
||||
:rtype: list(list(str))
|
||||
"""
|
||||
fileids = self._resolve(fileids, categories)
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, str):
|
||||
fileids = [fileids]
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(path, self._read_sent_block, encoding=enc)
|
||||
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
||||
]
|
||||
)
|
||||
|
||||
def words(self, fileids=None, categories=None):
|
||||
"""
|
||||
Return all words and punctuation symbols in the corpus or in the specified
|
||||
files/categories.
|
||||
|
||||
:param fileids: a list or regexp specifying the ids of the files whose
|
||||
words have to be returned.
|
||||
:param categories: a list specifying the categories whose words have
|
||||
to be returned.
|
||||
:return: the given file(s) as a list of words and punctuation symbols.
|
||||
:rtype: list(str)
|
||||
"""
|
||||
fileids = self._resolve(fileids, categories)
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, str):
|
||||
fileids = [fileids]
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(path, self._read_word_block, encoding=enc)
|
||||
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
||||
]
|
||||
)
|
||||
|
||||
def _read_sent_block(self, stream):
|
||||
sents = []
|
||||
for i in range(20): # Read 20 lines at a time.
|
||||
line = stream.readline()
|
||||
if not line:
|
||||
continue
|
||||
sent = re.match(r"^(?!\n)\s*<(Pros|Cons)>(.*)</(?:Pros|Cons)>", line)
|
||||
if sent:
|
||||
sents.append(self._word_tokenizer.tokenize(sent.group(2).strip()))
|
||||
return sents
|
||||
|
||||
def _read_word_block(self, stream):
|
||||
words = []
|
||||
for sent in self._read_sent_block(stream):
|
||||
words.extend(sent)
|
||||
return words
|
||||
331
backend/venv/Lib/site-packages/nltk/corpus/reader/reviews.py
Normal file
331
backend/venv/Lib/site-packages/nltk/corpus/reader/reviews.py
Normal file
@@ -0,0 +1,331 @@
|
||||
# Natural Language Toolkit: Product Reviews Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
CorpusReader for reviews corpora (syntax based on Customer Review Corpus).
|
||||
|
||||
Customer Review Corpus information
|
||||
==================================
|
||||
|
||||
Annotated by: Minqing Hu and Bing Liu, 2004.
|
||||
Department of Computer Science
|
||||
University of Illinois at Chicago
|
||||
|
||||
Contact: Bing Liu, liub@cs.uic.edu
|
||||
https://www.cs.uic.edu/~liub
|
||||
|
||||
Distributed with permission.
|
||||
|
||||
The "product_reviews_1" and "product_reviews_2" datasets respectively contain
|
||||
annotated customer reviews of 5 and 9 products from amazon.com.
|
||||
|
||||
Related papers:
|
||||
|
||||
- Minqing Hu and Bing Liu. "Mining and summarizing customer reviews".
|
||||
Proceedings of the ACM SIGKDD International Conference on Knowledge
|
||||
Discovery & Data Mining (KDD-04), 2004.
|
||||
|
||||
- Minqing Hu and Bing Liu. "Mining Opinion Features in Customer Reviews".
|
||||
Proceedings of Nineteeth National Conference on Artificial Intelligence
|
||||
(AAAI-2004), 2004.
|
||||
|
||||
- Xiaowen Ding, Bing Liu and Philip S. Yu. "A Holistic Lexicon-Based Appraoch to
|
||||
Opinion Mining." Proceedings of First ACM International Conference on Web
|
||||
Search and Data Mining (WSDM-2008), Feb 11-12, 2008, Stanford University,
|
||||
Stanford, California, USA.
|
||||
|
||||
Symbols used in the annotated reviews:
|
||||
|
||||
:[t]: the title of the review: Each [t] tag starts a review.
|
||||
:xxxx[+|-n]: xxxx is a product feature.
|
||||
:[+n]: Positive opinion, n is the opinion strength: 3 strongest, and 1 weakest.
|
||||
Note that the strength is quite subjective.
|
||||
You may want ignore it, but only considering + and -
|
||||
:[-n]: Negative opinion
|
||||
:##: start of each sentence. Each line is a sentence.
|
||||
:[u]: feature not appeared in the sentence.
|
||||
:[p]: feature not appeared in the sentence. Pronoun resolution is needed.
|
||||
:[s]: suggestion or recommendation.
|
||||
:[cc]: comparison with a competing product from a different brand.
|
||||
:[cs]: comparison with a competing product from the same brand.
|
||||
|
||||
Note: Some of the files (e.g. "ipod.txt", "Canon PowerShot SD500.txt") do not
|
||||
provide separation between different reviews. This is due to the fact that
|
||||
the dataset was specifically designed for aspect/feature-based sentiment
|
||||
analysis, for which sentence-level annotation is sufficient. For document-
|
||||
level classification and analysis, this peculiarity should be taken into
|
||||
consideration.
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.tokenize import *
|
||||
|
||||
TITLE = re.compile(r"^\[t\](.*)$") # [t] Title
|
||||
FEATURES = re.compile(
|
||||
r"((?:(?:\w+\s)+)?\w+)\[((?:\+|\-)\d)\]"
|
||||
) # find 'feature' in feature[+3]
|
||||
NOTES = re.compile(r"\[(?!t)(p|u|s|cc|cs)\]") # find 'p' in camera[+2][p]
|
||||
SENT = re.compile(r"##(.*)$") # find tokenized sentence
|
||||
|
||||
|
||||
class Review:
|
||||
"""
|
||||
A Review is the main block of a ReviewsCorpusReader.
|
||||
"""
|
||||
|
||||
def __init__(self, title=None, review_lines=None):
|
||||
"""
|
||||
:param title: the title of the review.
|
||||
:param review_lines: the list of the ReviewLines that belong to the Review.
|
||||
"""
|
||||
self.title = title
|
||||
if review_lines is None:
|
||||
self.review_lines = []
|
||||
else:
|
||||
self.review_lines = review_lines
|
||||
|
||||
def add_line(self, review_line):
|
||||
"""
|
||||
Add a line (ReviewLine) to the review.
|
||||
|
||||
:param review_line: a ReviewLine instance that belongs to the Review.
|
||||
"""
|
||||
assert isinstance(review_line, ReviewLine)
|
||||
self.review_lines.append(review_line)
|
||||
|
||||
def features(self):
|
||||
"""
|
||||
Return a list of features in the review. Each feature is a tuple made of
|
||||
the specific item feature and the opinion strength about that feature.
|
||||
|
||||
:return: all features of the review as a list of tuples (feat, score).
|
||||
:rtype: list(tuple)
|
||||
"""
|
||||
features = []
|
||||
for review_line in self.review_lines:
|
||||
features.extend(review_line.features)
|
||||
return features
|
||||
|
||||
def sents(self):
|
||||
"""
|
||||
Return all tokenized sentences in the review.
|
||||
|
||||
:return: all sentences of the review as lists of tokens.
|
||||
:rtype: list(list(str))
|
||||
"""
|
||||
return [review_line.sent for review_line in self.review_lines]
|
||||
|
||||
def __repr__(self):
|
||||
return 'Review(title="{}", review_lines={})'.format(
|
||||
self.title, self.review_lines
|
||||
)
|
||||
|
||||
|
||||
class ReviewLine:
|
||||
"""
|
||||
A ReviewLine represents a sentence of the review, together with (optional)
|
||||
annotations of its features and notes about the reviewed item.
|
||||
"""
|
||||
|
||||
def __init__(self, sent, features=None, notes=None):
|
||||
self.sent = sent
|
||||
if features is None:
|
||||
self.features = []
|
||||
else:
|
||||
self.features = features
|
||||
|
||||
if notes is None:
|
||||
self.notes = []
|
||||
else:
|
||||
self.notes = notes
|
||||
|
||||
def __repr__(self):
|
||||
return "ReviewLine(features={}, notes={}, sent={})".format(
|
||||
self.features, self.notes, self.sent
|
||||
)
|
||||
|
||||
|
||||
class ReviewsCorpusReader(CorpusReader):
|
||||
"""
|
||||
Reader for the Customer Review Data dataset by Hu, Liu (2004).
|
||||
Note: we are not applying any sentence tokenization at the moment, just word
|
||||
tokenization.
|
||||
|
||||
>>> from nltk.corpus import product_reviews_1
|
||||
>>> camera_reviews = product_reviews_1.reviews('Canon_G3.txt')
|
||||
>>> review = camera_reviews[0]
|
||||
>>> review.sents()[0] # doctest: +NORMALIZE_WHITESPACE
|
||||
['i', 'recently', 'purchased', 'the', 'canon', 'powershot', 'g3', 'and', 'am',
|
||||
'extremely', 'satisfied', 'with', 'the', 'purchase', '.']
|
||||
>>> review.features() # doctest: +NORMALIZE_WHITESPACE
|
||||
[('canon powershot g3', '+3'), ('use', '+2'), ('picture', '+2'),
|
||||
('picture quality', '+1'), ('picture quality', '+1'), ('camera', '+2'),
|
||||
('use', '+2'), ('feature', '+1'), ('picture quality', '+3'), ('use', '+1'),
|
||||
('option', '+1')]
|
||||
|
||||
We can also reach the same information directly from the stream:
|
||||
|
||||
>>> product_reviews_1.features('Canon_G3.txt')
|
||||
[('canon powershot g3', '+3'), ('use', '+2'), ...]
|
||||
|
||||
We can compute stats for specific product features:
|
||||
|
||||
>>> n_reviews = len([(feat,score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture'])
|
||||
>>> tot = sum([int(score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture'])
|
||||
>>> mean = tot / n_reviews
|
||||
>>> print(n_reviews, tot, mean)
|
||||
15 24 1.6
|
||||
"""
|
||||
|
||||
CorpusView = StreamBackedCorpusView
|
||||
|
||||
def __init__(
|
||||
self, root, fileids, word_tokenizer=WordPunctTokenizer(), encoding="utf8"
|
||||
):
|
||||
"""
|
||||
:param root: The root directory for the corpus.
|
||||
:param fileids: a list or regexp specifying the fileids in the corpus.
|
||||
:param word_tokenizer: a tokenizer for breaking sentences or paragraphs
|
||||
into words. Default: `WordPunctTokenizer`
|
||||
:param encoding: the encoding that should be used to read the corpus.
|
||||
"""
|
||||
|
||||
CorpusReader.__init__(self, root, fileids, encoding)
|
||||
self._word_tokenizer = word_tokenizer
|
||||
self._readme = "README.txt"
|
||||
|
||||
def features(self, fileids=None):
|
||||
"""
|
||||
Return a list of features. Each feature is a tuple made of the specific
|
||||
item feature and the opinion strength about that feature.
|
||||
|
||||
:param fileids: a list or regexp specifying the ids of the files whose
|
||||
features have to be returned.
|
||||
:return: all features for the item(s) in the given file(s).
|
||||
:rtype: list(tuple)
|
||||
"""
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, str):
|
||||
fileids = [fileids]
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(fileid, self._read_features, encoding=enc)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def reviews(self, fileids=None):
|
||||
"""
|
||||
Return all the reviews as a list of Review objects. If `fileids` is
|
||||
specified, return all the reviews from each of the specified files.
|
||||
|
||||
:param fileids: a list or regexp specifying the ids of the files whose
|
||||
reviews have to be returned.
|
||||
:return: the given file(s) as a list of reviews.
|
||||
"""
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(fileid, self._read_review_block, encoding=enc)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def sents(self, fileids=None):
|
||||
"""
|
||||
Return all sentences in the corpus or in the specified files.
|
||||
|
||||
:param fileids: a list or regexp specifying the ids of the files whose
|
||||
sentences have to be returned.
|
||||
:return: the given file(s) as a list of sentences, each encoded as a
|
||||
list of word strings.
|
||||
:rtype: list(list(str))
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(path, self._read_sent_block, encoding=enc)
|
||||
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
||||
]
|
||||
)
|
||||
|
||||
def words(self, fileids=None):
|
||||
"""
|
||||
Return all words and punctuation symbols in the corpus or in the specified
|
||||
files.
|
||||
|
||||
:param fileids: a list or regexp specifying the ids of the files whose
|
||||
words have to be returned.
|
||||
:return: the given file(s) as a list of words and punctuation symbols.
|
||||
:rtype: list(str)
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(path, self._read_word_block, encoding=enc)
|
||||
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
||||
]
|
||||
)
|
||||
|
||||
def _read_features(self, stream):
|
||||
features = []
|
||||
for i in range(20):
|
||||
line = stream.readline()
|
||||
if not line:
|
||||
return features
|
||||
features.extend(re.findall(FEATURES, line))
|
||||
return features
|
||||
|
||||
def _read_review_block(self, stream):
|
||||
while True:
|
||||
line = stream.readline()
|
||||
if not line:
|
||||
return [] # end of file.
|
||||
title_match = re.match(TITLE, line)
|
||||
if title_match:
|
||||
review = Review(
|
||||
title=title_match.group(1).strip()
|
||||
) # We create a new review
|
||||
break
|
||||
|
||||
# Scan until we find another line matching the regexp, or EOF.
|
||||
while True:
|
||||
oldpos = stream.tell()
|
||||
line = stream.readline()
|
||||
# End of file:
|
||||
if not line:
|
||||
return [review]
|
||||
# Start of a new review: backup to just before it starts, and
|
||||
# return the review we've already collected.
|
||||
if re.match(TITLE, line):
|
||||
stream.seek(oldpos)
|
||||
return [review]
|
||||
# Anything else is part of the review line.
|
||||
feats = re.findall(FEATURES, line)
|
||||
notes = re.findall(NOTES, line)
|
||||
sent = re.findall(SENT, line)
|
||||
if sent:
|
||||
sent = self._word_tokenizer.tokenize(sent[0])
|
||||
review_line = ReviewLine(sent=sent, features=feats, notes=notes)
|
||||
review.add_line(review_line)
|
||||
|
||||
def _read_sent_block(self, stream):
|
||||
sents = []
|
||||
for review in self._read_review_block(stream):
|
||||
sents.extend([sent for sent in review.sents()])
|
||||
return sents
|
||||
|
||||
def _read_word_block(self, stream):
|
||||
words = []
|
||||
for i in range(20): # Read 20 lines at a time.
|
||||
line = stream.readline()
|
||||
sent = re.findall(SENT, line)
|
||||
if sent:
|
||||
words.extend(self._word_tokenizer.tokenize(sent[0]))
|
||||
return words
|
||||
146
backend/venv/Lib/site-packages/nltk/corpus/reader/rte.py
Normal file
146
backend/venv/Lib/site-packages/nltk/corpus/reader/rte.py
Normal file
@@ -0,0 +1,146 @@
|
||||
# Natural Language Toolkit: RTE Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Corpus reader for the Recognizing Textual Entailment (RTE) Challenge Corpora.
|
||||
|
||||
The files were taken from the RTE1, RTE2 and RTE3 datasets and the files
|
||||
were regularized.
|
||||
|
||||
Filenames are of the form rte*_dev.xml and rte*_test.xml. The latter are the
|
||||
gold standard annotated files.
|
||||
|
||||
Each entailment corpus is a list of 'text'/'hypothesis' pairs. The following
|
||||
example is taken from RTE3::
|
||||
|
||||
<pair id="1" entailment="YES" task="IE" length="short" >
|
||||
|
||||
<t>The sale was made to pay Yukos' US$ 27.5 billion tax bill,
|
||||
Yuganskneftegaz was originally sold for US$ 9.4 billion to a little known
|
||||
company Baikalfinansgroup which was later bought by the Russian
|
||||
state-owned oil company Rosneft .</t>
|
||||
|
||||
<h>Baikalfinansgroup was sold to Rosneft.</h>
|
||||
</pair>
|
||||
|
||||
In order to provide globally unique IDs for each pair, a new attribute
|
||||
``challenge`` has been added to the root element ``entailment-corpus`` of each
|
||||
file, taking values 1, 2 or 3. The GID is formatted 'm-n', where 'm' is the
|
||||
challenge number and 'n' is the pair ID.
|
||||
"""
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.corpus.reader.xmldocs import *
|
||||
|
||||
|
||||
def norm(value_string):
|
||||
"""
|
||||
Normalize the string value in an RTE pair's ``value`` or ``entailment``
|
||||
attribute as an integer (1, 0).
|
||||
|
||||
:param value_string: the label used to classify a text/hypothesis pair
|
||||
:type value_string: str
|
||||
:rtype: int
|
||||
"""
|
||||
|
||||
valdict = {"TRUE": 1, "FALSE": 0, "YES": 1, "NO": 0}
|
||||
return valdict[value_string.upper()]
|
||||
|
||||
|
||||
class RTEPair:
|
||||
"""
|
||||
Container for RTE text-hypothesis pairs.
|
||||
|
||||
The entailment relation is signalled by the ``value`` attribute in RTE1, and by
|
||||
``entailment`` in RTE2 and RTE3. These both get mapped on to the ``entailment``
|
||||
attribute of this class.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
pair,
|
||||
challenge=None,
|
||||
id=None,
|
||||
text=None,
|
||||
hyp=None,
|
||||
value=None,
|
||||
task=None,
|
||||
length=None,
|
||||
):
|
||||
"""
|
||||
:param challenge: version of the RTE challenge (i.e., RTE1, RTE2 or RTE3)
|
||||
:param id: identifier for the pair
|
||||
:param text: the text component of the pair
|
||||
:param hyp: the hypothesis component of the pair
|
||||
:param value: classification label for the pair
|
||||
:param task: attribute for the particular NLP task that the data was drawn from
|
||||
:param length: attribute for the length of the text of the pair
|
||||
"""
|
||||
self.challenge = challenge
|
||||
self.id = pair.attrib["id"]
|
||||
self.gid = f"{self.challenge}-{self.id}"
|
||||
self.text = pair[0].text
|
||||
self.hyp = pair[1].text
|
||||
|
||||
if "value" in pair.attrib:
|
||||
self.value = norm(pair.attrib["value"])
|
||||
elif "entailment" in pair.attrib:
|
||||
self.value = norm(pair.attrib["entailment"])
|
||||
else:
|
||||
self.value = value
|
||||
if "task" in pair.attrib:
|
||||
self.task = pair.attrib["task"]
|
||||
else:
|
||||
self.task = task
|
||||
if "length" in pair.attrib:
|
||||
self.length = pair.attrib["length"]
|
||||
else:
|
||||
self.length = length
|
||||
|
||||
def __repr__(self):
|
||||
if self.challenge:
|
||||
return f"<RTEPair: gid={self.challenge}-{self.id}>"
|
||||
else:
|
||||
return "<RTEPair: id=%s>" % self.id
|
||||
|
||||
|
||||
class RTECorpusReader(XMLCorpusReader):
|
||||
"""
|
||||
Corpus reader for corpora in RTE challenges.
|
||||
|
||||
This is just a wrapper around the XMLCorpusReader. See module docstring above for the expected
|
||||
structure of input documents.
|
||||
"""
|
||||
|
||||
def _read_etree(self, doc):
|
||||
"""
|
||||
Map the XML input into an RTEPair.
|
||||
|
||||
This uses the ``getiterator()`` method from the ElementTree package to
|
||||
find all the ``<pair>`` elements.
|
||||
|
||||
:param doc: a parsed XML document
|
||||
:rtype: list(RTEPair)
|
||||
"""
|
||||
try:
|
||||
challenge = doc.attrib["challenge"]
|
||||
except KeyError:
|
||||
challenge = None
|
||||
pairiter = doc.iter("pair")
|
||||
return [RTEPair(pair, challenge=challenge) for pair in pairiter]
|
||||
|
||||
def pairs(self, fileids):
|
||||
"""
|
||||
Build a list of RTEPairs from a RTE corpus.
|
||||
|
||||
:param fileids: a list of RTE corpus fileids
|
||||
:type: list
|
||||
:rtype: list(RTEPair)
|
||||
"""
|
||||
if isinstance(fileids, str):
|
||||
fileids = [fileids]
|
||||
return concat([self._read_etree(self.xml(fileid)) for fileid in fileids])
|
||||
296
backend/venv/Lib/site-packages/nltk/corpus/reader/semcor.py
Normal file
296
backend/venv/Lib/site-packages/nltk/corpus/reader/semcor.py
Normal file
@@ -0,0 +1,296 @@
|
||||
# Natural Language Toolkit: SemCor Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Nathan Schneider <nschneid@cs.cmu.edu>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Corpus reader for the SemCor Corpus.
|
||||
"""
|
||||
|
||||
__docformat__ = "epytext en"
|
||||
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView
|
||||
from nltk.tree import Tree
|
||||
|
||||
|
||||
class SemcorCorpusReader(XMLCorpusReader):
|
||||
"""
|
||||
Corpus reader for the SemCor Corpus.
|
||||
For access to the complete XML data structure, use the ``xml()``
|
||||
method. For access to simple word lists and tagged word lists, use
|
||||
``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``.
|
||||
"""
|
||||
|
||||
def __init__(self, root, fileids, wordnet, lazy=True):
|
||||
XMLCorpusReader.__init__(self, root, fileids)
|
||||
self._lazy = lazy
|
||||
self._wordnet = wordnet
|
||||
|
||||
def words(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of words and punctuation symbols.
|
||||
:rtype: list(str)
|
||||
"""
|
||||
return self._items(fileids, "word", False, False, False)
|
||||
|
||||
def chunks(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of chunks,
|
||||
each of which is a list of words and punctuation symbols
|
||||
that form a unit.
|
||||
:rtype: list(list(str))
|
||||
"""
|
||||
return self._items(fileids, "chunk", False, False, False)
|
||||
|
||||
def tagged_chunks(self, fileids=None, tag=("pos" or "sem" or "both")):
|
||||
"""
|
||||
:return: the given file(s) as a list of tagged chunks, represented
|
||||
in tree form.
|
||||
:rtype: list(Tree)
|
||||
|
||||
:param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'`
|
||||
to indicate the kind of tags to include. Semantic tags consist of
|
||||
WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity
|
||||
without a specific entry in WordNet. (Named entities of type 'other'
|
||||
have no lemma. Other chunks not in WordNet have no semantic tag.
|
||||
Punctuation tokens have `None` for their part of speech tag.)
|
||||
"""
|
||||
return self._items(fileids, "chunk", False, tag != "sem", tag != "pos")
|
||||
|
||||
def sents(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of sentences, each encoded
|
||||
as a list of word strings.
|
||||
:rtype: list(list(str))
|
||||
"""
|
||||
return self._items(fileids, "word", True, False, False)
|
||||
|
||||
def chunk_sents(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of sentences, each encoded
|
||||
as a list of chunks.
|
||||
:rtype: list(list(list(str)))
|
||||
"""
|
||||
return self._items(fileids, "chunk", True, False, False)
|
||||
|
||||
def tagged_sents(self, fileids=None, tag=("pos" or "sem" or "both")):
|
||||
"""
|
||||
:return: the given file(s) as a list of sentences. Each sentence
|
||||
is represented as a list of tagged chunks (in tree form).
|
||||
:rtype: list(list(Tree))
|
||||
|
||||
:param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'`
|
||||
to indicate the kind of tags to include. Semantic tags consist of
|
||||
WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity
|
||||
without a specific entry in WordNet. (Named entities of type 'other'
|
||||
have no lemma. Other chunks not in WordNet have no semantic tag.
|
||||
Punctuation tokens have `None` for their part of speech tag.)
|
||||
"""
|
||||
return self._items(fileids, "chunk", True, tag != "sem", tag != "pos")
|
||||
|
||||
def _items(self, fileids, unit, bracket_sent, pos_tag, sem_tag):
|
||||
if unit == "word" and not bracket_sent:
|
||||
# the result of the SemcorWordView may be a multiword unit, so the
|
||||
# LazyConcatenation will make sure the sentence is flattened
|
||||
_ = lambda *args: LazyConcatenation(
|
||||
(SemcorWordView if self._lazy else self._words)(*args)
|
||||
)
|
||||
else:
|
||||
_ = SemcorWordView if self._lazy else self._words
|
||||
return concat(
|
||||
[
|
||||
_(fileid, unit, bracket_sent, pos_tag, sem_tag, self._wordnet)
|
||||
for fileid in self.abspaths(fileids)
|
||||
]
|
||||
)
|
||||
|
||||
def _words(self, fileid, unit, bracket_sent, pos_tag, sem_tag):
|
||||
"""
|
||||
Helper used to implement the view methods -- returns a list of
|
||||
tokens, (segmented) words, chunks, or sentences. The tokens
|
||||
and chunks may optionally be tagged (with POS and sense
|
||||
information).
|
||||
|
||||
:param fileid: The name of the underlying file.
|
||||
:param unit: One of `'token'`, `'word'`, or `'chunk'`.
|
||||
:param bracket_sent: If true, include sentence bracketing.
|
||||
:param pos_tag: Whether to include part-of-speech tags.
|
||||
:param sem_tag: Whether to include semantic tags, namely WordNet lemma
|
||||
and OOV named entity status.
|
||||
"""
|
||||
assert unit in ("token", "word", "chunk")
|
||||
result = []
|
||||
|
||||
xmldoc = ElementTree.parse(fileid).getroot()
|
||||
for xmlsent in xmldoc.findall(".//s"):
|
||||
sent = []
|
||||
for xmlword in _all_xmlwords_in(xmlsent):
|
||||
itm = SemcorCorpusReader._word(
|
||||
xmlword, unit, pos_tag, sem_tag, self._wordnet
|
||||
)
|
||||
if unit == "word":
|
||||
sent.extend(itm)
|
||||
else:
|
||||
sent.append(itm)
|
||||
|
||||
if bracket_sent:
|
||||
result.append(SemcorSentence(xmlsent.attrib["snum"], sent))
|
||||
else:
|
||||
result.extend(sent)
|
||||
|
||||
assert None not in result
|
||||
return result
|
||||
|
||||
@staticmethod
|
||||
def _word(xmlword, unit, pos_tag, sem_tag, wordnet):
|
||||
tkn = xmlword.text
|
||||
if not tkn:
|
||||
tkn = "" # fixes issue 337?
|
||||
|
||||
lemma = xmlword.get("lemma", tkn) # lemma or NE class
|
||||
lexsn = xmlword.get("lexsn") # lex_sense (locator for the lemma's sense)
|
||||
if lexsn is not None:
|
||||
sense_key = lemma + "%" + lexsn
|
||||
wnpos = ("n", "v", "a", "r", "s")[
|
||||
int(lexsn.split(":")[0]) - 1
|
||||
] # see http://wordnet.princeton.edu/man/senseidx.5WN.html
|
||||
else:
|
||||
sense_key = wnpos = None
|
||||
redef = xmlword.get(
|
||||
"rdf", tkn
|
||||
) # redefinition--this indicates the lookup string
|
||||
# does not exactly match the enclosed string, e.g. due to typographical adjustments
|
||||
# or discontinuity of a multiword expression. If a redefinition has occurred,
|
||||
# the "rdf" attribute holds its inflected form and "lemma" holds its lemma.
|
||||
# For NEs, "rdf", "lemma", and "pn" all hold the same value (the NE class).
|
||||
sensenum = xmlword.get("wnsn") # WordNet sense number
|
||||
isOOVEntity = "pn" in xmlword.keys() # a "personal name" (NE) not in WordNet
|
||||
pos = xmlword.get(
|
||||
"pos"
|
||||
) # part of speech for the whole chunk (None for punctuation)
|
||||
|
||||
if unit == "token":
|
||||
if not pos_tag and not sem_tag:
|
||||
itm = tkn
|
||||
else:
|
||||
itm = (
|
||||
(tkn,)
|
||||
+ ((pos,) if pos_tag else ())
|
||||
+ ((lemma, wnpos, sensenum, isOOVEntity) if sem_tag else ())
|
||||
)
|
||||
return itm
|
||||
else:
|
||||
ww = tkn.split("_") # TODO: case where punctuation intervenes in MWE
|
||||
if unit == "word":
|
||||
return ww
|
||||
else:
|
||||
if sensenum is not None:
|
||||
try:
|
||||
sense = wordnet.lemma_from_key(sense_key) # Lemma object
|
||||
except Exception:
|
||||
# cannot retrieve the wordnet.Lemma object. possible reasons:
|
||||
# (a) the wordnet corpus is not downloaded;
|
||||
# (b) a nonexistent sense is annotated: e.g., such.s.00 triggers:
|
||||
# nltk.corpus.reader.wordnet.WordNetError: No synset found for key u'such%5:00:01:specified:00'
|
||||
# solution: just use the lemma name as a string
|
||||
try:
|
||||
sense = "%s.%s.%02d" % (
|
||||
lemma,
|
||||
wnpos,
|
||||
int(sensenum),
|
||||
) # e.g.: reach.v.02
|
||||
except ValueError:
|
||||
sense = (
|
||||
lemma + "." + wnpos + "." + sensenum
|
||||
) # e.g. the sense number may be "2;1"
|
||||
|
||||
bottom = [Tree(pos, ww)] if pos_tag else ww
|
||||
|
||||
if sem_tag and isOOVEntity:
|
||||
if sensenum is not None:
|
||||
return Tree(sense, [Tree("NE", bottom)])
|
||||
else: # 'other' NE
|
||||
return Tree("NE", bottom)
|
||||
elif sem_tag and sensenum is not None:
|
||||
return Tree(sense, bottom)
|
||||
elif pos_tag:
|
||||
return bottom[0]
|
||||
else:
|
||||
return bottom # chunk as a list
|
||||
|
||||
|
||||
def _all_xmlwords_in(elt, result=None):
|
||||
if result is None:
|
||||
result = []
|
||||
for child in elt:
|
||||
if child.tag in ("wf", "punc"):
|
||||
result.append(child)
|
||||
else:
|
||||
_all_xmlwords_in(child, result)
|
||||
return result
|
||||
|
||||
|
||||
class SemcorSentence(list):
|
||||
"""
|
||||
A list of words, augmented by an attribute ``num`` used to record
|
||||
the sentence identifier (the ``n`` attribute from the XML).
|
||||
"""
|
||||
|
||||
def __init__(self, num, items):
|
||||
self.num = num
|
||||
list.__init__(self, items)
|
||||
|
||||
|
||||
class SemcorWordView(XMLCorpusView):
|
||||
"""
|
||||
A stream backed corpus view specialized for use with the BNC corpus.
|
||||
"""
|
||||
|
||||
def __init__(self, fileid, unit, bracket_sent, pos_tag, sem_tag, wordnet):
|
||||
"""
|
||||
:param fileid: The name of the underlying file.
|
||||
:param unit: One of `'token'`, `'word'`, or `'chunk'`.
|
||||
:param bracket_sent: If true, include sentence bracketing.
|
||||
:param pos_tag: Whether to include part-of-speech tags.
|
||||
:param sem_tag: Whether to include semantic tags, namely WordNet lemma
|
||||
and OOV named entity status.
|
||||
"""
|
||||
if bracket_sent:
|
||||
tagspec = ".*/s"
|
||||
else:
|
||||
tagspec = ".*/s/(punc|wf)"
|
||||
|
||||
self._unit = unit
|
||||
self._sent = bracket_sent
|
||||
self._pos_tag = pos_tag
|
||||
self._sem_tag = sem_tag
|
||||
self._wordnet = wordnet
|
||||
|
||||
XMLCorpusView.__init__(self, fileid, tagspec)
|
||||
|
||||
def handle_elt(self, elt, context):
|
||||
if self._sent:
|
||||
return self.handle_sent(elt)
|
||||
else:
|
||||
return self.handle_word(elt)
|
||||
|
||||
def handle_word(self, elt):
|
||||
return SemcorCorpusReader._word(
|
||||
elt, self._unit, self._pos_tag, self._sem_tag, self._wordnet
|
||||
)
|
||||
|
||||
def handle_sent(self, elt):
|
||||
sent = []
|
||||
for child in elt:
|
||||
if child.tag in ("wf", "punc"):
|
||||
itm = self.handle_word(child)
|
||||
if self._unit == "word":
|
||||
sent.extend(itm)
|
||||
else:
|
||||
sent.append(itm)
|
||||
else:
|
||||
raise ValueError("Unexpected element %s" % child.tag)
|
||||
return SemcorSentence(elt.attrib["snum"], sent)
|
||||
196
backend/venv/Lib/site-packages/nltk/corpus/reader/senseval.py
Normal file
196
backend/venv/Lib/site-packages/nltk/corpus/reader/senseval.py
Normal file
@@ -0,0 +1,196 @@
|
||||
# Natural Language Toolkit: Senseval 2 Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
|
||||
# Steven Bird <stevenbird1@gmail.com> (modifications)
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Read from the Senseval 2 Corpus.
|
||||
|
||||
SENSEVAL [http://www.senseval.org/]
|
||||
Evaluation exercises for Word Sense Disambiguation.
|
||||
Organized by ACL-SIGLEX [https://www.siglex.org/]
|
||||
|
||||
Prepared by Ted Pedersen <tpederse@umn.edu>, University of Minnesota,
|
||||
https://www.d.umn.edu/~tpederse/data.html
|
||||
Distributed with permission.
|
||||
|
||||
The NLTK version of the Senseval 2 files uses well-formed XML.
|
||||
Each instance of the ambiguous words "hard", "interest", "line", and "serve"
|
||||
is tagged with a sense identifier, and supplied with context.
|
||||
"""
|
||||
|
||||
import re
|
||||
from xml.etree import ElementTree
|
||||
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.tokenize import *
|
||||
|
||||
|
||||
class SensevalInstance:
|
||||
def __init__(self, word, position, context, senses):
|
||||
self.word = word
|
||||
self.senses = tuple(senses)
|
||||
self.position = position
|
||||
self.context = context
|
||||
|
||||
def __repr__(self):
|
||||
return "SensevalInstance(word=%r, position=%r, " "context=%r, senses=%r)" % (
|
||||
self.word,
|
||||
self.position,
|
||||
self.context,
|
||||
self.senses,
|
||||
)
|
||||
|
||||
|
||||
class SensevalCorpusReader(CorpusReader):
|
||||
def instances(self, fileids=None):
|
||||
return concat(
|
||||
[
|
||||
SensevalCorpusView(fileid, enc)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def _entry(self, tree):
|
||||
elts = []
|
||||
for lexelt in tree.findall("lexelt"):
|
||||
for inst in lexelt.findall("instance"):
|
||||
sense = inst[0].attrib["senseid"]
|
||||
context = [(w.text, w.attrib["pos"]) for w in inst[1]]
|
||||
elts.append((sense, context))
|
||||
return elts
|
||||
|
||||
|
||||
class SensevalCorpusView(StreamBackedCorpusView):
|
||||
def __init__(self, fileid, encoding):
|
||||
StreamBackedCorpusView.__init__(self, fileid, encoding=encoding)
|
||||
|
||||
self._word_tokenizer = WhitespaceTokenizer()
|
||||
self._lexelt_starts = [0] # list of streampos
|
||||
self._lexelts = [None] # list of lexelt names
|
||||
|
||||
def read_block(self, stream):
|
||||
# Decide which lexical element we're in.
|
||||
lexelt_num = bisect.bisect_right(self._lexelt_starts, stream.tell()) - 1
|
||||
lexelt = self._lexelts[lexelt_num]
|
||||
|
||||
instance_lines = []
|
||||
in_instance = False
|
||||
while True:
|
||||
line = stream.readline()
|
||||
if line == "":
|
||||
assert instance_lines == []
|
||||
return []
|
||||
|
||||
# Start of a lexical element?
|
||||
if line.lstrip().startswith("<lexelt"):
|
||||
lexelt_num += 1
|
||||
m = re.search("item=(\"[^\"]+\"|'[^']+')", line)
|
||||
assert m is not None # <lexelt> has no 'item=...'
|
||||
lexelt = m.group(1)[1:-1]
|
||||
if lexelt_num < len(self._lexelts):
|
||||
assert lexelt == self._lexelts[lexelt_num]
|
||||
else:
|
||||
self._lexelts.append(lexelt)
|
||||
self._lexelt_starts.append(stream.tell())
|
||||
|
||||
# Start of an instance?
|
||||
if line.lstrip().startswith("<instance"):
|
||||
assert instance_lines == []
|
||||
in_instance = True
|
||||
|
||||
# Body of an instance?
|
||||
if in_instance:
|
||||
instance_lines.append(line)
|
||||
|
||||
# End of an instance?
|
||||
if line.lstrip().startswith("</instance"):
|
||||
xml_block = "\n".join(instance_lines)
|
||||
xml_block = _fixXML(xml_block)
|
||||
inst = ElementTree.fromstring(xml_block)
|
||||
return [self._parse_instance(inst, lexelt)]
|
||||
|
||||
def _parse_instance(self, instance, lexelt):
|
||||
senses = []
|
||||
context = []
|
||||
position = None
|
||||
for child in instance:
|
||||
if child.tag == "answer":
|
||||
senses.append(child.attrib["senseid"])
|
||||
elif child.tag == "context":
|
||||
context += self._word_tokenizer.tokenize(child.text)
|
||||
for cword in child:
|
||||
if cword.tag == "compound":
|
||||
cword = cword[0] # is this ok to do?
|
||||
|
||||
if cword.tag == "head":
|
||||
# Some santiy checks:
|
||||
assert position is None, "head specified twice"
|
||||
assert cword.text.strip() or len(cword) == 1
|
||||
assert not (cword.text.strip() and len(cword) == 1)
|
||||
# Record the position of the head:
|
||||
position = len(context)
|
||||
# Add on the head word itself:
|
||||
if cword.text.strip():
|
||||
context.append(cword.text.strip())
|
||||
elif cword[0].tag == "wf":
|
||||
context.append((cword[0].text, cword[0].attrib["pos"]))
|
||||
if cword[0].tail:
|
||||
context += self._word_tokenizer.tokenize(cword[0].tail)
|
||||
else:
|
||||
assert False, "expected CDATA or wf in <head>"
|
||||
elif cword.tag == "wf":
|
||||
context.append((cword.text, cword.attrib["pos"]))
|
||||
elif cword.tag == "s":
|
||||
pass # Sentence boundary marker.
|
||||
|
||||
else:
|
||||
print("ACK", cword.tag)
|
||||
assert False, "expected CDATA or <wf> or <head>"
|
||||
if cword.tail:
|
||||
context += self._word_tokenizer.tokenize(cword.tail)
|
||||
else:
|
||||
assert False, "unexpected tag %s" % child.tag
|
||||
return SensevalInstance(lexelt, position, context, senses)
|
||||
|
||||
|
||||
def _fixXML(text):
|
||||
"""
|
||||
Fix the various issues with Senseval pseudo-XML.
|
||||
"""
|
||||
# <~> or <^> => ~ or ^
|
||||
text = re.sub(r"<([~\^])>", r"\1", text)
|
||||
# fix lone &
|
||||
text = re.sub(r"(\s+)\&(\s+)", r"\1&\2", text)
|
||||
# fix """
|
||||
text = re.sub(r'"""', "'\"'", text)
|
||||
# fix <s snum=dd> => <s snum="dd"/>
|
||||
text = re.sub(r'(<[^<]*snum=)([^">]+)>', r'\1"\2"/>', text)
|
||||
# fix foreign word tag
|
||||
text = re.sub(r"<\&frasl>\s*<p[^>]*>", "FRASL", text)
|
||||
# remove <&I .>
|
||||
text = re.sub(r"<\&I[^>]*>", "", text)
|
||||
# fix <{word}>
|
||||
text = re.sub(r"<{([^}]+)}>", r"\1", text)
|
||||
# remove <@>, <p>, </p>
|
||||
text = re.sub(r"<(@|/?p)>", r"", text)
|
||||
# remove <&M .> and <&T .> and <&Ms .>
|
||||
text = re.sub(r"<&\w+ \.>", r"", text)
|
||||
# remove <!DOCTYPE... > lines
|
||||
text = re.sub(r"<!DOCTYPE[^>]*>", r"", text)
|
||||
# remove <[hi]> and <[/p]> etc
|
||||
text = re.sub(r"<\[\/?[^>]+\]*>", r"", text)
|
||||
# take the thing out of the brackets: <…>
|
||||
text = re.sub(r"<(\&\w+;)>", r"\1", text)
|
||||
# and remove the & for those patterns that aren't regular XML
|
||||
text = re.sub(r"&(?!amp|gt|lt|apos|quot)", r"", text)
|
||||
# fix 'abc <p="foo"/>' style tags - now <wf pos="foo">abc</wf>
|
||||
text = re.sub(
|
||||
r'[ \t]*([^<>\s]+?)[ \t]*<p="([^"]*"?)"/>', r' <wf pos="\2">\1</wf>', text
|
||||
)
|
||||
text = re.sub(r'\s*"\s*<p=\'"\'/>', " <wf pos='\"'>\"</wf>", text)
|
||||
return text
|
||||
@@ -0,0 +1,136 @@
|
||||
# Natural Language Toolkit: SentiWordNet
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Christopher Potts <cgpotts@stanford.edu>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
An NLTK interface for SentiWordNet
|
||||
|
||||
SentiWordNet is a lexical resource for opinion mining.
|
||||
SentiWordNet assigns to each synset of WordNet three
|
||||
sentiment scores: positivity, negativity, and objectivity.
|
||||
|
||||
For details about SentiWordNet see:
|
||||
http://sentiwordnet.isti.cnr.it/
|
||||
|
||||
>>> from nltk.corpus import sentiwordnet as swn
|
||||
>>> print(swn.senti_synset('breakdown.n.03'))
|
||||
<breakdown.n.03: PosScore=0.0 NegScore=0.25>
|
||||
>>> list(swn.senti_synsets('slow'))
|
||||
[SentiSynset('decelerate.v.01'), SentiSynset('slow.v.02'),\
|
||||
SentiSynset('slow.v.03'), SentiSynset('slow.a.01'),\
|
||||
SentiSynset('slow.a.02'), SentiSynset('dense.s.04'),\
|
||||
SentiSynset('slow.a.04'), SentiSynset('boring.s.01'),\
|
||||
SentiSynset('dull.s.05'), SentiSynset('slowly.r.01'),\
|
||||
SentiSynset('behind.r.03')]
|
||||
>>> happy = swn.senti_synsets('happy', 'a')
|
||||
>>> happy0 = list(happy)[0]
|
||||
>>> happy0.pos_score()
|
||||
0.875
|
||||
>>> happy0.neg_score()
|
||||
0.0
|
||||
>>> happy0.obj_score()
|
||||
0.125
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
from nltk.corpus.reader import CorpusReader
|
||||
|
||||
|
||||
class SentiWordNetCorpusReader(CorpusReader):
|
||||
def __init__(self, root, fileids, encoding="utf-8"):
|
||||
"""
|
||||
Construct a new SentiWordNet Corpus Reader, using data from
|
||||
the specified file.
|
||||
"""
|
||||
super().__init__(root, fileids, encoding=encoding)
|
||||
if len(self._fileids) != 1:
|
||||
raise ValueError("Exactly one file must be specified")
|
||||
self._db = {}
|
||||
self._parse_src_file()
|
||||
|
||||
def _parse_src_file(self):
|
||||
lines = self.open(self._fileids[0]).read().splitlines()
|
||||
lines = filter((lambda x: not re.search(r"^\s*#", x)), lines)
|
||||
for i, line in enumerate(lines):
|
||||
fields = [field.strip() for field in re.split(r"\t+", line)]
|
||||
try:
|
||||
pos, offset, pos_score, neg_score, synset_terms, gloss = fields
|
||||
except BaseException as e:
|
||||
raise ValueError(f"Line {i} formatted incorrectly: {line}\n") from e
|
||||
if pos and offset:
|
||||
offset = int(offset)
|
||||
self._db[(pos, offset)] = (float(pos_score), float(neg_score))
|
||||
|
||||
def senti_synset(self, *vals):
|
||||
from nltk.corpus import wordnet as wn
|
||||
|
||||
if tuple(vals) in self._db:
|
||||
pos_score, neg_score = self._db[tuple(vals)]
|
||||
pos, offset = vals
|
||||
if pos == "s":
|
||||
pos = "a"
|
||||
synset = wn.synset_from_pos_and_offset(pos, offset)
|
||||
return SentiSynset(pos_score, neg_score, synset)
|
||||
else:
|
||||
synset = wn.synset(vals[0])
|
||||
pos = synset.pos()
|
||||
if pos == "s":
|
||||
pos = "a"
|
||||
offset = synset.offset()
|
||||
if (pos, offset) in self._db:
|
||||
pos_score, neg_score = self._db[(pos, offset)]
|
||||
return SentiSynset(pos_score, neg_score, synset)
|
||||
else:
|
||||
return None
|
||||
|
||||
def senti_synsets(self, string, pos=None):
|
||||
from nltk.corpus import wordnet as wn
|
||||
|
||||
sentis = []
|
||||
synset_list = wn.synsets(string, pos)
|
||||
for synset in synset_list:
|
||||
sentis.append(self.senti_synset(synset.name()))
|
||||
sentis = filter(lambda x: x, sentis)
|
||||
return sentis
|
||||
|
||||
def all_senti_synsets(self):
|
||||
from nltk.corpus import wordnet as wn
|
||||
|
||||
for key, fields in self._db.items():
|
||||
pos, offset = key
|
||||
pos_score, neg_score = fields
|
||||
synset = wn.synset_from_pos_and_offset(pos, offset)
|
||||
yield SentiSynset(pos_score, neg_score, synset)
|
||||
|
||||
|
||||
class SentiSynset:
|
||||
def __init__(self, pos_score, neg_score, synset):
|
||||
self._pos_score = pos_score
|
||||
self._neg_score = neg_score
|
||||
self._obj_score = 1.0 - (self._pos_score + self._neg_score)
|
||||
self.synset = synset
|
||||
|
||||
def pos_score(self):
|
||||
return self._pos_score
|
||||
|
||||
def neg_score(self):
|
||||
return self._neg_score
|
||||
|
||||
def obj_score(self):
|
||||
return self._obj_score
|
||||
|
||||
def __str__(self):
|
||||
"""Prints just the Pos/Neg scores for now."""
|
||||
s = "<"
|
||||
s += self.synset.name() + ": "
|
||||
s += "PosScore=%s " % self._pos_score
|
||||
s += "NegScore=%s" % self._neg_score
|
||||
s += ">"
|
||||
return s
|
||||
|
||||
def __repr__(self):
|
||||
return "Senti" + repr(self.synset)
|
||||
@@ -0,0 +1,75 @@
|
||||
# Natural Language Toolkit: Sinica Treebank Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Sinica Treebank Corpus Sample
|
||||
|
||||
http://rocling.iis.sinica.edu.tw/CKIP/engversion/treebank.htm
|
||||
|
||||
10,000 parsed sentences, drawn from the Academia Sinica Balanced
|
||||
Corpus of Modern Chinese. Parse tree notation is based on
|
||||
Information-based Case Grammar. Tagset documentation is available
|
||||
at https://www.sinica.edu.tw/SinicaCorpus/modern_e_wordtype.html
|
||||
|
||||
Language and Knowledge Processing Group, Institute of Information
|
||||
Science, Academia Sinica
|
||||
|
||||
The data is distributed with the Natural Language Toolkit under the terms of
|
||||
the Creative Commons Attribution-NonCommercial-ShareAlike License
|
||||
[https://creativecommons.org/licenses/by-nc-sa/2.5/].
|
||||
|
||||
References:
|
||||
|
||||
Feng-Yi Chen, Pi-Fang Tsai, Keh-Jiann Chen, and Chu-Ren Huang (1999)
|
||||
The Construction of Sinica Treebank. Computational Linguistics and
|
||||
Chinese Language Processing, 4, pp 87-104.
|
||||
|
||||
Huang Chu-Ren, Keh-Jiann Chen, Feng-Yi Chen, Keh-Jiann Chen, Zhao-Ming
|
||||
Gao, and Kuang-Yu Chen. 2000. Sinica Treebank: Design Criteria,
|
||||
Annotation Guidelines, and On-line Interface. Proceedings of 2nd
|
||||
Chinese Language Processing Workshop, Association for Computational
|
||||
Linguistics.
|
||||
|
||||
Chen Keh-Jiann and Yu-Ming Hsieh (2004) Chinese Treebanks and Grammar
|
||||
Extraction, Proceedings of IJCNLP-04, pp560-565.
|
||||
"""
|
||||
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.tag import map_tag
|
||||
from nltk.tree import sinica_parse
|
||||
|
||||
IDENTIFIER = re.compile(r"^#\S+\s")
|
||||
APPENDIX = re.compile(r"(?<=\))#.*$")
|
||||
TAGWORD = re.compile(r":([^:()|]+):([^:()|]+)")
|
||||
WORD = re.compile(r":[^:()|]+:([^:()|]+)")
|
||||
|
||||
|
||||
class SinicaTreebankCorpusReader(SyntaxCorpusReader):
|
||||
"""
|
||||
Reader for the sinica treebank.
|
||||
"""
|
||||
|
||||
def _read_block(self, stream):
|
||||
sent = stream.readline()
|
||||
sent = IDENTIFIER.sub("", sent)
|
||||
sent = APPENDIX.sub("", sent)
|
||||
return [sent]
|
||||
|
||||
def _parse(self, sent):
|
||||
return sinica_parse(sent)
|
||||
|
||||
def _tag(self, sent, tagset=None):
|
||||
tagged_sent = [(w, t) for (t, w) in TAGWORD.findall(sent)]
|
||||
if tagset and tagset != self._tagset:
|
||||
tagged_sent = [
|
||||
(w, map_tag(self._tagset, tagset, t)) for (w, t) in tagged_sent
|
||||
]
|
||||
return tagged_sent
|
||||
|
||||
def _word(self, sent):
|
||||
return WORD.findall(sent)
|
||||
@@ -0,0 +1,56 @@
|
||||
# Natural Language Toolkit: String Category Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Read tuples from a corpus consisting of categorized strings.
|
||||
For example, from the question classification corpus:
|
||||
|
||||
NUM:dist How far is it from Denver to Aspen ?
|
||||
LOC:city What county is Modesto , California in ?
|
||||
HUM:desc Who was Galileo ?
|
||||
DESC:def What is an atom ?
|
||||
NUM:date When did Hawaii become a state ?
|
||||
"""
|
||||
|
||||
from nltk.corpus.reader.api import *
|
||||
|
||||
# based on PPAttachmentCorpusReader
|
||||
from nltk.corpus.reader.util import *
|
||||
|
||||
|
||||
# [xx] Should the order of the tuple be reversed -- in most other places
|
||||
# in nltk, we use the form (data, tag) -- e.g., tagged words and
|
||||
# labeled texts for classifiers.
|
||||
class StringCategoryCorpusReader(CorpusReader):
|
||||
def __init__(self, root, fileids, delimiter=" ", encoding="utf8"):
|
||||
"""
|
||||
:param root: The root directory for this corpus.
|
||||
:param fileids: A list or regexp specifying the fileids in this corpus.
|
||||
:param delimiter: Field delimiter
|
||||
"""
|
||||
CorpusReader.__init__(self, root, fileids, encoding)
|
||||
self._delimiter = delimiter
|
||||
|
||||
def tuples(self, fileids=None):
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, str):
|
||||
fileids = [fileids]
|
||||
return concat(
|
||||
[
|
||||
StreamBackedCorpusView(fileid, self._read_tuple_block, encoding=enc)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def _read_tuple_block(self, stream):
|
||||
line = stream.readline().strip()
|
||||
if line:
|
||||
return [tuple(line.split(self._delimiter, 1))]
|
||||
else:
|
||||
return []
|
||||
125
backend/venv/Lib/site-packages/nltk/corpus/reader/switchboard.py
Normal file
125
backend/venv/Lib/site-packages/nltk/corpus/reader/switchboard.py
Normal file
@@ -0,0 +1,125 @@
|
||||
# Natural Language Toolkit: Switchboard Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
import re
|
||||
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.tag import map_tag, str2tuple
|
||||
|
||||
|
||||
class SwitchboardTurn(list):
|
||||
"""
|
||||
A specialized list object used to encode switchboard utterances.
|
||||
The elements of the list are the words in the utterance; and two
|
||||
attributes, ``speaker`` and ``id``, are provided to retrieve the
|
||||
spearker identifier and utterance id. Note that utterance ids
|
||||
are only unique within a given discourse.
|
||||
"""
|
||||
|
||||
def __init__(self, words, speaker, id):
|
||||
list.__init__(self, words)
|
||||
self.speaker = speaker
|
||||
self.id = int(id)
|
||||
|
||||
def __repr__(self):
|
||||
if len(self) == 0:
|
||||
text = ""
|
||||
elif isinstance(self[0], tuple):
|
||||
text = " ".join("%s/%s" % w for w in self)
|
||||
else:
|
||||
text = " ".join(self)
|
||||
return f"<{self.speaker}.{self.id}: {text!r}>"
|
||||
|
||||
|
||||
class SwitchboardCorpusReader(CorpusReader):
|
||||
_FILES = ["tagged"]
|
||||
# Use the "tagged" file even for non-tagged data methods, since
|
||||
# it's tokenized.
|
||||
|
||||
def __init__(self, root, tagset=None):
|
||||
CorpusReader.__init__(self, root, self._FILES)
|
||||
self._tagset = tagset
|
||||
|
||||
def words(self):
|
||||
return StreamBackedCorpusView(self.abspath("tagged"), self._words_block_reader)
|
||||
|
||||
def tagged_words(self, tagset=None):
|
||||
def tagged_words_block_reader(stream):
|
||||
return self._tagged_words_block_reader(stream, tagset)
|
||||
|
||||
return StreamBackedCorpusView(self.abspath("tagged"), tagged_words_block_reader)
|
||||
|
||||
def turns(self):
|
||||
return StreamBackedCorpusView(self.abspath("tagged"), self._turns_block_reader)
|
||||
|
||||
def tagged_turns(self, tagset=None):
|
||||
def tagged_turns_block_reader(stream):
|
||||
return self._tagged_turns_block_reader(stream, tagset)
|
||||
|
||||
return StreamBackedCorpusView(self.abspath("tagged"), tagged_turns_block_reader)
|
||||
|
||||
def discourses(self):
|
||||
return StreamBackedCorpusView(
|
||||
self.abspath("tagged"), self._discourses_block_reader
|
||||
)
|
||||
|
||||
def tagged_discourses(self, tagset=False):
|
||||
def tagged_discourses_block_reader(stream):
|
||||
return self._tagged_discourses_block_reader(stream, tagset)
|
||||
|
||||
return StreamBackedCorpusView(
|
||||
self.abspath("tagged"), tagged_discourses_block_reader
|
||||
)
|
||||
|
||||
def _discourses_block_reader(self, stream):
|
||||
# returns at most 1 discourse. (The other methods depend on this.)
|
||||
return [
|
||||
[
|
||||
self._parse_utterance(u, include_tag=False)
|
||||
for b in read_blankline_block(stream)
|
||||
for u in b.split("\n")
|
||||
if u.strip()
|
||||
]
|
||||
]
|
||||
|
||||
def _tagged_discourses_block_reader(self, stream, tagset=None):
|
||||
# returns at most 1 discourse. (The other methods depend on this.)
|
||||
return [
|
||||
[
|
||||
self._parse_utterance(u, include_tag=True, tagset=tagset)
|
||||
for b in read_blankline_block(stream)
|
||||
for u in b.split("\n")
|
||||
if u.strip()
|
||||
]
|
||||
]
|
||||
|
||||
def _turns_block_reader(self, stream):
|
||||
return self._discourses_block_reader(stream)[0]
|
||||
|
||||
def _tagged_turns_block_reader(self, stream, tagset=None):
|
||||
return self._tagged_discourses_block_reader(stream, tagset)[0]
|
||||
|
||||
def _words_block_reader(self, stream):
|
||||
return sum(self._discourses_block_reader(stream)[0], [])
|
||||
|
||||
def _tagged_words_block_reader(self, stream, tagset=None):
|
||||
return sum(self._tagged_discourses_block_reader(stream, tagset)[0], [])
|
||||
|
||||
_UTTERANCE_RE = re.compile(r"(\w+)\.(\d+)\:\s*(.*)")
|
||||
_SEP = "/"
|
||||
|
||||
def _parse_utterance(self, utterance, include_tag, tagset=None):
|
||||
m = self._UTTERANCE_RE.match(utterance)
|
||||
if m is None:
|
||||
raise ValueError("Bad utterance %r" % utterance)
|
||||
speaker, id, text = m.groups()
|
||||
words = [str2tuple(s, self._SEP) for s in text.split()]
|
||||
if not include_tag:
|
||||
words = [w for (w, t) in words]
|
||||
elif tagset and tagset != self._tagset:
|
||||
words = [(w, map_tag(self._tagset, tagset, t)) for (w, t) in words]
|
||||
return SwitchboardTurn(words, speaker, id)
|
||||
354
backend/venv/Lib/site-packages/nltk/corpus/reader/tagged.py
Normal file
354
backend/venv/Lib/site-packages/nltk/corpus/reader/tagged.py
Normal file
@@ -0,0 +1,354 @@
|
||||
# Natural Language Toolkit: Tagged Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com>
|
||||
# Jacob Perkins <japerk@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
A reader for corpora whose documents contain part-of-speech-tagged words.
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.corpus.reader.timit import read_timit_block
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.tag import map_tag, str2tuple
|
||||
from nltk.tokenize import *
|
||||
|
||||
|
||||
class TaggedCorpusReader(CorpusReader):
|
||||
"""
|
||||
Reader for simple part-of-speech tagged corpora. Paragraphs are
|
||||
assumed to be split using blank lines. Sentences and words can be
|
||||
tokenized using the default tokenizers, or by custom tokenizers
|
||||
specified as parameters to the constructor. Words are parsed
|
||||
using ``nltk.tag.str2tuple``. By default, ``'/'`` is used as the
|
||||
separator. I.e., words should have the form::
|
||||
|
||||
word1/tag1 word2/tag2 word3/tag3 ...
|
||||
|
||||
But custom separators may be specified as parameters to the
|
||||
constructor. Part of speech tags are case-normalized to upper
|
||||
case.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
root,
|
||||
fileids,
|
||||
sep="/",
|
||||
word_tokenizer=WhitespaceTokenizer(),
|
||||
sent_tokenizer=RegexpTokenizer("\n", gaps=True),
|
||||
para_block_reader=read_blankline_block,
|
||||
encoding="utf8",
|
||||
tagset=None,
|
||||
):
|
||||
"""
|
||||
Construct a new Tagged Corpus reader for a set of documents
|
||||
located at the given root directory. Example usage:
|
||||
|
||||
>>> root = '/...path to corpus.../'
|
||||
>>> reader = TaggedCorpusReader(root, '.*', '.txt') # doctest: +SKIP
|
||||
|
||||
:param root: The root directory for this corpus.
|
||||
:param fileids: A list or regexp specifying the fileids in this corpus.
|
||||
"""
|
||||
CorpusReader.__init__(self, root, fileids, encoding)
|
||||
self._sep = sep
|
||||
self._word_tokenizer = word_tokenizer
|
||||
self._sent_tokenizer = sent_tokenizer
|
||||
self._para_block_reader = para_block_reader
|
||||
self._tagset = tagset
|
||||
|
||||
def words(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of words
|
||||
and punctuation symbols.
|
||||
:rtype: list(str)
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
TaggedCorpusView(
|
||||
fileid,
|
||||
enc,
|
||||
False,
|
||||
False,
|
||||
False,
|
||||
self._sep,
|
||||
self._word_tokenizer,
|
||||
self._sent_tokenizer,
|
||||
self._para_block_reader,
|
||||
None,
|
||||
)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def sents(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of
|
||||
sentences or utterances, each encoded as a list of word
|
||||
strings.
|
||||
:rtype: list(list(str))
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
TaggedCorpusView(
|
||||
fileid,
|
||||
enc,
|
||||
False,
|
||||
True,
|
||||
False,
|
||||
self._sep,
|
||||
self._word_tokenizer,
|
||||
self._sent_tokenizer,
|
||||
self._para_block_reader,
|
||||
None,
|
||||
)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def paras(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of
|
||||
paragraphs, each encoded as a list of sentences, which are
|
||||
in turn encoded as lists of word strings.
|
||||
:rtype: list(list(list(str)))
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
TaggedCorpusView(
|
||||
fileid,
|
||||
enc,
|
||||
False,
|
||||
True,
|
||||
True,
|
||||
self._sep,
|
||||
self._word_tokenizer,
|
||||
self._sent_tokenizer,
|
||||
self._para_block_reader,
|
||||
None,
|
||||
)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_words(self, fileids=None, tagset=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of tagged
|
||||
words and punctuation symbols, encoded as tuples
|
||||
``(word,tag)``.
|
||||
:rtype: list(tuple(str,str))
|
||||
"""
|
||||
if tagset and tagset != self._tagset:
|
||||
tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
|
||||
else:
|
||||
tag_mapping_function = None
|
||||
return concat(
|
||||
[
|
||||
TaggedCorpusView(
|
||||
fileid,
|
||||
enc,
|
||||
True,
|
||||
False,
|
||||
False,
|
||||
self._sep,
|
||||
self._word_tokenizer,
|
||||
self._sent_tokenizer,
|
||||
self._para_block_reader,
|
||||
tag_mapping_function,
|
||||
)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_sents(self, fileids=None, tagset=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of
|
||||
sentences, each encoded as a list of ``(word,tag)`` tuples.
|
||||
|
||||
:rtype: list(list(tuple(str,str)))
|
||||
"""
|
||||
if tagset and tagset != self._tagset:
|
||||
tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
|
||||
else:
|
||||
tag_mapping_function = None
|
||||
return concat(
|
||||
[
|
||||
TaggedCorpusView(
|
||||
fileid,
|
||||
enc,
|
||||
True,
|
||||
True,
|
||||
False,
|
||||
self._sep,
|
||||
self._word_tokenizer,
|
||||
self._sent_tokenizer,
|
||||
self._para_block_reader,
|
||||
tag_mapping_function,
|
||||
)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_paras(self, fileids=None, tagset=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of
|
||||
paragraphs, each encoded as a list of sentences, which are
|
||||
in turn encoded as lists of ``(word,tag)`` tuples.
|
||||
:rtype: list(list(list(tuple(str,str))))
|
||||
"""
|
||||
if tagset and tagset != self._tagset:
|
||||
tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
|
||||
else:
|
||||
tag_mapping_function = None
|
||||
return concat(
|
||||
[
|
||||
TaggedCorpusView(
|
||||
fileid,
|
||||
enc,
|
||||
True,
|
||||
True,
|
||||
True,
|
||||
self._sep,
|
||||
self._word_tokenizer,
|
||||
self._sent_tokenizer,
|
||||
self._para_block_reader,
|
||||
tag_mapping_function,
|
||||
)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
class CategorizedTaggedCorpusReader(CategorizedCorpusReader, TaggedCorpusReader):
|
||||
"""
|
||||
A reader for part-of-speech tagged corpora whose documents are
|
||||
divided into categories based on their file identifiers.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
"""
|
||||
Initialize the corpus reader. Categorization arguments
|
||||
(``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
|
||||
the ``CategorizedCorpusReader`` constructor. The remaining arguments
|
||||
are passed to the ``TaggedCorpusReader``.
|
||||
"""
|
||||
CategorizedCorpusReader.__init__(self, kwargs)
|
||||
TaggedCorpusReader.__init__(self, *args, **kwargs)
|
||||
|
||||
def tagged_words(self, fileids=None, categories=None, tagset=None):
|
||||
return super().tagged_words(self._resolve(fileids, categories), tagset)
|
||||
|
||||
def tagged_sents(self, fileids=None, categories=None, tagset=None):
|
||||
return super().tagged_sents(self._resolve(fileids, categories), tagset)
|
||||
|
||||
def tagged_paras(self, fileids=None, categories=None, tagset=None):
|
||||
return super().tagged_paras(self._resolve(fileids, categories), tagset)
|
||||
|
||||
|
||||
class TaggedCorpusView(StreamBackedCorpusView):
|
||||
"""
|
||||
A specialized corpus view for tagged documents. It can be
|
||||
customized via flags to divide the tagged corpus documents up by
|
||||
sentence or paragraph, and to include or omit part of speech tags.
|
||||
``TaggedCorpusView`` objects are typically created by
|
||||
``TaggedCorpusReader`` (not directly by nltk users).
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
corpus_file,
|
||||
encoding,
|
||||
tagged,
|
||||
group_by_sent,
|
||||
group_by_para,
|
||||
sep,
|
||||
word_tokenizer,
|
||||
sent_tokenizer,
|
||||
para_block_reader,
|
||||
tag_mapping_function=None,
|
||||
):
|
||||
self._tagged = tagged
|
||||
self._group_by_sent = group_by_sent
|
||||
self._group_by_para = group_by_para
|
||||
self._sep = sep
|
||||
self._word_tokenizer = word_tokenizer
|
||||
self._sent_tokenizer = sent_tokenizer
|
||||
self._para_block_reader = para_block_reader
|
||||
self._tag_mapping_function = tag_mapping_function
|
||||
StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
|
||||
|
||||
def read_block(self, stream):
|
||||
"""Reads one paragraph at a time."""
|
||||
block = []
|
||||
for para_str in self._para_block_reader(stream):
|
||||
para = []
|
||||
for sent_str in self._sent_tokenizer.tokenize(para_str):
|
||||
sent = [
|
||||
str2tuple(s, self._sep)
|
||||
for s in self._word_tokenizer.tokenize(sent_str)
|
||||
]
|
||||
if self._tag_mapping_function:
|
||||
sent = [(w, self._tag_mapping_function(t)) for (w, t) in sent]
|
||||
if not self._tagged:
|
||||
sent = [w for (w, t) in sent]
|
||||
if self._group_by_sent:
|
||||
para.append(sent)
|
||||
else:
|
||||
para.extend(sent)
|
||||
if self._group_by_para:
|
||||
block.append(para)
|
||||
else:
|
||||
block.extend(para)
|
||||
return block
|
||||
|
||||
|
||||
# needs to implement simplified tags
|
||||
class MacMorphoCorpusReader(TaggedCorpusReader):
|
||||
"""
|
||||
A corpus reader for the MAC_MORPHO corpus. Each line contains a
|
||||
single tagged word, using '_' as a separator. Sentence boundaries
|
||||
are based on the end-sentence tag ('_.'). Paragraph information
|
||||
is not included in the corpus, so each paragraph returned by
|
||||
``self.paras()`` and ``self.tagged_paras()`` contains a single
|
||||
sentence.
|
||||
"""
|
||||
|
||||
def __init__(self, root, fileids, encoding="utf8", tagset=None):
|
||||
TaggedCorpusReader.__init__(
|
||||
self,
|
||||
root,
|
||||
fileids,
|
||||
sep="_",
|
||||
word_tokenizer=LineTokenizer(),
|
||||
sent_tokenizer=RegexpTokenizer(".*\n"),
|
||||
para_block_reader=self._read_block,
|
||||
encoding=encoding,
|
||||
tagset=tagset,
|
||||
)
|
||||
|
||||
def _read_block(self, stream):
|
||||
return read_regexp_block(stream, r".*", r".*_\.")
|
||||
|
||||
|
||||
class TimitTaggedCorpusReader(TaggedCorpusReader):
|
||||
"""
|
||||
A corpus reader for tagged sentences that are included in the TIMIT corpus.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
TaggedCorpusReader.__init__(
|
||||
self, para_block_reader=read_timit_block, *args, **kwargs
|
||||
)
|
||||
|
||||
def paras(self):
|
||||
raise NotImplementedError("use sents() instead")
|
||||
|
||||
def tagged_paras(self):
|
||||
raise NotImplementedError("use tagged_sents() instead")
|
||||
510
backend/venv/Lib/site-packages/nltk/corpus/reader/timit.py
Normal file
510
backend/venv/Lib/site-packages/nltk/corpus/reader/timit.py
Normal file
@@ -0,0 +1,510 @@
|
||||
# Natural Language Toolkit: TIMIT Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2007 NLTK Project
|
||||
# Author: Haejoong Lee <haejoong@ldc.upenn.edu>
|
||||
# Steven Bird <stevenbird1@gmail.com>
|
||||
# Jacob Perkins <japerk@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
# [xx] this docstring is out-of-date:
|
||||
"""
|
||||
Read tokens, phonemes and audio data from the NLTK TIMIT Corpus.
|
||||
|
||||
This corpus contains selected portion of the TIMIT corpus.
|
||||
|
||||
- 16 speakers from 8 dialect regions
|
||||
- 1 male and 1 female from each dialect region
|
||||
- total 130 sentences (10 sentences per speaker. Note that some
|
||||
sentences are shared among other speakers, especially sa1 and sa2
|
||||
are spoken by all speakers.)
|
||||
- total 160 recording of sentences (10 recordings per speaker)
|
||||
- audio format: NIST Sphere, single channel, 16kHz sampling,
|
||||
16 bit sample, PCM encoding
|
||||
|
||||
|
||||
Module contents
|
||||
===============
|
||||
|
||||
The timit corpus reader provides 4 functions and 4 data items.
|
||||
|
||||
- utterances
|
||||
|
||||
List of utterances in the corpus. There are total 160 utterances,
|
||||
each of which corresponds to a unique utterance of a speaker.
|
||||
Here's an example of an utterance identifier in the list::
|
||||
|
||||
dr1-fvmh0/sx206
|
||||
- _---- _---
|
||||
| | | | |
|
||||
| | | | |
|
||||
| | | | `--- sentence number
|
||||
| | | `----- sentence type (a:all, i:shared, x:exclusive)
|
||||
| | `--------- speaker ID
|
||||
| `------------ sex (m:male, f:female)
|
||||
`-------------- dialect region (1..8)
|
||||
|
||||
- speakers
|
||||
|
||||
List of speaker IDs. An example of speaker ID::
|
||||
|
||||
dr1-fvmh0
|
||||
|
||||
Note that if you split an item ID with colon and take the first element of
|
||||
the result, you will get a speaker ID.
|
||||
|
||||
>>> itemid = 'dr1-fvmh0/sx206'
|
||||
>>> spkrid , sentid = itemid.split('/')
|
||||
>>> spkrid
|
||||
'dr1-fvmh0'
|
||||
|
||||
The second element of the result is a sentence ID.
|
||||
|
||||
- dictionary()
|
||||
|
||||
Phonetic dictionary of words contained in this corpus. This is a Python
|
||||
dictionary from words to phoneme lists.
|
||||
|
||||
- spkrinfo()
|
||||
|
||||
Speaker information table. It's a Python dictionary from speaker IDs to
|
||||
records of 10 fields. Speaker IDs the same as the ones in timie.speakers.
|
||||
Each record is a dictionary from field names to values, and the fields are
|
||||
as follows::
|
||||
|
||||
id speaker ID as defined in the original TIMIT speaker info table
|
||||
sex speaker gender (M:male, F:female)
|
||||
dr speaker dialect region (1:new england, 2:northern,
|
||||
3:north midland, 4:south midland, 5:southern, 6:new york city,
|
||||
7:western, 8:army brat (moved around))
|
||||
use corpus type (TRN:training, TST:test)
|
||||
in this sample corpus only TRN is available
|
||||
recdate recording date
|
||||
birthdate speaker birth date
|
||||
ht speaker height
|
||||
race speaker race (WHT:white, BLK:black, AMR:american indian,
|
||||
SPN:spanish-american, ORN:oriental,???:unknown)
|
||||
edu speaker education level (HS:high school, AS:associate degree,
|
||||
BS:bachelor's degree (BS or BA), MS:master's degree (MS or MA),
|
||||
PHD:doctorate degree (PhD,JD,MD), ??:unknown)
|
||||
comments comments by the recorder
|
||||
|
||||
The 4 functions are as follows.
|
||||
|
||||
- tokenized(sentences=items, offset=False)
|
||||
|
||||
Given a list of items, returns an iterator of a list of word lists,
|
||||
each of which corresponds to an item (sentence). If offset is set to True,
|
||||
each element of the word list is a tuple of word(string), start offset and
|
||||
end offset, where offset is represented as a number of 16kHz samples.
|
||||
|
||||
- phonetic(sentences=items, offset=False)
|
||||
|
||||
Given a list of items, returns an iterator of a list of phoneme lists,
|
||||
each of which corresponds to an item (sentence). If offset is set to True,
|
||||
each element of the phoneme list is a tuple of word(string), start offset
|
||||
and end offset, where offset is represented as a number of 16kHz samples.
|
||||
|
||||
- audiodata(item, start=0, end=None)
|
||||
|
||||
Given an item, returns a chunk of audio samples formatted into a string.
|
||||
When the function is called, if start and end are omitted, the entire
|
||||
samples of the recording will be returned. If only end is omitted,
|
||||
samples from the start offset to the end of the recording will be returned.
|
||||
|
||||
- play(data)
|
||||
|
||||
Play the given audio samples. The audio samples can be obtained from the
|
||||
timit.audiodata function.
|
||||
|
||||
"""
|
||||
import sys
|
||||
import time
|
||||
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.internals import import_from_stdlib
|
||||
from nltk.tree import Tree
|
||||
|
||||
|
||||
class TimitCorpusReader(CorpusReader):
|
||||
"""
|
||||
Reader for the TIMIT corpus (or any other corpus with the same
|
||||
file layout and use of file formats). The corpus root directory
|
||||
should contain the following files:
|
||||
|
||||
- timitdic.txt: dictionary of standard transcriptions
|
||||
- spkrinfo.txt: table of speaker information
|
||||
|
||||
In addition, the root directory should contain one subdirectory
|
||||
for each speaker, containing three files for each utterance:
|
||||
|
||||
- <utterance-id>.txt: text content of utterances
|
||||
- <utterance-id>.wrd: tokenized text content of utterances
|
||||
- <utterance-id>.phn: phonetic transcription of utterances
|
||||
- <utterance-id>.wav: utterance sound file
|
||||
"""
|
||||
|
||||
_FILE_RE = r"(\w+-\w+/\w+\.(phn|txt|wav|wrd))|" + r"timitdic\.txt|spkrinfo\.txt"
|
||||
"""A regexp matching fileids that are used by this corpus reader."""
|
||||
_UTTERANCE_RE = r"\w+-\w+/\w+\.txt"
|
||||
|
||||
def __init__(self, root, encoding="utf8"):
|
||||
"""
|
||||
Construct a new TIMIT corpus reader in the given directory.
|
||||
:param root: The root directory for this corpus.
|
||||
"""
|
||||
# Ensure that wave files don't get treated as unicode data:
|
||||
if isinstance(encoding, str):
|
||||
encoding = [(r".*\.wav", None), (".*", encoding)]
|
||||
|
||||
CorpusReader.__init__(
|
||||
self, root, find_corpus_fileids(root, self._FILE_RE), encoding=encoding
|
||||
)
|
||||
|
||||
self._utterances = [
|
||||
name[:-4] for name in find_corpus_fileids(root, self._UTTERANCE_RE)
|
||||
]
|
||||
"""A list of the utterance identifiers for all utterances in
|
||||
this corpus."""
|
||||
|
||||
self._speakerinfo = None
|
||||
self._root = root
|
||||
self.speakers = sorted({u.split("/")[0] for u in self._utterances})
|
||||
|
||||
def fileids(self, filetype=None):
|
||||
"""
|
||||
Return a list of file identifiers for the files that make up
|
||||
this corpus.
|
||||
|
||||
:param filetype: If specified, then ``filetype`` indicates that
|
||||
only the files that have the given type should be
|
||||
returned. Accepted values are: ``txt``, ``wrd``, ``phn``,
|
||||
``wav``, or ``metadata``,
|
||||
"""
|
||||
if filetype is None:
|
||||
return CorpusReader.fileids(self)
|
||||
elif filetype in ("txt", "wrd", "phn", "wav"):
|
||||
return [f"{u}.{filetype}" for u in self._utterances]
|
||||
elif filetype == "metadata":
|
||||
return ["timitdic.txt", "spkrinfo.txt"]
|
||||
else:
|
||||
raise ValueError("Bad value for filetype: %r" % filetype)
|
||||
|
||||
def utteranceids(
|
||||
self, dialect=None, sex=None, spkrid=None, sent_type=None, sentid=None
|
||||
):
|
||||
"""
|
||||
:return: A list of the utterance identifiers for all
|
||||
utterances in this corpus, or for the given speaker, dialect
|
||||
region, gender, sentence type, or sentence number, if
|
||||
specified.
|
||||
"""
|
||||
if isinstance(dialect, str):
|
||||
dialect = [dialect]
|
||||
if isinstance(sex, str):
|
||||
sex = [sex]
|
||||
if isinstance(spkrid, str):
|
||||
spkrid = [spkrid]
|
||||
if isinstance(sent_type, str):
|
||||
sent_type = [sent_type]
|
||||
if isinstance(sentid, str):
|
||||
sentid = [sentid]
|
||||
|
||||
utterances = self._utterances[:]
|
||||
if dialect is not None:
|
||||
utterances = [u for u in utterances if u[2] in dialect]
|
||||
if sex is not None:
|
||||
utterances = [u for u in utterances if u[4] in sex]
|
||||
if spkrid is not None:
|
||||
utterances = [u for u in utterances if u[:9] in spkrid]
|
||||
if sent_type is not None:
|
||||
utterances = [u for u in utterances if u[11] in sent_type]
|
||||
if sentid is not None:
|
||||
utterances = [u for u in utterances if u[10:] in spkrid]
|
||||
return utterances
|
||||
|
||||
def transcription_dict(self):
|
||||
"""
|
||||
:return: A dictionary giving the 'standard' transcription for
|
||||
each word.
|
||||
"""
|
||||
_transcriptions = {}
|
||||
with self.open("timitdic.txt") as fp:
|
||||
for line in fp:
|
||||
if not line.strip() or line[0] == ";":
|
||||
continue
|
||||
m = re.match(r"\s*(\S+)\s+/(.*)/\s*$", line)
|
||||
if not m:
|
||||
raise ValueError("Bad line: %r" % line)
|
||||
_transcriptions[m.group(1)] = m.group(2).split()
|
||||
return _transcriptions
|
||||
|
||||
def spkrid(self, utterance):
|
||||
return utterance.split("/")[0]
|
||||
|
||||
def sentid(self, utterance):
|
||||
return utterance.split("/")[1]
|
||||
|
||||
def utterance(self, spkrid, sentid):
|
||||
return f"{spkrid}/{sentid}"
|
||||
|
||||
def spkrutteranceids(self, speaker):
|
||||
"""
|
||||
:return: A list of all utterances associated with a given
|
||||
speaker.
|
||||
"""
|
||||
return [
|
||||
utterance
|
||||
for utterance in self._utterances
|
||||
if utterance.startswith(speaker + "/")
|
||||
]
|
||||
|
||||
def spkrinfo(self, speaker):
|
||||
"""
|
||||
:return: A dictionary mapping .. something.
|
||||
"""
|
||||
if speaker in self._utterances:
|
||||
speaker = self.spkrid(speaker)
|
||||
|
||||
if self._speakerinfo is None:
|
||||
self._speakerinfo = {}
|
||||
with self.open("spkrinfo.txt") as fp:
|
||||
for line in fp:
|
||||
if not line.strip() or line[0] == ";":
|
||||
continue
|
||||
rec = line.strip().split(None, 9)
|
||||
key = f"dr{rec[2]}-{rec[1].lower()}{rec[0].lower()}"
|
||||
self._speakerinfo[key] = SpeakerInfo(*rec)
|
||||
|
||||
return self._speakerinfo[speaker]
|
||||
|
||||
def phones(self, utterances=None):
|
||||
results = []
|
||||
for fileid in self._utterance_fileids(utterances, ".phn"):
|
||||
with self.open(fileid) as fp:
|
||||
for line in fp:
|
||||
if line.strip():
|
||||
results.append(line.split()[-1])
|
||||
return results
|
||||
|
||||
def phone_times(self, utterances=None):
|
||||
"""
|
||||
offset is represented as a number of 16kHz samples!
|
||||
"""
|
||||
results = []
|
||||
for fileid in self._utterance_fileids(utterances, ".phn"):
|
||||
with self.open(fileid) as fp:
|
||||
for line in fp:
|
||||
if line.strip():
|
||||
results.append(
|
||||
(
|
||||
line.split()[2],
|
||||
int(line.split()[0]),
|
||||
int(line.split()[1]),
|
||||
)
|
||||
)
|
||||
return results
|
||||
|
||||
def words(self, utterances=None):
|
||||
results = []
|
||||
for fileid in self._utterance_fileids(utterances, ".wrd"):
|
||||
with self.open(fileid) as fp:
|
||||
for line in fp:
|
||||
if line.strip():
|
||||
results.append(line.split()[-1])
|
||||
return results
|
||||
|
||||
def word_times(self, utterances=None):
|
||||
results = []
|
||||
for fileid in self._utterance_fileids(utterances, ".wrd"):
|
||||
with self.open(fileid) as fp:
|
||||
for line in fp:
|
||||
if line.strip():
|
||||
results.append(
|
||||
(
|
||||
line.split()[2],
|
||||
int(line.split()[0]),
|
||||
int(line.split()[1]),
|
||||
)
|
||||
)
|
||||
return results
|
||||
|
||||
def sents(self, utterances=None):
|
||||
results = []
|
||||
for fileid in self._utterance_fileids(utterances, ".wrd"):
|
||||
with self.open(fileid) as fp:
|
||||
results.append([line.split()[-1] for line in fp if line.strip()])
|
||||
return results
|
||||
|
||||
def sent_times(self, utterances=None):
|
||||
# TODO: Check this
|
||||
return [
|
||||
(
|
||||
line.split(None, 2)[-1].strip(),
|
||||
int(line.split()[0]),
|
||||
int(line.split()[1]),
|
||||
)
|
||||
for fileid in self._utterance_fileids(utterances, ".txt")
|
||||
for line in self.open(fileid)
|
||||
if line.strip()
|
||||
]
|
||||
|
||||
def phone_trees(self, utterances=None):
|
||||
if utterances is None:
|
||||
utterances = self._utterances
|
||||
if isinstance(utterances, str):
|
||||
utterances = [utterances]
|
||||
|
||||
trees = []
|
||||
for utterance in utterances:
|
||||
word_times = self.word_times(utterance)
|
||||
phone_times = self.phone_times(utterance)
|
||||
sent_times = self.sent_times(utterance)
|
||||
|
||||
while sent_times:
|
||||
(sent, sent_start, sent_end) = sent_times.pop(0)
|
||||
trees.append(Tree("S", []))
|
||||
while (
|
||||
word_times and phone_times and phone_times[0][2] <= word_times[0][1]
|
||||
):
|
||||
trees[-1].append(phone_times.pop(0)[0])
|
||||
while word_times and word_times[0][2] <= sent_end:
|
||||
(word, word_start, word_end) = word_times.pop(0)
|
||||
trees[-1].append(Tree(word, []))
|
||||
while phone_times and phone_times[0][2] <= word_end:
|
||||
trees[-1][-1].append(phone_times.pop(0)[0])
|
||||
while phone_times and phone_times[0][2] <= sent_end:
|
||||
trees[-1].append(phone_times.pop(0)[0])
|
||||
return trees
|
||||
|
||||
# [xx] NOTE: This is currently broken -- we're assuming that the
|
||||
# fileids are WAV fileids (aka RIFF), but they're actually NIST SPHERE
|
||||
# fileids.
|
||||
def wav(self, utterance, start=0, end=None):
|
||||
# nltk.chunk conflicts with the stdlib module 'chunk'
|
||||
wave = import_from_stdlib("wave")
|
||||
|
||||
w = wave.open(self.open(utterance + ".wav"), "rb")
|
||||
|
||||
if end is None:
|
||||
end = w.getnframes()
|
||||
|
||||
# Skip past frames before start, then read the frames we want
|
||||
w.readframes(start)
|
||||
frames = w.readframes(end - start)
|
||||
|
||||
# Open a new temporary file -- the wave module requires
|
||||
# an actual file, and won't work w/ stringio. :(
|
||||
tf = tempfile.TemporaryFile()
|
||||
out = wave.open(tf, "w")
|
||||
|
||||
# Write the parameters & data to the new file.
|
||||
out.setparams(w.getparams())
|
||||
out.writeframes(frames)
|
||||
out.close()
|
||||
|
||||
# Read the data back from the file, and return it. The
|
||||
# file will automatically be deleted when we return.
|
||||
tf.seek(0)
|
||||
return tf.read()
|
||||
|
||||
def audiodata(self, utterance, start=0, end=None):
|
||||
assert end is None or end > start
|
||||
headersize = 44
|
||||
with self.open(utterance + ".wav") as fp:
|
||||
if end is None:
|
||||
data = fp.read()
|
||||
else:
|
||||
data = fp.read(headersize + end * 2)
|
||||
return data[headersize + start * 2 :]
|
||||
|
||||
def _utterance_fileids(self, utterances, extension):
|
||||
if utterances is None:
|
||||
utterances = self._utterances
|
||||
if isinstance(utterances, str):
|
||||
utterances = [utterances]
|
||||
return [f"{u}{extension}" for u in utterances]
|
||||
|
||||
def play(self, utterance, start=0, end=None):
|
||||
"""
|
||||
Play the given audio sample.
|
||||
|
||||
:param utterance: The utterance id of the sample to play
|
||||
"""
|
||||
# Method 1: os audio dev.
|
||||
try:
|
||||
import ossaudiodev
|
||||
|
||||
try:
|
||||
dsp = ossaudiodev.open("w")
|
||||
dsp.setfmt(ossaudiodev.AFMT_S16_LE)
|
||||
dsp.channels(1)
|
||||
dsp.speed(16000)
|
||||
dsp.write(self.audiodata(utterance, start, end))
|
||||
dsp.close()
|
||||
except OSError as e:
|
||||
print(
|
||||
(
|
||||
"can't acquire the audio device; please "
|
||||
"activate your audio device."
|
||||
),
|
||||
file=sys.stderr,
|
||||
)
|
||||
print("system error message:", str(e), file=sys.stderr)
|
||||
return
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
# Method 2: pygame
|
||||
try:
|
||||
# FIXME: this won't work under python 3
|
||||
import pygame.mixer
|
||||
import StringIO
|
||||
|
||||
pygame.mixer.init(16000)
|
||||
f = StringIO.StringIO(self.wav(utterance, start, end))
|
||||
pygame.mixer.Sound(f).play()
|
||||
while pygame.mixer.get_busy():
|
||||
time.sleep(0.01)
|
||||
return
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
# Method 3: complain. :)
|
||||
print(
|
||||
("you must install pygame or ossaudiodev " "for audio playback."),
|
||||
file=sys.stderr,
|
||||
)
|
||||
|
||||
|
||||
class SpeakerInfo:
|
||||
def __init__(
|
||||
self, id, sex, dr, use, recdate, birthdate, ht, race, edu, comments=None
|
||||
):
|
||||
self.id = id
|
||||
self.sex = sex
|
||||
self.dr = dr
|
||||
self.use = use
|
||||
self.recdate = recdate
|
||||
self.birthdate = birthdate
|
||||
self.ht = ht
|
||||
self.race = race
|
||||
self.edu = edu
|
||||
self.comments = comments
|
||||
|
||||
def __repr__(self):
|
||||
attribs = "id sex dr use recdate birthdate ht race edu comments"
|
||||
args = [f"{attr}={getattr(self, attr)!r}" for attr in attribs.split()]
|
||||
return "SpeakerInfo(%s)" % (", ".join(args))
|
||||
|
||||
|
||||
def read_timit_block(stream):
|
||||
"""
|
||||
Block reader for timit tagged sentences, which are preceded by a sentence
|
||||
number that will be ignored.
|
||||
"""
|
||||
line = stream.readline()
|
||||
if not line:
|
||||
return []
|
||||
n, sent = line.split(" ", 1)
|
||||
return [sent]
|
||||
76
backend/venv/Lib/site-packages/nltk/corpus/reader/toolbox.py
Normal file
76
backend/venv/Lib/site-packages/nltk/corpus/reader/toolbox.py
Normal file
@@ -0,0 +1,76 @@
|
||||
# Natural Language Toolkit: Toolbox Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Greg Aumann <greg_aumann@sil.org>
|
||||
# Stuart Robinson <Stuart.Robinson@mpi.nl>
|
||||
# Steven Bird <stevenbird1@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Module for reading, writing and manipulating
|
||||
Toolbox databases and settings fileids.
|
||||
"""
|
||||
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.toolbox import ToolboxData
|
||||
|
||||
|
||||
class ToolboxCorpusReader(CorpusReader):
|
||||
def xml(self, fileids, key=None):
|
||||
return concat(
|
||||
[
|
||||
ToolboxData(path, enc).parse(key=key)
|
||||
for (path, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def fields(
|
||||
self,
|
||||
fileids,
|
||||
strip=True,
|
||||
unwrap=True,
|
||||
encoding="utf8",
|
||||
errors="strict",
|
||||
unicode_fields=None,
|
||||
):
|
||||
return concat(
|
||||
[
|
||||
list(
|
||||
ToolboxData(fileid, enc).fields(
|
||||
strip, unwrap, encoding, errors, unicode_fields
|
||||
)
|
||||
)
|
||||
for (fileid, enc) in self.abspaths(fileids, include_encoding=True)
|
||||
]
|
||||
)
|
||||
|
||||
# should probably be done lazily:
|
||||
def entries(self, fileids, **kwargs):
|
||||
if "key" in kwargs:
|
||||
key = kwargs["key"]
|
||||
del kwargs["key"]
|
||||
else:
|
||||
key = "lx" # the default key in MDF
|
||||
entries = []
|
||||
for marker, contents in self.fields(fileids, **kwargs):
|
||||
if marker == key:
|
||||
entries.append((contents, []))
|
||||
else:
|
||||
try:
|
||||
entries[-1][-1].append((marker, contents))
|
||||
except IndexError:
|
||||
pass
|
||||
return entries
|
||||
|
||||
def words(self, fileids, key="lx"):
|
||||
return [contents for marker, contents in self.fields(fileids) if marker == key]
|
||||
|
||||
|
||||
def demo():
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
136
backend/venv/Lib/site-packages/nltk/corpus/reader/twitter.py
Normal file
136
backend/venv/Lib/site-packages/nltk/corpus/reader/twitter.py
Normal file
@@ -0,0 +1,136 @@
|
||||
# Natural Language Toolkit: Twitter Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
A reader for corpora that consist of Tweets. It is assumed that the Tweets
|
||||
have been serialised into line-delimited JSON.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
|
||||
from nltk.corpus.reader.api import CorpusReader
|
||||
from nltk.corpus.reader.util import StreamBackedCorpusView, ZipFilePathPointer, concat
|
||||
from nltk.tokenize import TweetTokenizer
|
||||
|
||||
|
||||
class TwitterCorpusReader(CorpusReader):
|
||||
r"""
|
||||
Reader for corpora that consist of Tweets represented as a list of line-delimited JSON.
|
||||
|
||||
Individual Tweets can be tokenized using the default tokenizer, or by a
|
||||
custom tokenizer specified as a parameter to the constructor.
|
||||
|
||||
Construct a new Tweet corpus reader for a set of documents
|
||||
located at the given root directory.
|
||||
|
||||
If you made your own tweet collection in a directory called
|
||||
`twitter-files`, then you can initialise the reader as::
|
||||
|
||||
from nltk.corpus import TwitterCorpusReader
|
||||
reader = TwitterCorpusReader(root='/path/to/twitter-files', '.*\.json')
|
||||
|
||||
However, the recommended approach is to set the relevant directory as the
|
||||
value of the environmental variable `TWITTER`, and then invoke the reader
|
||||
as follows::
|
||||
|
||||
root = os.environ['TWITTER']
|
||||
reader = TwitterCorpusReader(root, '.*\.json')
|
||||
|
||||
If you want to work directly with the raw Tweets, the `json` library can
|
||||
be used::
|
||||
|
||||
import json
|
||||
for tweet in reader.docs():
|
||||
print(json.dumps(tweet, indent=1, sort_keys=True))
|
||||
|
||||
"""
|
||||
|
||||
CorpusView = StreamBackedCorpusView
|
||||
"""
|
||||
The corpus view class used by this reader.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, root, fileids=None, word_tokenizer=TweetTokenizer(), encoding="utf8"
|
||||
):
|
||||
"""
|
||||
:param root: The root directory for this corpus.
|
||||
:param fileids: A list or regexp specifying the fileids in this corpus.
|
||||
:param word_tokenizer: Tokenizer for breaking the text of Tweets into
|
||||
smaller units, including but not limited to words.
|
||||
"""
|
||||
CorpusReader.__init__(self, root, fileids, encoding)
|
||||
|
||||
for path in self.abspaths(self._fileids):
|
||||
if isinstance(path, ZipFilePathPointer):
|
||||
pass
|
||||
elif os.path.getsize(path) == 0:
|
||||
raise ValueError(f"File {path} is empty")
|
||||
"""Check that all user-created corpus files are non-empty."""
|
||||
|
||||
self._word_tokenizer = word_tokenizer
|
||||
|
||||
def docs(self, fileids=None):
|
||||
"""
|
||||
Returns the full Tweet objects, as specified by `Twitter
|
||||
documentation on Tweets
|
||||
<https://dev.twitter.com/docs/platform-objects/tweets>`_
|
||||
|
||||
:return: the given file(s) as a list of dictionaries deserialised
|
||||
from JSON.
|
||||
:rtype: list(dict)
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(path, self._read_tweets, encoding=enc)
|
||||
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
||||
]
|
||||
)
|
||||
|
||||
def strings(self, fileids=None):
|
||||
"""
|
||||
Returns only the text content of Tweets in the file(s)
|
||||
|
||||
:return: the given file(s) as a list of Tweets.
|
||||
:rtype: list(str)
|
||||
"""
|
||||
fulltweets = self.docs(fileids)
|
||||
tweets = []
|
||||
for jsono in fulltweets:
|
||||
try:
|
||||
text = jsono["text"]
|
||||
if isinstance(text, bytes):
|
||||
text = text.decode(self.encoding)
|
||||
tweets.append(text)
|
||||
except KeyError:
|
||||
pass
|
||||
return tweets
|
||||
|
||||
def tokenized(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of the text content of Tweets as
|
||||
as a list of words, screenanames, hashtags, URLs and punctuation symbols.
|
||||
|
||||
:rtype: list(list(str))
|
||||
"""
|
||||
tweets = self.strings(fileids)
|
||||
tokenizer = self._word_tokenizer
|
||||
return [tokenizer.tokenize(t) for t in tweets]
|
||||
|
||||
def _read_tweets(self, stream):
|
||||
"""
|
||||
Assumes that each line in ``stream`` is a JSON-serialised object.
|
||||
"""
|
||||
tweets = []
|
||||
for i in range(10):
|
||||
line = stream.readline()
|
||||
if not line:
|
||||
return tweets
|
||||
tweet = json.loads(line)
|
||||
tweets.append(tweet)
|
||||
return tweets
|
||||
74
backend/venv/Lib/site-packages/nltk/corpus/reader/udhr.py
Normal file
74
backend/venv/Lib/site-packages/nltk/corpus/reader/udhr.py
Normal file
@@ -0,0 +1,74 @@
|
||||
"""
|
||||
UDHR corpus reader. It mostly deals with encodings.
|
||||
"""
|
||||
|
||||
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
|
||||
from nltk.corpus.reader.util import find_corpus_fileids
|
||||
|
||||
|
||||
class UdhrCorpusReader(PlaintextCorpusReader):
|
||||
ENCODINGS = [
|
||||
(".*-Latin1$", "latin-1"),
|
||||
(".*-Hebrew$", "hebrew"),
|
||||
(".*-Arabic$", "cp1256"),
|
||||
("Czech_Cesky-UTF8", "cp1250"), # yeah
|
||||
("Polish-Latin2", "cp1250"),
|
||||
("Polish_Polski-Latin2", "cp1250"),
|
||||
(".*-Cyrillic$", "cyrillic"),
|
||||
(".*-SJIS$", "SJIS"),
|
||||
(".*-GB2312$", "GB2312"),
|
||||
(".*-Latin2$", "ISO-8859-2"),
|
||||
(".*-Greek$", "greek"),
|
||||
(".*-UTF8$", "utf-8"),
|
||||
("Hungarian_Magyar-Unicode", "utf-16-le"),
|
||||
("Amahuaca", "latin1"),
|
||||
("Turkish_Turkce-Turkish", "latin5"),
|
||||
("Lithuanian_Lietuviskai-Baltic", "latin4"),
|
||||
("Japanese_Nihongo-EUC", "EUC-JP"),
|
||||
("Japanese_Nihongo-JIS", "iso2022_jp"),
|
||||
("Chinese_Mandarin-HZ", "hz"),
|
||||
(r"Abkhaz\-Cyrillic\+Abkh", "cp1251"),
|
||||
]
|
||||
|
||||
SKIP = {
|
||||
# The following files are not fully decodable because they
|
||||
# were truncated at wrong bytes:
|
||||
"Burmese_Myanmar-UTF8",
|
||||
"Japanese_Nihongo-JIS",
|
||||
"Chinese_Mandarin-HZ",
|
||||
"Chinese_Mandarin-UTF8",
|
||||
"Gujarati-UTF8",
|
||||
"Hungarian_Magyar-Unicode",
|
||||
"Lao-UTF8",
|
||||
"Magahi-UTF8",
|
||||
"Marathi-UTF8",
|
||||
"Tamil-UTF8",
|
||||
# Unfortunately, encodings required for reading
|
||||
# the following files are not supported by Python:
|
||||
"Vietnamese-VPS",
|
||||
"Vietnamese-VIQR",
|
||||
"Vietnamese-TCVN",
|
||||
"Magahi-Agra",
|
||||
"Bhojpuri-Agra",
|
||||
"Esperanto-T61", # latin3 raises an exception
|
||||
# The following files are encoded for specific fonts:
|
||||
"Burmese_Myanmar-WinResearcher",
|
||||
"Armenian-DallakHelv",
|
||||
"Tigrinya_Tigrigna-VG2Main",
|
||||
"Amharic-Afenegus6..60375", # ?
|
||||
"Navaho_Dine-Navajo-Navaho-font",
|
||||
# What are these?
|
||||
"Azeri_Azerbaijani_Cyrillic-Az.Times.Cyr.Normal0117",
|
||||
"Azeri_Azerbaijani_Latin-Az.Times.Lat0117",
|
||||
# The following files are unintended:
|
||||
"Czech-Latin2-err",
|
||||
"Russian_Russky-UTF8~",
|
||||
}
|
||||
|
||||
def __init__(self, root="udhr"):
|
||||
fileids = find_corpus_fileids(root, r"(?!README|\.).*")
|
||||
super().__init__(
|
||||
root,
|
||||
[fileid for fileid in fileids if fileid not in self.SKIP],
|
||||
encoding=self.ENCODINGS,
|
||||
)
|
||||
780
backend/venv/Lib/site-packages/nltk/corpus/reader/util.py
Normal file
780
backend/venv/Lib/site-packages/nltk/corpus/reader/util.py
Normal file
@@ -0,0 +1,780 @@
|
||||
# Natural Language Toolkit: Corpus Reader Utilities
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import bisect
|
||||
import os
|
||||
import pickle
|
||||
import re
|
||||
import tempfile
|
||||
from functools import reduce
|
||||
from xml.etree import ElementTree
|
||||
|
||||
from nltk.data import (
|
||||
FileSystemPathPointer,
|
||||
PathPointer,
|
||||
SeekableUnicodeStreamReader,
|
||||
ZipFilePathPointer,
|
||||
)
|
||||
from nltk.internals import slice_bounds
|
||||
from nltk.tokenize import wordpunct_tokenize
|
||||
from nltk.util import AbstractLazySequence, LazyConcatenation, LazySubsequence
|
||||
|
||||
######################################################################
|
||||
# { Corpus View
|
||||
######################################################################
|
||||
|
||||
|
||||
class StreamBackedCorpusView(AbstractLazySequence):
|
||||
"""
|
||||
A 'view' of a corpus file, which acts like a sequence of tokens:
|
||||
it can be accessed by index, iterated over, etc. However, the
|
||||
tokens are only constructed as-needed -- the entire corpus is
|
||||
never stored in memory at once.
|
||||
|
||||
The constructor to ``StreamBackedCorpusView`` takes two arguments:
|
||||
a corpus fileid (specified as a string or as a ``PathPointer``);
|
||||
and a block reader. A "block reader" is a function that reads
|
||||
zero or more tokens from a stream, and returns them as a list. A
|
||||
very simple example of a block reader is:
|
||||
|
||||
>>> def simple_block_reader(stream):
|
||||
... return stream.readline().split()
|
||||
|
||||
This simple block reader reads a single line at a time, and
|
||||
returns a single token (consisting of a string) for each
|
||||
whitespace-separated substring on the line.
|
||||
|
||||
When deciding how to define the block reader for a given
|
||||
corpus, careful consideration should be given to the size of
|
||||
blocks handled by the block reader. Smaller block sizes will
|
||||
increase the memory requirements of the corpus view's internal
|
||||
data structures (by 2 integers per block). On the other hand,
|
||||
larger block sizes may decrease performance for random access to
|
||||
the corpus. (But note that larger block sizes will *not*
|
||||
decrease performance for iteration.)
|
||||
|
||||
Internally, ``CorpusView`` maintains a partial mapping from token
|
||||
index to file position, with one entry per block. When a token
|
||||
with a given index *i* is requested, the ``CorpusView`` constructs
|
||||
it as follows:
|
||||
|
||||
1. First, it searches the toknum/filepos mapping for the token
|
||||
index closest to (but less than or equal to) *i*.
|
||||
|
||||
2. Then, starting at the file position corresponding to that
|
||||
index, it reads one block at a time using the block reader
|
||||
until it reaches the requested token.
|
||||
|
||||
The toknum/filepos mapping is created lazily: it is initially
|
||||
empty, but every time a new block is read, the block's
|
||||
initial token is added to the mapping. (Thus, the toknum/filepos
|
||||
map has one entry per block.)
|
||||
|
||||
In order to increase efficiency for random access patterns that
|
||||
have high degrees of locality, the corpus view may cache one or
|
||||
more blocks.
|
||||
|
||||
:note: Each ``CorpusView`` object internally maintains an open file
|
||||
object for its underlying corpus file. This file should be
|
||||
automatically closed when the ``CorpusView`` is garbage collected,
|
||||
but if you wish to close it manually, use the ``close()``
|
||||
method. If you access a ``CorpusView``'s items after it has been
|
||||
closed, the file object will be automatically re-opened.
|
||||
|
||||
:warning: If the contents of the file are modified during the
|
||||
lifetime of the ``CorpusView``, then the ``CorpusView``'s behavior
|
||||
is undefined.
|
||||
|
||||
:warning: If a unicode encoding is specified when constructing a
|
||||
``CorpusView``, then the block reader may only call
|
||||
``stream.seek()`` with offsets that have been returned by
|
||||
``stream.tell()``; in particular, calling ``stream.seek()`` with
|
||||
relative offsets, or with offsets based on string lengths, may
|
||||
lead to incorrect behavior.
|
||||
|
||||
:ivar _block_reader: The function used to read
|
||||
a single block from the underlying file stream.
|
||||
:ivar _toknum: A list containing the token index of each block
|
||||
that has been processed. In particular, ``_toknum[i]`` is the
|
||||
token index of the first token in block ``i``. Together
|
||||
with ``_filepos``, this forms a partial mapping between token
|
||||
indices and file positions.
|
||||
:ivar _filepos: A list containing the file position of each block
|
||||
that has been processed. In particular, ``_toknum[i]`` is the
|
||||
file position of the first character in block ``i``. Together
|
||||
with ``_toknum``, this forms a partial mapping between token
|
||||
indices and file positions.
|
||||
:ivar _stream: The stream used to access the underlying corpus file.
|
||||
:ivar _len: The total number of tokens in the corpus, if known;
|
||||
or None, if the number of tokens is not yet known.
|
||||
:ivar _eofpos: The character position of the last character in the
|
||||
file. This is calculated when the corpus view is initialized,
|
||||
and is used to decide when the end of file has been reached.
|
||||
:ivar _cache: A cache of the most recently read block. It
|
||||
is encoded as a tuple (start_toknum, end_toknum, tokens), where
|
||||
start_toknum is the token index of the first token in the block;
|
||||
end_toknum is the token index of the first token not in the
|
||||
block; and tokens is a list of the tokens in the block.
|
||||
"""
|
||||
|
||||
def __init__(self, fileid, block_reader=None, startpos=0, encoding="utf8"):
|
||||
"""
|
||||
Create a new corpus view, based on the file ``fileid``, and
|
||||
read with ``block_reader``. See the class documentation
|
||||
for more information.
|
||||
|
||||
:param fileid: The path to the file that is read by this
|
||||
corpus view. ``fileid`` can either be a string or a
|
||||
``PathPointer``.
|
||||
|
||||
:param startpos: The file position at which the view will
|
||||
start reading. This can be used to skip over preface
|
||||
sections.
|
||||
|
||||
:param encoding: The unicode encoding that should be used to
|
||||
read the file's contents. If no encoding is specified,
|
||||
then the file's contents will be read as a non-unicode
|
||||
string (i.e., a str).
|
||||
"""
|
||||
if block_reader:
|
||||
self.read_block = block_reader
|
||||
# Initialize our toknum/filepos mapping.
|
||||
self._toknum = [0]
|
||||
self._filepos = [startpos]
|
||||
self._encoding = encoding
|
||||
# We don't know our length (number of tokens) yet.
|
||||
self._len = None
|
||||
|
||||
self._fileid = fileid
|
||||
self._stream = None
|
||||
|
||||
self._current_toknum = None
|
||||
"""This variable is set to the index of the next token that
|
||||
will be read, immediately before ``self.read_block()`` is
|
||||
called. This is provided for the benefit of the block
|
||||
reader, which under rare circumstances may need to know
|
||||
the current token number."""
|
||||
|
||||
self._current_blocknum = None
|
||||
"""This variable is set to the index of the next block that
|
||||
will be read, immediately before ``self.read_block()`` is
|
||||
called. This is provided for the benefit of the block
|
||||
reader, which under rare circumstances may need to know
|
||||
the current block number."""
|
||||
|
||||
# Find the length of the file.
|
||||
try:
|
||||
if isinstance(self._fileid, PathPointer):
|
||||
self._eofpos = self._fileid.file_size()
|
||||
else:
|
||||
self._eofpos = os.stat(self._fileid).st_size
|
||||
except Exception as exc:
|
||||
raise ValueError(f"Unable to open or access {fileid!r} -- {exc}") from exc
|
||||
|
||||
# Maintain a cache of the most recently read block, to
|
||||
# increase efficiency of random access.
|
||||
self._cache = (-1, -1, None)
|
||||
|
||||
fileid = property(
|
||||
lambda self: self._fileid,
|
||||
doc="""
|
||||
The fileid of the file that is accessed by this view.
|
||||
|
||||
:type: str or PathPointer""",
|
||||
)
|
||||
|
||||
def read_block(self, stream):
|
||||
"""
|
||||
Read a block from the input stream.
|
||||
|
||||
:return: a block of tokens from the input stream
|
||||
:rtype: list(any)
|
||||
:param stream: an input stream
|
||||
:type stream: stream
|
||||
"""
|
||||
raise NotImplementedError("Abstract Method")
|
||||
|
||||
def _open(self):
|
||||
"""
|
||||
Open the file stream associated with this corpus view. This
|
||||
will be called performed if any value is read from the view
|
||||
while its file stream is closed.
|
||||
"""
|
||||
if isinstance(self._fileid, PathPointer):
|
||||
self._stream = self._fileid.open(self._encoding)
|
||||
elif self._encoding:
|
||||
self._stream = SeekableUnicodeStreamReader(
|
||||
open(self._fileid, "rb"), self._encoding
|
||||
)
|
||||
else:
|
||||
self._stream = open(self._fileid, "rb")
|
||||
|
||||
def close(self):
|
||||
"""
|
||||
Close the file stream associated with this corpus view. This
|
||||
can be useful if you are worried about running out of file
|
||||
handles (although the stream should automatically be closed
|
||||
upon garbage collection of the corpus view). If the corpus
|
||||
view is accessed after it is closed, it will be automatically
|
||||
re-opened.
|
||||
"""
|
||||
if self._stream is not None:
|
||||
self._stream.close()
|
||||
self._stream = None
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, type, value, traceback):
|
||||
self.close()
|
||||
|
||||
def __len__(self):
|
||||
if self._len is None:
|
||||
# iterate_from() sets self._len when it reaches the end
|
||||
# of the file:
|
||||
for tok in self.iterate_from(self._toknum[-1]):
|
||||
pass
|
||||
return self._len
|
||||
|
||||
def __getitem__(self, i):
|
||||
if isinstance(i, slice):
|
||||
start, stop = slice_bounds(self, i)
|
||||
# Check if it's in the cache.
|
||||
offset = self._cache[0]
|
||||
if offset <= start and stop <= self._cache[1]:
|
||||
return self._cache[2][start - offset : stop - offset]
|
||||
# Construct & return the result.
|
||||
return LazySubsequence(self, start, stop)
|
||||
else:
|
||||
# Handle negative indices
|
||||
if i < 0:
|
||||
i += len(self)
|
||||
if i < 0:
|
||||
raise IndexError("index out of range")
|
||||
# Check if it's in the cache.
|
||||
offset = self._cache[0]
|
||||
if offset <= i < self._cache[1]:
|
||||
return self._cache[2][i - offset]
|
||||
# Use iterate_from to extract it.
|
||||
try:
|
||||
return next(self.iterate_from(i))
|
||||
except StopIteration as e:
|
||||
raise IndexError("index out of range") from e
|
||||
|
||||
# If we wanted to be thread-safe, then this method would need to
|
||||
# do some locking.
|
||||
def iterate_from(self, start_tok):
|
||||
# Start by feeding from the cache, if possible.
|
||||
if self._cache[0] <= start_tok < self._cache[1]:
|
||||
for tok in self._cache[2][start_tok - self._cache[0] :]:
|
||||
yield tok
|
||||
start_tok += 1
|
||||
|
||||
# Decide where in the file we should start. If `start` is in
|
||||
# our mapping, then we can jump straight to the correct block;
|
||||
# otherwise, start at the last block we've processed.
|
||||
if start_tok < self._toknum[-1]:
|
||||
block_index = bisect.bisect_right(self._toknum, start_tok) - 1
|
||||
toknum = self._toknum[block_index]
|
||||
filepos = self._filepos[block_index]
|
||||
else:
|
||||
block_index = len(self._toknum) - 1
|
||||
toknum = self._toknum[-1]
|
||||
filepos = self._filepos[-1]
|
||||
|
||||
# Open the stream, if it's not open already.
|
||||
if self._stream is None:
|
||||
self._open()
|
||||
|
||||
# If the file is empty, the while loop will never run.
|
||||
# This *seems* to be all the state we need to set:
|
||||
if self._eofpos == 0:
|
||||
self._len = 0
|
||||
|
||||
# Each iteration through this loop, we read a single block
|
||||
# from the stream.
|
||||
while filepos < self._eofpos:
|
||||
# Read the next block.
|
||||
self._stream.seek(filepos)
|
||||
self._current_toknum = toknum
|
||||
self._current_blocknum = block_index
|
||||
tokens = self.read_block(self._stream)
|
||||
assert isinstance(tokens, (tuple, list, AbstractLazySequence)), (
|
||||
"block reader %s() should return list or tuple."
|
||||
% self.read_block.__name__
|
||||
)
|
||||
num_toks = len(tokens)
|
||||
new_filepos = self._stream.tell()
|
||||
assert (
|
||||
new_filepos > filepos
|
||||
), "block reader %s() should consume at least 1 byte (filepos=%d)" % (
|
||||
self.read_block.__name__,
|
||||
filepos,
|
||||
)
|
||||
|
||||
# Update our cache.
|
||||
self._cache = (toknum, toknum + num_toks, list(tokens))
|
||||
|
||||
# Update our mapping.
|
||||
assert toknum <= self._toknum[-1]
|
||||
if num_toks > 0:
|
||||
block_index += 1
|
||||
if toknum == self._toknum[-1]:
|
||||
assert new_filepos > self._filepos[-1] # monotonic!
|
||||
self._filepos.append(new_filepos)
|
||||
self._toknum.append(toknum + num_toks)
|
||||
else:
|
||||
# Check for consistency:
|
||||
assert (
|
||||
new_filepos == self._filepos[block_index]
|
||||
), "inconsistent block reader (num chars read)"
|
||||
assert (
|
||||
toknum + num_toks == self._toknum[block_index]
|
||||
), "inconsistent block reader (num tokens returned)"
|
||||
|
||||
# If we reached the end of the file, then update self._len
|
||||
if new_filepos == self._eofpos:
|
||||
self._len = toknum + num_toks
|
||||
# Generate the tokens in this block (but skip any tokens
|
||||
# before start_tok). Note that between yields, our state
|
||||
# may be modified.
|
||||
for tok in tokens[max(0, start_tok - toknum) :]:
|
||||
yield tok
|
||||
# If we're at the end of the file, then we're done.
|
||||
assert new_filepos <= self._eofpos
|
||||
if new_filepos == self._eofpos:
|
||||
break
|
||||
# Update our indices
|
||||
toknum += num_toks
|
||||
filepos = new_filepos
|
||||
|
||||
# If we reach this point, then we should know our length.
|
||||
assert self._len is not None
|
||||
# Enforce closing of stream once we reached end of file
|
||||
# We should have reached EOF once we're out of the while loop.
|
||||
self.close()
|
||||
|
||||
# Use concat for these, so we can use a ConcatenatedCorpusView
|
||||
# when possible.
|
||||
def __add__(self, other):
|
||||
return concat([self, other])
|
||||
|
||||
def __radd__(self, other):
|
||||
return concat([other, self])
|
||||
|
||||
def __mul__(self, count):
|
||||
return concat([self] * count)
|
||||
|
||||
def __rmul__(self, count):
|
||||
return concat([self] * count)
|
||||
|
||||
|
||||
class ConcatenatedCorpusView(AbstractLazySequence):
|
||||
"""
|
||||
A 'view' of a corpus file that joins together one or more
|
||||
``StreamBackedCorpusViews<StreamBackedCorpusView>``. At most
|
||||
one file handle is left open at any time.
|
||||
"""
|
||||
|
||||
def __init__(self, corpus_views):
|
||||
self._pieces = corpus_views
|
||||
"""A list of the corpus subviews that make up this
|
||||
concatenation."""
|
||||
|
||||
self._offsets = [0]
|
||||
"""A list of offsets, indicating the index at which each
|
||||
subview begins. In particular::
|
||||
offsets[i] = sum([len(p) for p in pieces[:i]])"""
|
||||
|
||||
self._open_piece = None
|
||||
"""The most recently accessed corpus subview (or None).
|
||||
Before a new subview is accessed, this subview will be closed."""
|
||||
|
||||
def __len__(self):
|
||||
if len(self._offsets) <= len(self._pieces):
|
||||
# Iterate to the end of the corpus.
|
||||
for tok in self.iterate_from(self._offsets[-1]):
|
||||
pass
|
||||
|
||||
return self._offsets[-1]
|
||||
|
||||
def close(self):
|
||||
for piece in self._pieces:
|
||||
piece.close()
|
||||
|
||||
def iterate_from(self, start_tok):
|
||||
piecenum = bisect.bisect_right(self._offsets, start_tok) - 1
|
||||
|
||||
while piecenum < len(self._pieces):
|
||||
offset = self._offsets[piecenum]
|
||||
piece = self._pieces[piecenum]
|
||||
|
||||
# If we've got another piece open, close it first.
|
||||
if self._open_piece is not piece:
|
||||
if self._open_piece is not None:
|
||||
self._open_piece.close()
|
||||
self._open_piece = piece
|
||||
|
||||
# Get everything we can from this piece.
|
||||
yield from piece.iterate_from(max(0, start_tok - offset))
|
||||
|
||||
# Update the offset table.
|
||||
if piecenum + 1 == len(self._offsets):
|
||||
self._offsets.append(self._offsets[-1] + len(piece))
|
||||
|
||||
# Move on to the next piece.
|
||||
piecenum += 1
|
||||
|
||||
|
||||
def concat(docs):
|
||||
"""
|
||||
Concatenate together the contents of multiple documents from a
|
||||
single corpus, using an appropriate concatenation function. This
|
||||
utility function is used by corpus readers when the user requests
|
||||
more than one document at a time.
|
||||
"""
|
||||
if len(docs) == 1:
|
||||
return docs[0]
|
||||
if len(docs) == 0:
|
||||
raise ValueError("concat() expects at least one object!")
|
||||
|
||||
types = {d.__class__ for d in docs}
|
||||
|
||||
# If they're all strings, use string concatenation.
|
||||
if all(isinstance(doc, str) for doc in docs):
|
||||
return "".join(docs)
|
||||
|
||||
# If they're all corpus views, then use ConcatenatedCorpusView.
|
||||
for typ in types:
|
||||
if not issubclass(typ, (StreamBackedCorpusView, ConcatenatedCorpusView)):
|
||||
break
|
||||
else:
|
||||
return ConcatenatedCorpusView(docs)
|
||||
|
||||
# If they're all lazy sequences, use a lazy concatenation
|
||||
for typ in types:
|
||||
if not issubclass(typ, AbstractLazySequence):
|
||||
break
|
||||
else:
|
||||
return LazyConcatenation(docs)
|
||||
|
||||
# Otherwise, see what we can do:
|
||||
if len(types) == 1:
|
||||
typ = list(types)[0]
|
||||
|
||||
if issubclass(typ, list):
|
||||
return reduce((lambda a, b: a + b), docs, [])
|
||||
|
||||
if issubclass(typ, tuple):
|
||||
return reduce((lambda a, b: a + b), docs, ())
|
||||
|
||||
if ElementTree.iselement(typ):
|
||||
xmltree = ElementTree.Element("documents")
|
||||
for doc in docs:
|
||||
xmltree.append(doc)
|
||||
return xmltree
|
||||
|
||||
# No method found!
|
||||
raise ValueError("Don't know how to concatenate types: %r" % types)
|
||||
|
||||
|
||||
######################################################################
|
||||
# { Block Readers
|
||||
######################################################################
|
||||
|
||||
|
||||
def read_whitespace_block(stream):
|
||||
toks = []
|
||||
for i in range(20): # Read 20 lines at a time.
|
||||
toks.extend(stream.readline().split())
|
||||
return toks
|
||||
|
||||
|
||||
def read_wordpunct_block(stream):
|
||||
toks = []
|
||||
for i in range(20): # Read 20 lines at a time.
|
||||
toks.extend(wordpunct_tokenize(stream.readline()))
|
||||
return toks
|
||||
|
||||
|
||||
def read_line_block(stream):
|
||||
toks = []
|
||||
for i in range(20):
|
||||
line = stream.readline()
|
||||
if not line:
|
||||
return toks
|
||||
toks.append(line.rstrip("\n"))
|
||||
return toks
|
||||
|
||||
|
||||
def read_blankline_block(stream):
|
||||
s = ""
|
||||
while True:
|
||||
line = stream.readline()
|
||||
# End of file:
|
||||
if not line:
|
||||
if s:
|
||||
return [s]
|
||||
else:
|
||||
return []
|
||||
# Blank line:
|
||||
elif line and not line.strip():
|
||||
if s:
|
||||
return [s]
|
||||
# Other line:
|
||||
else:
|
||||
s += line
|
||||
|
||||
|
||||
def read_alignedsent_block(stream):
|
||||
s = ""
|
||||
while True:
|
||||
line = stream.readline()
|
||||
if line[0] == "=" or line[0] == "\n" or line[:2] == "\r\n":
|
||||
continue
|
||||
# End of file:
|
||||
if not line:
|
||||
if s:
|
||||
return [s]
|
||||
else:
|
||||
return []
|
||||
# Other line:
|
||||
else:
|
||||
s += line
|
||||
if re.match(r"^\d+-\d+", line) is not None:
|
||||
return [s]
|
||||
|
||||
|
||||
def read_regexp_block(stream, start_re, end_re=None):
|
||||
"""
|
||||
Read a sequence of tokens from a stream, where tokens begin with
|
||||
lines that match ``start_re``. If ``end_re`` is specified, then
|
||||
tokens end with lines that match ``end_re``; otherwise, tokens end
|
||||
whenever the next line matching ``start_re`` or EOF is found.
|
||||
"""
|
||||
# Scan until we find a line matching the start regexp.
|
||||
while True:
|
||||
line = stream.readline()
|
||||
if not line:
|
||||
return [] # end of file.
|
||||
if re.match(start_re, line):
|
||||
break
|
||||
|
||||
# Scan until we find another line matching the regexp, or EOF.
|
||||
lines = [line]
|
||||
while True:
|
||||
oldpos = stream.tell()
|
||||
line = stream.readline()
|
||||
# End of file:
|
||||
if not line:
|
||||
return ["".join(lines)]
|
||||
# End of token:
|
||||
if end_re is not None and re.match(end_re, line):
|
||||
return ["".join(lines)]
|
||||
# Start of new token: backup to just before it starts, and
|
||||
# return the token we've already collected.
|
||||
if end_re is None and re.match(start_re, line):
|
||||
stream.seek(oldpos)
|
||||
return ["".join(lines)]
|
||||
# Anything else is part of the token.
|
||||
lines.append(line)
|
||||
|
||||
|
||||
def read_sexpr_block(stream, block_size=16384, comment_char=None):
|
||||
"""
|
||||
Read a sequence of s-expressions from the stream, and leave the
|
||||
stream's file position at the end the last complete s-expression
|
||||
read. This function will always return at least one s-expression,
|
||||
unless there are no more s-expressions in the file.
|
||||
|
||||
If the file ends in in the middle of an s-expression, then that
|
||||
incomplete s-expression is returned when the end of the file is
|
||||
reached.
|
||||
|
||||
:param block_size: The default block size for reading. If an
|
||||
s-expression is longer than one block, then more than one
|
||||
block will be read.
|
||||
:param comment_char: A character that marks comments. Any lines
|
||||
that begin with this character will be stripped out.
|
||||
(If spaces or tabs precede the comment character, then the
|
||||
line will not be stripped.)
|
||||
"""
|
||||
start = stream.tell()
|
||||
block = stream.read(block_size)
|
||||
encoding = getattr(stream, "encoding", None)
|
||||
assert encoding is not None or isinstance(block, str)
|
||||
if encoding not in (None, "utf-8"):
|
||||
import warnings
|
||||
|
||||
warnings.warn(
|
||||
"Parsing may fail, depending on the properties "
|
||||
"of the %s encoding!" % encoding
|
||||
)
|
||||
# (e.g., the utf-16 encoding does not work because it insists
|
||||
# on adding BOMs to the beginning of encoded strings.)
|
||||
|
||||
if comment_char:
|
||||
COMMENT = re.compile("(?m)^%s.*$" % re.escape(comment_char))
|
||||
while True:
|
||||
try:
|
||||
# If we're stripping comments, then make sure our block ends
|
||||
# on a line boundary; and then replace any comments with
|
||||
# space characters. (We can't just strip them out -- that
|
||||
# would make our offset wrong.)
|
||||
if comment_char:
|
||||
block += stream.readline()
|
||||
block = re.sub(COMMENT, _sub_space, block)
|
||||
# Read the block.
|
||||
tokens, offset = _parse_sexpr_block(block)
|
||||
# Skip whitespace
|
||||
offset = re.compile(r"\s*").search(block, offset).end()
|
||||
|
||||
# Move to the end position.
|
||||
if encoding is None:
|
||||
stream.seek(start + offset)
|
||||
else:
|
||||
stream.seek(start + len(block[:offset].encode(encoding)))
|
||||
|
||||
# Return the list of tokens we processed
|
||||
return tokens
|
||||
except ValueError as e:
|
||||
if e.args[0] == "Block too small":
|
||||
next_block = stream.read(block_size)
|
||||
if next_block:
|
||||
block += next_block
|
||||
continue
|
||||
else:
|
||||
# The file ended mid-sexpr -- return what we got.
|
||||
return [block.strip()]
|
||||
else:
|
||||
raise
|
||||
|
||||
|
||||
def _sub_space(m):
|
||||
"""Helper function: given a regexp match, return a string of
|
||||
spaces that's the same length as the matched string."""
|
||||
return " " * (m.end() - m.start())
|
||||
|
||||
|
||||
def _parse_sexpr_block(block):
|
||||
tokens = []
|
||||
start = end = 0
|
||||
|
||||
while end < len(block):
|
||||
m = re.compile(r"\S").search(block, end)
|
||||
if not m:
|
||||
return tokens, end
|
||||
|
||||
start = m.start()
|
||||
|
||||
# Case 1: sexpr is not parenthesized.
|
||||
if m.group() != "(":
|
||||
m2 = re.compile(r"[\s(]").search(block, start)
|
||||
if m2:
|
||||
end = m2.start()
|
||||
else:
|
||||
if tokens:
|
||||
return tokens, end
|
||||
raise ValueError("Block too small")
|
||||
|
||||
# Case 2: parenthesized sexpr.
|
||||
else:
|
||||
nesting = 0
|
||||
for m in re.compile(r"[()]").finditer(block, start):
|
||||
if m.group() == "(":
|
||||
nesting += 1
|
||||
else:
|
||||
nesting -= 1
|
||||
if nesting == 0:
|
||||
end = m.end()
|
||||
break
|
||||
else:
|
||||
if tokens:
|
||||
return tokens, end
|
||||
raise ValueError("Block too small")
|
||||
|
||||
tokens.append(block[start:end])
|
||||
|
||||
return tokens, end
|
||||
|
||||
|
||||
######################################################################
|
||||
# { Finding Corpus Items
|
||||
######################################################################
|
||||
|
||||
|
||||
def find_corpus_fileids(root, regexp):
|
||||
if not isinstance(root, PathPointer):
|
||||
raise TypeError("find_corpus_fileids: expected a PathPointer")
|
||||
regexp += "$"
|
||||
|
||||
# Find fileids in a zipfile: scan the zipfile's namelist. Filter
|
||||
# out entries that end in '/' -- they're directories.
|
||||
if isinstance(root, ZipFilePathPointer):
|
||||
fileids = [
|
||||
name[len(root.entry) :]
|
||||
for name in root.zipfile.namelist()
|
||||
if not name.endswith("/")
|
||||
]
|
||||
items = [name for name in fileids if re.match(regexp, name)]
|
||||
return sorted(items)
|
||||
|
||||
# Find fileids in a directory: use os.walk to search all (proper
|
||||
# or symlinked) subdirectories, and match paths against the regexp.
|
||||
elif isinstance(root, FileSystemPathPointer):
|
||||
items = []
|
||||
for dirname, subdirs, fileids in os.walk(root.path):
|
||||
prefix = "".join("%s/" % p for p in _path_from(root.path, dirname))
|
||||
items += [
|
||||
prefix + fileid
|
||||
for fileid in fileids
|
||||
if re.match(regexp, prefix + fileid)
|
||||
]
|
||||
# Don't visit svn directories:
|
||||
if ".svn" in subdirs:
|
||||
subdirs.remove(".svn")
|
||||
return sorted(items)
|
||||
|
||||
else:
|
||||
raise AssertionError("Don't know how to handle %r" % root)
|
||||
|
||||
|
||||
def _path_from(parent, child):
|
||||
if os.path.split(parent)[1] == "":
|
||||
parent = os.path.split(parent)[0]
|
||||
path = []
|
||||
while parent != child:
|
||||
child, dirname = os.path.split(child)
|
||||
path.insert(0, dirname)
|
||||
assert os.path.split(child)[0] != child
|
||||
return path
|
||||
|
||||
|
||||
######################################################################
|
||||
# { Paragraph structure in Treebank files
|
||||
######################################################################
|
||||
|
||||
|
||||
def tagged_treebank_para_block_reader(stream):
|
||||
# Read the next paragraph.
|
||||
para = ""
|
||||
while True:
|
||||
line = stream.readline()
|
||||
# End of paragraph:
|
||||
if re.match(r"======+\s*$", line):
|
||||
if para.strip():
|
||||
return [para]
|
||||
# End of file:
|
||||
elif line == "":
|
||||
if para.strip():
|
||||
return [para]
|
||||
else:
|
||||
return []
|
||||
# Content line:
|
||||
else:
|
||||
para += line
|
||||
629
backend/venv/Lib/site-packages/nltk/corpus/reader/verbnet.py
Normal file
629
backend/venv/Lib/site-packages/nltk/corpus/reader/verbnet.py
Normal file
@@ -0,0 +1,629 @@
|
||||
# Natural Language Toolkit: Verbnet Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
An NLTK interface to the VerbNet verb lexicon
|
||||
|
||||
For details about VerbNet see:
|
||||
https://verbs.colorado.edu/~mpalmer/projects/verbnet.html
|
||||
"""
|
||||
|
||||
import re
|
||||
import textwrap
|
||||
from collections import defaultdict
|
||||
|
||||
from nltk.corpus.reader.xmldocs import XMLCorpusReader
|
||||
|
||||
|
||||
class VerbnetCorpusReader(XMLCorpusReader):
|
||||
"""
|
||||
An NLTK interface to the VerbNet verb lexicon.
|
||||
|
||||
From the VerbNet site: "VerbNet (VN) (Kipper-Schuler 2006) is the largest
|
||||
on-line verb lexicon currently available for English. It is a hierarchical
|
||||
domain-independent, broad-coverage verb lexicon with mappings to other
|
||||
lexical resources such as WordNet (Miller, 1990; Fellbaum, 1998), XTAG
|
||||
(XTAG Research Group, 2001), and FrameNet (Baker et al., 1998)."
|
||||
|
||||
For details about VerbNet see:
|
||||
https://verbs.colorado.edu/~mpalmer/projects/verbnet.html
|
||||
"""
|
||||
|
||||
# No unicode encoding param, since the data files are all XML.
|
||||
def __init__(self, root, fileids, wrap_etree=False):
|
||||
XMLCorpusReader.__init__(self, root, fileids, wrap_etree)
|
||||
|
||||
self._lemma_to_class = defaultdict(list)
|
||||
"""A dictionary mapping from verb lemma strings to lists of
|
||||
VerbNet class identifiers."""
|
||||
|
||||
self._wordnet_to_class = defaultdict(list)
|
||||
"""A dictionary mapping from wordnet identifier strings to
|
||||
lists of VerbNet class identifiers."""
|
||||
|
||||
self._class_to_fileid = {}
|
||||
"""A dictionary mapping from class identifiers to
|
||||
corresponding file identifiers. The keys of this dictionary
|
||||
provide a complete list of all classes and subclasses."""
|
||||
|
||||
self._shortid_to_longid = {}
|
||||
|
||||
# Initialize the dictionaries. Use the quick (regexp-based)
|
||||
# method instead of the slow (xml-based) method, because it
|
||||
# runs 2-30 times faster.
|
||||
self._quick_index()
|
||||
|
||||
_LONGID_RE = re.compile(r"([^\-\.]*)-([\d+.\-]+)$")
|
||||
"""Regular expression that matches (and decomposes) longids"""
|
||||
|
||||
_SHORTID_RE = re.compile(r"[\d+.\-]+$")
|
||||
"""Regular expression that matches shortids"""
|
||||
|
||||
_INDEX_RE = re.compile(
|
||||
r'<MEMBER name="\??([^"]+)" wn="([^"]*)"[^>]+>|' r'<VNSUBCLASS ID="([^"]+)"/?>'
|
||||
)
|
||||
"""Regular expression used by ``_index()`` to quickly scan the corpus
|
||||
for basic information."""
|
||||
|
||||
def lemmas(self, vnclass=None):
|
||||
"""
|
||||
Return a list of all verb lemmas that appear in any class, or
|
||||
in the ``classid`` if specified.
|
||||
"""
|
||||
if vnclass is None:
|
||||
return sorted(self._lemma_to_class.keys())
|
||||
else:
|
||||
# [xx] should this include subclass members?
|
||||
if isinstance(vnclass, str):
|
||||
vnclass = self.vnclass(vnclass)
|
||||
return [member.get("name") for member in vnclass.findall("MEMBERS/MEMBER")]
|
||||
|
||||
def wordnetids(self, vnclass=None):
|
||||
"""
|
||||
Return a list of all wordnet identifiers that appear in any
|
||||
class, or in ``classid`` if specified.
|
||||
"""
|
||||
if vnclass is None:
|
||||
return sorted(self._wordnet_to_class.keys())
|
||||
else:
|
||||
# [xx] should this include subclass members?
|
||||
if isinstance(vnclass, str):
|
||||
vnclass = self.vnclass(vnclass)
|
||||
return sum(
|
||||
(
|
||||
member.get("wn", "").split()
|
||||
for member in vnclass.findall("MEMBERS/MEMBER")
|
||||
),
|
||||
[],
|
||||
)
|
||||
|
||||
def classids(self, lemma=None, wordnetid=None, fileid=None, classid=None):
|
||||
"""
|
||||
Return a list of the VerbNet class identifiers. If a file
|
||||
identifier is specified, then return only the VerbNet class
|
||||
identifiers for classes (and subclasses) defined by that file.
|
||||
If a lemma is specified, then return only VerbNet class
|
||||
identifiers for classes that contain that lemma as a member.
|
||||
If a wordnetid is specified, then return only identifiers for
|
||||
classes that contain that wordnetid as a member. If a classid
|
||||
is specified, then return only identifiers for subclasses of
|
||||
the specified VerbNet class.
|
||||
If nothing is specified, return all classids within VerbNet
|
||||
"""
|
||||
if fileid is not None:
|
||||
return [c for (c, f) in self._class_to_fileid.items() if f == fileid]
|
||||
elif lemma is not None:
|
||||
return self._lemma_to_class[lemma]
|
||||
elif wordnetid is not None:
|
||||
return self._wordnet_to_class[wordnetid]
|
||||
elif classid is not None:
|
||||
xmltree = self.vnclass(classid)
|
||||
return [
|
||||
subclass.get("ID")
|
||||
for subclass in xmltree.findall("SUBCLASSES/VNSUBCLASS")
|
||||
]
|
||||
else:
|
||||
return sorted(self._class_to_fileid.keys())
|
||||
|
||||
def vnclass(self, fileid_or_classid):
|
||||
"""Returns VerbNet class ElementTree
|
||||
|
||||
Return an ElementTree containing the xml for the specified
|
||||
VerbNet class.
|
||||
|
||||
:param fileid_or_classid: An identifier specifying which class
|
||||
should be returned. Can be a file identifier (such as
|
||||
``'put-9.1.xml'``), or a VerbNet class identifier (such as
|
||||
``'put-9.1'``) or a short VerbNet class identifier (such as
|
||||
``'9.1'``).
|
||||
"""
|
||||
# File identifier: just return the xml.
|
||||
if fileid_or_classid in self._fileids:
|
||||
return self.xml(fileid_or_classid)
|
||||
|
||||
# Class identifier: get the xml, and find the right elt.
|
||||
classid = self.longid(fileid_or_classid)
|
||||
if classid in self._class_to_fileid:
|
||||
fileid = self._class_to_fileid[self.longid(classid)]
|
||||
tree = self.xml(fileid)
|
||||
if classid == tree.get("ID"):
|
||||
return tree
|
||||
else:
|
||||
for subclass in tree.findall(".//VNSUBCLASS"):
|
||||
if classid == subclass.get("ID"):
|
||||
return subclass
|
||||
else:
|
||||
assert False # we saw it during _index()!
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unknown identifier {fileid_or_classid}")
|
||||
|
||||
def fileids(self, vnclass_ids=None):
|
||||
"""
|
||||
Return a list of fileids that make up this corpus. If
|
||||
``vnclass_ids`` is specified, then return the fileids that make
|
||||
up the specified VerbNet class(es).
|
||||
"""
|
||||
if vnclass_ids is None:
|
||||
return self._fileids
|
||||
elif isinstance(vnclass_ids, str):
|
||||
return [self._class_to_fileid[self.longid(vnclass_ids)]]
|
||||
else:
|
||||
return [
|
||||
self._class_to_fileid[self.longid(vnclass_id)]
|
||||
for vnclass_id in vnclass_ids
|
||||
]
|
||||
|
||||
def frames(self, vnclass):
|
||||
"""Given a VerbNet class, this method returns VerbNet frames
|
||||
|
||||
The members returned are:
|
||||
1) Example
|
||||
2) Description
|
||||
3) Syntax
|
||||
4) Semantics
|
||||
|
||||
:param vnclass: A VerbNet class identifier; or an ElementTree
|
||||
containing the xml contents of a VerbNet class.
|
||||
:return: frames - a list of frame dictionaries
|
||||
"""
|
||||
if isinstance(vnclass, str):
|
||||
vnclass = self.vnclass(vnclass)
|
||||
frames = []
|
||||
vnframes = vnclass.findall("FRAMES/FRAME")
|
||||
for vnframe in vnframes:
|
||||
frames.append(
|
||||
{
|
||||
"example": self._get_example_within_frame(vnframe),
|
||||
"description": self._get_description_within_frame(vnframe),
|
||||
"syntax": self._get_syntactic_list_within_frame(vnframe),
|
||||
"semantics": self._get_semantics_within_frame(vnframe),
|
||||
}
|
||||
)
|
||||
return frames
|
||||
|
||||
def subclasses(self, vnclass):
|
||||
"""Returns subclass ids, if any exist
|
||||
|
||||
Given a VerbNet class, this method returns subclass ids (if they exist)
|
||||
in a list of strings.
|
||||
|
||||
:param vnclass: A VerbNet class identifier; or an ElementTree
|
||||
containing the xml contents of a VerbNet class.
|
||||
:return: list of subclasses
|
||||
"""
|
||||
if isinstance(vnclass, str):
|
||||
vnclass = self.vnclass(vnclass)
|
||||
|
||||
subclasses = [
|
||||
subclass.get("ID") for subclass in vnclass.findall("SUBCLASSES/VNSUBCLASS")
|
||||
]
|
||||
return subclasses
|
||||
|
||||
def themroles(self, vnclass):
|
||||
"""Returns thematic roles participating in a VerbNet class
|
||||
|
||||
Members returned as part of roles are-
|
||||
1) Type
|
||||
2) Modifiers
|
||||
|
||||
:param vnclass: A VerbNet class identifier; or an ElementTree
|
||||
containing the xml contents of a VerbNet class.
|
||||
:return: themroles: A list of thematic roles in the VerbNet class
|
||||
"""
|
||||
if isinstance(vnclass, str):
|
||||
vnclass = self.vnclass(vnclass)
|
||||
|
||||
themroles = []
|
||||
for trole in vnclass.findall("THEMROLES/THEMROLE"):
|
||||
themroles.append(
|
||||
{
|
||||
"type": trole.get("type"),
|
||||
"modifiers": [
|
||||
{"value": restr.get("Value"), "type": restr.get("type")}
|
||||
for restr in trole.findall("SELRESTRS/SELRESTR")
|
||||
],
|
||||
}
|
||||
)
|
||||
return themroles
|
||||
|
||||
######################################################################
|
||||
# { Index Initialization
|
||||
######################################################################
|
||||
|
||||
def _index(self):
|
||||
"""
|
||||
Initialize the indexes ``_lemma_to_class``,
|
||||
``_wordnet_to_class``, and ``_class_to_fileid`` by scanning
|
||||
through the corpus fileids. This is fast if ElementTree
|
||||
uses the C implementation (<0.1 secs), but quite slow (>10 secs)
|
||||
if only the python implementation is available.
|
||||
"""
|
||||
for fileid in self._fileids:
|
||||
self._index_helper(self.xml(fileid), fileid)
|
||||
|
||||
def _index_helper(self, xmltree, fileid):
|
||||
"""Helper for ``_index()``"""
|
||||
vnclass = xmltree.get("ID")
|
||||
self._class_to_fileid[vnclass] = fileid
|
||||
self._shortid_to_longid[self.shortid(vnclass)] = vnclass
|
||||
for member in xmltree.findall("MEMBERS/MEMBER"):
|
||||
self._lemma_to_class[member.get("name")].append(vnclass)
|
||||
for wn in member.get("wn", "").split():
|
||||
self._wordnet_to_class[wn].append(vnclass)
|
||||
for subclass in xmltree.findall("SUBCLASSES/VNSUBCLASS"):
|
||||
self._index_helper(subclass, fileid)
|
||||
|
||||
def _quick_index(self):
|
||||
"""
|
||||
Initialize the indexes ``_lemma_to_class``,
|
||||
``_wordnet_to_class``, and ``_class_to_fileid`` by scanning
|
||||
through the corpus fileids. This doesn't do proper xml parsing,
|
||||
but is good enough to find everything in the standard VerbNet
|
||||
corpus -- and it runs about 30 times faster than xml parsing
|
||||
(with the python ElementTree; only 2-3 times faster
|
||||
if ElementTree uses the C implementation).
|
||||
"""
|
||||
# nb: if we got rid of wordnet_to_class, this would run 2-3
|
||||
# times faster.
|
||||
for fileid in self._fileids:
|
||||
vnclass = fileid[:-4] # strip the '.xml'
|
||||
self._class_to_fileid[vnclass] = fileid
|
||||
self._shortid_to_longid[self.shortid(vnclass)] = vnclass
|
||||
with self.open(fileid) as fp:
|
||||
for m in self._INDEX_RE.finditer(fp.read()):
|
||||
groups = m.groups()
|
||||
if groups[0] is not None:
|
||||
self._lemma_to_class[groups[0]].append(vnclass)
|
||||
for wn in groups[1].split():
|
||||
self._wordnet_to_class[wn].append(vnclass)
|
||||
elif groups[2] is not None:
|
||||
self._class_to_fileid[groups[2]] = fileid
|
||||
vnclass = groups[2] # for <MEMBER> elts.
|
||||
self._shortid_to_longid[self.shortid(vnclass)] = vnclass
|
||||
else:
|
||||
assert False, "unexpected match condition"
|
||||
|
||||
######################################################################
|
||||
# { Identifier conversion
|
||||
######################################################################
|
||||
|
||||
def longid(self, shortid):
|
||||
"""Returns longid of a VerbNet class
|
||||
|
||||
Given a short VerbNet class identifier (eg '37.10'), map it
|
||||
to a long id (eg 'confess-37.10'). If ``shortid`` is already a
|
||||
long id, then return it as-is"""
|
||||
if self._LONGID_RE.match(shortid):
|
||||
return shortid # it's already a longid.
|
||||
elif not self._SHORTID_RE.match(shortid):
|
||||
raise ValueError("vnclass identifier %r not found" % shortid)
|
||||
try:
|
||||
return self._shortid_to_longid[shortid]
|
||||
except KeyError as e:
|
||||
raise ValueError("vnclass identifier %r not found" % shortid) from e
|
||||
|
||||
def shortid(self, longid):
|
||||
"""Returns shortid of a VerbNet class
|
||||
|
||||
Given a long VerbNet class identifier (eg 'confess-37.10'),
|
||||
map it to a short id (eg '37.10'). If ``longid`` is already a
|
||||
short id, then return it as-is."""
|
||||
if self._SHORTID_RE.match(longid):
|
||||
return longid # it's already a shortid.
|
||||
m = self._LONGID_RE.match(longid)
|
||||
if m:
|
||||
return m.group(2)
|
||||
else:
|
||||
raise ValueError("vnclass identifier %r not found" % longid)
|
||||
|
||||
######################################################################
|
||||
# { Frame access utility functions
|
||||
######################################################################
|
||||
|
||||
def _get_semantics_within_frame(self, vnframe):
|
||||
"""Returns semantics within a single frame
|
||||
|
||||
A utility function to retrieve semantics within a frame in VerbNet
|
||||
Members of the semantics dictionary:
|
||||
1) Predicate value
|
||||
2) Arguments
|
||||
|
||||
:param vnframe: An ElementTree containing the xml contents of
|
||||
a VerbNet frame.
|
||||
:return: semantics: semantics dictionary
|
||||
"""
|
||||
semantics_within_single_frame = []
|
||||
for pred in vnframe.findall("SEMANTICS/PRED"):
|
||||
arguments = [
|
||||
{"type": arg.get("type"), "value": arg.get("value")}
|
||||
for arg in pred.findall("ARGS/ARG")
|
||||
]
|
||||
semantics_within_single_frame.append(
|
||||
{
|
||||
"predicate_value": pred.get("value"),
|
||||
"arguments": arguments,
|
||||
"negated": pred.get("bool") == "!",
|
||||
}
|
||||
)
|
||||
return semantics_within_single_frame
|
||||
|
||||
def _get_example_within_frame(self, vnframe):
|
||||
"""Returns example within a frame
|
||||
|
||||
A utility function to retrieve an example within a frame in VerbNet.
|
||||
|
||||
:param vnframe: An ElementTree containing the xml contents of
|
||||
a VerbNet frame.
|
||||
:return: example_text: The example sentence for this particular frame
|
||||
"""
|
||||
example_element = vnframe.find("EXAMPLES/EXAMPLE")
|
||||
if example_element is not None:
|
||||
example_text = example_element.text
|
||||
else:
|
||||
example_text = ""
|
||||
return example_text
|
||||
|
||||
def _get_description_within_frame(self, vnframe):
|
||||
"""Returns member description within frame
|
||||
|
||||
A utility function to retrieve a description of participating members
|
||||
within a frame in VerbNet.
|
||||
|
||||
:param vnframe: An ElementTree containing the xml contents of
|
||||
a VerbNet frame.
|
||||
:return: description: a description dictionary with members - primary and secondary
|
||||
"""
|
||||
description_element = vnframe.find("DESCRIPTION")
|
||||
return {
|
||||
"primary": description_element.attrib["primary"],
|
||||
"secondary": description_element.get("secondary", ""),
|
||||
}
|
||||
|
||||
def _get_syntactic_list_within_frame(self, vnframe):
|
||||
"""Returns semantics within a frame
|
||||
|
||||
A utility function to retrieve semantics within a frame in VerbNet.
|
||||
Members of the syntactic dictionary:
|
||||
1) POS Tag
|
||||
2) Modifiers
|
||||
|
||||
:param vnframe: An ElementTree containing the xml contents of
|
||||
a VerbNet frame.
|
||||
:return: syntax_within_single_frame
|
||||
"""
|
||||
syntax_within_single_frame = []
|
||||
for elt in vnframe.find("SYNTAX"):
|
||||
pos_tag = elt.tag
|
||||
modifiers = dict()
|
||||
modifiers["value"] = elt.get("value") if "value" in elt.attrib else ""
|
||||
modifiers["selrestrs"] = [
|
||||
{"value": restr.get("Value"), "type": restr.get("type")}
|
||||
for restr in elt.findall("SELRESTRS/SELRESTR")
|
||||
]
|
||||
modifiers["synrestrs"] = [
|
||||
{"value": restr.get("Value"), "type": restr.get("type")}
|
||||
for restr in elt.findall("SYNRESTRS/SYNRESTR")
|
||||
]
|
||||
syntax_within_single_frame.append(
|
||||
{"pos_tag": pos_tag, "modifiers": modifiers}
|
||||
)
|
||||
return syntax_within_single_frame
|
||||
|
||||
######################################################################
|
||||
# { Pretty Printing
|
||||
######################################################################
|
||||
|
||||
def pprint(self, vnclass):
|
||||
"""Returns pretty printed version of a VerbNet class
|
||||
|
||||
Return a string containing a pretty-printed representation of
|
||||
the given VerbNet class.
|
||||
|
||||
:param vnclass: A VerbNet class identifier; or an ElementTree
|
||||
containing the xml contents of a VerbNet class.
|
||||
"""
|
||||
if isinstance(vnclass, str):
|
||||
vnclass = self.vnclass(vnclass)
|
||||
|
||||
s = vnclass.get("ID") + "\n"
|
||||
s += self.pprint_subclasses(vnclass, indent=" ") + "\n"
|
||||
s += self.pprint_members(vnclass, indent=" ") + "\n"
|
||||
s += " Thematic roles:\n"
|
||||
s += self.pprint_themroles(vnclass, indent=" ") + "\n"
|
||||
s += " Frames:\n"
|
||||
s += self.pprint_frames(vnclass, indent=" ")
|
||||
return s
|
||||
|
||||
def pprint_subclasses(self, vnclass, indent=""):
|
||||
"""Returns pretty printed version of subclasses of VerbNet class
|
||||
|
||||
Return a string containing a pretty-printed representation of
|
||||
the given VerbNet class's subclasses.
|
||||
|
||||
:param vnclass: A VerbNet class identifier; or an ElementTree
|
||||
containing the xml contents of a VerbNet class.
|
||||
"""
|
||||
if isinstance(vnclass, str):
|
||||
vnclass = self.vnclass(vnclass)
|
||||
|
||||
subclasses = self.subclasses(vnclass)
|
||||
if not subclasses:
|
||||
subclasses = ["(none)"]
|
||||
s = "Subclasses: " + " ".join(subclasses)
|
||||
return textwrap.fill(
|
||||
s, 70, initial_indent=indent, subsequent_indent=indent + " "
|
||||
)
|
||||
|
||||
def pprint_members(self, vnclass, indent=""):
|
||||
"""Returns pretty printed version of members in a VerbNet class
|
||||
|
||||
Return a string containing a pretty-printed representation of
|
||||
the given VerbNet class's member verbs.
|
||||
|
||||
:param vnclass: A VerbNet class identifier; or an ElementTree
|
||||
containing the xml contents of a VerbNet class.
|
||||
"""
|
||||
if isinstance(vnclass, str):
|
||||
vnclass = self.vnclass(vnclass)
|
||||
|
||||
members = self.lemmas(vnclass)
|
||||
if not members:
|
||||
members = ["(none)"]
|
||||
s = "Members: " + " ".join(members)
|
||||
return textwrap.fill(
|
||||
s, 70, initial_indent=indent, subsequent_indent=indent + " "
|
||||
)
|
||||
|
||||
def pprint_themroles(self, vnclass, indent=""):
|
||||
"""Returns pretty printed version of thematic roles in a VerbNet class
|
||||
|
||||
Return a string containing a pretty-printed representation of
|
||||
the given VerbNet class's thematic roles.
|
||||
|
||||
:param vnclass: A VerbNet class identifier; or an ElementTree
|
||||
containing the xml contents of a VerbNet class.
|
||||
"""
|
||||
if isinstance(vnclass, str):
|
||||
vnclass = self.vnclass(vnclass)
|
||||
|
||||
pieces = []
|
||||
for themrole in self.themroles(vnclass):
|
||||
piece = indent + "* " + themrole.get("type")
|
||||
modifiers = [
|
||||
modifier["value"] + modifier["type"]
|
||||
for modifier in themrole["modifiers"]
|
||||
]
|
||||
if modifiers:
|
||||
piece += "[{}]".format(" ".join(modifiers))
|
||||
pieces.append(piece)
|
||||
return "\n".join(pieces)
|
||||
|
||||
def pprint_frames(self, vnclass, indent=""):
|
||||
"""Returns pretty version of all frames in a VerbNet class
|
||||
|
||||
Return a string containing a pretty-printed representation of
|
||||
the list of frames within the VerbNet class.
|
||||
|
||||
:param vnclass: A VerbNet class identifier; or an ElementTree
|
||||
containing the xml contents of a VerbNet class.
|
||||
"""
|
||||
if isinstance(vnclass, str):
|
||||
vnclass = self.vnclass(vnclass)
|
||||
pieces = []
|
||||
for vnframe in self.frames(vnclass):
|
||||
pieces.append(self._pprint_single_frame(vnframe, indent))
|
||||
return "\n".join(pieces)
|
||||
|
||||
def _pprint_single_frame(self, vnframe, indent=""):
|
||||
"""Returns pretty printed version of a single frame in a VerbNet class
|
||||
|
||||
Returns a string containing a pretty-printed representation of
|
||||
the given frame.
|
||||
|
||||
:param vnframe: An ElementTree containing the xml contents of
|
||||
a VerbNet frame.
|
||||
"""
|
||||
frame_string = self._pprint_description_within_frame(vnframe, indent) + "\n"
|
||||
frame_string += self._pprint_example_within_frame(vnframe, indent + " ") + "\n"
|
||||
frame_string += (
|
||||
self._pprint_syntax_within_frame(vnframe, indent + " Syntax: ") + "\n"
|
||||
)
|
||||
frame_string += indent + " Semantics:\n"
|
||||
frame_string += self._pprint_semantics_within_frame(vnframe, indent + " ")
|
||||
return frame_string
|
||||
|
||||
def _pprint_example_within_frame(self, vnframe, indent=""):
|
||||
"""Returns pretty printed version of example within frame in a VerbNet class
|
||||
|
||||
Return a string containing a pretty-printed representation of
|
||||
the given VerbNet frame example.
|
||||
|
||||
:param vnframe: An ElementTree containing the xml contents of
|
||||
a Verbnet frame.
|
||||
"""
|
||||
if vnframe["example"]:
|
||||
return indent + " Example: " + vnframe["example"]
|
||||
|
||||
def _pprint_description_within_frame(self, vnframe, indent=""):
|
||||
"""Returns pretty printed version of a VerbNet frame description
|
||||
|
||||
Return a string containing a pretty-printed representation of
|
||||
the given VerbNet frame description.
|
||||
|
||||
:param vnframe: An ElementTree containing the xml contents of
|
||||
a VerbNet frame.
|
||||
"""
|
||||
description = indent + vnframe["description"]["primary"]
|
||||
if vnframe["description"]["secondary"]:
|
||||
description += " ({})".format(vnframe["description"]["secondary"])
|
||||
return description
|
||||
|
||||
def _pprint_syntax_within_frame(self, vnframe, indent=""):
|
||||
"""Returns pretty printed version of syntax within a frame in a VerbNet class
|
||||
|
||||
Return a string containing a pretty-printed representation of
|
||||
the given VerbNet frame syntax.
|
||||
|
||||
:param vnframe: An ElementTree containing the xml contents of
|
||||
a VerbNet frame.
|
||||
"""
|
||||
pieces = []
|
||||
for element in vnframe["syntax"]:
|
||||
piece = element["pos_tag"]
|
||||
modifier_list = []
|
||||
if "value" in element["modifiers"] and element["modifiers"]["value"]:
|
||||
modifier_list.append(element["modifiers"]["value"])
|
||||
modifier_list += [
|
||||
"{}{}".format(restr["value"], restr["type"])
|
||||
for restr in (
|
||||
element["modifiers"]["selrestrs"]
|
||||
+ element["modifiers"]["synrestrs"]
|
||||
)
|
||||
]
|
||||
if modifier_list:
|
||||
piece += "[{}]".format(" ".join(modifier_list))
|
||||
pieces.append(piece)
|
||||
|
||||
return indent + " ".join(pieces)
|
||||
|
||||
def _pprint_semantics_within_frame(self, vnframe, indent=""):
|
||||
"""Returns a pretty printed version of semantics within frame in a VerbNet class
|
||||
|
||||
Return a string containing a pretty-printed representation of
|
||||
the given VerbNet frame semantics.
|
||||
|
||||
:param vnframe: An ElementTree containing the xml contents of
|
||||
a VerbNet frame.
|
||||
"""
|
||||
pieces = []
|
||||
for predicate in vnframe["semantics"]:
|
||||
arguments = [argument["value"] for argument in predicate["arguments"]]
|
||||
pieces.append(
|
||||
f"{'¬' if predicate['negated'] else ''}{predicate['predicate_value']}({', '.join(arguments)})"
|
||||
)
|
||||
return "\n".join(f"{indent}* {piece}" for piece in pieces)
|
||||
166
backend/venv/Lib/site-packages/nltk/corpus/reader/wordlist.py
Normal file
166
backend/venv/Lib/site-packages/nltk/corpus/reader/wordlist.py
Normal file
@@ -0,0 +1,166 @@
|
||||
# Natural Language Toolkit: Word List Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.tokenize import line_tokenize
|
||||
|
||||
|
||||
class WordListCorpusReader(CorpusReader):
|
||||
"""
|
||||
List of words, one per line. Blank lines are ignored.
|
||||
"""
|
||||
|
||||
def words(self, fileids=None, ignore_lines_startswith="\n"):
|
||||
return [
|
||||
line
|
||||
for line in line_tokenize(self.raw(fileids))
|
||||
if not line.startswith(ignore_lines_startswith)
|
||||
]
|
||||
|
||||
|
||||
class SwadeshCorpusReader(WordListCorpusReader):
|
||||
def entries(self, fileids=None):
|
||||
"""
|
||||
:return: a tuple of words for the specified fileids.
|
||||
"""
|
||||
if not fileids:
|
||||
fileids = self.fileids()
|
||||
|
||||
wordlists = [self.words(f) for f in fileids]
|
||||
return list(zip(*wordlists))
|
||||
|
||||
|
||||
class NonbreakingPrefixesCorpusReader(WordListCorpusReader):
|
||||
"""
|
||||
This is a class to read the nonbreaking prefixes textfiles from the
|
||||
Moses Machine Translation toolkit. These lists are used in the Python port
|
||||
of the Moses' word tokenizer.
|
||||
"""
|
||||
|
||||
available_langs = {
|
||||
"catalan": "ca",
|
||||
"czech": "cs",
|
||||
"german": "de",
|
||||
"greek": "el",
|
||||
"english": "en",
|
||||
"spanish": "es",
|
||||
"finnish": "fi",
|
||||
"french": "fr",
|
||||
"hungarian": "hu",
|
||||
"icelandic": "is",
|
||||
"italian": "it",
|
||||
"latvian": "lv",
|
||||
"dutch": "nl",
|
||||
"polish": "pl",
|
||||
"portuguese": "pt",
|
||||
"romanian": "ro",
|
||||
"russian": "ru",
|
||||
"slovak": "sk",
|
||||
"slovenian": "sl",
|
||||
"swedish": "sv",
|
||||
"tamil": "ta",
|
||||
}
|
||||
# Also, add the lang IDs as the keys.
|
||||
available_langs.update({v: v for v in available_langs.values()})
|
||||
|
||||
def words(self, lang=None, fileids=None, ignore_lines_startswith="#"):
|
||||
"""
|
||||
This module returns a list of nonbreaking prefixes for the specified
|
||||
language(s).
|
||||
|
||||
>>> from nltk.corpus import nonbreaking_prefixes as nbp
|
||||
>>> nbp.words('en')[:10] == [u'A', u'B', u'C', u'D', u'E', u'F', u'G', u'H', u'I', u'J']
|
||||
True
|
||||
>>> nbp.words('ta')[:5] == [u'\u0b85', u'\u0b86', u'\u0b87', u'\u0b88', u'\u0b89']
|
||||
True
|
||||
|
||||
:return: a list words for the specified language(s).
|
||||
"""
|
||||
# If *lang* in list of languages available, allocate apt fileid.
|
||||
# Otherwise, the function returns non-breaking prefixes for
|
||||
# all languages when fileids==None.
|
||||
if lang in self.available_langs:
|
||||
lang = self.available_langs[lang]
|
||||
fileids = ["nonbreaking_prefix." + lang]
|
||||
return [
|
||||
line
|
||||
for line in line_tokenize(self.raw(fileids))
|
||||
if not line.startswith(ignore_lines_startswith)
|
||||
]
|
||||
|
||||
|
||||
class UnicharsCorpusReader(WordListCorpusReader):
|
||||
"""
|
||||
This class is used to read lists of characters from the Perl Unicode
|
||||
Properties (see https://perldoc.perl.org/perluniprops.html).
|
||||
The files in the perluniprop.zip are extracted using the Unicode::Tussle
|
||||
module from https://search.cpan.org/~bdfoy/Unicode-Tussle-1.11/lib/Unicode/Tussle.pm
|
||||
"""
|
||||
|
||||
# These are categories similar to the Perl Unicode Properties
|
||||
available_categories = [
|
||||
"Close_Punctuation",
|
||||
"Currency_Symbol",
|
||||
"IsAlnum",
|
||||
"IsAlpha",
|
||||
"IsLower",
|
||||
"IsN",
|
||||
"IsSc",
|
||||
"IsSo",
|
||||
"IsUpper",
|
||||
"Line_Separator",
|
||||
"Number",
|
||||
"Open_Punctuation",
|
||||
"Punctuation",
|
||||
"Separator",
|
||||
"Symbol",
|
||||
]
|
||||
|
||||
def chars(self, category=None, fileids=None):
|
||||
"""
|
||||
This module returns a list of characters from the Perl Unicode Properties.
|
||||
They are very useful when porting Perl tokenizers to Python.
|
||||
|
||||
>>> from nltk.corpus import perluniprops as pup
|
||||
>>> pup.chars('Open_Punctuation')[:5] == [u'(', u'[', u'{', u'\u0f3a', u'\u0f3c']
|
||||
True
|
||||
>>> pup.chars('Currency_Symbol')[:5] == [u'$', u'\xa2', u'\xa3', u'\xa4', u'\xa5']
|
||||
True
|
||||
>>> pup.available_categories
|
||||
['Close_Punctuation', 'Currency_Symbol', 'IsAlnum', 'IsAlpha', 'IsLower', 'IsN', 'IsSc', 'IsSo', 'IsUpper', 'Line_Separator', 'Number', 'Open_Punctuation', 'Punctuation', 'Separator', 'Symbol']
|
||||
|
||||
:return: a list of characters given the specific unicode character category
|
||||
"""
|
||||
if category in self.available_categories:
|
||||
fileids = [category + ".txt"]
|
||||
return list(self.raw(fileids).strip())
|
||||
|
||||
|
||||
class MWAPPDBCorpusReader(WordListCorpusReader):
|
||||
"""
|
||||
This class is used to read the list of word pairs from the subset of lexical
|
||||
pairs of The Paraphrase Database (PPDB) XXXL used in the Monolingual Word
|
||||
Alignment (MWA) algorithm described in Sultan et al. (2014a, 2014b, 2015):
|
||||
|
||||
- http://acl2014.org/acl2014/Q14/pdf/Q14-1017
|
||||
- https://www.aclweb.org/anthology/S14-2039
|
||||
- https://www.aclweb.org/anthology/S15-2027
|
||||
|
||||
The original source of the full PPDB corpus can be found on
|
||||
https://www.cis.upenn.edu/~ccb/ppdb/
|
||||
|
||||
:return: a list of tuples of similar lexical terms.
|
||||
"""
|
||||
|
||||
mwa_ppdb_xxxl_file = "ppdb-1.0-xxxl-lexical.extended.synonyms.uniquepairs"
|
||||
|
||||
def entries(self, fileids=mwa_ppdb_xxxl_file):
|
||||
"""
|
||||
:return: a tuple of synonym word pairs.
|
||||
"""
|
||||
return [tuple(line.split("\t")) for line in line_tokenize(self.raw(fileids))]
|
||||
2558
backend/venv/Lib/site-packages/nltk/corpus/reader/wordnet.py
Normal file
2558
backend/venv/Lib/site-packages/nltk/corpus/reader/wordnet.py
Normal file
File diff suppressed because it is too large
Load Diff
397
backend/venv/Lib/site-packages/nltk/corpus/reader/xmldocs.py
Normal file
397
backend/venv/Lib/site-packages/nltk/corpus/reader/xmldocs.py
Normal file
@@ -0,0 +1,397 @@
|
||||
# Natural Language Toolkit: XML Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Corpus reader for corpora whose documents are xml files.
|
||||
|
||||
(note -- not named 'xml' to avoid conflicting w/ standard xml package)
|
||||
"""
|
||||
|
||||
import codecs
|
||||
from xml.etree import ElementTree
|
||||
|
||||
from nltk.corpus.reader.api import CorpusReader
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.data import SeekableUnicodeStreamReader
|
||||
from nltk.internals import ElementWrapper
|
||||
from nltk.tokenize import WordPunctTokenizer
|
||||
|
||||
|
||||
class XMLCorpusReader(CorpusReader):
|
||||
"""
|
||||
Corpus reader for corpora whose documents are xml files.
|
||||
|
||||
Note that the ``XMLCorpusReader`` constructor does not take an
|
||||
``encoding`` argument, because the unicode encoding is specified by
|
||||
the XML files themselves. See the XML specs for more info.
|
||||
"""
|
||||
|
||||
def __init__(self, root, fileids, wrap_etree=False):
|
||||
self._wrap_etree = wrap_etree
|
||||
CorpusReader.__init__(self, root, fileids)
|
||||
|
||||
def xml(self, fileid=None):
|
||||
# Make sure we have exactly one file -- no concatenating XML.
|
||||
if fileid is None and len(self._fileids) == 1:
|
||||
fileid = self._fileids[0]
|
||||
if not isinstance(fileid, str):
|
||||
raise TypeError("Expected a single file identifier string")
|
||||
# Read the XML in using ElementTree.
|
||||
with self.abspath(fileid).open() as fp:
|
||||
elt = ElementTree.parse(fp).getroot()
|
||||
# If requested, wrap it.
|
||||
if self._wrap_etree:
|
||||
elt = ElementWrapper(elt)
|
||||
# Return the ElementTree element.
|
||||
return elt
|
||||
|
||||
def words(self, fileid=None):
|
||||
"""
|
||||
Returns all of the words and punctuation symbols in the specified file
|
||||
that were in text nodes -- ie, tags are ignored. Like the xml() method,
|
||||
fileid can only specify one file.
|
||||
|
||||
:return: the given file's text nodes as a list of words and punctuation symbols
|
||||
:rtype: list(str)
|
||||
"""
|
||||
|
||||
elt = self.xml(fileid)
|
||||
encoding = self.encoding(fileid)
|
||||
word_tokenizer = WordPunctTokenizer()
|
||||
try:
|
||||
iterator = elt.getiterator()
|
||||
except:
|
||||
iterator = elt.iter()
|
||||
out = []
|
||||
|
||||
for node in iterator:
|
||||
text = node.text
|
||||
if text is not None:
|
||||
if isinstance(text, bytes):
|
||||
text = text.decode(encoding)
|
||||
toks = word_tokenizer.tokenize(text)
|
||||
out.extend(toks)
|
||||
return out
|
||||
|
||||
|
||||
class XMLCorpusView(StreamBackedCorpusView):
|
||||
"""
|
||||
A corpus view that selects out specified elements from an XML
|
||||
file, and provides a flat list-like interface for accessing them.
|
||||
(Note: ``XMLCorpusView`` is not used by ``XMLCorpusReader`` itself,
|
||||
but may be used by subclasses of ``XMLCorpusReader``.)
|
||||
|
||||
Every XML corpus view has a "tag specification", indicating what
|
||||
XML elements should be included in the view; and each (non-nested)
|
||||
element that matches this specification corresponds to one item in
|
||||
the view. Tag specifications are regular expressions over tag
|
||||
paths, where a tag path is a list of element tag names, separated
|
||||
by '/', indicating the ancestry of the element. Some examples:
|
||||
|
||||
- ``'foo'``: A top-level element whose tag is ``foo``.
|
||||
- ``'foo/bar'``: An element whose tag is ``bar`` and whose parent
|
||||
is a top-level element whose tag is ``foo``.
|
||||
- ``'.*/foo'``: An element whose tag is ``foo``, appearing anywhere
|
||||
in the xml tree.
|
||||
- ``'.*/(foo|bar)'``: An wlement whose tag is ``foo`` or ``bar``,
|
||||
appearing anywhere in the xml tree.
|
||||
|
||||
The view items are generated from the selected XML elements via
|
||||
the method ``handle_elt()``. By default, this method returns the
|
||||
element as-is (i.e., as an ElementTree object); but it can be
|
||||
overridden, either via subclassing or via the ``elt_handler``
|
||||
constructor parameter.
|
||||
"""
|
||||
|
||||
#: If true, then display debugging output to stdout when reading
|
||||
#: blocks.
|
||||
_DEBUG = False
|
||||
|
||||
#: The number of characters read at a time by this corpus reader.
|
||||
_BLOCK_SIZE = 1024
|
||||
|
||||
def __init__(self, fileid, tagspec, elt_handler=None):
|
||||
"""
|
||||
Create a new corpus view based on a specified XML file.
|
||||
|
||||
Note that the ``XMLCorpusView`` constructor does not take an
|
||||
``encoding`` argument, because the unicode encoding is
|
||||
specified by the XML files themselves.
|
||||
|
||||
:type tagspec: str
|
||||
:param tagspec: A tag specification, indicating what XML
|
||||
elements should be included in the view. Each non-nested
|
||||
element that matches this specification corresponds to one
|
||||
item in the view.
|
||||
|
||||
:param elt_handler: A function used to transform each element
|
||||
to a value for the view. If no handler is specified, then
|
||||
``self.handle_elt()`` is called, which returns the element
|
||||
as an ElementTree object. The signature of elt_handler is::
|
||||
|
||||
elt_handler(elt, tagspec) -> value
|
||||
"""
|
||||
if elt_handler:
|
||||
self.handle_elt = elt_handler
|
||||
|
||||
self._tagspec = re.compile(tagspec + r"\Z")
|
||||
"""The tag specification for this corpus view."""
|
||||
|
||||
self._tag_context = {0: ()}
|
||||
"""A dictionary mapping from file positions (as returned by
|
||||
``stream.seek()`` to XML contexts. An XML context is a
|
||||
tuple of XML tag names, indicating which tags have not yet
|
||||
been closed."""
|
||||
|
||||
encoding = self._detect_encoding(fileid)
|
||||
StreamBackedCorpusView.__init__(self, fileid, encoding=encoding)
|
||||
|
||||
def _detect_encoding(self, fileid):
|
||||
if isinstance(fileid, PathPointer):
|
||||
try:
|
||||
infile = fileid.open()
|
||||
s = infile.readline()
|
||||
finally:
|
||||
infile.close()
|
||||
else:
|
||||
with open(fileid, "rb") as infile:
|
||||
s = infile.readline()
|
||||
if s.startswith(codecs.BOM_UTF16_BE):
|
||||
return "utf-16-be"
|
||||
if s.startswith(codecs.BOM_UTF16_LE):
|
||||
return "utf-16-le"
|
||||
if s.startswith(codecs.BOM_UTF32_BE):
|
||||
return "utf-32-be"
|
||||
if s.startswith(codecs.BOM_UTF32_LE):
|
||||
return "utf-32-le"
|
||||
if s.startswith(codecs.BOM_UTF8):
|
||||
return "utf-8"
|
||||
m = re.match(rb'\s*<\?xml\b.*\bencoding="([^"]+)"', s)
|
||||
if m:
|
||||
return m.group(1).decode()
|
||||
m = re.match(rb"\s*<\?xml\b.*\bencoding='([^']+)'", s)
|
||||
if m:
|
||||
return m.group(1).decode()
|
||||
# No encoding found -- what should the default be?
|
||||
return "utf-8"
|
||||
|
||||
def handle_elt(self, elt, context):
|
||||
"""
|
||||
Convert an element into an appropriate value for inclusion in
|
||||
the view. Unless overridden by a subclass or by the
|
||||
``elt_handler`` constructor argument, this method simply
|
||||
returns ``elt``.
|
||||
|
||||
:return: The view value corresponding to ``elt``.
|
||||
|
||||
:type elt: ElementTree
|
||||
:param elt: The element that should be converted.
|
||||
|
||||
:type context: str
|
||||
:param context: A string composed of element tags separated by
|
||||
forward slashes, indicating the XML context of the given
|
||||
element. For example, the string ``'foo/bar/baz'``
|
||||
indicates that the element is a ``baz`` element whose
|
||||
parent is a ``bar`` element and whose grandparent is a
|
||||
top-level ``foo`` element.
|
||||
"""
|
||||
return elt
|
||||
|
||||
#: A regular expression that matches XML fragments that do not
|
||||
#: contain any un-closed tags.
|
||||
_VALID_XML_RE = re.compile(
|
||||
r"""
|
||||
[^<]*
|
||||
(
|
||||
((<!--.*?-->) | # comment
|
||||
(<![CDATA[.*?]]) | # raw character data
|
||||
(<!DOCTYPE\s+[^\[]*(\[[^\]]*])?\s*>) | # doctype decl
|
||||
(<[^!>][^>]*>)) # tag or PI
|
||||
[^<]*)*
|
||||
\Z""",
|
||||
re.DOTALL | re.VERBOSE,
|
||||
)
|
||||
|
||||
#: A regular expression used to extract the tag name from a start tag,
|
||||
#: end tag, or empty-elt tag string.
|
||||
_XML_TAG_NAME = re.compile(r"<\s*(?:/\s*)?([^\s>]+)")
|
||||
|
||||
#: A regular expression used to find all start-tags, end-tags, and
|
||||
#: empty-elt tags in an XML file. This regexp is more lenient than
|
||||
#: the XML spec -- e.g., it allows spaces in some places where the
|
||||
#: spec does not.
|
||||
_XML_PIECE = re.compile(
|
||||
r"""
|
||||
# Include these so we can skip them:
|
||||
(?P<COMMENT> <!--.*?--> )|
|
||||
(?P<CDATA> <![CDATA[.*?]]> )|
|
||||
(?P<PI> <\?.*?\?> )|
|
||||
(?P<DOCTYPE> <!DOCTYPE\s+[^\[^>]*(\[[^\]]*])?\s*>)|
|
||||
# These are the ones we actually care about:
|
||||
(?P<EMPTY_ELT_TAG> <\s*[^>/\?!\s][^>]*/\s*> )|
|
||||
(?P<START_TAG> <\s*[^>/\?!\s][^>]*> )|
|
||||
(?P<END_TAG> <\s*/[^>/\?!\s][^>]*> )""",
|
||||
re.DOTALL | re.VERBOSE,
|
||||
)
|
||||
|
||||
def _read_xml_fragment(self, stream):
|
||||
"""
|
||||
Read a string from the given stream that does not contain any
|
||||
un-closed tags. In particular, this function first reads a
|
||||
block from the stream of size ``self._BLOCK_SIZE``. It then
|
||||
checks if that block contains an un-closed tag. If it does,
|
||||
then this function either backtracks to the last '<', or reads
|
||||
another block.
|
||||
"""
|
||||
fragment = ""
|
||||
|
||||
if isinstance(stream, SeekableUnicodeStreamReader):
|
||||
startpos = stream.tell()
|
||||
while True:
|
||||
# Read a block and add it to the fragment.
|
||||
xml_block = stream.read(self._BLOCK_SIZE)
|
||||
fragment += xml_block
|
||||
|
||||
# Do we have a well-formed xml fragment?
|
||||
if self._VALID_XML_RE.match(fragment):
|
||||
return fragment
|
||||
|
||||
# Do we have a fragment that will never be well-formed?
|
||||
if re.search("[<>]", fragment).group(0) == ">":
|
||||
pos = stream.tell() - (
|
||||
len(fragment) - re.search("[<>]", fragment).end()
|
||||
)
|
||||
raise ValueError('Unexpected ">" near char %s' % pos)
|
||||
|
||||
# End of file?
|
||||
if not xml_block:
|
||||
raise ValueError("Unexpected end of file: tag not closed")
|
||||
|
||||
# If not, then we must be in the middle of a <..tag..>.
|
||||
# If appropriate, backtrack to the most recent '<'
|
||||
# character.
|
||||
last_open_bracket = fragment.rfind("<")
|
||||
if last_open_bracket > 0:
|
||||
if self._VALID_XML_RE.match(fragment[:last_open_bracket]):
|
||||
if isinstance(stream, SeekableUnicodeStreamReader):
|
||||
stream.seek(startpos)
|
||||
stream.char_seek_forward(last_open_bracket)
|
||||
else:
|
||||
stream.seek(-(len(fragment) - last_open_bracket), 1)
|
||||
return fragment[:last_open_bracket]
|
||||
|
||||
# Otherwise, read another block. (i.e., return to the
|
||||
# top of the loop.)
|
||||
|
||||
def read_block(self, stream, tagspec=None, elt_handler=None):
|
||||
"""
|
||||
Read from ``stream`` until we find at least one element that
|
||||
matches ``tagspec``, and return the result of applying
|
||||
``elt_handler`` to each element found.
|
||||
"""
|
||||
if tagspec is None:
|
||||
tagspec = self._tagspec
|
||||
if elt_handler is None:
|
||||
elt_handler = self.handle_elt
|
||||
|
||||
# Use a stack of strings to keep track of our context:
|
||||
context = list(self._tag_context.get(stream.tell()))
|
||||
assert context is not None # check this -- could it ever happen?
|
||||
|
||||
elts = []
|
||||
|
||||
elt_start = None # where does the elt start
|
||||
elt_depth = None # what context depth
|
||||
elt_text = ""
|
||||
|
||||
while elts == [] or elt_start is not None:
|
||||
if isinstance(stream, SeekableUnicodeStreamReader):
|
||||
startpos = stream.tell()
|
||||
xml_fragment = self._read_xml_fragment(stream)
|
||||
|
||||
# End of file.
|
||||
if not xml_fragment:
|
||||
if elt_start is None:
|
||||
break
|
||||
else:
|
||||
raise ValueError("Unexpected end of file")
|
||||
|
||||
# Process each <tag> in the xml fragment.
|
||||
for piece in self._XML_PIECE.finditer(xml_fragment):
|
||||
if self._DEBUG:
|
||||
print("{:>25} {}".format("/".join(context)[-20:], piece.group()))
|
||||
|
||||
if piece.group("START_TAG"):
|
||||
name = self._XML_TAG_NAME.match(piece.group()).group(1)
|
||||
# Keep context up-to-date.
|
||||
context.append(name)
|
||||
# Is this one of the elts we're looking for?
|
||||
if elt_start is None:
|
||||
if re.match(tagspec, "/".join(context)):
|
||||
elt_start = piece.start()
|
||||
elt_depth = len(context)
|
||||
|
||||
elif piece.group("END_TAG"):
|
||||
name = self._XML_TAG_NAME.match(piece.group()).group(1)
|
||||
# sanity checks:
|
||||
if not context:
|
||||
raise ValueError("Unmatched tag </%s>" % name)
|
||||
if name != context[-1]:
|
||||
raise ValueError(f"Unmatched tag <{context[-1]}>...</{name}>")
|
||||
# Is this the end of an element?
|
||||
if elt_start is not None and elt_depth == len(context):
|
||||
elt_text += xml_fragment[elt_start : piece.end()]
|
||||
elts.append((elt_text, "/".join(context)))
|
||||
elt_start = elt_depth = None
|
||||
elt_text = ""
|
||||
# Keep context up-to-date
|
||||
context.pop()
|
||||
|
||||
elif piece.group("EMPTY_ELT_TAG"):
|
||||
name = self._XML_TAG_NAME.match(piece.group()).group(1)
|
||||
if elt_start is None:
|
||||
if re.match(tagspec, "/".join(context) + "/" + name):
|
||||
elts.append((piece.group(), "/".join(context) + "/" + name))
|
||||
|
||||
if elt_start is not None:
|
||||
# If we haven't found any elements yet, then keep
|
||||
# looping until we do.
|
||||
if elts == []:
|
||||
elt_text += xml_fragment[elt_start:]
|
||||
elt_start = 0
|
||||
|
||||
# If we've found at least one element, then try
|
||||
# backtracking to the start of the element that we're
|
||||
# inside of.
|
||||
else:
|
||||
# take back the last start-tag, and return what
|
||||
# we've gotten so far (elts is non-empty).
|
||||
if self._DEBUG:
|
||||
print(" " * 36 + "(backtrack)")
|
||||
if isinstance(stream, SeekableUnicodeStreamReader):
|
||||
stream.seek(startpos)
|
||||
stream.char_seek_forward(elt_start)
|
||||
else:
|
||||
stream.seek(-(len(xml_fragment) - elt_start), 1)
|
||||
context = context[: elt_depth - 1]
|
||||
elt_start = elt_depth = None
|
||||
elt_text = ""
|
||||
|
||||
# Update the _tag_context dict.
|
||||
pos = stream.tell()
|
||||
if pos in self._tag_context:
|
||||
assert tuple(context) == self._tag_context[pos]
|
||||
else:
|
||||
self._tag_context[pos] = tuple(context)
|
||||
|
||||
return [
|
||||
elt_handler(
|
||||
ElementTree.fromstring(elt.encode("ascii", "xmlcharrefreplace")),
|
||||
context,
|
||||
)
|
||||
for (elt, context) in elts
|
||||
]
|
||||
256
backend/venv/Lib/site-packages/nltk/corpus/reader/ycoe.py
Normal file
256
backend/venv/Lib/site-packages/nltk/corpus/reader/ycoe.py
Normal file
@@ -0,0 +1,256 @@
|
||||
# Natural Language Toolkit: York-Toronto-Helsinki Parsed Corpus of Old English Prose (YCOE)
|
||||
#
|
||||
# Copyright (C) 2001-2015 NLTK Project
|
||||
# Author: Selina Dennis <selina@tranzfusion.net>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Corpus reader for the York-Toronto-Helsinki Parsed Corpus of Old
|
||||
English Prose (YCOE), a 1.5 million word syntactically-annotated
|
||||
corpus of Old English prose texts. The corpus is distributed by the
|
||||
Oxford Text Archive: http://www.ota.ahds.ac.uk/ It is not included
|
||||
with NLTK.
|
||||
|
||||
The YCOE corpus is divided into 100 files, each representing
|
||||
an Old English prose text. Tags used within each text complies
|
||||
to the YCOE standard: https://www-users.york.ac.uk/~lang22/YCOE/YcoeHome.htm
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader
|
||||
from nltk.corpus.reader.tagged import TaggedCorpusReader
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.tokenize import RegexpTokenizer
|
||||
|
||||
|
||||
class YCOECorpusReader(CorpusReader):
|
||||
"""
|
||||
Corpus reader for the York-Toronto-Helsinki Parsed Corpus of Old
|
||||
English Prose (YCOE), a 1.5 million word syntactically-annotated
|
||||
corpus of Old English prose texts.
|
||||
"""
|
||||
|
||||
def __init__(self, root, encoding="utf8"):
|
||||
CorpusReader.__init__(self, root, [], encoding)
|
||||
|
||||
self._psd_reader = YCOEParseCorpusReader(
|
||||
self.root.join("psd"), ".*", ".psd", encoding=encoding
|
||||
)
|
||||
self._pos_reader = YCOETaggedCorpusReader(self.root.join("pos"), ".*", ".pos")
|
||||
|
||||
# Make sure we have a consistent set of items:
|
||||
documents = {f[:-4] for f in self._psd_reader.fileids()}
|
||||
if {f[:-4] for f in self._pos_reader.fileids()} != documents:
|
||||
raise ValueError('Items in "psd" and "pos" ' "subdirectories do not match.")
|
||||
|
||||
fileids = sorted(
|
||||
["%s.psd" % doc for doc in documents]
|
||||
+ ["%s.pos" % doc for doc in documents]
|
||||
)
|
||||
CorpusReader.__init__(self, root, fileids, encoding)
|
||||
self._documents = sorted(documents)
|
||||
|
||||
def documents(self, fileids=None):
|
||||
"""
|
||||
Return a list of document identifiers for all documents in
|
||||
this corpus, or for the documents with the given file(s) if
|
||||
specified.
|
||||
"""
|
||||
if fileids is None:
|
||||
return self._documents
|
||||
if isinstance(fileids, str):
|
||||
fileids = [fileids]
|
||||
for f in fileids:
|
||||
if f not in self._fileids:
|
||||
raise KeyError("File id %s not found" % fileids)
|
||||
# Strip off the '.pos' and '.psd' extensions.
|
||||
return sorted({f[:-4] for f in fileids})
|
||||
|
||||
def fileids(self, documents=None):
|
||||
"""
|
||||
Return a list of file identifiers for the files that make up
|
||||
this corpus, or that store the given document(s) if specified.
|
||||
"""
|
||||
if documents is None:
|
||||
return self._fileids
|
||||
elif isinstance(documents, str):
|
||||
documents = [documents]
|
||||
return sorted(
|
||||
set(
|
||||
["%s.pos" % doc for doc in documents]
|
||||
+ ["%s.psd" % doc for doc in documents]
|
||||
)
|
||||
)
|
||||
|
||||
def _getfileids(self, documents, subcorpus):
|
||||
"""
|
||||
Helper that selects the appropriate fileids for a given set of
|
||||
documents from a given subcorpus (pos or psd).
|
||||
"""
|
||||
if documents is None:
|
||||
documents = self._documents
|
||||
else:
|
||||
if isinstance(documents, str):
|
||||
documents = [documents]
|
||||
for document in documents:
|
||||
if document not in self._documents:
|
||||
if document[-4:] in (".pos", ".psd"):
|
||||
raise ValueError(
|
||||
"Expected a document identifier, not a file "
|
||||
"identifier. (Use corpus.documents() to get "
|
||||
"a list of document identifiers."
|
||||
)
|
||||
else:
|
||||
raise ValueError("Document identifier %s not found" % document)
|
||||
return [f"{d}.{subcorpus}" for d in documents]
|
||||
|
||||
# Delegate to one of our two sub-readers:
|
||||
def words(self, documents=None):
|
||||
return self._pos_reader.words(self._getfileids(documents, "pos"))
|
||||
|
||||
def sents(self, documents=None):
|
||||
return self._pos_reader.sents(self._getfileids(documents, "pos"))
|
||||
|
||||
def paras(self, documents=None):
|
||||
return self._pos_reader.paras(self._getfileids(documents, "pos"))
|
||||
|
||||
def tagged_words(self, documents=None):
|
||||
return self._pos_reader.tagged_words(self._getfileids(documents, "pos"))
|
||||
|
||||
def tagged_sents(self, documents=None):
|
||||
return self._pos_reader.tagged_sents(self._getfileids(documents, "pos"))
|
||||
|
||||
def tagged_paras(self, documents=None):
|
||||
return self._pos_reader.tagged_paras(self._getfileids(documents, "pos"))
|
||||
|
||||
def parsed_sents(self, documents=None):
|
||||
return self._psd_reader.parsed_sents(self._getfileids(documents, "psd"))
|
||||
|
||||
|
||||
class YCOEParseCorpusReader(BracketParseCorpusReader):
|
||||
"""Specialized version of the standard bracket parse corpus reader
|
||||
that strips out (CODE ...) and (ID ...) nodes."""
|
||||
|
||||
def _parse(self, t):
|
||||
t = re.sub(r"(?u)\((CODE|ID)[^\)]*\)", "", t)
|
||||
if re.match(r"\s*\(\s*\)\s*$", t):
|
||||
return None
|
||||
return BracketParseCorpusReader._parse(self, t)
|
||||
|
||||
|
||||
class YCOETaggedCorpusReader(TaggedCorpusReader):
|
||||
def __init__(self, root, items, encoding="utf8"):
|
||||
gaps_re = r"(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*"
|
||||
sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True)
|
||||
TaggedCorpusReader.__init__(
|
||||
self, root, items, sep="_", sent_tokenizer=sent_tokenizer
|
||||
)
|
||||
|
||||
|
||||
#: A list of all documents and their titles in ycoe.
|
||||
documents = {
|
||||
"coadrian.o34": "Adrian and Ritheus",
|
||||
"coaelhom.o3": "Ælfric, Supplemental Homilies",
|
||||
"coaelive.o3": "Ælfric's Lives of Saints",
|
||||
"coalcuin": "Alcuin De virtutibus et vitiis",
|
||||
"coalex.o23": "Alexander's Letter to Aristotle",
|
||||
"coapollo.o3": "Apollonius of Tyre",
|
||||
"coaugust": "Augustine",
|
||||
"cobede.o2": "Bede's History of the English Church",
|
||||
"cobenrul.o3": "Benedictine Rule",
|
||||
"coblick.o23": "Blickling Homilies",
|
||||
"coboeth.o2": "Boethius' Consolation of Philosophy",
|
||||
"cobyrhtf.o3": "Byrhtferth's Manual",
|
||||
"cocanedgD": "Canons of Edgar (D)",
|
||||
"cocanedgX": "Canons of Edgar (X)",
|
||||
"cocathom1.o3": "Ælfric's Catholic Homilies I",
|
||||
"cocathom2.o3": "Ælfric's Catholic Homilies II",
|
||||
"cochad.o24": "Saint Chad",
|
||||
"cochdrul": "Chrodegang of Metz, Rule",
|
||||
"cochristoph": "Saint Christopher",
|
||||
"cochronA.o23": "Anglo-Saxon Chronicle A",
|
||||
"cochronC": "Anglo-Saxon Chronicle C",
|
||||
"cochronD": "Anglo-Saxon Chronicle D",
|
||||
"cochronE.o34": "Anglo-Saxon Chronicle E",
|
||||
"cocura.o2": "Cura Pastoralis",
|
||||
"cocuraC": "Cura Pastoralis (Cotton)",
|
||||
"codicts.o34": "Dicts of Cato",
|
||||
"codocu1.o1": "Documents 1 (O1)",
|
||||
"codocu2.o12": "Documents 2 (O1/O2)",
|
||||
"codocu2.o2": "Documents 2 (O2)",
|
||||
"codocu3.o23": "Documents 3 (O2/O3)",
|
||||
"codocu3.o3": "Documents 3 (O3)",
|
||||
"codocu4.o24": "Documents 4 (O2/O4)",
|
||||
"coeluc1": "Honorius of Autun, Elucidarium 1",
|
||||
"coeluc2": "Honorius of Autun, Elucidarium 1",
|
||||
"coepigen.o3": "Ælfric's Epilogue to Genesis",
|
||||
"coeuphr": "Saint Euphrosyne",
|
||||
"coeust": "Saint Eustace and his companions",
|
||||
"coexodusP": "Exodus (P)",
|
||||
"cogenesiC": "Genesis (C)",
|
||||
"cogregdC.o24": "Gregory's Dialogues (C)",
|
||||
"cogregdH.o23": "Gregory's Dialogues (H)",
|
||||
"coherbar": "Pseudo-Apuleius, Herbarium",
|
||||
"coinspolD.o34": "Wulfstan's Institute of Polity (D)",
|
||||
"coinspolX": "Wulfstan's Institute of Polity (X)",
|
||||
"cojames": "Saint James",
|
||||
"colacnu.o23": "Lacnunga",
|
||||
"colaece.o2": "Leechdoms",
|
||||
"colaw1cn.o3": "Laws, Cnut I",
|
||||
"colaw2cn.o3": "Laws, Cnut II",
|
||||
"colaw5atr.o3": "Laws, Æthelred V",
|
||||
"colaw6atr.o3": "Laws, Æthelred VI",
|
||||
"colawaf.o2": "Laws, Alfred",
|
||||
"colawafint.o2": "Alfred's Introduction to Laws",
|
||||
"colawger.o34": "Laws, Gerefa",
|
||||
"colawine.ox2": "Laws, Ine",
|
||||
"colawnorthu.o3": "Northumbra Preosta Lagu",
|
||||
"colawwllad.o4": "Laws, William I, Lad",
|
||||
"coleofri.o4": "Leofric",
|
||||
"colsigef.o3": "Ælfric's Letter to Sigefyrth",
|
||||
"colsigewB": "Ælfric's Letter to Sigeweard (B)",
|
||||
"colsigewZ.o34": "Ælfric's Letter to Sigeweard (Z)",
|
||||
"colwgeat": "Ælfric's Letter to Wulfgeat",
|
||||
"colwsigeT": "Ælfric's Letter to Wulfsige (T)",
|
||||
"colwsigeXa.o34": "Ælfric's Letter to Wulfsige (Xa)",
|
||||
"colwstan1.o3": "Ælfric's Letter to Wulfstan I",
|
||||
"colwstan2.o3": "Ælfric's Letter to Wulfstan II",
|
||||
"comargaC.o34": "Saint Margaret (C)",
|
||||
"comargaT": "Saint Margaret (T)",
|
||||
"comart1": "Martyrology, I",
|
||||
"comart2": "Martyrology, II",
|
||||
"comart3.o23": "Martyrology, III",
|
||||
"comarvel.o23": "Marvels of the East",
|
||||
"comary": "Mary of Egypt",
|
||||
"coneot": "Saint Neot",
|
||||
"conicodA": "Gospel of Nicodemus (A)",
|
||||
"conicodC": "Gospel of Nicodemus (C)",
|
||||
"conicodD": "Gospel of Nicodemus (D)",
|
||||
"conicodE": "Gospel of Nicodemus (E)",
|
||||
"coorosiu.o2": "Orosius",
|
||||
"cootest.o3": "Heptateuch",
|
||||
"coprefcath1.o3": "Ælfric's Preface to Catholic Homilies I",
|
||||
"coprefcath2.o3": "Ælfric's Preface to Catholic Homilies II",
|
||||
"coprefcura.o2": "Preface to the Cura Pastoralis",
|
||||
"coprefgen.o3": "Ælfric's Preface to Genesis",
|
||||
"copreflives.o3": "Ælfric's Preface to Lives of Saints",
|
||||
"coprefsolilo": "Preface to Augustine's Soliloquies",
|
||||
"coquadru.o23": "Pseudo-Apuleius, Medicina de quadrupedibus",
|
||||
"corood": "History of the Holy Rood-Tree",
|
||||
"cosevensl": "Seven Sleepers",
|
||||
"cosolilo": "St. Augustine's Soliloquies",
|
||||
"cosolsat1.o4": "Solomon and Saturn I",
|
||||
"cosolsat2": "Solomon and Saturn II",
|
||||
"cotempo.o3": "Ælfric's De Temporibus Anni",
|
||||
"coverhom": "Vercelli Homilies",
|
||||
"coverhomE": "Vercelli Homilies (E)",
|
||||
"coverhomL": "Vercelli Homilies (L)",
|
||||
"covinceB": "Saint Vincent (Bodley 343)",
|
||||
"covinsal": "Vindicta Salvatoris",
|
||||
"cowsgosp.o3": "West-Saxon Gospels",
|
||||
"cowulf.o34": "Wulfstan's Homilies",
|
||||
}
|
||||
153
backend/venv/Lib/site-packages/nltk/corpus/util.py
Normal file
153
backend/venv/Lib/site-packages/nltk/corpus/util.py
Normal file
@@ -0,0 +1,153 @@
|
||||
# Natural Language Toolkit: Corpus Reader Utility Functions
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
######################################################################
|
||||
# { Lazy Corpus Loader
|
||||
######################################################################
|
||||
|
||||
import gc
|
||||
import re
|
||||
|
||||
import nltk
|
||||
|
||||
TRY_ZIPFILE_FIRST = False
|
||||
|
||||
|
||||
class LazyCorpusLoader:
|
||||
"""
|
||||
To see the API documentation for this lazily loaded corpus, first
|
||||
run corpus.ensure_loaded(), and then run help(this_corpus).
|
||||
|
||||
LazyCorpusLoader is a proxy object which is used to stand in for a
|
||||
corpus object before the corpus is loaded. This allows NLTK to
|
||||
create an object for each corpus, but defer the costs associated
|
||||
with loading those corpora until the first time that they're
|
||||
actually accessed.
|
||||
|
||||
The first time this object is accessed in any way, it will load
|
||||
the corresponding corpus, and transform itself into that corpus
|
||||
(by modifying its own ``__class__`` and ``__dict__`` attributes).
|
||||
|
||||
If the corpus can not be found, then accessing this object will
|
||||
raise an exception, displaying installation instructions for the
|
||||
NLTK data package. Once they've properly installed the data
|
||||
package (or modified ``nltk.data.path`` to point to its location),
|
||||
they can then use the corpus object without restarting python.
|
||||
|
||||
:param name: The name of the corpus
|
||||
:type name: str
|
||||
:param reader_cls: The specific CorpusReader class, e.g. PlaintextCorpusReader, WordListCorpusReader
|
||||
:type reader: nltk.corpus.reader.api.CorpusReader
|
||||
:param nltk_data_subdir: The subdirectory where the corpus is stored.
|
||||
:type nltk_data_subdir: str
|
||||
:param `*args`: Any other non-keywords arguments that `reader_cls` might need.
|
||||
:param `**kwargs`: Any other keywords arguments that `reader_cls` might need.
|
||||
"""
|
||||
|
||||
def __init__(self, name, reader_cls, *args, **kwargs):
|
||||
from nltk.corpus.reader.api import CorpusReader
|
||||
|
||||
assert issubclass(reader_cls, CorpusReader)
|
||||
self.__name = self.__name__ = name
|
||||
self.__reader_cls = reader_cls
|
||||
# If nltk_data_subdir is set explicitly
|
||||
if "nltk_data_subdir" in kwargs:
|
||||
# Use the specified subdirectory path
|
||||
self.subdir = kwargs["nltk_data_subdir"]
|
||||
# Pops the `nltk_data_subdir` argument, we don't need it anymore.
|
||||
kwargs.pop("nltk_data_subdir", None)
|
||||
else: # Otherwise use 'nltk_data/corpora'
|
||||
self.subdir = "corpora"
|
||||
self.__args = args
|
||||
self.__kwargs = kwargs
|
||||
|
||||
def __load(self):
|
||||
# Find the corpus root directory.
|
||||
zip_name = re.sub(r"(([^/]+)(/.*)?)", r"\2.zip/\1/", self.__name)
|
||||
if TRY_ZIPFILE_FIRST:
|
||||
try:
|
||||
root = nltk.data.find(f"{self.subdir}/{zip_name}")
|
||||
except LookupError as e:
|
||||
try:
|
||||
root = nltk.data.find(f"{self.subdir}/{self.__name}")
|
||||
except LookupError:
|
||||
raise e
|
||||
else:
|
||||
try:
|
||||
root = nltk.data.find(f"{self.subdir}/{self.__name}")
|
||||
except LookupError as e:
|
||||
try:
|
||||
root = nltk.data.find(f"{self.subdir}/{zip_name}")
|
||||
except LookupError:
|
||||
raise e
|
||||
|
||||
# Load the corpus.
|
||||
corpus = self.__reader_cls(root, *self.__args, **self.__kwargs)
|
||||
|
||||
# This is where the magic happens! Transform ourselves into
|
||||
# the corpus by modifying our own __dict__ and __class__ to
|
||||
# match that of the corpus.
|
||||
|
||||
args, kwargs = self.__args, self.__kwargs
|
||||
name, reader_cls = self.__name, self.__reader_cls
|
||||
|
||||
self.__dict__ = corpus.__dict__
|
||||
self.__class__ = corpus.__class__
|
||||
|
||||
# _unload support: assign __dict__ and __class__ back, then do GC.
|
||||
# after reassigning __dict__ there shouldn't be any references to
|
||||
# corpus data so the memory should be deallocated after gc.collect()
|
||||
def _unload(self):
|
||||
lazy_reader = LazyCorpusLoader(name, reader_cls, *args, **kwargs)
|
||||
self.__dict__ = lazy_reader.__dict__
|
||||
self.__class__ = lazy_reader.__class__
|
||||
gc.collect()
|
||||
|
||||
self._unload = _make_bound_method(_unload, self)
|
||||
|
||||
def __getattr__(self, attr):
|
||||
# Fix for inspect.isclass under Python 2.6
|
||||
# (see https://bugs.python.org/issue1225107).
|
||||
# Without this fix tests may take extra 1.5GB RAM
|
||||
# because all corpora gets loaded during test collection.
|
||||
if attr == "__bases__":
|
||||
raise AttributeError("LazyCorpusLoader object has no attribute '__bases__'")
|
||||
|
||||
self.__load()
|
||||
# This looks circular, but its not, since __load() changes our
|
||||
# __class__ to something new:
|
||||
return getattr(self, attr)
|
||||
|
||||
def __repr__(self):
|
||||
return "<{} in {!r} (not loaded yet)>".format(
|
||||
self.__reader_cls.__name__,
|
||||
".../corpora/" + self.__name,
|
||||
)
|
||||
|
||||
def _unload(self):
|
||||
# If an exception occurs during corpus loading then
|
||||
# '_unload' method may be unattached, so __getattr__ can be called;
|
||||
# we shouldn't trigger corpus loading again in this case.
|
||||
pass
|
||||
|
||||
|
||||
def _make_bound_method(func, self):
|
||||
"""
|
||||
Magic for creating bound methods (used for _unload).
|
||||
"""
|
||||
|
||||
class Foo:
|
||||
def meth(self):
|
||||
pass
|
||||
|
||||
f = Foo()
|
||||
bound_method = type(f.meth)
|
||||
|
||||
try:
|
||||
return bound_method(func, self, self.__class__)
|
||||
except TypeError: # python3
|
||||
return bound_method(func, self)
|
||||
Reference in New Issue
Block a user