Initial commit

This commit is contained in:
2026-02-01 09:31:38 +01:00
commit e02db93960
4396 changed files with 1511612 additions and 0 deletions

View File

@@ -0,0 +1,551 @@
# Natural Language Toolkit: Corpus Readers
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
# TODO this docstring isn't up-to-date!
"""
NLTK corpus readers. The modules in this package provide functions
that can be used to read corpus files in a variety of formats. These
functions can be used to read both the corpus files that are
distributed in the NLTK corpus package, and corpus files that are part
of external corpora.
Available Corpora
=================
Please see https://www.nltk.org/nltk_data/ for a complete list.
Install corpora using nltk.download().
Corpus Reader Functions
=======================
Each corpus module defines one or more "corpus reader functions",
which can be used to read documents from that corpus. These functions
take an argument, ``item``, which is used to indicate which document
should be read from the corpus:
- If ``item`` is one of the unique identifiers listed in the corpus
module's ``items`` variable, then the corresponding document will
be loaded from the NLTK corpus package.
- If ``item`` is a filename, then that file will be read.
Additionally, corpus reader functions can be given lists of item
names; in which case, they will return a concatenation of the
corresponding documents.
Corpus reader functions are named based on the type of information
they return. Some common examples, and their return types, are:
- words(): list of str
- sents(): list of (list of str)
- paras(): list of (list of (list of str))
- tagged_words(): list of (str,str) tuple
- tagged_sents(): list of (list of (str,str))
- tagged_paras(): list of (list of (list of (str,str)))
- chunked_sents(): list of (Tree w/ (str,str) leaves)
- parsed_sents(): list of (Tree with str leaves)
- parsed_paras(): list of (list of (Tree with str leaves))
- xml(): A single xml ElementTree
- raw(): unprocessed corpus contents
For example, to read a list of the words in the Brown Corpus, use
``nltk.corpus.brown.words()``:
>>> from nltk.corpus import brown
>>> print(", ".join(brown.words())) # doctest: +ELLIPSIS
The, Fulton, County, Grand, Jury, said, ...
"""
import re
from nltk.corpus.reader import *
from nltk.corpus.util import LazyCorpusLoader
from nltk.tokenize import RegexpTokenizer
abc: PlaintextCorpusReader = LazyCorpusLoader(
"abc",
PlaintextCorpusReader,
r"(?!\.).*\.txt",
encoding=[("science", "latin_1"), ("rural", "utf8")],
)
alpino: AlpinoCorpusReader = LazyCorpusLoader(
"alpino", AlpinoCorpusReader, tagset="alpino"
)
bcp47: BCP47CorpusReader = LazyCorpusLoader(
"bcp47", BCP47CorpusReader, r"(cldr|iana)/*"
)
brown: CategorizedTaggedCorpusReader = LazyCorpusLoader(
"brown",
CategorizedTaggedCorpusReader,
r"c[a-z]\d\d",
cat_file="cats.txt",
tagset="brown",
encoding="ascii",
)
cess_cat: BracketParseCorpusReader = LazyCorpusLoader(
"cess_cat",
BracketParseCorpusReader,
r"(?!\.).*\.tbf",
tagset="unknown",
encoding="ISO-8859-15",
)
cess_esp: BracketParseCorpusReader = LazyCorpusLoader(
"cess_esp",
BracketParseCorpusReader,
r"(?!\.).*\.tbf",
tagset="unknown",
encoding="ISO-8859-15",
)
cmudict: CMUDictCorpusReader = LazyCorpusLoader(
"cmudict", CMUDictCorpusReader, ["cmudict"]
)
comtrans: AlignedCorpusReader = LazyCorpusLoader(
"comtrans", AlignedCorpusReader, r"(?!\.).*\.txt"
)
comparative_sentences: ComparativeSentencesCorpusReader = LazyCorpusLoader(
"comparative_sentences",
ComparativeSentencesCorpusReader,
r"labeledSentences\.txt",
encoding="latin-1",
)
conll2000: ConllChunkCorpusReader = LazyCorpusLoader(
"conll2000",
ConllChunkCorpusReader,
["train.txt", "test.txt"],
("NP", "VP", "PP"),
tagset="wsj",
encoding="ascii",
)
conll2002: ConllChunkCorpusReader = LazyCorpusLoader(
"conll2002",
ConllChunkCorpusReader,
r".*\.(test|train).*",
("LOC", "PER", "ORG", "MISC"),
encoding="utf-8",
)
conll2007: DependencyCorpusReader = LazyCorpusLoader(
"conll2007",
DependencyCorpusReader,
r".*\.(test|train).*",
encoding=[("eus", "ISO-8859-2"), ("esp", "utf8")],
)
crubadan: CrubadanCorpusReader = LazyCorpusLoader(
"crubadan", CrubadanCorpusReader, r".*\.txt"
)
dependency_treebank: DependencyCorpusReader = LazyCorpusLoader(
"dependency_treebank", DependencyCorpusReader, r".*\.dp", encoding="ascii"
)
extended_omw: CorpusReader = LazyCorpusLoader(
"extended_omw", CorpusReader, r".*/wn-[a-z\-]*\.tab", encoding="utf8"
)
floresta: BracketParseCorpusReader = LazyCorpusLoader(
"floresta",
BracketParseCorpusReader,
r"(?!\.).*\.ptb",
"#",
tagset="unknown",
encoding="ISO-8859-15",
)
framenet15: FramenetCorpusReader = LazyCorpusLoader(
"framenet_v15",
FramenetCorpusReader,
[
"frRelation.xml",
"frameIndex.xml",
"fulltextIndex.xml",
"luIndex.xml",
"semTypes.xml",
],
)
framenet: FramenetCorpusReader = LazyCorpusLoader(
"framenet_v17",
FramenetCorpusReader,
[
"frRelation.xml",
"frameIndex.xml",
"fulltextIndex.xml",
"luIndex.xml",
"semTypes.xml",
],
)
gazetteers: WordListCorpusReader = LazyCorpusLoader(
"gazetteers", WordListCorpusReader, r"(?!LICENSE|\.).*\.txt", encoding="ISO-8859-2"
)
genesis: PlaintextCorpusReader = LazyCorpusLoader(
"genesis",
PlaintextCorpusReader,
r"(?!\.).*\.txt",
encoding=[
("finnish|french|german", "latin_1"),
("swedish", "cp865"),
(".*", "utf_8"),
],
)
gutenberg: PlaintextCorpusReader = LazyCorpusLoader(
"gutenberg", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="latin1"
)
ieer: IEERCorpusReader = LazyCorpusLoader("ieer", IEERCorpusReader, r"(?!README|\.).*")
inaugural: PlaintextCorpusReader = LazyCorpusLoader(
"inaugural", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="latin1"
)
# [XX] This should probably just use TaggedCorpusReader:
indian: IndianCorpusReader = LazyCorpusLoader(
"indian", IndianCorpusReader, r"(?!\.).*\.pos", tagset="unknown", encoding="utf8"
)
jeita: ChasenCorpusReader = LazyCorpusLoader(
"jeita", ChasenCorpusReader, r".*\.chasen", encoding="utf-8"
)
knbc: KNBCorpusReader = LazyCorpusLoader(
"knbc/corpus1", KNBCorpusReader, r".*/KN.*", encoding="euc-jp"
)
lin_thesaurus: LinThesaurusCorpusReader = LazyCorpusLoader(
"lin_thesaurus", LinThesaurusCorpusReader, r".*\.lsp"
)
mac_morpho: MacMorphoCorpusReader = LazyCorpusLoader(
"mac_morpho",
MacMorphoCorpusReader,
r"(?!\.).*\.txt",
tagset="unknown",
encoding="latin-1",
)
machado: PortugueseCategorizedPlaintextCorpusReader = LazyCorpusLoader(
"machado",
PortugueseCategorizedPlaintextCorpusReader,
r"(?!\.).*\.txt",
cat_pattern=r"([a-z]*)/.*",
encoding="latin-1",
)
masc_tagged: CategorizedTaggedCorpusReader = LazyCorpusLoader(
"masc_tagged",
CategorizedTaggedCorpusReader,
r"(spoken|written)/.*\.txt",
cat_file="categories.txt",
tagset="wsj",
encoding="utf-8",
sep="_",
)
movie_reviews: CategorizedPlaintextCorpusReader = LazyCorpusLoader(
"movie_reviews",
CategorizedPlaintextCorpusReader,
r"(?!\.).*\.txt",
cat_pattern=r"(neg|pos)/.*",
encoding="ascii",
)
multext_east: MTECorpusReader = LazyCorpusLoader(
"mte_teip5", MTECorpusReader, r"(oana).*\.xml", encoding="utf-8"
)
names: WordListCorpusReader = LazyCorpusLoader(
"names", WordListCorpusReader, r"(?!\.).*\.txt", encoding="ascii"
)
nps_chat: NPSChatCorpusReader = LazyCorpusLoader(
"nps_chat", NPSChatCorpusReader, r"(?!README|\.).*\.xml", tagset="wsj"
)
opinion_lexicon: OpinionLexiconCorpusReader = LazyCorpusLoader(
"opinion_lexicon",
OpinionLexiconCorpusReader,
r"(\w+)\-words\.txt",
encoding="ISO-8859-2",
)
ppattach: PPAttachmentCorpusReader = LazyCorpusLoader(
"ppattach", PPAttachmentCorpusReader, ["training", "test", "devset"]
)
product_reviews_1: ReviewsCorpusReader = LazyCorpusLoader(
"product_reviews_1", ReviewsCorpusReader, r"^(?!Readme).*\.txt", encoding="utf8"
)
product_reviews_2: ReviewsCorpusReader = LazyCorpusLoader(
"product_reviews_2", ReviewsCorpusReader, r"^(?!Readme).*\.txt", encoding="utf8"
)
pros_cons: ProsConsCorpusReader = LazyCorpusLoader(
"pros_cons",
ProsConsCorpusReader,
r"Integrated(Cons|Pros)\.txt",
cat_pattern=r"Integrated(Cons|Pros)\.txt",
encoding="ISO-8859-2",
)
ptb: CategorizedBracketParseCorpusReader = (
LazyCorpusLoader( # Penn Treebank v3: WSJ and Brown portions
"ptb",
CategorizedBracketParseCorpusReader,
r"(WSJ/\d\d/WSJ_\d\d|BROWN/C[A-Z]/C[A-Z])\d\d.MRG",
cat_file="allcats.txt",
tagset="wsj",
)
)
qc: StringCategoryCorpusReader = LazyCorpusLoader(
"qc", StringCategoryCorpusReader, ["train.txt", "test.txt"], encoding="ISO-8859-2"
)
reuters: CategorizedPlaintextCorpusReader = LazyCorpusLoader(
"reuters",
CategorizedPlaintextCorpusReader,
"(training|test).*",
cat_file="cats.txt",
encoding="ISO-8859-2",
)
rte: RTECorpusReader = LazyCorpusLoader("rte", RTECorpusReader, r"(?!\.).*\.xml")
senseval: SensevalCorpusReader = LazyCorpusLoader(
"senseval", SensevalCorpusReader, r"(?!\.).*\.pos"
)
sentence_polarity: CategorizedSentencesCorpusReader = LazyCorpusLoader(
"sentence_polarity",
CategorizedSentencesCorpusReader,
r"rt-polarity\.(neg|pos)",
cat_pattern=r"rt-polarity\.(neg|pos)",
encoding="utf-8",
)
sentiwordnet: SentiWordNetCorpusReader = LazyCorpusLoader(
"sentiwordnet", SentiWordNetCorpusReader, "SentiWordNet_3.0.0.txt", encoding="utf-8"
)
shakespeare: XMLCorpusReader = LazyCorpusLoader(
"shakespeare", XMLCorpusReader, r"(?!\.).*\.xml"
)
sinica_treebank: SinicaTreebankCorpusReader = LazyCorpusLoader(
"sinica_treebank",
SinicaTreebankCorpusReader,
["parsed"],
tagset="unknown",
encoding="utf-8",
)
state_union: PlaintextCorpusReader = LazyCorpusLoader(
"state_union", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="ISO-8859-2"
)
stopwords: WordListCorpusReader = LazyCorpusLoader(
"stopwords", WordListCorpusReader, r"(?!README|\.).*", encoding="utf8"
)
subjectivity: CategorizedSentencesCorpusReader = LazyCorpusLoader(
"subjectivity",
CategorizedSentencesCorpusReader,
r"(quote.tok.gt9|plot.tok.gt9)\.5000",
cat_map={"quote.tok.gt9.5000": ["subj"], "plot.tok.gt9.5000": ["obj"]},
encoding="latin-1",
)
swadesh: SwadeshCorpusReader = LazyCorpusLoader(
"swadesh", SwadeshCorpusReader, r"(?!README|\.).*", encoding="utf8"
)
swadesh110: PanlexSwadeshCorpusReader = LazyCorpusLoader(
"panlex_swadesh", PanlexSwadeshCorpusReader, r"swadesh110/.*\.txt", encoding="utf8"
)
swadesh207: PanlexSwadeshCorpusReader = LazyCorpusLoader(
"panlex_swadesh", PanlexSwadeshCorpusReader, r"swadesh207/.*\.txt", encoding="utf8"
)
switchboard: SwitchboardCorpusReader = LazyCorpusLoader(
"switchboard", SwitchboardCorpusReader, tagset="wsj"
)
timit: TimitCorpusReader = LazyCorpusLoader("timit", TimitCorpusReader)
timit_tagged: TimitTaggedCorpusReader = LazyCorpusLoader(
"timit", TimitTaggedCorpusReader, r".+\.tags", tagset="wsj", encoding="ascii"
)
toolbox: ToolboxCorpusReader = LazyCorpusLoader(
"toolbox", ToolboxCorpusReader, r"(?!.*(README|\.)).*\.(dic|txt)"
)
treebank: BracketParseCorpusReader = LazyCorpusLoader(
"treebank/combined",
BracketParseCorpusReader,
r"wsj_.*\.mrg",
tagset="wsj",
encoding="ascii",
)
treebank_chunk: ChunkedCorpusReader = LazyCorpusLoader(
"treebank/tagged",
ChunkedCorpusReader,
r"wsj_.*\.pos",
sent_tokenizer=RegexpTokenizer(r"(?<=/\.)\s*(?![^\[]*\])", gaps=True),
para_block_reader=tagged_treebank_para_block_reader,
tagset="wsj",
encoding="ascii",
)
treebank_raw: PlaintextCorpusReader = LazyCorpusLoader(
"treebank/raw", PlaintextCorpusReader, r"wsj_.*", encoding="ISO-8859-2"
)
twitter_samples: TwitterCorpusReader = LazyCorpusLoader(
"twitter_samples", TwitterCorpusReader, r".*\.json"
)
udhr: UdhrCorpusReader = LazyCorpusLoader("udhr", UdhrCorpusReader)
udhr2: PlaintextCorpusReader = LazyCorpusLoader(
"udhr2", PlaintextCorpusReader, r".*\.txt", encoding="utf8"
)
universal_treebanks: ConllCorpusReader = LazyCorpusLoader(
"universal_treebanks_v20",
ConllCorpusReader,
r".*\.conll",
columntypes=(
"ignore",
"words",
"ignore",
"ignore",
"pos",
"ignore",
"ignore",
"ignore",
"ignore",
"ignore",
),
)
verbnet: VerbnetCorpusReader = LazyCorpusLoader(
"verbnet", VerbnetCorpusReader, r"(?!\.).*\.xml"
)
webtext: PlaintextCorpusReader = LazyCorpusLoader(
"webtext", PlaintextCorpusReader, r"(?!README|\.).*\.txt", encoding="ISO-8859-2"
)
wordnet: WordNetCorpusReader = LazyCorpusLoader(
"wordnet",
WordNetCorpusReader,
LazyCorpusLoader("omw-1.4", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
)
## Use the following template to add a custom Wordnet package.
## Just uncomment, and replace the identifier (my_wordnet) in two places:
##
# my_wordnet: WordNetCorpusReader = LazyCorpusLoader(
# "my_wordnet",
# WordNetCorpusReader,
# LazyCorpusLoader("omw-1.4", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
# )
wordnet31: WordNetCorpusReader = LazyCorpusLoader(
"wordnet31",
WordNetCorpusReader,
LazyCorpusLoader("omw-1.4", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
)
wordnet2021: WordNetCorpusReader = LazyCorpusLoader(
# Obsolete, use english_wordnet instead.
"wordnet2021",
WordNetCorpusReader,
LazyCorpusLoader("omw-1.4", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
)
wordnet2022: WordNetCorpusReader = LazyCorpusLoader(
# Obsolete, use english_wordnet instead.
"wordnet2022",
WordNetCorpusReader,
LazyCorpusLoader("omw-1.4", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
)
english_wordnet: WordNetCorpusReader = LazyCorpusLoader(
# Latest Open English Wordnet
"english_wordnet",
WordNetCorpusReader,
LazyCorpusLoader("omw-1.4", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
)
wordnet_ic: WordNetICCorpusReader = LazyCorpusLoader(
"wordnet_ic", WordNetICCorpusReader, r".*\.dat"
)
words: WordListCorpusReader = LazyCorpusLoader(
"words", WordListCorpusReader, r"(?!README|\.).*", encoding="ascii"
)
# defined after treebank
propbank: PropbankCorpusReader = LazyCorpusLoader(
"propbank",
PropbankCorpusReader,
"prop.txt",
r"frames/.*\.xml",
"verbs.txt",
lambda filename: re.sub(r"^wsj/\d\d/", "", filename),
treebank,
) # Must be defined *after* treebank corpus.
nombank: NombankCorpusReader = LazyCorpusLoader(
"nombank.1.0",
NombankCorpusReader,
"nombank.1.0",
r"frames/.*\.xml",
"nombank.1.0.words",
lambda filename: re.sub(r"^wsj/\d\d/", "", filename),
treebank,
) # Must be defined *after* treebank corpus.
propbank_ptb: PropbankCorpusReader = LazyCorpusLoader(
"propbank",
PropbankCorpusReader,
"prop.txt",
r"frames/.*\.xml",
"verbs.txt",
lambda filename: filename.upper(),
ptb,
) # Must be defined *after* ptb corpus.
nombank_ptb: NombankCorpusReader = LazyCorpusLoader(
"nombank.1.0",
NombankCorpusReader,
"nombank.1.0",
r"frames/.*\.xml",
"nombank.1.0.words",
lambda filename: filename.upper(),
ptb,
) # Must be defined *after* ptb corpus.
semcor: SemcorCorpusReader = LazyCorpusLoader(
"semcor", SemcorCorpusReader, r"brown./tagfiles/br-.*\.xml", wordnet
) # Must be defined *after* wordnet corpus.
nonbreaking_prefixes: NonbreakingPrefixesCorpusReader = LazyCorpusLoader(
"nonbreaking_prefixes",
NonbreakingPrefixesCorpusReader,
r"(?!README|\.).*",
encoding="utf8",
)
perluniprops: UnicharsCorpusReader = LazyCorpusLoader(
"perluniprops",
UnicharsCorpusReader,
r"(?!README|\.).*",
nltk_data_subdir="misc",
encoding="utf8",
)
# mwa_ppdb = LazyCorpusLoader(
# 'mwa_ppdb', MWAPPDBCorpusReader, r'(?!README|\.).*', nltk_data_subdir='misc', encoding='utf8')
# See https://github.com/nltk/nltk/issues/1579
# and https://github.com/nltk/nltk/issues/1716
#
# pl196x = LazyCorpusLoader(
# 'pl196x', Pl196xCorpusReader, r'[a-z]-.*\.xml',
# cat_file='cats.txt', textid_file='textids.txt', encoding='utf8')
#
# ipipan = LazyCorpusLoader(
# 'ipipan', IPIPANCorpusReader, r'(?!\.).*morph\.xml')
#
# nkjp = LazyCorpusLoader(
# 'nkjp', NKJPCorpusReader, r'', encoding='utf8')
#
# panlex_lite = LazyCorpusLoader(
# 'panlex_lite', PanLexLiteCorpusReader)
#
# ycoe = LazyCorpusLoader(
# 'ycoe', YCOECorpusReader)
#
# corpus not available with NLTK; these lines caused help(nltk.corpus) to break
# hebrew_treebank = LazyCorpusLoader(
# 'hebrew_treebank', BracketParseCorpusReader, r'.*\.txt')
# FIXME: override any imported demo from various corpora, see https://github.com/nltk/nltk/issues/2116
def demo():
# This is out-of-date:
abc.demo()
brown.demo()
# chat80.demo()
cmudict.demo()
conll2000.demo()
conll2002.demo()
genesis.demo()
gutenberg.demo()
ieer.demo()
inaugural.demo()
indian.demo()
names.demo()
ppattach.demo()
senseval.demo()
shakespeare.demo()
sinica_treebank.demo()
state_union.demo()
stopwords.demo()
timit.demo()
toolbox.demo()
treebank.demo()
udhr.demo()
webtext.demo()
words.demo()
# ycoe.demo()
if __name__ == "__main__":
# demo()
pass

View File

@@ -0,0 +1,56 @@
# Natural Language Toolkit: Europarl Corpus Readers
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Nitin Madnani <nmadnani@umiacs.umd.edu>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import re
from nltk.corpus.reader import *
from nltk.corpus.util import LazyCorpusLoader
# Create a new corpus reader instance for each European language
danish: EuroparlCorpusReader = LazyCorpusLoader(
"europarl_raw/danish", EuroparlCorpusReader, r"ep-.*\.da", encoding="utf-8"
)
dutch: EuroparlCorpusReader = LazyCorpusLoader(
"europarl_raw/dutch", EuroparlCorpusReader, r"ep-.*\.nl", encoding="utf-8"
)
english: EuroparlCorpusReader = LazyCorpusLoader(
"europarl_raw/english", EuroparlCorpusReader, r"ep-.*\.en", encoding="utf-8"
)
finnish: EuroparlCorpusReader = LazyCorpusLoader(
"europarl_raw/finnish", EuroparlCorpusReader, r"ep-.*\.fi", encoding="utf-8"
)
french: EuroparlCorpusReader = LazyCorpusLoader(
"europarl_raw/french", EuroparlCorpusReader, r"ep-.*\.fr", encoding="utf-8"
)
german: EuroparlCorpusReader = LazyCorpusLoader(
"europarl_raw/german", EuroparlCorpusReader, r"ep-.*\.de", encoding="utf-8"
)
greek: EuroparlCorpusReader = LazyCorpusLoader(
"europarl_raw/greek", EuroparlCorpusReader, r"ep-.*\.el", encoding="utf-8"
)
italian: EuroparlCorpusReader = LazyCorpusLoader(
"europarl_raw/italian", EuroparlCorpusReader, r"ep-.*\.it", encoding="utf-8"
)
portuguese: EuroparlCorpusReader = LazyCorpusLoader(
"europarl_raw/portuguese", EuroparlCorpusReader, r"ep-.*\.pt", encoding="utf-8"
)
spanish: EuroparlCorpusReader = LazyCorpusLoader(
"europarl_raw/spanish", EuroparlCorpusReader, r"ep-.*\.es", encoding="utf-8"
)
swedish: EuroparlCorpusReader = LazyCorpusLoader(
"europarl_raw/swedish", EuroparlCorpusReader, r"ep-.*\.sv", encoding="utf-8"
)

View File

@@ -0,0 +1,186 @@
# Natural Language Toolkit: Corpus Readers
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
NLTK corpus readers. The modules in this package provide functions
that can be used to read corpus fileids in a variety of formats. These
functions can be used to read both the corpus fileids that are
distributed in the NLTK corpus package, and corpus fileids that are part
of external corpora.
Corpus Reader Functions
=======================
Each corpus module defines one or more "corpus reader functions",
which can be used to read documents from that corpus. These functions
take an argument, ``item``, which is used to indicate which document
should be read from the corpus:
- If ``item`` is one of the unique identifiers listed in the corpus
module's ``items`` variable, then the corresponding document will
be loaded from the NLTK corpus package.
- If ``item`` is a fileid, then that file will be read.
Additionally, corpus reader functions can be given lists of item
names; in which case, they will return a concatenation of the
corresponding documents.
Corpus reader functions are named based on the type of information
they return. Some common examples, and their return types, are:
- words(): list of str
- sents(): list of (list of str)
- paras(): list of (list of (list of str))
- tagged_words(): list of (str,str) tuple
- tagged_sents(): list of (list of (str,str))
- tagged_paras(): list of (list of (list of (str,str)))
- chunked_sents(): list of (Tree w/ (str,str) leaves)
- parsed_sents(): list of (Tree with str leaves)
- parsed_paras(): list of (list of (Tree with str leaves))
- xml(): A single xml ElementTree
- raw(): unprocessed corpus contents
For example, to read a list of the words in the Brown Corpus, use
``nltk.corpus.brown.words()``:
>>> from nltk.corpus import brown
>>> print(", ".join(brown.words()[:6])) # only first 6 words
The, Fulton, County, Grand, Jury, said
isort:skip_file
"""
from nltk.corpus.reader.plaintext import *
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
from nltk.corpus.reader.tagged import *
from nltk.corpus.reader.cmudict import *
from nltk.corpus.reader.conll import *
from nltk.corpus.reader.chunked import *
from nltk.corpus.reader.wordlist import *
from nltk.corpus.reader.xmldocs import *
from nltk.corpus.reader.ppattach import *
from nltk.corpus.reader.senseval import *
from nltk.corpus.reader.ieer import *
from nltk.corpus.reader.sinica_treebank import *
from nltk.corpus.reader.bracket_parse import *
from nltk.corpus.reader.indian import *
from nltk.corpus.reader.toolbox import *
from nltk.corpus.reader.timit import *
from nltk.corpus.reader.ycoe import *
from nltk.corpus.reader.rte import *
from nltk.corpus.reader.string_category import *
from nltk.corpus.reader.propbank import *
from nltk.corpus.reader.verbnet import *
from nltk.corpus.reader.bnc import *
from nltk.corpus.reader.nps_chat import *
from nltk.corpus.reader.wordnet import *
from nltk.corpus.reader.switchboard import *
from nltk.corpus.reader.dependency import *
from nltk.corpus.reader.nombank import *
from nltk.corpus.reader.ipipan import *
from nltk.corpus.reader.pl196x import *
from nltk.corpus.reader.knbc import *
from nltk.corpus.reader.chasen import *
from nltk.corpus.reader.childes import *
from nltk.corpus.reader.aligned import *
from nltk.corpus.reader.lin import *
from nltk.corpus.reader.semcor import *
from nltk.corpus.reader.framenet import *
from nltk.corpus.reader.udhr import *
from nltk.corpus.reader.bnc import *
from nltk.corpus.reader.sentiwordnet import *
from nltk.corpus.reader.twitter import *
from nltk.corpus.reader.nkjp import *
from nltk.corpus.reader.crubadan import *
from nltk.corpus.reader.mte import *
from nltk.corpus.reader.reviews import *
from nltk.corpus.reader.opinion_lexicon import *
from nltk.corpus.reader.pros_cons import *
from nltk.corpus.reader.categorized_sents import *
from nltk.corpus.reader.comparative_sents import *
from nltk.corpus.reader.panlex_lite import *
from nltk.corpus.reader.panlex_swadesh import *
from nltk.corpus.reader.bcp47 import *
# Make sure that nltk.corpus.reader.bracket_parse gives the module, not
# the function bracket_parse() defined in nltk.tree:
from nltk.corpus.reader import bracket_parse
__all__ = [
"CorpusReader",
"CategorizedCorpusReader",
"PlaintextCorpusReader",
"find_corpus_fileids",
"TaggedCorpusReader",
"CMUDictCorpusReader",
"ConllChunkCorpusReader",
"WordListCorpusReader",
"PPAttachmentCorpusReader",
"SensevalCorpusReader",
"IEERCorpusReader",
"ChunkedCorpusReader",
"SinicaTreebankCorpusReader",
"BracketParseCorpusReader",
"IndianCorpusReader",
"ToolboxCorpusReader",
"TimitCorpusReader",
"YCOECorpusReader",
"MacMorphoCorpusReader",
"SyntaxCorpusReader",
"AlpinoCorpusReader",
"RTECorpusReader",
"StringCategoryCorpusReader",
"EuroparlCorpusReader",
"CategorizedBracketParseCorpusReader",
"CategorizedTaggedCorpusReader",
"CategorizedPlaintextCorpusReader",
"PortugueseCategorizedPlaintextCorpusReader",
"tagged_treebank_para_block_reader",
"PropbankCorpusReader",
"VerbnetCorpusReader",
"BNCCorpusReader",
"ConllCorpusReader",
"XMLCorpusReader",
"NPSChatCorpusReader",
"SwadeshCorpusReader",
"WordNetCorpusReader",
"WordNetICCorpusReader",
"SwitchboardCorpusReader",
"DependencyCorpusReader",
"NombankCorpusReader",
"IPIPANCorpusReader",
"Pl196xCorpusReader",
"TEICorpusView",
"KNBCorpusReader",
"ChasenCorpusReader",
"CHILDESCorpusReader",
"AlignedCorpusReader",
"TimitTaggedCorpusReader",
"LinThesaurusCorpusReader",
"SemcorCorpusReader",
"FramenetCorpusReader",
"UdhrCorpusReader",
"BNCCorpusReader",
"SentiWordNetCorpusReader",
"SentiSynset",
"TwitterCorpusReader",
"NKJPCorpusReader",
"CrubadanCorpusReader",
"MTECorpusReader",
"ReviewsCorpusReader",
"OpinionLexiconCorpusReader",
"ProsConsCorpusReader",
"CategorizedSentencesCorpusReader",
"ComparativeSentencesCorpusReader",
"PanLexLiteCorpusReader",
"NonbreakingPrefixesCorpusReader",
"UnicharsCorpusReader",
"MWAPPDBCorpusReader",
"PanlexSwadeshCorpusReader",
"BCP47CorpusReader",
]

View File

@@ -0,0 +1,154 @@
# Natural Language Toolkit: Aligned Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# URL: <https://www.nltk.org/>
# Author: Steven Bird <stevenbird1@gmail.com>
# For license information, see LICENSE.TXT
from nltk.corpus.reader.api import CorpusReader
from nltk.corpus.reader.util import (
StreamBackedCorpusView,
concat,
read_alignedsent_block,
)
from nltk.tokenize import RegexpTokenizer, WhitespaceTokenizer
from nltk.translate import AlignedSent, Alignment
class AlignedCorpusReader(CorpusReader):
"""
Reader for corpora of word-aligned sentences. Tokens are assumed
to be separated by whitespace. Sentences begin on separate lines.
"""
def __init__(
self,
root,
fileids,
sep="/",
word_tokenizer=WhitespaceTokenizer(),
sent_tokenizer=RegexpTokenizer("\n", gaps=True),
alignedsent_block_reader=read_alignedsent_block,
encoding="latin1",
):
"""
Construct a new Aligned Corpus reader for a set of documents
located at the given root directory. Example usage:
>>> root = '/...path to corpus.../'
>>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP
:param root: The root directory for this corpus.
:param fileids: A list or regexp specifying the fileids in this corpus.
"""
CorpusReader.__init__(self, root, fileids, encoding)
self._sep = sep
self._word_tokenizer = word_tokenizer
self._sent_tokenizer = sent_tokenizer
self._alignedsent_block_reader = alignedsent_block_reader
def words(self, fileids=None):
"""
:return: the given file(s) as a list of words
and punctuation symbols.
:rtype: list(str)
"""
return concat(
[
AlignedSentCorpusView(
fileid,
enc,
False,
False,
self._word_tokenizer,
self._sent_tokenizer,
self._alignedsent_block_reader,
)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def sents(self, fileids=None):
"""
:return: the given file(s) as a list of
sentences or utterances, each encoded as a list of word
strings.
:rtype: list(list(str))
"""
return concat(
[
AlignedSentCorpusView(
fileid,
enc,
False,
True,
self._word_tokenizer,
self._sent_tokenizer,
self._alignedsent_block_reader,
)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def aligned_sents(self, fileids=None):
"""
:return: the given file(s) as a list of AlignedSent objects.
:rtype: list(AlignedSent)
"""
return concat(
[
AlignedSentCorpusView(
fileid,
enc,
True,
True,
self._word_tokenizer,
self._sent_tokenizer,
self._alignedsent_block_reader,
)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
class AlignedSentCorpusView(StreamBackedCorpusView):
"""
A specialized corpus view for aligned sentences.
``AlignedSentCorpusView`` objects are typically created by
``AlignedCorpusReader`` (not directly by nltk users).
"""
def __init__(
self,
corpus_file,
encoding,
aligned,
group_by_sent,
word_tokenizer,
sent_tokenizer,
alignedsent_block_reader,
):
self._aligned = aligned
self._group_by_sent = group_by_sent
self._word_tokenizer = word_tokenizer
self._sent_tokenizer = sent_tokenizer
self._alignedsent_block_reader = alignedsent_block_reader
StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
def read_block(self, stream):
block = [
self._word_tokenizer.tokenize(sent_str)
for alignedsent_str in self._alignedsent_block_reader(stream)
for sent_str in self._sent_tokenizer.tokenize(alignedsent_str)
]
if self._aligned:
block[2] = Alignment.fromstring(
" ".join(block[2])
) # kludge; we shouldn't have tokenized the alignment string
block = [AlignedSent(*block)]
elif self._group_by_sent:
block = [block[0]]
else:
block = block[0]
return block

View File

@@ -0,0 +1,517 @@
# Natural Language Toolkit: API for Corpus Readers
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
API for corpus readers.
"""
import os
import re
from collections import defaultdict
from itertools import chain
from nltk.corpus.reader.util import *
from nltk.data import FileSystemPathPointer, PathPointer, ZipFilePathPointer
class CorpusReader:
"""
A base class for "corpus reader" classes, each of which can be
used to read a specific corpus format. Each individual corpus
reader instance is used to read a specific corpus, consisting of
one or more files under a common root directory. Each file is
identified by its ``file identifier``, which is the relative path
to the file from the root directory.
A separate subclass is defined for each corpus format. These
subclasses define one or more methods that provide 'views' on the
corpus contents, such as ``words()`` (for a list of words) and
``parsed_sents()`` (for a list of parsed sentences). Called with
no arguments, these methods will return the contents of the entire
corpus. For most corpora, these methods define one or more
selection arguments, such as ``fileids`` or ``categories``, which can
be used to select which portion of the corpus should be returned.
"""
def __init__(self, root, fileids, encoding="utf8", tagset=None):
"""
:type root: PathPointer or str
:param root: A path pointer identifying the root directory for
this corpus. If a string is specified, then it will be
converted to a ``PathPointer`` automatically.
:param fileids: A list of the files that make up this corpus.
This list can either be specified explicitly, as a list of
strings; or implicitly, as a regular expression over file
paths. The absolute path for each file will be constructed
by joining the reader's root to each file name.
:param encoding: The default unicode encoding for the files
that make up the corpus. The value of ``encoding`` can be any
of the following:
- A string: ``encoding`` is the encoding name for all files.
- A dictionary: ``encoding[file_id]`` is the encoding
name for the file whose identifier is ``file_id``. If
``file_id`` is not in ``encoding``, then the file
contents will be processed using non-unicode byte strings.
- A list: ``encoding`` should be a list of ``(regexp, encoding)``
tuples. The encoding for a file whose identifier is ``file_id``
will be the ``encoding`` value for the first tuple whose
``regexp`` matches the ``file_id``. If no tuple's ``regexp``
matches the ``file_id``, the file contents will be processed
using non-unicode byte strings.
- None: the file contents of all files will be
processed using non-unicode byte strings.
:param tagset: The name of the tagset used by this corpus, to be used
for normalizing or converting the POS tags returned by the
``tagged_...()`` methods.
"""
# Convert the root to a path pointer, if necessary.
if isinstance(root, str) and not isinstance(root, PathPointer):
m = re.match(r"(.*\.zip)/?(.*)$|", root)
zipfile, zipentry = m.groups()
if zipfile:
root = ZipFilePathPointer(zipfile, zipentry)
else:
root = FileSystemPathPointer(root)
elif not isinstance(root, PathPointer):
raise TypeError("CorpusReader: expected a string or a PathPointer")
# If `fileids` is a regexp, then expand it.
if isinstance(fileids, str):
fileids = find_corpus_fileids(root, fileids)
self._fileids = fileids
"""A list of the relative paths for the fileids that make up
this corpus."""
self._root = root
"""The root directory for this corpus."""
self._readme = "README"
self._license = "LICENSE"
self._citation = "citation.bib"
# If encoding was specified as a list of regexps, then convert
# it to a dictionary.
if isinstance(encoding, list):
encoding_dict = {}
for fileid in self._fileids:
for x in encoding:
(regexp, enc) = x
if re.match(regexp, fileid):
encoding_dict[fileid] = enc
break
encoding = encoding_dict
self._encoding = encoding
"""The default unicode encoding for the fileids that make up
this corpus. If ``encoding`` is None, then the file
contents are processed using byte strings."""
self._tagset = tagset
def __repr__(self):
if isinstance(self._root, ZipFilePathPointer):
path = f"{self._root.zipfile.filename}/{self._root.entry}"
else:
path = "%s" % self._root.path
return f"<{self.__class__.__name__} in {path!r}>"
def ensure_loaded(self):
"""
Load this corpus (if it has not already been loaded). This is
used by LazyCorpusLoader as a simple method that can be used to
make sure a corpus is loaded -- e.g., in case a user wants to
do help(some_corpus).
"""
pass # no need to actually do anything.
def readme(self):
"""
Return the contents of the corpus README file, if it exists.
"""
with self.open(self._readme) as f:
return f.read()
def license(self):
"""
Return the contents of the corpus LICENSE file, if it exists.
"""
with self.open(self._license) as f:
return f.read()
def citation(self):
"""
Return the contents of the corpus citation.bib file, if it exists.
"""
with self.open(self._citation) as f:
return f.read()
def fileids(self):
"""
Return a list of file identifiers for the fileids that make up
this corpus.
"""
return self._fileids
def abspath(self, fileid):
"""
Return the absolute path for the given file.
:type fileid: str
:param fileid: The file identifier for the file whose path
should be returned.
:rtype: PathPointer
"""
return self._root.join(fileid)
def abspaths(self, fileids=None, include_encoding=False, include_fileid=False):
"""
Return a list of the absolute paths for all fileids in this corpus;
or for the given list of fileids, if specified.
:type fileids: None or str or list
:param fileids: Specifies the set of fileids for which paths should
be returned. Can be None, for all fileids; a list of
file identifiers, for a specified set of fileids; or a single
file identifier, for a single file. Note that the return
value is always a list of paths, even if ``fileids`` is a
single file identifier.
:param include_encoding: If true, then return a list of
``(path_pointer, encoding)`` tuples.
:rtype: list(PathPointer)
"""
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, str):
fileids = [fileids]
paths = [self._root.join(f) for f in fileids]
if include_encoding and include_fileid:
return list(zip(paths, [self.encoding(f) for f in fileids], fileids))
elif include_fileid:
return list(zip(paths, fileids))
elif include_encoding:
return list(zip(paths, [self.encoding(f) for f in fileids]))
else:
return paths
def raw(self, fileids=None):
"""
:param fileids: A list specifying the fileids that should be used.
:return: the given file(s) as a single string.
:rtype: str
"""
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, str):
fileids = [fileids]
contents = []
for f in fileids:
with self.open(f) as fp:
contents.append(fp.read())
return concat(contents)
def open(self, file):
"""
Return an open stream that can be used to read the given file.
If the file's encoding is not None, then the stream will
automatically decode the file's contents into unicode.
:param file: The file identifier of the file to read.
"""
encoding = self.encoding(file)
stream = self._root.join(file).open(encoding)
return stream
def encoding(self, file):
"""
Return the unicode encoding for the given corpus file, if known.
If the encoding is unknown, or if the given file should be
processed using byte strings (str), then return None.
"""
if isinstance(self._encoding, dict):
return self._encoding.get(file)
else:
return self._encoding
def _get_root(self):
return self._root
root = property(
_get_root,
doc="""
The directory where this corpus is stored.
:type: PathPointer""",
)
######################################################################
# { Corpora containing categorized items
######################################################################
class CategorizedCorpusReader:
"""
A mixin class used to aid in the implementation of corpus readers
for categorized corpora. This class defines the method
``categories()``, which returns a list of the categories for the
corpus or for a specified set of fileids; and overrides ``fileids()``
to take a ``categories`` argument, restricting the set of fileids to
be returned.
Subclasses are expected to:
- Call ``__init__()`` to set up the mapping.
- Override all view methods to accept a ``categories`` parameter,
which can be used *instead* of the ``fileids`` parameter, to
select which fileids should be included in the returned view.
"""
def __init__(self, kwargs):
"""
Initialize this mapping based on keyword arguments, as
follows:
- cat_pattern: A regular expression pattern used to find the
category for each file identifier. The pattern will be
applied to each file identifier, and the first matching
group will be used as the category label for that file.
- cat_map: A dictionary, mapping from file identifiers to
category labels.
- cat_file: The name of a file that contains the mapping
from file identifiers to categories. The argument
``cat_delimiter`` can be used to specify a delimiter.
The corresponding argument will be deleted from ``kwargs``. If
more than one argument is specified, an exception will be
raised.
"""
self._f2c = None #: file-to-category mapping
self._c2f = None #: category-to-file mapping
self._pattern = None #: regexp specifying the mapping
self._map = None #: dict specifying the mapping
self._file = None #: fileid of file containing the mapping
self._delimiter = None #: delimiter for ``self._file``
if "cat_pattern" in kwargs:
self._pattern = kwargs["cat_pattern"]
del kwargs["cat_pattern"]
elif "cat_map" in kwargs:
self._map = kwargs["cat_map"]
del kwargs["cat_map"]
elif "cat_file" in kwargs:
self._file = kwargs["cat_file"]
del kwargs["cat_file"]
if "cat_delimiter" in kwargs:
self._delimiter = kwargs["cat_delimiter"]
del kwargs["cat_delimiter"]
else:
raise ValueError(
"Expected keyword argument cat_pattern or " "cat_map or cat_file."
)
if "cat_pattern" in kwargs or "cat_map" in kwargs or "cat_file" in kwargs:
raise ValueError(
"Specify exactly one of: cat_pattern, " "cat_map, cat_file."
)
def _init(self):
self._f2c = defaultdict(set)
self._c2f = defaultdict(set)
if self._pattern is not None:
for file_id in self._fileids:
category = re.match(self._pattern, file_id).group(1)
self._add(file_id, category)
elif self._map is not None:
for file_id, categories in self._map.items():
for category in categories:
self._add(file_id, category)
elif self._file is not None:
with self.open(self._file) as f:
for line in f.readlines():
line = line.strip()
file_id, categories = line.split(self._delimiter, 1)
if file_id not in self.fileids():
raise ValueError(
"In category mapping file %s: %s "
"not found" % (self._file, file_id)
)
for category in categories.split(self._delimiter):
self._add(file_id, category)
def _add(self, file_id, category):
self._f2c[file_id].add(category)
self._c2f[category].add(file_id)
def categories(self, fileids=None):
"""
Return a list of the categories that are defined for this corpus,
or for the file(s) if it is given.
"""
if self._f2c is None:
self._init()
if fileids is None:
return sorted(self._c2f)
if isinstance(fileids, str):
fileids = [fileids]
return sorted(set.union(*(self._f2c[d] for d in fileids)))
def fileids(self, categories=None):
"""
Return a list of file identifiers for the files that make up
this corpus, or that make up the given category(s) if specified.
"""
if categories is None:
return super().fileids()
elif isinstance(categories, str):
if self._f2c is None:
self._init()
if categories in self._c2f:
return sorted(self._c2f[categories])
else:
raise ValueError("Category %s not found" % categories)
else:
if self._f2c is None:
self._init()
return sorted(set.union(*(self._c2f[c] for c in categories)))
def _resolve(self, fileids, categories):
if fileids is not None and categories is not None:
raise ValueError("Specify fileids or categories, not both")
if categories is not None:
return self.fileids(categories)
else:
return fileids
def raw(self, fileids=None, categories=None):
return super().raw(self._resolve(fileids, categories))
def words(self, fileids=None, categories=None):
return super().words(self._resolve(fileids, categories))
def sents(self, fileids=None, categories=None):
return super().sents(self._resolve(fileids, categories))
def paras(self, fileids=None, categories=None):
return super().paras(self._resolve(fileids, categories))
######################################################################
# { Treebank readers
######################################################################
# [xx] is it worth it to factor this out?
class SyntaxCorpusReader(CorpusReader):
"""
An abstract base class for reading corpora consisting of
syntactically parsed text. Subclasses should define:
- ``__init__``, which specifies the location of the corpus
and a method for detecting the sentence blocks in corpus files.
- ``_read_block``, which reads a block from the input stream.
- ``_word``, which takes a block and returns a list of list of words.
- ``_tag``, which takes a block and returns a list of list of tagged
words.
- ``_parse``, which takes a block and returns a list of parsed
sentences.
"""
def _parse(self, s):
raise NotImplementedError()
def _word(self, s):
raise NotImplementedError()
def _tag(self, s):
raise NotImplementedError()
def _read_block(self, stream):
raise NotImplementedError()
def parsed_sents(self, fileids=None):
reader = self._read_parsed_sent_block
return concat(
[
StreamBackedCorpusView(fileid, reader, encoding=enc)
for fileid, enc in self.abspaths(fileids, True)
]
)
def tagged_sents(self, fileids=None, tagset=None):
def reader(stream):
return self._read_tagged_sent_block(stream, tagset)
return concat(
[
StreamBackedCorpusView(fileid, reader, encoding=enc)
for fileid, enc in self.abspaths(fileids, True)
]
)
def sents(self, fileids=None):
reader = self._read_sent_block
return concat(
[
StreamBackedCorpusView(fileid, reader, encoding=enc)
for fileid, enc in self.abspaths(fileids, True)
]
)
def tagged_words(self, fileids=None, tagset=None):
def reader(stream):
return self._read_tagged_word_block(stream, tagset)
return concat(
[
StreamBackedCorpusView(fileid, reader, encoding=enc)
for fileid, enc in self.abspaths(fileids, True)
]
)
def words(self, fileids=None):
return concat(
[
StreamBackedCorpusView(fileid, self._read_word_block, encoding=enc)
for fileid, enc in self.abspaths(fileids, True)
]
)
# ------------------------------------------------------------
# { Block Readers
def _read_word_block(self, stream):
return list(chain.from_iterable(self._read_sent_block(stream)))
def _read_tagged_word_block(self, stream, tagset=None):
return list(chain.from_iterable(self._read_tagged_sent_block(stream, tagset)))
def _read_sent_block(self, stream):
return list(filter(None, [self._word(t) for t in self._read_block(stream)]))
def _read_tagged_sent_block(self, stream, tagset=None):
return list(
filter(None, [self._tag(t, tagset) for t in self._read_block(stream)])
)
def _read_parsed_sent_block(self, stream):
return list(filter(None, [self._parse(t) for t in self._read_block(stream)]))
# } End of Block Readers
# ------------------------------------------------------------

View File

@@ -0,0 +1,218 @@
# Natural Language Toolkit: BCP-47 language tags
#
# Copyright (C) 2022-2023 NLTK Project
# Author: Eric Kafe <kafe.eric@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import re
from warnings import warn
from xml.etree import ElementTree as et
from nltk.corpus.reader import CorpusReader
class BCP47CorpusReader(CorpusReader):
"""
Parse BCP-47 composite language tags
Supports all the main subtags, and the 'u-sd' extension:
>>> from nltk.corpus import bcp47
>>> bcp47.name('oc-gascon-u-sd-fr64')
'Occitan (post 1500): Gascon: Pyrénées-Atlantiques'
Can load a conversion table to Wikidata Q-codes:
>>> bcp47.load_wiki_q()
>>> bcp47.wiki_q['en-GI-spanglis']
'Q79388'
"""
def __init__(self, root, fileids):
"""Read the BCP-47 database"""
super().__init__(root, fileids)
self.langcode = {}
with self.open("iana/language-subtag-registry.txt") as fp:
self.db = self.data_dict(fp.read().split("%%\n"))
with self.open("cldr/common-subdivisions-en.xml") as fp:
self.subdiv = self.subdiv_dict(
et.parse(fp).iterfind("localeDisplayNames/subdivisions/subdivision")
)
self.morphology()
def load_wiki_q(self):
"""Load conversion table to Wikidata Q-codes (only if needed)"""
with self.open("cldr/tools-cldr-rdf-external-entityToCode.tsv") as fp:
self.wiki_q = self.wiki_dict(fp.read().strip().split("\n")[1:])
def wiki_dict(self, lines):
"""Convert Wikidata list of Q-codes to a BCP-47 dictionary"""
return {
pair[1]: pair[0].split("/")[-1]
for pair in [line.strip().split("\t") for line in lines]
}
def subdiv_dict(self, subdivs):
"""Convert the CLDR subdivisions list to a dictionary"""
return {sub.attrib["type"]: sub.text for sub in subdivs}
def morphology(self):
self.casing = {
"language": str.lower,
"extlang": str.lower,
"script": str.title,
"region": str.upper,
"variant": str.lower,
}
dig = "[0-9]"
low = "[a-z]"
up = "[A-Z]"
alnum = "[a-zA-Z0-9]"
self.format = {
"language": re.compile(f"{low*3}?"),
"extlang": re.compile(f"{low*3}"),
"script": re.compile(f"{up}{low*3}"),
"region": re.compile(f"({up*2})|({dig*3})"),
"variant": re.compile(f"{alnum*4}{(alnum+'?')*4}"),
"singleton": re.compile(f"{low}"),
}
def data_dict(self, records):
"""Convert the BCP-47 language subtag registry to a dictionary"""
self.version = records[0].replace("File-Date:", "").strip()
dic = {}
dic["deprecated"] = {}
for label in [
"language",
"extlang",
"script",
"region",
"variant",
"redundant",
"grandfathered",
]:
dic["deprecated"][label] = {}
for record in records[1:]:
fields = [field.split(": ") for field in record.strip().split("\n")]
typ = fields[0][1]
tag = fields[1][1]
if typ not in dic:
dic[typ] = {}
subfields = {}
for field in fields[2:]:
if len(field) == 2:
[key, val] = field
if key not in subfields:
subfields[key] = [val]
else: # multiple value
subfields[key].append(val)
else: # multiline field
subfields[key][-1] += " " + field[0].strip()
if (
"Deprecated" not in record
and typ == "language"
and key == "Description"
):
self.langcode[subfields[key][-1]] = tag
for key in subfields:
if len(subfields[key]) == 1: # single value
subfields[key] = subfields[key][0]
if "Deprecated" in record:
dic["deprecated"][typ][tag] = subfields
else:
dic[typ][tag] = subfields
return dic
def val2str(self, val):
"""Return only first value"""
if type(val) == list:
# val = "/".join(val) # Concatenate all values
val = val[0]
return val
def lang2str(self, lg_record):
"""Concatenate subtag values"""
name = f"{lg_record['language']}"
for label in ["extlang", "script", "region", "variant", "extension"]:
if label in lg_record:
name += f": {lg_record[label]}"
return name
def parse_tag(self, tag):
"""Convert a BCP-47 tag to a dictionary of labelled subtags"""
subtags = tag.split("-")
lang = {}
labels = ["language", "extlang", "script", "region", "variant", "variant"]
while subtags and labels:
subtag = subtags.pop(0)
found = False
while labels:
label = labels.pop(0)
subtag = self.casing[label](subtag)
if self.format[label].fullmatch(subtag):
if subtag in self.db[label]:
found = True
valstr = self.val2str(self.db[label][subtag]["Description"])
if label == "variant" and label in lang:
lang[label] += ": " + valstr
else:
lang[label] = valstr
break
elif subtag in self.db["deprecated"][label]:
found = True
note = f"The {subtag!r} {label} code is deprecated"
if "Preferred-Value" in self.db["deprecated"][label][subtag]:
prefer = self.db["deprecated"][label][subtag][
"Preferred-Value"
]
note += f"', prefer '{self.val2str(prefer)}'"
lang[label] = self.val2str(
self.db["deprecated"][label][subtag]["Description"]
)
warn(note)
break
if not found:
if subtag == "u" and subtags[0] == "sd": # CLDR regional subdivisions
sd = subtags[1]
if sd in self.subdiv:
ext = self.subdiv[sd]
else:
ext = f"<Unknown subdivision: {ext}>"
else: # other extension subtags are not supported yet
ext = f"{subtag}{''.join(['-'+ext for ext in subtags])}".lower()
if not self.format["singleton"].fullmatch(subtag):
ext = f"<Invalid extension: {ext}>"
warn(ext)
lang["extension"] = ext
subtags = []
return lang
def name(self, tag):
"""
Convert a BCP-47 tag to a colon-separated string of subtag names
>>> from nltk.corpus import bcp47
>>> bcp47.name('ca-Latn-ES-valencia')
'Catalan: Latin: Spain: Valencian'
"""
for label in ["redundant", "grandfathered"]:
val = None
if tag in self.db[label]:
val = f"{self.db[label][tag]['Description']}"
note = f"The {tag!r} code is {label}"
elif tag in self.db["deprecated"][label]:
val = f"{self.db['deprecated'][label][tag]['Description']}"
note = f"The {tag!r} code is {label} and deprecated"
if "Preferred-Value" in self.db["deprecated"][label][tag]:
prefer = self.db["deprecated"][label][tag]["Preferred-Value"]
note += f", prefer {self.val2str(prefer)!r}"
if val:
warn(note)
return val
try:
return self.lang2str(self.parse_tag(tag))
except:
warn(f"Tag {tag!r} was not recognized")
return None

View File

@@ -0,0 +1,265 @@
# Natural Language Toolkit: Plaintext Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""Corpus reader for the XML version of the British National Corpus."""
from nltk.corpus.reader.util import concat
from nltk.corpus.reader.xmldocs import ElementTree, XMLCorpusReader, XMLCorpusView
class BNCCorpusReader(XMLCorpusReader):
r"""Corpus reader for the XML version of the British National Corpus.
For access to the complete XML data structure, use the ``xml()``
method. For access to simple word lists and tagged word lists, use
``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``.
You can obtain the full version of the BNC corpus at
https://www.ota.ox.ac.uk/desc/2554
If you extracted the archive to a directory called `BNC`, then you can
instantiate the reader as::
BNCCorpusReader(root='BNC/Texts/', fileids=r'[A-K]/\w*/\w*\.xml')
"""
def __init__(self, root, fileids, lazy=True):
XMLCorpusReader.__init__(self, root, fileids)
self._lazy = lazy
def words(self, fileids=None, strip_space=True, stem=False):
"""
:return: the given file(s) as a list of words
and punctuation symbols.
:rtype: list(str)
:param strip_space: If true, then strip trailing spaces from
word tokens. Otherwise, leave the spaces on the tokens.
:param stem: If true, then use word stems instead of word strings.
"""
return self._views(fileids, False, None, strip_space, stem)
def tagged_words(self, fileids=None, c5=False, strip_space=True, stem=False):
"""
:return: the given file(s) as a list of tagged
words and punctuation symbols, encoded as tuples
``(word,tag)``.
:rtype: list(tuple(str,str))
:param c5: If true, then the tags used will be the more detailed
c5 tags. Otherwise, the simplified tags will be used.
:param strip_space: If true, then strip trailing spaces from
word tokens. Otherwise, leave the spaces on the tokens.
:param stem: If true, then use word stems instead of word strings.
"""
tag = "c5" if c5 else "pos"
return self._views(fileids, False, tag, strip_space, stem)
def sents(self, fileids=None, strip_space=True, stem=False):
"""
:return: the given file(s) as a list of
sentences or utterances, each encoded as a list of word
strings.
:rtype: list(list(str))
:param strip_space: If true, then strip trailing spaces from
word tokens. Otherwise, leave the spaces on the tokens.
:param stem: If true, then use word stems instead of word strings.
"""
return self._views(fileids, True, None, strip_space, stem)
def tagged_sents(self, fileids=None, c5=False, strip_space=True, stem=False):
"""
:return: the given file(s) as a list of
sentences, each encoded as a list of ``(word,tag)`` tuples.
:rtype: list(list(tuple(str,str)))
:param c5: If true, then the tags used will be the more detailed
c5 tags. Otherwise, the simplified tags will be used.
:param strip_space: If true, then strip trailing spaces from
word tokens. Otherwise, leave the spaces on the tokens.
:param stem: If true, then use word stems instead of word strings.
"""
tag = "c5" if c5 else "pos"
return self._views(
fileids, sent=True, tag=tag, strip_space=strip_space, stem=stem
)
def _views(self, fileids=None, sent=False, tag=False, strip_space=True, stem=False):
"""A helper function that instantiates BNCWordViews or the list of words/sentences."""
f = BNCWordView if self._lazy else self._words
return concat(
[
f(fileid, sent, tag, strip_space, stem)
for fileid in self.abspaths(fileids)
]
)
def _words(self, fileid, bracket_sent, tag, strip_space, stem):
"""
Helper used to implement the view methods -- returns a list of
words or a list of sentences, optionally tagged.
:param fileid: The name of the underlying file.
:param bracket_sent: If true, include sentence bracketing.
:param tag: The name of the tagset to use, or None for no tags.
:param strip_space: If true, strip spaces from word tokens.
:param stem: If true, then substitute stems for words.
"""
result = []
xmldoc = ElementTree.parse(fileid).getroot()
for xmlsent in xmldoc.findall(".//s"):
sent = []
for xmlword in _all_xmlwords_in(xmlsent):
word = xmlword.text
if not word:
word = "" # fixes issue 337?
if strip_space or stem:
word = word.strip()
if stem:
word = xmlword.get("hw", word)
if tag == "c5":
word = (word, xmlword.get("c5"))
elif tag == "pos":
word = (word, xmlword.get("pos", xmlword.get("c5")))
sent.append(word)
if bracket_sent:
result.append(BNCSentence(xmlsent.attrib["n"], sent))
else:
result.extend(sent)
assert None not in result
return result
def _all_xmlwords_in(elt, result=None):
if result is None:
result = []
for child in elt:
if child.tag in ("c", "w"):
result.append(child)
else:
_all_xmlwords_in(child, result)
return result
class BNCSentence(list):
"""
A list of words, augmented by an attribute ``num`` used to record
the sentence identifier (the ``n`` attribute from the XML).
"""
def __init__(self, num, items):
self.num = num
list.__init__(self, items)
class BNCWordView(XMLCorpusView):
"""
A stream backed corpus view specialized for use with the BNC corpus.
"""
tags_to_ignore = {
"pb",
"gap",
"vocal",
"event",
"unclear",
"shift",
"pause",
"align",
}
"""These tags are ignored. For their description refer to the
technical documentation, for example,
http://www.natcorp.ox.ac.uk/docs/URG/ref-vocal.html
"""
def __init__(self, fileid, sent, tag, strip_space, stem):
"""
:param fileid: The name of the underlying file.
:param sent: If true, include sentence bracketing.
:param tag: The name of the tagset to use, or None for no tags.
:param strip_space: If true, strip spaces from word tokens.
:param stem: If true, then substitute stems for words.
"""
if sent:
tagspec = ".*/s"
else:
tagspec = ".*/s/(.*/)?(c|w)"
self._sent = sent
self._tag = tag
self._strip_space = strip_space
self._stem = stem
self.title = None #: Title of the document.
self.author = None #: Author of the document.
self.editor = None #: Editor
self.resps = None #: Statement of responsibility
XMLCorpusView.__init__(self, fileid, tagspec)
# Read in a tasty header.
self._open()
self.read_block(self._stream, ".*/teiHeader$", self.handle_header)
self.close()
# Reset tag context.
self._tag_context = {0: ()}
def handle_header(self, elt, context):
# Set up some metadata!
titles = elt.findall("titleStmt/title")
if titles:
self.title = "\n".join(title.text.strip() for title in titles)
authors = elt.findall("titleStmt/author")
if authors:
self.author = "\n".join(author.text.strip() for author in authors)
editors = elt.findall("titleStmt/editor")
if editors:
self.editor = "\n".join(editor.text.strip() for editor in editors)
resps = elt.findall("titleStmt/respStmt")
if resps:
self.resps = "\n\n".join(
"\n".join(resp_elt.text.strip() for resp_elt in resp) for resp in resps
)
def handle_elt(self, elt, context):
if self._sent:
return self.handle_sent(elt)
else:
return self.handle_word(elt)
def handle_word(self, elt):
word = elt.text
if not word:
word = "" # fixes issue 337?
if self._strip_space or self._stem:
word = word.strip()
if self._stem:
word = elt.get("hw", word)
if self._tag == "c5":
word = (word, elt.get("c5"))
elif self._tag == "pos":
word = (word, elt.get("pos", elt.get("c5")))
return word
def handle_sent(self, elt):
sent = []
for child in elt:
if child.tag in ("mw", "hi", "corr", "trunc"):
sent += [self.handle_word(w) for w in child]
elif child.tag in ("w", "c"):
sent.append(self.handle_word(child))
elif child.tag not in self.tags_to_ignore:
raise ValueError("Unexpected element %s" % child.tag)
return BNCSentence(elt.attrib["n"], sent)

View File

@@ -0,0 +1,237 @@
# Natural Language Toolkit: Penn Treebank Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Corpus reader for corpora that consist of parenthesis-delineated parse trees.
"""
import sys
from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *
from nltk.tag import map_tag
from nltk.tree import Tree
# we use [^\s()]+ instead of \S+? to avoid matching ()
SORTTAGWRD = re.compile(r"\((\d+) ([^\s()]+) ([^\s()]+)\)")
TAGWORD = re.compile(r"\(([^\s()]+) ([^\s()]+)\)")
WORD = re.compile(r"\([^\s()]+ ([^\s()]+)\)")
EMPTY_BRACKETS = re.compile(r"\s*\(\s*\(")
class BracketParseCorpusReader(SyntaxCorpusReader):
"""
Reader for corpora that consist of parenthesis-delineated parse trees,
like those found in the "combined" section of the Penn Treebank,
e.g. "(S (NP (DT the) (JJ little) (NN dog)) (VP (VBD barked)))".
"""
def __init__(
self,
root,
fileids,
comment_char=None,
detect_blocks="unindented_paren",
encoding="utf8",
tagset=None,
):
"""
:param root: The root directory for this corpus.
:param fileids: A list or regexp specifying the fileids in this corpus.
:param comment_char: The character which can appear at the start of
a line to indicate that the rest of the line is a comment.
:param detect_blocks: The method that is used to find blocks
in the corpus; can be 'unindented_paren' (every unindented
parenthesis starts a new parse) or 'sexpr' (brackets are
matched).
:param tagset: The name of the tagset used by this corpus, to be used
for normalizing or converting the POS tags returned by the
``tagged_...()`` methods.
"""
SyntaxCorpusReader.__init__(self, root, fileids, encoding)
self._comment_char = comment_char
self._detect_blocks = detect_blocks
self._tagset = tagset
def _read_block(self, stream):
if self._detect_blocks == "sexpr":
return read_sexpr_block(stream, comment_char=self._comment_char)
elif self._detect_blocks == "blankline":
return read_blankline_block(stream)
elif self._detect_blocks == "unindented_paren":
# Tokens start with unindented left parens.
toks = read_regexp_block(stream, start_re=r"^\(")
# Strip any comments out of the tokens.
if self._comment_char:
toks = [
re.sub("(?m)^%s.*" % re.escape(self._comment_char), "", tok)
for tok in toks
]
return toks
else:
assert 0, "bad block type"
def _normalize(self, t):
# Replace leaves of the form (!), (,), with (! !), (, ,)
t = re.sub(r"\((.)\)", r"(\1 \1)", t)
# Replace leaves of the form (tag word root) with (tag word)
t = re.sub(r"\(([^\s()]+) ([^\s()]+) [^\s()]+\)", r"(\1 \2)", t)
return t
def _parse(self, t):
try:
tree = Tree.fromstring(self._normalize(t))
# If there's an empty node at the top, strip it off
if tree.label() == "" and len(tree) == 1:
return tree[0]
else:
return tree
except ValueError as e:
sys.stderr.write("Bad tree detected; trying to recover...\n")
# Try to recover, if we can:
if e.args == ("mismatched parens",):
for n in range(1, 5):
try:
v = Tree(self._normalize(t + ")" * n))
sys.stderr.write(
" Recovered by adding %d close " "paren(s)\n" % n
)
return v
except ValueError:
pass
# Try something else:
sys.stderr.write(" Recovered by returning a flat parse.\n")
# sys.stderr.write(' '.join(t.split())+'\n')
return Tree("S", self._tag(t))
def _tag(self, t, tagset=None):
tagged_sent = [(w, p) for (p, w) in TAGWORD.findall(self._normalize(t))]
if tagset and tagset != self._tagset:
tagged_sent = [
(w, map_tag(self._tagset, tagset, p)) for (w, p) in tagged_sent
]
return tagged_sent
def _word(self, t):
return WORD.findall(self._normalize(t))
class CategorizedBracketParseCorpusReader(
CategorizedCorpusReader, BracketParseCorpusReader
):
"""
A reader for parsed corpora whose documents are
divided into categories based on their file identifiers.
@author: Nathan Schneider <nschneid@cs.cmu.edu>
"""
def __init__(self, *args, **kwargs):
"""
Initialize the corpus reader. Categorization arguments
(C{cat_pattern}, C{cat_map}, and C{cat_file}) are passed to
the L{CategorizedCorpusReader constructor
<CategorizedCorpusReader.__init__>}. The remaining arguments
are passed to the L{BracketParseCorpusReader constructor
<BracketParseCorpusReader.__init__>}.
"""
CategorizedCorpusReader.__init__(self, kwargs)
BracketParseCorpusReader.__init__(self, *args, **kwargs)
def tagged_words(self, fileids=None, categories=None, tagset=None):
return super().tagged_words(self._resolve(fileids, categories), tagset)
def tagged_sents(self, fileids=None, categories=None, tagset=None):
return super().tagged_sents(self._resolve(fileids, categories), tagset)
def tagged_paras(self, fileids=None, categories=None, tagset=None):
return super().tagged_paras(self._resolve(fileids, categories), tagset)
def parsed_words(self, fileids=None, categories=None):
return super().parsed_words(self._resolve(fileids, categories))
def parsed_sents(self, fileids=None, categories=None):
return super().parsed_sents(self._resolve(fileids, categories))
def parsed_paras(self, fileids=None, categories=None):
return super().parsed_paras(self._resolve(fileids, categories))
class AlpinoCorpusReader(BracketParseCorpusReader):
"""
Reader for the Alpino Dutch Treebank.
This corpus has a lexical breakdown structure embedded, as read by `_parse`
Unfortunately this puts punctuation and some other words out of the sentence
order in the xml element tree. This is no good for `tag_` and `word_`
`_tag` and `_word` will be overridden to use a non-default new parameter 'ordered'
to the overridden _normalize function. The _parse function can then remain
untouched.
"""
def __init__(self, root, encoding="ISO-8859-1", tagset=None):
BracketParseCorpusReader.__init__(
self,
root,
r"alpino\.xml",
detect_blocks="blankline",
encoding=encoding,
tagset=tagset,
)
def _normalize(self, t, ordered=False):
"""Normalize the xml sentence element in t.
The sentence elements <alpino_ds>, although embedded in a few overall
xml elements, are separated by blank lines. That's how the reader can
deliver them one at a time.
Each sentence has a few category subnodes that are of no use to us.
The remaining word nodes may or may not appear in the proper order.
Each word node has attributes, among which:
- begin : the position of the word in the sentence
- pos : Part of Speech: the Tag
- word : the actual word
The return value is a string with all xml elementes replaced by
clauses: either a cat clause with nested clauses, or a word clause.
The order of the bracket clauses closely follows the xml.
If ordered == True, the word clauses include an order sequence number.
If ordered == False, the word clauses only have pos and word parts.
"""
if t[:10] != "<alpino_ds":
return ""
# convert XML to sexpr notation
t = re.sub(r' <node .*? cat="(\w+)".*>', r"(\1", t)
if ordered:
t = re.sub(
r' <node. *?begin="(\d+)".*? pos="(\w+)".*? word="([^"]+)".*?/>',
r"(\1 \2 \3)",
t,
)
else:
t = re.sub(r' <node .*?pos="(\w+)".*? word="([^"]+)".*?/>', r"(\1 \2)", t)
t = re.sub(r" </node>", r")", t)
t = re.sub(r"<sentence>.*</sentence>", r"", t)
t = re.sub(r"</?alpino_ds.*>", r"", t)
return t
def _tag(self, t, tagset=None):
tagged_sent = [
(int(o), w, p)
for (o, p, w) in SORTTAGWRD.findall(self._normalize(t, ordered=True))
]
tagged_sent.sort()
if tagset and tagset != self._tagset:
tagged_sent = [
(w, map_tag(self._tagset, tagset, p)) for (o, w, p) in tagged_sent
]
else:
tagged_sent = [(w, p) for (o, w, p) in tagged_sent]
return tagged_sent
def _word(self, t):
"""Return a correctly ordered list if words"""
tagged_sent = self._tag(t)
return [w for (w, p) in tagged_sent]

View File

@@ -0,0 +1,168 @@
# Natural Language Toolkit: Categorized Sentences Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
CorpusReader structured for corpora that contain one instance on each row.
This CorpusReader is specifically used for the Subjectivity Dataset and the
Sentence Polarity Dataset.
- Subjectivity Dataset information -
Authors: Bo Pang and Lillian Lee.
Url: https://www.cs.cornell.edu/people/pabo/movie-review-data
Distributed with permission.
Related papers:
- Bo Pang and Lillian Lee. "A Sentimental Education: Sentiment Analysis Using
Subjectivity Summarization Based on Minimum Cuts". Proceedings of the ACL,
2004.
- Sentence Polarity Dataset information -
Authors: Bo Pang and Lillian Lee.
Url: https://www.cs.cornell.edu/people/pabo/movie-review-data
Related papers:
- Bo Pang and Lillian Lee. "Seeing stars: Exploiting class relationships for
sentiment categorization with respect to rating scales". Proceedings of the
ACL, 2005.
"""
from nltk.corpus.reader.api import *
from nltk.tokenize import *
class CategorizedSentencesCorpusReader(CategorizedCorpusReader, CorpusReader):
"""
A reader for corpora in which each row represents a single instance, mainly
a sentence. Istances are divided into categories based on their file identifiers
(see CategorizedCorpusReader).
Since many corpora allow rows that contain more than one sentence, it is
possible to specify a sentence tokenizer to retrieve all sentences instead
than all rows.
Examples using the Subjectivity Dataset:
>>> from nltk.corpus import subjectivity
>>> subjectivity.sents()[23] # doctest: +NORMALIZE_WHITESPACE
['television', 'made', 'him', 'famous', ',', 'but', 'his', 'biggest', 'hits',
'happened', 'off', 'screen', '.']
>>> subjectivity.categories()
['obj', 'subj']
>>> subjectivity.words(categories='subj')
['smart', 'and', 'alert', ',', 'thirteen', ...]
Examples using the Sentence Polarity Dataset:
>>> from nltk.corpus import sentence_polarity
>>> sentence_polarity.sents() # doctest: +NORMALIZE_WHITESPACE
[['simplistic', ',', 'silly', 'and', 'tedious', '.'], ["it's", 'so', 'laddish',
'and', 'juvenile', ',', 'only', 'teenage', 'boys', 'could', 'possibly', 'find',
'it', 'funny', '.'], ...]
>>> sentence_polarity.categories()
['neg', 'pos']
"""
CorpusView = StreamBackedCorpusView
def __init__(
self,
root,
fileids,
word_tokenizer=WhitespaceTokenizer(),
sent_tokenizer=None,
encoding="utf8",
**kwargs
):
"""
:param root: The root directory for the corpus.
:param fileids: a list or regexp specifying the fileids in the corpus.
:param word_tokenizer: a tokenizer for breaking sentences or paragraphs
into words. Default: `WhitespaceTokenizer`
:param sent_tokenizer: a tokenizer for breaking paragraphs into sentences.
:param encoding: the encoding that should be used to read the corpus.
:param kwargs: additional parameters passed to CategorizedCorpusReader.
"""
CorpusReader.__init__(self, root, fileids, encoding)
CategorizedCorpusReader.__init__(self, kwargs)
self._word_tokenizer = word_tokenizer
self._sent_tokenizer = sent_tokenizer
def sents(self, fileids=None, categories=None):
"""
Return all sentences in the corpus or in the specified file(s).
:param fileids: a list or regexp specifying the ids of the files whose
sentences have to be returned.
:param categories: a list specifying the categories whose sentences have
to be returned.
:return: the given file(s) as a list of sentences.
Each sentence is tokenized using the specified word_tokenizer.
:rtype: list(list(str))
"""
fileids = self._resolve(fileids, categories)
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, str):
fileids = [fileids]
return concat(
[
self.CorpusView(path, self._read_sent_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def words(self, fileids=None, categories=None):
"""
Return all words and punctuation symbols in the corpus or in the specified
file(s).
:param fileids: a list or regexp specifying the ids of the files whose
words have to be returned.
:param categories: a list specifying the categories whose words have to
be returned.
:return: the given file(s) as a list of words and punctuation symbols.
:rtype: list(str)
"""
fileids = self._resolve(fileids, categories)
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, str):
fileids = [fileids]
return concat(
[
self.CorpusView(path, self._read_word_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def _read_sent_block(self, stream):
sents = []
for i in range(20): # Read 20 lines at a time.
line = stream.readline()
if not line:
continue
if self._sent_tokenizer:
sents.extend(
[
self._word_tokenizer.tokenize(sent)
for sent in self._sent_tokenizer.tokenize(line)
]
)
else:
sents.append(self._word_tokenizer.tokenize(line))
return sents
def _read_word_block(self, stream):
words = []
for sent in self._read_sent_block(stream):
words.extend(sent)
return words

View File

@@ -0,0 +1,154 @@
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Masato Hagiwara <hagisan@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import sys
from nltk.corpus.reader import util
from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *
class ChasenCorpusReader(CorpusReader):
def __init__(self, root, fileids, encoding="utf8", sent_splitter=None):
self._sent_splitter = sent_splitter
CorpusReader.__init__(self, root, fileids, encoding)
def words(self, fileids=None):
return concat(
[
ChasenCorpusView(fileid, enc, False, False, False, self._sent_splitter)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def tagged_words(self, fileids=None):
return concat(
[
ChasenCorpusView(fileid, enc, True, False, False, self._sent_splitter)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def sents(self, fileids=None):
return concat(
[
ChasenCorpusView(fileid, enc, False, True, False, self._sent_splitter)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def tagged_sents(self, fileids=None):
return concat(
[
ChasenCorpusView(fileid, enc, True, True, False, self._sent_splitter)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def paras(self, fileids=None):
return concat(
[
ChasenCorpusView(fileid, enc, False, True, True, self._sent_splitter)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def tagged_paras(self, fileids=None):
return concat(
[
ChasenCorpusView(fileid, enc, True, True, True, self._sent_splitter)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
class ChasenCorpusView(StreamBackedCorpusView):
"""
A specialized corpus view for ChasenReader. Similar to ``TaggedCorpusView``,
but this'll use fixed sets of word and sentence tokenizer.
"""
def __init__(
self,
corpus_file,
encoding,
tagged,
group_by_sent,
group_by_para,
sent_splitter=None,
):
self._tagged = tagged
self._group_by_sent = group_by_sent
self._group_by_para = group_by_para
self._sent_splitter = sent_splitter
StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
def read_block(self, stream):
"""Reads one paragraph at a time."""
block = []
for para_str in read_regexp_block(stream, r".", r"^EOS\n"):
para = []
sent = []
for line in para_str.splitlines():
_eos = line.strip() == "EOS"
_cells = line.split("\t")
w = (_cells[0], "\t".join(_cells[1:]))
if not _eos:
sent.append(w)
if _eos or (self._sent_splitter and self._sent_splitter(w)):
if not self._tagged:
sent = [w for (w, t) in sent]
if self._group_by_sent:
para.append(sent)
else:
para.extend(sent)
sent = []
if len(sent) > 0:
if not self._tagged:
sent = [w for (w, t) in sent]
if self._group_by_sent:
para.append(sent)
else:
para.extend(sent)
if self._group_by_para:
block.append(para)
else:
block.extend(para)
return block
def demo():
import nltk
from nltk.corpus.util import LazyCorpusLoader
jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*chasen", encoding="utf-8")
print("/".join(jeita.words()[22100:22140]))
print(
"\nEOS\n".join(
"\n".join("{}/{}".format(w[0], w[1].split("\t")[2]) for w in sent)
for sent in jeita.tagged_sents()[2170:2173]
)
)
def test():
from nltk.corpus.util import LazyCorpusLoader
jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*chasen", encoding="utf-8")
assert isinstance(jeita.tagged_words()[0][1], str)
if __name__ == "__main__":
demo()
test()

View File

@@ -0,0 +1,630 @@
# CHILDES XML Corpus Reader
# Copyright (C) 2001-2025 NLTK Project
# Author: Tomonori Nagano <tnagano@gc.cuny.edu>
# Alexis Dimitriadis <A.Dimitriadis@uu.nl>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Corpus reader for the XML version of the CHILDES corpus.
"""
__docformat__ = "epytext en"
import re
from collections import defaultdict
from nltk.corpus.reader.util import concat
from nltk.corpus.reader.xmldocs import ElementTree, XMLCorpusReader
from nltk.util import LazyConcatenation, LazyMap, flatten
# to resolve the namespace issue
NS = "http://www.talkbank.org/ns/talkbank"
class CHILDESCorpusReader(XMLCorpusReader):
"""
Corpus reader for the XML version of the CHILDES corpus.
The CHILDES corpus is available at ``https://childes.talkbank.org/``. The XML
version of CHILDES is located at ``https://childes.talkbank.org/data-xml/``.
Copy the needed parts of the CHILDES XML corpus into the NLTK data directory
(``nltk_data/corpora/CHILDES/``).
For access to the file text use the usual nltk functions,
``words()``, ``sents()``, ``tagged_words()`` and ``tagged_sents()``.
"""
def __init__(self, root, fileids, lazy=True):
XMLCorpusReader.__init__(self, root, fileids)
self._lazy = lazy
def words(
self,
fileids=None,
speaker="ALL",
stem=False,
relation=False,
strip_space=True,
replace=False,
):
"""
:return: the given file(s) as a list of words
:rtype: list(str)
:param speaker: If specified, select specific speaker(s) defined
in the corpus. Default is 'ALL' (all participants). Common choices
are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
researchers)
:param stem: If true, then use word stems instead of word strings.
:param relation: If true, then return tuples of (stem, index,
dependent_index)
:param strip_space: If true, then strip trailing spaces from word
tokens. Otherwise, leave the spaces on the tokens.
:param replace: If true, then use the replaced (intended) word instead
of the original word (e.g., 'wat' will be replaced with 'watch')
"""
sent = None
pos = False
if not self._lazy:
return [
self._get_words(
fileid, speaker, sent, stem, relation, pos, strip_space, replace
)
for fileid in self.abspaths(fileids)
]
get_words = lambda fileid: self._get_words(
fileid, speaker, sent, stem, relation, pos, strip_space, replace
)
return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
def tagged_words(
self,
fileids=None,
speaker="ALL",
stem=False,
relation=False,
strip_space=True,
replace=False,
):
"""
:return: the given file(s) as a list of tagged
words and punctuation symbols, encoded as tuples
``(word,tag)``.
:rtype: list(tuple(str,str))
:param speaker: If specified, select specific speaker(s) defined
in the corpus. Default is 'ALL' (all participants). Common choices
are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
researchers)
:param stem: If true, then use word stems instead of word strings.
:param relation: If true, then return tuples of (stem, index,
dependent_index)
:param strip_space: If true, then strip trailing spaces from word
tokens. Otherwise, leave the spaces on the tokens.
:param replace: If true, then use the replaced (intended) word instead
of the original word (e.g., 'wat' will be replaced with 'watch')
"""
sent = None
pos = True
if not self._lazy:
return [
self._get_words(
fileid, speaker, sent, stem, relation, pos, strip_space, replace
)
for fileid in self.abspaths(fileids)
]
get_words = lambda fileid: self._get_words(
fileid, speaker, sent, stem, relation, pos, strip_space, replace
)
return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
def sents(
self,
fileids=None,
speaker="ALL",
stem=False,
relation=None,
strip_space=True,
replace=False,
):
"""
:return: the given file(s) as a list of sentences or utterances, each
encoded as a list of word strings.
:rtype: list(list(str))
:param speaker: If specified, select specific speaker(s) defined
in the corpus. Default is 'ALL' (all participants). Common choices
are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
researchers)
:param stem: If true, then use word stems instead of word strings.
:param relation: If true, then return tuples of ``(str,pos,relation_list)``.
If there is manually-annotated relation info, it will return
tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)``
:param strip_space: If true, then strip trailing spaces from word
tokens. Otherwise, leave the spaces on the tokens.
:param replace: If true, then use the replaced (intended) word instead
of the original word (e.g., 'wat' will be replaced with 'watch')
"""
sent = True
pos = False
if not self._lazy:
return [
self._get_words(
fileid, speaker, sent, stem, relation, pos, strip_space, replace
)
for fileid in self.abspaths(fileids)
]
get_words = lambda fileid: self._get_words(
fileid, speaker, sent, stem, relation, pos, strip_space, replace
)
return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
def tagged_sents(
self,
fileids=None,
speaker="ALL",
stem=False,
relation=None,
strip_space=True,
replace=False,
):
"""
:return: the given file(s) as a list of
sentences, each encoded as a list of ``(word,tag)`` tuples.
:rtype: list(list(tuple(str,str)))
:param speaker: If specified, select specific speaker(s) defined
in the corpus. Default is 'ALL' (all participants). Common choices
are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
researchers)
:param stem: If true, then use word stems instead of word strings.
:param relation: If true, then return tuples of ``(str,pos,relation_list)``.
If there is manually-annotated relation info, it will return
tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)``
:param strip_space: If true, then strip trailing spaces from word
tokens. Otherwise, leave the spaces on the tokens.
:param replace: If true, then use the replaced (intended) word instead
of the original word (e.g., 'wat' will be replaced with 'watch')
"""
sent = True
pos = True
if not self._lazy:
return [
self._get_words(
fileid, speaker, sent, stem, relation, pos, strip_space, replace
)
for fileid in self.abspaths(fileids)
]
get_words = lambda fileid: self._get_words(
fileid, speaker, sent, stem, relation, pos, strip_space, replace
)
return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
def corpus(self, fileids=None):
"""
:return: the given file(s) as a dict of ``(corpus_property_key, value)``
:rtype: list(dict)
"""
if not self._lazy:
return [self._get_corpus(fileid) for fileid in self.abspaths(fileids)]
return LazyMap(self._get_corpus, self.abspaths(fileids))
def _get_corpus(self, fileid):
results = dict()
xmldoc = ElementTree.parse(fileid).getroot()
for key, value in xmldoc.items():
results[key] = value
return results
def participants(self, fileids=None):
"""
:return: the given file(s) as a dict of
``(participant_property_key, value)``
:rtype: list(dict)
"""
if not self._lazy:
return [self._get_participants(fileid) for fileid in self.abspaths(fileids)]
return LazyMap(self._get_participants, self.abspaths(fileids))
def _get_participants(self, fileid):
# multidimensional dicts
def dictOfDicts():
return defaultdict(dictOfDicts)
xmldoc = ElementTree.parse(fileid).getroot()
# getting participants' data
pat = dictOfDicts()
for participant in xmldoc.findall(
f".//{{{NS}}}Participants/{{{NS}}}participant"
):
for key, value in participant.items():
pat[participant.get("id")][key] = value
return pat
def age(self, fileids=None, speaker="CHI", month=False):
"""
:return: the given file(s) as string or int
:rtype: list or int
:param month: If true, return months instead of year-month-date
"""
if not self._lazy:
return [
self._get_age(fileid, speaker, month)
for fileid in self.abspaths(fileids)
]
get_age = lambda fileid: self._get_age(fileid, speaker, month)
return LazyMap(get_age, self.abspaths(fileids))
def _get_age(self, fileid, speaker, month):
xmldoc = ElementTree.parse(fileid).getroot()
for pat in xmldoc.findall(f".//{{{NS}}}Participants/{{{NS}}}participant"):
try:
if pat.get("id") == speaker:
age = pat.get("age")
if month:
age = self.convert_age(age)
return age
# some files don't have age data
except (TypeError, AttributeError) as e:
return None
def convert_age(self, age_year):
"Caclculate age in months from a string in CHILDES format"
m = re.match(r"P(\d+)Y(\d+)M?(\d?\d?)D?", age_year)
age_month = int(m.group(1)) * 12 + int(m.group(2))
try:
if int(m.group(3)) > 15:
age_month += 1
# some corpora don't have age information?
except ValueError as e:
pass
return age_month
def MLU(self, fileids=None, speaker="CHI"):
"""
:return: the given file(s) as a floating number
:rtype: list(float)
"""
if not self._lazy:
return [
self._getMLU(fileid, speaker=speaker)
for fileid in self.abspaths(fileids)
]
get_MLU = lambda fileid: self._getMLU(fileid, speaker=speaker)
return LazyMap(get_MLU, self.abspaths(fileids))
def _getMLU(self, fileid, speaker):
sents = self._get_words(
fileid,
speaker=speaker,
sent=True,
stem=True,
relation=False,
pos=True,
strip_space=True,
replace=True,
)
results = []
lastSent = []
numFillers = 0
sentDiscount = 0
for sent in sents:
posList = [pos for (word, pos) in sent]
# if any part of the sentence is intelligible
if any(pos == "unk" for pos in posList):
continue
# if the sentence is null
elif sent == []:
continue
# if the sentence is the same as the last sent
elif sent == lastSent:
continue
else:
results.append([word for (word, pos) in sent])
# count number of fillers
if len({"co", None}.intersection(posList)) > 0:
numFillers += posList.count("co")
numFillers += posList.count(None)
sentDiscount += 1
lastSent = sent
try:
thisWordList = flatten(results)
# count number of morphemes
# (e.g., 'read' = 1 morpheme but 'read-PAST' is 2 morphemes)
numWords = (
len(flatten([word.split("-") for word in thisWordList])) - numFillers
)
numSents = len(results) - sentDiscount
mlu = numWords / numSents
except ZeroDivisionError:
mlu = 0
# return {'mlu':mlu,'wordNum':numWords,'sentNum':numSents}
return mlu
def _get_words(
self, fileid, speaker, sent, stem, relation, pos, strip_space, replace
):
if (
isinstance(speaker, str) and speaker != "ALL"
): # ensure we have a list of speakers
speaker = [speaker]
xmldoc = ElementTree.parse(fileid).getroot()
# processing each xml doc
results = []
for xmlsent in xmldoc.findall(".//{%s}u" % NS):
sents = []
# select speakers
if speaker == "ALL" or xmlsent.get("who") in speaker:
for xmlword in xmlsent.findall(".//{%s}w" % NS):
infl = None
suffixStem = None
suffixTag = None
# getting replaced words
if replace and xmlsent.find(f".//{{{NS}}}w/{{{NS}}}replacement"):
xmlword = xmlsent.find(
f".//{{{NS}}}w/{{{NS}}}replacement/{{{NS}}}w"
)
elif replace and xmlsent.find(f".//{{{NS}}}w/{{{NS}}}wk"):
xmlword = xmlsent.find(f".//{{{NS}}}w/{{{NS}}}wk")
# get text
if xmlword.text:
word = xmlword.text
else:
word = ""
# strip tailing space
if strip_space:
word = word.strip()
# stem
if relation or stem:
try:
xmlstem = xmlword.find(".//{%s}stem" % NS)
word = xmlstem.text
except AttributeError as e:
pass
# if there is an inflection
try:
xmlinfl = xmlword.find(
f".//{{{NS}}}mor/{{{NS}}}mw/{{{NS}}}mk"
)
word += "-" + xmlinfl.text
except:
pass
# if there is a suffix
try:
xmlsuffix = xmlword.find(
".//{%s}mor/{%s}mor-post/{%s}mw/{%s}stem"
% (NS, NS, NS, NS)
)
suffixStem = xmlsuffix.text
except AttributeError:
suffixStem = ""
if suffixStem:
word += "~" + suffixStem
# pos
if relation or pos:
try:
xmlpos = xmlword.findall(".//{%s}c" % NS)
xmlpos2 = xmlword.findall(".//{%s}s" % NS)
if xmlpos2 != []:
tag = xmlpos[0].text + ":" + xmlpos2[0].text
else:
tag = xmlpos[0].text
except (AttributeError, IndexError) as e:
tag = ""
try:
xmlsuffixpos = xmlword.findall(
".//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}c"
% (NS, NS, NS, NS, NS)
)
xmlsuffixpos2 = xmlword.findall(
".//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}s"
% (NS, NS, NS, NS, NS)
)
if xmlsuffixpos2:
suffixTag = (
xmlsuffixpos[0].text + ":" + xmlsuffixpos2[0].text
)
else:
suffixTag = xmlsuffixpos[0].text
except:
pass
if suffixTag:
tag += "~" + suffixTag
word = (word, tag)
# relational
# the gold standard is stored in
# <mor></mor><mor type="trn"><gra type="grt">
if relation == True:
for xmlstem_rel in xmlword.findall(
f".//{{{NS}}}mor/{{{NS}}}gra"
):
if not xmlstem_rel.get("type") == "grt":
word = (
word[0],
word[1],
xmlstem_rel.get("index")
+ "|"
+ xmlstem_rel.get("head")
+ "|"
+ xmlstem_rel.get("relation"),
)
else:
word = (
word[0],
word[1],
word[2],
word[0],
word[1],
xmlstem_rel.get("index")
+ "|"
+ xmlstem_rel.get("head")
+ "|"
+ xmlstem_rel.get("relation"),
)
try:
for xmlpost_rel in xmlword.findall(
f".//{{{NS}}}mor/{{{NS}}}mor-post/{{{NS}}}gra"
):
if not xmlpost_rel.get("type") == "grt":
suffixStem = (
suffixStem[0],
suffixStem[1],
xmlpost_rel.get("index")
+ "|"
+ xmlpost_rel.get("head")
+ "|"
+ xmlpost_rel.get("relation"),
)
else:
suffixStem = (
suffixStem[0],
suffixStem[1],
suffixStem[2],
suffixStem[0],
suffixStem[1],
xmlpost_rel.get("index")
+ "|"
+ xmlpost_rel.get("head")
+ "|"
+ xmlpost_rel.get("relation"),
)
except:
pass
sents.append(word)
if sent or relation:
results.append(sents)
else:
results.extend(sents)
return LazyMap(lambda x: x, results)
# Ready-to-use browser opener
"""
The base URL for viewing files on the childes website. This
shouldn't need to be changed, unless CHILDES changes the configuration
of their server or unless the user sets up their own corpus webserver.
"""
childes_url_base = r"https://childes.talkbank.org/browser/index.php?url="
def webview_file(self, fileid, urlbase=None):
"""Map a corpus file to its web version on the CHILDES website,
and open it in a web browser.
The complete URL to be used is:
childes.childes_url_base + urlbase + fileid.replace('.xml', '.cha')
If no urlbase is passed, we try to calculate it. This
requires that the childes corpus was set up to mirror the
folder hierarchy under childes.psy.cmu.edu/data-xml/, e.g.:
nltk_data/corpora/childes/Eng-USA/Cornell/??? or
nltk_data/corpora/childes/Romance/Spanish/Aguirre/???
The function first looks (as a special case) if "Eng-USA" is
on the path consisting of <corpus root>+fileid; then if
"childes", possibly followed by "data-xml", appears. If neither
one is found, we use the unmodified fileid and hope for the best.
If this is not right, specify urlbase explicitly, e.g., if the
corpus root points to the Cornell folder, urlbase='Eng-USA/Cornell'.
"""
import webbrowser
if urlbase:
path = urlbase + "/" + fileid
else:
full = self.root + "/" + fileid
full = re.sub(r"\\", "/", full)
if "/childes/" in full.lower():
# Discard /data-xml/ if present
path = re.findall(r"(?i)/childes(?:/data-xml)?/(.*)\.xml", full)[0]
elif "eng-usa" in full.lower():
path = "Eng-USA/" + re.findall(r"/(?i)Eng-USA/(.*)\.xml", full)[0]
else:
path = fileid
# Strip ".xml" and add ".cha", as necessary:
if path.endswith(".xml"):
path = path[:-4]
if not path.endswith(".cha"):
path = path + ".cha"
url = self.childes_url_base + path
webbrowser.open_new_tab(url)
print("Opening in browser:", url)
# Pausing is a good idea, but it's up to the user...
# raw_input("Hit Return to continue")
def demo(corpus_root=None):
"""
The CHILDES corpus should be manually downloaded and saved
to ``[NLTK_Data_Dir]/corpora/childes/``
"""
if not corpus_root:
from nltk.data import find
corpus_root = find("corpora/childes/data-xml/Eng-USA/")
try:
childes = CHILDESCorpusReader(corpus_root, ".*.xml")
# describe all corpus
for file in childes.fileids()[:5]:
corpus = ""
corpus_id = ""
for key, value in childes.corpus(file)[0].items():
if key == "Corpus":
corpus = value
if key == "Id":
corpus_id = value
print("Reading", corpus, corpus_id, " .....")
print("words:", childes.words(file)[:7], "...")
print(
"words with replaced words:",
childes.words(file, replace=True)[:7],
" ...",
)
print("words with pos tags:", childes.tagged_words(file)[:7], " ...")
print("words (only MOT):", childes.words(file, speaker="MOT")[:7], "...")
print("words (only CHI):", childes.words(file, speaker="CHI")[:7], "...")
print("stemmed words:", childes.words(file, stem=True)[:7], " ...")
print(
"words with relations and pos-tag:",
childes.words(file, relation=True)[:5],
" ...",
)
print("sentence:", childes.sents(file)[:2], " ...")
for participant, values in childes.participants(file)[0].items():
for key, value in values.items():
print("\tparticipant", participant, key, ":", value)
print("num of sent:", len(childes.sents(file)))
print("num of morphemes:", len(childes.words(file, stem=True)))
print("age:", childes.age(file))
print("age in month:", childes.age(file, month=True))
print("MLU:", childes.MLU(file))
print()
except LookupError as e:
print(
"""The CHILDES corpus, or the parts you need, should be manually
downloaded from https://childes.talkbank.org/data-xml/ and saved at
[NLTK_Data_Dir]/corpora/childes/
Alternately, you can call the demo with the path to a portion of the CHILDES corpus, e.g.:
demo('/path/to/childes/data-xml/Eng-USA/")
"""
)
# corpus_root_http = urllib2.urlopen('https://childes.talkbank.org/data-xml/Eng-USA/Bates.zip')
# corpus_root_http_bates = zipfile.ZipFile(cStringIO.StringIO(corpus_root_http.read()))
##this fails
# childes = CHILDESCorpusReader(corpus_root_http_bates,corpus_root_http_bates.namelist())
if __name__ == "__main__":
demo()

View File

@@ -0,0 +1,273 @@
# Natural Language Toolkit: Chunked Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
A reader for corpora that contain chunked (and optionally tagged)
documents.
"""
import codecs
import os.path
import nltk
from nltk.chunk import tagstr2tree
from nltk.corpus.reader.api import *
from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader
from nltk.corpus.reader.util import *
from nltk.tokenize import *
from nltk.tree import Tree
class ChunkedCorpusReader(CorpusReader):
"""
Reader for chunked (and optionally tagged) corpora. Paragraphs
are split using a block reader. They are then tokenized into
sentences using a sentence tokenizer. Finally, these sentences
are parsed into chunk trees using a string-to-chunktree conversion
function. Each of these steps can be performed using a default
function or a custom function. By default, paragraphs are split
on blank lines; sentences are listed one per line; and sentences
are parsed into chunk trees using ``nltk.chunk.tagstr2tree``.
"""
def __init__(
self,
root,
fileids,
extension="",
str2chunktree=tagstr2tree,
sent_tokenizer=RegexpTokenizer("\n", gaps=True),
para_block_reader=read_blankline_block,
encoding="utf8",
tagset=None,
):
"""
:param root: The root directory for this corpus.
:param fileids: A list or regexp specifying the fileids in this corpus.
"""
CorpusReader.__init__(self, root, fileids, encoding)
self._cv_args = (str2chunktree, sent_tokenizer, para_block_reader, tagset)
"""Arguments for corpus views generated by this corpus: a tuple
(str2chunktree, sent_tokenizer, para_block_tokenizer)"""
def words(self, fileids=None):
"""
:return: the given file(s) as a list of words
and punctuation symbols.
:rtype: list(str)
"""
return concat(
[
ChunkedCorpusView(f, enc, 0, 0, 0, 0, *self._cv_args)
for (f, enc) in self.abspaths(fileids, True)
]
)
def sents(self, fileids=None):
"""
:return: the given file(s) as a list of
sentences or utterances, each encoded as a list of word
strings.
:rtype: list(list(str))
"""
return concat(
[
ChunkedCorpusView(f, enc, 0, 1, 0, 0, *self._cv_args)
for (f, enc) in self.abspaths(fileids, True)
]
)
def paras(self, fileids=None):
"""
:return: the given file(s) as a list of
paragraphs, each encoded as a list of sentences, which are
in turn encoded as lists of word strings.
:rtype: list(list(list(str)))
"""
return concat(
[
ChunkedCorpusView(f, enc, 0, 1, 1, 0, *self._cv_args)
for (f, enc) in self.abspaths(fileids, True)
]
)
def tagged_words(self, fileids=None, tagset=None):
"""
:return: the given file(s) as a list of tagged
words and punctuation symbols, encoded as tuples
``(word,tag)``.
:rtype: list(tuple(str,str))
"""
return concat(
[
ChunkedCorpusView(
f, enc, 1, 0, 0, 0, *self._cv_args, target_tagset=tagset
)
for (f, enc) in self.abspaths(fileids, True)
]
)
def tagged_sents(self, fileids=None, tagset=None):
"""
:return: the given file(s) as a list of
sentences, each encoded as a list of ``(word,tag)`` tuples.
:rtype: list(list(tuple(str,str)))
"""
return concat(
[
ChunkedCorpusView(
f, enc, 1, 1, 0, 0, *self._cv_args, target_tagset=tagset
)
for (f, enc) in self.abspaths(fileids, True)
]
)
def tagged_paras(self, fileids=None, tagset=None):
"""
:return: the given file(s) as a list of
paragraphs, each encoded as a list of sentences, which are
in turn encoded as lists of ``(word,tag)`` tuples.
:rtype: list(list(list(tuple(str,str))))
"""
return concat(
[
ChunkedCorpusView(
f, enc, 1, 1, 1, 0, *self._cv_args, target_tagset=tagset
)
for (f, enc) in self.abspaths(fileids, True)
]
)
def chunked_words(self, fileids=None, tagset=None):
"""
:return: the given file(s) as a list of tagged
words and chunks. Words are encoded as ``(word, tag)``
tuples (if the corpus has tags) or word strings (if the
corpus has no tags). Chunks are encoded as depth-one
trees over ``(word,tag)`` tuples or word strings.
:rtype: list(tuple(str,str) and Tree)
"""
return concat(
[
ChunkedCorpusView(
f, enc, 1, 0, 0, 1, *self._cv_args, target_tagset=tagset
)
for (f, enc) in self.abspaths(fileids, True)
]
)
def chunked_sents(self, fileids=None, tagset=None):
"""
:return: the given file(s) as a list of
sentences, each encoded as a shallow Tree. The leaves
of these trees are encoded as ``(word, tag)`` tuples (if
the corpus has tags) or word strings (if the corpus has no
tags).
:rtype: list(Tree)
"""
return concat(
[
ChunkedCorpusView(
f, enc, 1, 1, 0, 1, *self._cv_args, target_tagset=tagset
)
for (f, enc) in self.abspaths(fileids, True)
]
)
def chunked_paras(self, fileids=None, tagset=None):
"""
:return: the given file(s) as a list of
paragraphs, each encoded as a list of sentences, which are
in turn encoded as a shallow Tree. The leaves of these
trees are encoded as ``(word, tag)`` tuples (if the corpus
has tags) or word strings (if the corpus has no tags).
:rtype: list(list(Tree))
"""
return concat(
[
ChunkedCorpusView(
f, enc, 1, 1, 1, 1, *self._cv_args, target_tagset=tagset
)
for (f, enc) in self.abspaths(fileids, True)
]
)
def _read_block(self, stream):
return [tagstr2tree(t) for t in read_blankline_block(stream)]
class ChunkedCorpusView(StreamBackedCorpusView):
def __init__(
self,
fileid,
encoding,
tagged,
group_by_sent,
group_by_para,
chunked,
str2chunktree,
sent_tokenizer,
para_block_reader,
source_tagset=None,
target_tagset=None,
):
StreamBackedCorpusView.__init__(self, fileid, encoding=encoding)
self._tagged = tagged
self._group_by_sent = group_by_sent
self._group_by_para = group_by_para
self._chunked = chunked
self._str2chunktree = str2chunktree
self._sent_tokenizer = sent_tokenizer
self._para_block_reader = para_block_reader
self._source_tagset = source_tagset
self._target_tagset = target_tagset
def read_block(self, stream):
block = []
for para_str in self._para_block_reader(stream):
para = []
for sent_str in self._sent_tokenizer.tokenize(para_str):
sent = self._str2chunktree(
sent_str,
source_tagset=self._source_tagset,
target_tagset=self._target_tagset,
)
# If requested, throw away the tags.
if not self._tagged:
sent = self._untag(sent)
# If requested, throw away the chunks.
if not self._chunked:
sent = sent.leaves()
# Add the sentence to `para`.
if self._group_by_sent:
para.append(sent)
else:
para.extend(sent)
# Add the paragraph to `block`.
if self._group_by_para:
block.append(para)
else:
block.extend(para)
# Return the block
return block
def _untag(self, tree):
for i, child in enumerate(tree):
if isinstance(child, Tree):
self._untag(child)
elif isinstance(child, tuple):
tree[i] = child[0]
else:
raise ValueError("expected child to be Tree or tuple")
return tree

View File

@@ -0,0 +1,88 @@
# Natural Language Toolkit: Carnegie Mellon Pronouncing Dictionary Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
The Carnegie Mellon Pronouncing Dictionary [cmudict.0.6]
ftp://ftp.cs.cmu.edu/project/speech/dict/
Copyright 1998 Carnegie Mellon University
File Format: Each line consists of an uppercased word, a counter
(for alternative pronunciations), and a transcription. Vowels are
marked for stress (1=primary, 2=secondary, 0=no stress). E.g.:
NATURAL 1 N AE1 CH ER0 AH0 L
The dictionary contains 127069 entries. Of these, 119400 words are assigned
a unique pronunciation, 6830 words have two pronunciations, and 839 words have
three or more pronunciations. Many of these are fast-speech variants.
Phonemes: There are 39 phonemes, as shown below:
Phoneme Example Translation Phoneme Example Translation
------- ------- ----------- ------- ------- -----------
AA odd AA D AE at AE T
AH hut HH AH T AO ought AO T
AW cow K AW AY hide HH AY D
B be B IY CH cheese CH IY Z
D dee D IY DH thee DH IY
EH Ed EH D ER hurt HH ER T
EY ate EY T F fee F IY
G green G R IY N HH he HH IY
IH it IH T IY eat IY T
JH gee JH IY K key K IY
L lee L IY M me M IY
N knee N IY NG ping P IH NG
OW oat OW T OY toy T OY
P pee P IY R read R IY D
S sea S IY SH she SH IY
T tea T IY TH theta TH EY T AH
UH hood HH UH D UW two T UW
V vee V IY W we W IY
Y yield Y IY L D Z zee Z IY
ZH seizure S IY ZH ER
"""
from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *
from nltk.util import Index
class CMUDictCorpusReader(CorpusReader):
def entries(self):
"""
:return: the cmudict lexicon as a list of entries
containing (word, transcriptions) tuples.
"""
return concat(
[
StreamBackedCorpusView(fileid, read_cmudict_block, encoding=enc)
for fileid, enc in self.abspaths(None, True)
]
)
def words(self):
"""
:return: a list of all words defined in the cmudict lexicon.
"""
return [word.lower() for (word, _) in self.entries()]
def dict(self):
"""
:return: the cmudict lexicon as a dictionary, whose keys are
lowercase words and whose values are lists of pronunciations.
"""
return dict(Index(self.entries()))
def read_cmudict_block(stream):
entries = []
while len(entries) < 100: # Read 100 at a time.
line = stream.readline()
if line == "":
return entries # end of file.
pieces = line.split()
entries.append((pieces[0].lower(), pieces[2:]))
return entries

View File

@@ -0,0 +1,309 @@
# Natural Language Toolkit: Comparative Sentence Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
CorpusReader for the Comparative Sentence Dataset.
- Comparative Sentence Dataset information -
Annotated by: Nitin Jindal and Bing Liu, 2006.
Department of Computer Sicence
University of Illinois at Chicago
Contact: Nitin Jindal, njindal@cs.uic.edu
Bing Liu, liub@cs.uic.edu (https://www.cs.uic.edu/~liub)
Distributed with permission.
Related papers:
- Nitin Jindal and Bing Liu. "Identifying Comparative Sentences in Text Documents".
Proceedings of the ACM SIGIR International Conference on Information Retrieval
(SIGIR-06), 2006.
- Nitin Jindal and Bing Liu. "Mining Comprative Sentences and Relations".
Proceedings of Twenty First National Conference on Artificial Intelligence
(AAAI-2006), 2006.
- Murthy Ganapathibhotla and Bing Liu. "Mining Opinions in Comparative Sentences".
Proceedings of the 22nd International Conference on Computational Linguistics
(Coling-2008), Manchester, 18-22 August, 2008.
"""
import re
from nltk.corpus.reader.api import *
from nltk.tokenize import *
# Regular expressions for dataset components
STARS = re.compile(r"^\*+$")
COMPARISON = re.compile(r"<cs-[1234]>")
CLOSE_COMPARISON = re.compile(r"</cs-[1234]>")
GRAD_COMPARISON = re.compile(r"<cs-[123]>")
NON_GRAD_COMPARISON = re.compile(r"<cs-4>")
ENTITIES_FEATS = re.compile(r"(\d)_((?:[\.\w\s/-](?!\d_))+)")
KEYWORD = re.compile(r"\(([^\(]*)\)$")
class Comparison:
"""
A Comparison represents a comparative sentence and its constituents.
"""
def __init__(
self,
text=None,
comp_type=None,
entity_1=None,
entity_2=None,
feature=None,
keyword=None,
):
"""
:param text: a string (optionally tokenized) containing a comparison.
:param comp_type: an integer defining the type of comparison expressed.
Values can be: 1 (Non-equal gradable), 2 (Equative), 3 (Superlative),
4 (Non-gradable).
:param entity_1: the first entity considered in the comparison relation.
:param entity_2: the second entity considered in the comparison relation.
:param feature: the feature considered in the comparison relation.
:param keyword: the word or phrase which is used for that comparative relation.
"""
self.text = text
self.comp_type = comp_type
self.entity_1 = entity_1
self.entity_2 = entity_2
self.feature = feature
self.keyword = keyword
def __repr__(self):
return (
'Comparison(text="{}", comp_type={}, entity_1="{}", entity_2="{}", '
'feature="{}", keyword="{}")'
).format(
self.text,
self.comp_type,
self.entity_1,
self.entity_2,
self.feature,
self.keyword,
)
class ComparativeSentencesCorpusReader(CorpusReader):
"""
Reader for the Comparative Sentence Dataset by Jindal and Liu (2006).
>>> from nltk.corpus import comparative_sentences
>>> comparison = comparative_sentences.comparisons()[0]
>>> comparison.text # doctest: +NORMALIZE_WHITESPACE
['its', 'fast-forward', 'and', 'rewind', 'work', 'much', 'more', 'smoothly',
'and', 'consistently', 'than', 'those', 'of', 'other', 'models', 'i', "'ve",
'had', '.']
>>> comparison.entity_2
'models'
>>> (comparison.feature, comparison.keyword)
('rewind', 'more')
>>> len(comparative_sentences.comparisons())
853
"""
CorpusView = StreamBackedCorpusView
def __init__(
self,
root,
fileids,
word_tokenizer=WhitespaceTokenizer(),
sent_tokenizer=None,
encoding="utf8",
):
"""
:param root: The root directory for this corpus.
:param fileids: a list or regexp specifying the fileids in this corpus.
:param word_tokenizer: tokenizer for breaking sentences or paragraphs
into words. Default: `WhitespaceTokenizer`
:param sent_tokenizer: tokenizer for breaking paragraphs into sentences.
:param encoding: the encoding that should be used to read the corpus.
"""
CorpusReader.__init__(self, root, fileids, encoding)
self._word_tokenizer = word_tokenizer
self._sent_tokenizer = sent_tokenizer
self._readme = "README.txt"
def comparisons(self, fileids=None):
"""
Return all comparisons in the corpus.
:param fileids: a list or regexp specifying the ids of the files whose
comparisons have to be returned.
:return: the given file(s) as a list of Comparison objects.
:rtype: list(Comparison)
"""
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, str):
fileids = [fileids]
return concat(
[
self.CorpusView(path, self._read_comparison_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def keywords(self, fileids=None):
"""
Return a set of all keywords used in the corpus.
:param fileids: a list or regexp specifying the ids of the files whose
keywords have to be returned.
:return: the set of keywords and comparative phrases used in the corpus.
:rtype: set(str)
"""
all_keywords = concat(
[
self.CorpusView(path, self._read_keyword_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
keywords_set = {keyword.lower() for keyword in all_keywords if keyword}
return keywords_set
def keywords_readme(self):
"""
Return the list of words and constituents considered as clues of a
comparison (from listOfkeywords.txt).
"""
keywords = []
with self.open("listOfkeywords.txt") as fp:
raw_text = fp.read()
for line in raw_text.split("\n"):
if not line or line.startswith("//"):
continue
keywords.append(line.strip())
return keywords
def sents(self, fileids=None):
"""
Return all sentences in the corpus.
:param fileids: a list or regexp specifying the ids of the files whose
sentences have to be returned.
:return: all sentences of the corpus as lists of tokens (or as plain
strings, if no word tokenizer is specified).
:rtype: list(list(str)) or list(str)
"""
return concat(
[
self.CorpusView(path, self._read_sent_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def words(self, fileids=None):
"""
Return all words and punctuation symbols in the corpus.
:param fileids: a list or regexp specifying the ids of the files whose
words have to be returned.
:return: the given file(s) as a list of words and punctuation symbols.
:rtype: list(str)
"""
return concat(
[
self.CorpusView(path, self._read_word_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def _read_comparison_block(self, stream):
while True:
line = stream.readline()
if not line:
return [] # end of file.
comparison_tags = re.findall(COMPARISON, line)
if comparison_tags:
grad_comparisons = re.findall(GRAD_COMPARISON, line)
non_grad_comparisons = re.findall(NON_GRAD_COMPARISON, line)
# Advance to the next line (it contains the comparative sentence)
comparison_text = stream.readline().strip()
if self._word_tokenizer:
comparison_text = self._word_tokenizer.tokenize(comparison_text)
# Skip the next line (it contains closing comparison tags)
stream.readline()
# If gradable comparisons are found, create Comparison instances
# and populate their fields
comparison_bundle = []
if grad_comparisons:
# Each comparison tag has its own relations on a separate line
for comp in grad_comparisons:
comp_type = int(re.match(r"<cs-(\d)>", comp).group(1))
comparison = Comparison(
text=comparison_text, comp_type=comp_type
)
line = stream.readline()
entities_feats = ENTITIES_FEATS.findall(line)
if entities_feats:
for code, entity_feat in entities_feats:
if code == "1":
comparison.entity_1 = entity_feat.strip()
elif code == "2":
comparison.entity_2 = entity_feat.strip()
elif code == "3":
comparison.feature = entity_feat.strip()
keyword = KEYWORD.findall(line)
if keyword:
comparison.keyword = keyword[0]
comparison_bundle.append(comparison)
# If non-gradable comparisons are found, create a simple Comparison
# instance for each one
if non_grad_comparisons:
for comp in non_grad_comparisons:
# comp_type in this case should always be 4.
comp_type = int(re.match(r"<cs-(\d)>", comp).group(1))
comparison = Comparison(
text=comparison_text, comp_type=comp_type
)
comparison_bundle.append(comparison)
# Flatten the list of comparisons before returning them
# return concat([comparison_bundle])
return comparison_bundle
def _read_keyword_block(self, stream):
keywords = []
for comparison in self._read_comparison_block(stream):
keywords.append(comparison.keyword)
return keywords
def _read_sent_block(self, stream):
while True:
line = stream.readline()
if re.match(STARS, line):
while True:
line = stream.readline()
if re.match(STARS, line):
break
continue
if (
not re.findall(COMPARISON, line)
and not ENTITIES_FEATS.findall(line)
and not re.findall(CLOSE_COMPARISON, line)
):
if self._sent_tokenizer:
return [
self._word_tokenizer.tokenize(sent)
for sent in self._sent_tokenizer.tokenize(line)
]
else:
return [self._word_tokenizer.tokenize(line)]
def _read_word_block(self, stream):
words = []
for sent in self._read_sent_block(stream):
words.extend(sent)
return words

View File

@@ -0,0 +1,579 @@
# Natural Language Toolkit: CONLL Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Read CoNLL-style chunk fileids.
"""
import textwrap
from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *
from nltk.tag import map_tag
from nltk.tree import Tree
from nltk.util import LazyConcatenation, LazyMap
class ConllCorpusReader(CorpusReader):
"""
A corpus reader for CoNLL-style files. These files consist of a
series of sentences, separated by blank lines. Each sentence is
encoded using a table (or "grid") of values, where each line
corresponds to a single word, and each column corresponds to an
annotation type. The set of columns used by CoNLL-style files can
vary from corpus to corpus; the ``ConllCorpusReader`` constructor
therefore takes an argument, ``columntypes``, which is used to
specify the columns that are used by a given corpus. By default
columns are split by consecutive whitespaces, with the
``separator`` argument you can set a string to split by (e.g.
``\'\t\'``).
@todo: Add support for reading from corpora where different
parallel files contain different columns.
@todo: Possibly add caching of the grid corpus view? This would
allow the same grid view to be used by different data access
methods (eg words() and parsed_sents() could both share the
same grid corpus view object).
@todo: Better support for -DOCSTART-. Currently, we just ignore
it, but it could be used to define methods that retrieve a
document at a time (eg parsed_documents()).
"""
# /////////////////////////////////////////////////////////////////
# Column Types
# /////////////////////////////////////////////////////////////////
WORDS = "words" #: column type for words
POS = "pos" #: column type for part-of-speech tags
TREE = "tree" #: column type for parse trees
CHUNK = "chunk" #: column type for chunk structures
NE = "ne" #: column type for named entities
SRL = "srl" #: column type for semantic role labels
IGNORE = "ignore" #: column type for column that should be ignored
#: A list of all column types supported by the conll corpus reader.
COLUMN_TYPES = (WORDS, POS, TREE, CHUNK, NE, SRL, IGNORE)
# /////////////////////////////////////////////////////////////////
# Constructor
# /////////////////////////////////////////////////////////////////
def __init__(
self,
root,
fileids,
columntypes,
chunk_types=None,
root_label="S",
pos_in_tree=False,
srl_includes_roleset=True,
encoding="utf8",
tree_class=Tree,
tagset=None,
separator=None,
):
for columntype in columntypes:
if columntype not in self.COLUMN_TYPES:
raise ValueError("Bad column type %r" % columntype)
if isinstance(chunk_types, str):
chunk_types = [chunk_types]
self._chunk_types = chunk_types
self._colmap = {c: i for (i, c) in enumerate(columntypes)}
self._pos_in_tree = pos_in_tree
self._root_label = root_label # for chunks
self._srl_includes_roleset = srl_includes_roleset
self._tree_class = tree_class
CorpusReader.__init__(self, root, fileids, encoding)
self._tagset = tagset
self.sep = separator
# /////////////////////////////////////////////////////////////////
# Data Access Methods
# /////////////////////////////////////////////////////////////////
def words(self, fileids=None):
self._require(self.WORDS)
return LazyConcatenation(LazyMap(self._get_words, self._grids(fileids)))
def sents(self, fileids=None):
self._require(self.WORDS)
return LazyMap(self._get_words, self._grids(fileids))
def tagged_words(self, fileids=None, tagset=None):
self._require(self.WORDS, self.POS)
def get_tagged_words(grid):
return self._get_tagged_words(grid, tagset)
return LazyConcatenation(LazyMap(get_tagged_words, self._grids(fileids)))
def tagged_sents(self, fileids=None, tagset=None):
self._require(self.WORDS, self.POS)
def get_tagged_words(grid):
return self._get_tagged_words(grid, tagset)
return LazyMap(get_tagged_words, self._grids(fileids))
def chunked_words(self, fileids=None, chunk_types=None, tagset=None):
self._require(self.WORDS, self.POS, self.CHUNK)
if chunk_types is None:
chunk_types = self._chunk_types
def get_chunked_words(grid): # capture chunk_types as local var
return self._get_chunked_words(grid, chunk_types, tagset)
return LazyConcatenation(LazyMap(get_chunked_words, self._grids(fileids)))
def chunked_sents(self, fileids=None, chunk_types=None, tagset=None):
self._require(self.WORDS, self.POS, self.CHUNK)
if chunk_types is None:
chunk_types = self._chunk_types
def get_chunked_words(grid): # capture chunk_types as local var
return self._get_chunked_words(grid, chunk_types, tagset)
return LazyMap(get_chunked_words, self._grids(fileids))
def parsed_sents(self, fileids=None, pos_in_tree=None, tagset=None):
self._require(self.WORDS, self.POS, self.TREE)
if pos_in_tree is None:
pos_in_tree = self._pos_in_tree
def get_parsed_sent(grid): # capture pos_in_tree as local var
return self._get_parsed_sent(grid, pos_in_tree, tagset)
return LazyMap(get_parsed_sent, self._grids(fileids))
def srl_spans(self, fileids=None):
self._require(self.SRL)
return LazyMap(self._get_srl_spans, self._grids(fileids))
def srl_instances(self, fileids=None, pos_in_tree=None, flatten=True):
self._require(self.WORDS, self.POS, self.TREE, self.SRL)
if pos_in_tree is None:
pos_in_tree = self._pos_in_tree
def get_srl_instances(grid): # capture pos_in_tree as local var
return self._get_srl_instances(grid, pos_in_tree)
result = LazyMap(get_srl_instances, self._grids(fileids))
if flatten:
result = LazyConcatenation(result)
return result
def iob_words(self, fileids=None, tagset=None):
"""
:return: a list of word/tag/IOB tuples
:rtype: list(tuple)
:param fileids: the list of fileids that make up this corpus
:type fileids: None or str or list
"""
self._require(self.WORDS, self.POS, self.CHUNK)
def get_iob_words(grid):
return self._get_iob_words(grid, tagset)
return LazyConcatenation(LazyMap(get_iob_words, self._grids(fileids)))
def iob_sents(self, fileids=None, tagset=None):
"""
:return: a list of lists of word/tag/IOB tuples
:rtype: list(list)
:param fileids: the list of fileids that make up this corpus
:type fileids: None or str or list
"""
self._require(self.WORDS, self.POS, self.CHUNK)
def get_iob_words(grid):
return self._get_iob_words(grid, tagset)
return LazyMap(get_iob_words, self._grids(fileids))
# /////////////////////////////////////////////////////////////////
# Grid Reading
# /////////////////////////////////////////////////////////////////
def _grids(self, fileids=None):
# n.b.: we could cache the object returned here (keyed on
# fileids), which would let us reuse the same corpus view for
# different things (eg srl and parse trees).
return concat(
[
StreamBackedCorpusView(fileid, self._read_grid_block, encoding=enc)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def _read_grid_block(self, stream):
grids = []
for block in read_blankline_block(stream):
block = block.strip()
if not block:
continue
grid = [line.split(self.sep) for line in block.split("\n")]
# If there's a docstart row, then discard. ([xx] eventually it
# would be good to actually use it)
if grid[0][self._colmap.get("words", 0)] == "-DOCSTART-":
del grid[0]
# Check that the grid is consistent.
for row in grid:
if len(row) != len(grid[0]):
raise ValueError("Inconsistent number of columns:\n%s" % block)
grids.append(grid)
return grids
# /////////////////////////////////////////////////////////////////
# Transforms
# /////////////////////////////////////////////////////////////////
# given a grid, transform it into some representation (e.g.,
# a list of words or a parse tree).
def _get_words(self, grid):
return self._get_column(grid, self._colmap["words"])
def _get_tagged_words(self, grid, tagset=None):
pos_tags = self._get_column(grid, self._colmap["pos"])
if tagset and tagset != self._tagset:
pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
return list(zip(self._get_column(grid, self._colmap["words"]), pos_tags))
def _get_iob_words(self, grid, tagset=None):
pos_tags = self._get_column(grid, self._colmap["pos"])
if tagset and tagset != self._tagset:
pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
return list(
zip(
self._get_column(grid, self._colmap["words"]),
pos_tags,
self._get_column(grid, self._colmap["chunk"]),
)
)
def _get_chunked_words(self, grid, chunk_types, tagset=None):
# n.b.: this method is very similar to conllstr2tree.
words = self._get_column(grid, self._colmap["words"])
pos_tags = self._get_column(grid, self._colmap["pos"])
if tagset and tagset != self._tagset:
pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
chunk_tags = self._get_column(grid, self._colmap["chunk"])
stack = [Tree(self._root_label, [])]
for word, pos_tag, chunk_tag in zip(words, pos_tags, chunk_tags):
if chunk_tag == "O":
state, chunk_type = "O", ""
else:
(state, chunk_type) = chunk_tag.split("-")
# If it's a chunk we don't care about, treat it as O.
if chunk_types is not None and chunk_type not in chunk_types:
state = "O"
# Treat a mismatching I like a B.
if state == "I" and chunk_type != stack[-1].label():
state = "B"
# For B or I: close any open chunks
if state in "BO" and len(stack) == 2:
stack.pop()
# For B: start a new chunk.
if state == "B":
new_chunk = Tree(chunk_type, [])
stack[-1].append(new_chunk)
stack.append(new_chunk)
# Add the word token.
stack[-1].append((word, pos_tag))
return stack[0]
def _get_parsed_sent(self, grid, pos_in_tree, tagset=None):
words = self._get_column(grid, self._colmap["words"])
pos_tags = self._get_column(grid, self._colmap["pos"])
if tagset and tagset != self._tagset:
pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
parse_tags = self._get_column(grid, self._colmap["tree"])
treestr = ""
for word, pos_tag, parse_tag in zip(words, pos_tags, parse_tags):
if word == "(":
word = "-LRB-"
if word == ")":
word = "-RRB-"
if pos_tag == "(":
pos_tag = "-LRB-"
if pos_tag == ")":
pos_tag = "-RRB-"
(left, right) = parse_tag.split("*")
right = right.count(")") * ")" # only keep ')'.
treestr += f"{left} ({pos_tag} {word}) {right}"
try:
tree = self._tree_class.fromstring(treestr)
except (ValueError, IndexError):
tree = self._tree_class.fromstring(f"({self._root_label} {treestr})")
if not pos_in_tree:
for subtree in tree.subtrees():
for i, child in enumerate(subtree):
if (
isinstance(child, Tree)
and len(child) == 1
and isinstance(child[0], str)
):
subtree[i] = (child[0], child.label())
return tree
def _get_srl_spans(self, grid):
"""
list of list of (start, end), tag) tuples
"""
if self._srl_includes_roleset:
predicates = self._get_column(grid, self._colmap["srl"] + 1)
start_col = self._colmap["srl"] + 2
else:
predicates = self._get_column(grid, self._colmap["srl"])
start_col = self._colmap["srl"] + 1
# Count how many predicates there are. This tells us how many
# columns to expect for SRL data.
num_preds = len([p for p in predicates if p != "-"])
spanlists = []
for i in range(num_preds):
col = self._get_column(grid, start_col + i)
spanlist = []
stack = []
for wordnum, srl_tag in enumerate(col):
(left, right) = srl_tag.split("*")
for tag in left.split("("):
if tag:
stack.append((tag, wordnum))
for i in range(right.count(")")):
(tag, start) = stack.pop()
spanlist.append(((start, wordnum + 1), tag))
spanlists.append(spanlist)
return spanlists
def _get_srl_instances(self, grid, pos_in_tree):
tree = self._get_parsed_sent(grid, pos_in_tree)
spanlists = self._get_srl_spans(grid)
if self._srl_includes_roleset:
predicates = self._get_column(grid, self._colmap["srl"] + 1)
rolesets = self._get_column(grid, self._colmap["srl"])
else:
predicates = self._get_column(grid, self._colmap["srl"])
rolesets = [None] * len(predicates)
instances = ConllSRLInstanceList(tree)
for wordnum, predicate in enumerate(predicates):
if predicate == "-":
continue
# Decide which spanlist to use. Don't assume that they're
# sorted in the same order as the predicates (even though
# they usually are).
for spanlist in spanlists:
for (start, end), tag in spanlist:
if wordnum in range(start, end) and tag in ("V", "C-V"):
break
else:
continue
break
else:
raise ValueError("No srl column found for %r" % predicate)
instances.append(
ConllSRLInstance(tree, wordnum, predicate, rolesets[wordnum], spanlist)
)
return instances
# /////////////////////////////////////////////////////////////////
# Helper Methods
# /////////////////////////////////////////////////////////////////
def _require(self, *columntypes):
for columntype in columntypes:
if columntype not in self._colmap:
raise ValueError(
"This corpus does not contain a %s " "column." % columntype
)
@staticmethod
def _get_column(grid, column_index):
return [grid[i][column_index] for i in range(len(grid))]
class ConllSRLInstance:
"""
An SRL instance from a CoNLL corpus, which identifies and
providing labels for the arguments of a single verb.
"""
# [xx] add inst.core_arguments, inst.argm_arguments?
def __init__(self, tree, verb_head, verb_stem, roleset, tagged_spans):
self.verb = []
"""A list of the word indices of the words that compose the
verb whose arguments are identified by this instance.
This will contain multiple word indices when multi-word
verbs are used (e.g. 'turn on')."""
self.verb_head = verb_head
"""The word index of the head word of the verb whose arguments
are identified by this instance. E.g., for a sentence that
uses the verb 'turn on,' ``verb_head`` will be the word index
of the word 'turn'."""
self.verb_stem = verb_stem
self.roleset = roleset
self.arguments = []
"""A list of ``(argspan, argid)`` tuples, specifying the location
and type for each of the arguments identified by this
instance. ``argspan`` is a tuple ``start, end``, indicating
that the argument consists of the ``words[start:end]``."""
self.tagged_spans = tagged_spans
"""A list of ``(span, id)`` tuples, specifying the location and
type for each of the arguments, as well as the verb pieces,
that make up this instance."""
self.tree = tree
"""The parse tree for the sentence containing this instance."""
self.words = tree.leaves()
"""A list of the words in the sentence containing this
instance."""
# Fill in the self.verb and self.arguments values.
for (start, end), tag in tagged_spans:
if tag in ("V", "C-V"):
self.verb += list(range(start, end))
else:
self.arguments.append(((start, end), tag))
def __repr__(self):
# Originally, its:
##plural = 's' if len(self.arguments) != 1 else ''
plural = "s" if len(self.arguments) != 1 else ""
return "<ConllSRLInstance for %r with %d argument%s>" % (
(self.verb_stem, len(self.arguments), plural)
)
def pprint(self):
verbstr = " ".join(self.words[i][0] for i in self.verb)
hdr = f"SRL for {verbstr!r} (stem={self.verb_stem!r}):\n"
s = ""
for i, word in enumerate(self.words):
if isinstance(word, tuple):
word = word[0]
for (start, end), argid in self.arguments:
if i == start:
s += "[%s " % argid
if i == end:
s += "] "
if i in self.verb:
word = "<<%s>>" % word
s += word + " "
return hdr + textwrap.fill(
s.replace(" ]", "]"), initial_indent=" ", subsequent_indent=" "
)
class ConllSRLInstanceList(list):
"""
Set of instances for a single sentence
"""
def __init__(self, tree, instances=()):
self.tree = tree
list.__init__(self, instances)
def __str__(self):
return self.pprint()
def pprint(self, include_tree=False):
# Sanity check: trees should be the same
for inst in self:
if inst.tree != self.tree:
raise ValueError("Tree mismatch!")
# If desired, add trees:
if include_tree:
words = self.tree.leaves()
pos = [None] * len(words)
synt = ["*"] * len(words)
self._tree2conll(self.tree, 0, words, pos, synt)
s = ""
for i in range(len(words)):
# optional tree columns
if include_tree:
s += "%-20s " % words[i]
s += "%-8s " % pos[i]
s += "%15s*%-8s " % tuple(synt[i].split("*"))
# verb head column
for inst in self:
if i == inst.verb_head:
s += "%-20s " % inst.verb_stem
break
else:
s += "%-20s " % "-"
# Remaining columns: self
for inst in self:
argstr = "*"
for (start, end), argid in inst.tagged_spans:
if i == start:
argstr = f"({argid}{argstr}"
if i == (end - 1):
argstr += ")"
s += "%-12s " % argstr
s += "\n"
return s
def _tree2conll(self, tree, wordnum, words, pos, synt):
assert isinstance(tree, Tree)
if len(tree) == 1 and isinstance(tree[0], str):
pos[wordnum] = tree.label()
assert words[wordnum] == tree[0]
return wordnum + 1
elif len(tree) == 1 and isinstance(tree[0], tuple):
assert len(tree[0]) == 2
pos[wordnum], pos[wordnum] = tree[0]
return wordnum + 1
else:
synt[wordnum] = f"({tree.label()}{synt[wordnum]}"
for child in tree:
wordnum = self._tree2conll(child, wordnum, words, pos, synt)
synt[wordnum - 1] += ")"
return wordnum
class ConllChunkCorpusReader(ConllCorpusReader):
"""
A ConllCorpusReader whose data file contains three columns: words,
pos, and chunk.
"""
def __init__(
self, root, fileids, chunk_types, encoding="utf8", tagset=None, separator=None
):
ConllCorpusReader.__init__(
self,
root,
fileids,
("words", "pos", "chunk"),
chunk_types=chunk_types,
encoding=encoding,
tagset=tagset,
separator=separator,
)

View File

@@ -0,0 +1,106 @@
# Natural Language Toolkit: An Crubadan N-grams Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Avital Pekker <avital.pekker@utoronto.ca>
#
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
An NLTK interface for the n-gram statistics gathered from
the corpora for each language using An Crubadan.
There are multiple potential applications for the data but
this reader was created with the goal of using it in the
context of language identification.
For details about An Crubadan, this data, and its potential uses, see:
http://borel.slu.edu/crubadan/index.html
"""
import re
from os import path
from nltk.corpus.reader import CorpusReader
from nltk.data import ZipFilePathPointer
from nltk.probability import FreqDist
class CrubadanCorpusReader(CorpusReader):
"""
A corpus reader used to access language An Crubadan n-gram files.
"""
_LANG_MAPPER_FILE = "table.txt"
_all_lang_freq = {}
def __init__(self, root, fileids, encoding="utf8", tagset=None):
super().__init__(root, fileids, encoding="utf8")
self._lang_mapping_data = []
self._load_lang_mapping_data()
def lang_freq(self, lang):
"""Return n-gram FreqDist for a specific language
given ISO 639-3 language code"""
if lang not in self._all_lang_freq:
self._all_lang_freq[lang] = self._load_lang_ngrams(lang)
return self._all_lang_freq[lang]
def langs(self):
"""Return a list of supported languages as ISO 639-3 codes"""
return [row[1] for row in self._lang_mapping_data]
def iso_to_crubadan(self, lang):
"""Return internal Crubadan code based on ISO 639-3 code"""
for i in self._lang_mapping_data:
if i[1].lower() == lang.lower():
return i[0]
def crubadan_to_iso(self, lang):
"""Return ISO 639-3 code given internal Crubadan code"""
for i in self._lang_mapping_data:
if i[0].lower() == lang.lower():
return i[1]
def _load_lang_mapping_data(self):
"""Load language mappings between codes and description from table.txt"""
if isinstance(self.root, ZipFilePathPointer):
raise RuntimeError(
"Please install the 'crubadan' corpus first, use nltk.download()"
)
mapper_file = path.join(self.root, self._LANG_MAPPER_FILE)
if self._LANG_MAPPER_FILE not in self.fileids():
raise RuntimeError("Could not find language mapper file: " + mapper_file)
with open(mapper_file, encoding="utf-8") as raw:
strip_raw = raw.read().strip()
self._lang_mapping_data = [row.split("\t") for row in strip_raw.split("\n")]
def _load_lang_ngrams(self, lang):
"""Load single n-gram language file given the ISO 639-3 language code
and return its FreqDist"""
if lang not in self.langs():
raise RuntimeError("Unsupported language.")
crubadan_code = self.iso_to_crubadan(lang)
ngram_file = path.join(self.root, crubadan_code + "-3grams.txt")
if not path.isfile(ngram_file):
raise RuntimeError("No N-gram file found for requested language.")
counts = FreqDist()
with open(ngram_file, encoding="utf-8") as f:
for line in f:
data = line.split(" ")
ngram = data[1].strip("\n")
freq = int(data[0])
counts[ngram] = freq
return counts

View File

@@ -0,0 +1,115 @@
# Natural Language Toolkit: Dependency Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Kepa Sarasola <kepa.sarasola@ehu.es>
# Iker Manterola <returntothehangar@hotmail.com>
#
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *
from nltk.parse import DependencyGraph
from nltk.tokenize import *
class DependencyCorpusReader(SyntaxCorpusReader):
def __init__(
self,
root,
fileids,
encoding="utf8",
word_tokenizer=TabTokenizer(),
sent_tokenizer=RegexpTokenizer("\n", gaps=True),
para_block_reader=read_blankline_block,
):
SyntaxCorpusReader.__init__(self, root, fileids, encoding)
#########################################################
def words(self, fileids=None):
return concat(
[
DependencyCorpusView(fileid, False, False, False, encoding=enc)
for fileid, enc in self.abspaths(fileids, include_encoding=True)
]
)
def tagged_words(self, fileids=None):
return concat(
[
DependencyCorpusView(fileid, True, False, False, encoding=enc)
for fileid, enc in self.abspaths(fileids, include_encoding=True)
]
)
def sents(self, fileids=None):
return concat(
[
DependencyCorpusView(fileid, False, True, False, encoding=enc)
for fileid, enc in self.abspaths(fileids, include_encoding=True)
]
)
def tagged_sents(self, fileids=None):
return concat(
[
DependencyCorpusView(fileid, True, True, False, encoding=enc)
for fileid, enc in self.abspaths(fileids, include_encoding=True)
]
)
def parsed_sents(self, fileids=None):
sents = concat(
[
DependencyCorpusView(fileid, False, True, True, encoding=enc)
for fileid, enc in self.abspaths(fileids, include_encoding=True)
]
)
return [DependencyGraph(sent) for sent in sents]
class DependencyCorpusView(StreamBackedCorpusView):
_DOCSTART = "-DOCSTART- -DOCSTART- O\n" # dokumentu hasiera definitzen da
def __init__(
self,
corpus_file,
tagged,
group_by_sent,
dependencies,
chunk_types=None,
encoding="utf8",
):
self._tagged = tagged
self._dependencies = dependencies
self._group_by_sent = group_by_sent
self._chunk_types = chunk_types
StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
def read_block(self, stream):
# Read the next sentence.
sent = read_blankline_block(stream)[0].strip()
# Strip off the docstart marker, if present.
if sent.startswith(self._DOCSTART):
sent = sent[len(self._DOCSTART) :].lstrip()
# extract word and tag from any of the formats
if not self._dependencies:
lines = [line.split("\t") for line in sent.split("\n")]
if len(lines[0]) == 3 or len(lines[0]) == 4:
sent = [(line[0], line[1]) for line in lines]
elif len(lines[0]) == 10:
sent = [(line[1], line[4]) for line in lines]
else:
raise ValueError("Unexpected number of fields in dependency tree file")
# discard tags if they weren't requested
if not self._tagged:
sent = [word for (word, tag) in sent]
# Return the result.
if self._group_by_sent:
return [sent]
else:
return list(sent)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,116 @@
# Natural Language Toolkit: IEER Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Corpus reader for the Information Extraction and Entity Recognition Corpus.
NIST 1999 Information Extraction: Entity Recognition Evaluation
https://www.itl.nist.gov/iad/894.01/tests/ie-er/er_99/er_99.htm
This corpus contains the NEWSWIRE development test data for the
NIST 1999 IE-ER Evaluation. The files were taken from the
subdirectory: ``/ie_er_99/english/devtest/newswire/*.ref.nwt``
and filenames were shortened.
The corpus contains the following files: APW_19980314, APW_19980424,
APW_19980429, NYT_19980315, NYT_19980403, and NYT_19980407.
"""
import nltk
from nltk.corpus.reader.api import *
#: A dictionary whose keys are the names of documents in this corpus;
#: and whose values are descriptions of those documents' contents.
titles = {
"APW_19980314": "Associated Press Weekly, 14 March 1998",
"APW_19980424": "Associated Press Weekly, 24 April 1998",
"APW_19980429": "Associated Press Weekly, 29 April 1998",
"NYT_19980315": "New York Times, 15 March 1998",
"NYT_19980403": "New York Times, 3 April 1998",
"NYT_19980407": "New York Times, 7 April 1998",
}
#: A list of all documents in this corpus.
documents = sorted(titles)
class IEERDocument:
def __init__(self, text, docno=None, doctype=None, date_time=None, headline=""):
self.text = text
self.docno = docno
self.doctype = doctype
self.date_time = date_time
self.headline = headline
def __repr__(self):
if self.headline:
headline = " ".join(self.headline.leaves())
else:
headline = (
" ".join([w for w in self.text.leaves() if w[:1] != "<"][:12]) + "..."
)
if self.docno is not None:
return f"<IEERDocument {self.docno}: {headline!r}>"
else:
return "<IEERDocument: %r>" % headline
class IEERCorpusReader(CorpusReader):
""" """
def docs(self, fileids=None):
return concat(
[
StreamBackedCorpusView(fileid, self._read_block, encoding=enc)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def parsed_docs(self, fileids=None):
return concat(
[
StreamBackedCorpusView(fileid, self._read_parsed_block, encoding=enc)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def _read_parsed_block(self, stream):
# TODO: figure out while empty documents are being returned
return [
self._parse(doc)
for doc in self._read_block(stream)
if self._parse(doc).docno is not None
]
def _parse(self, doc):
val = nltk.chunk.ieerstr2tree(doc, root_label="DOCUMENT")
if isinstance(val, dict):
return IEERDocument(**val)
else:
return IEERDocument(val)
def _read_block(self, stream):
out = []
# Skip any preamble.
while True:
line = stream.readline()
if not line:
break
if line.strip() == "<DOC>":
break
out.append(line)
# Read the document
while True:
line = stream.readline()
if not line:
break
out.append(line)
if line.strip() == "</DOC>":
break
# Return the document
return ["\n".join(out)]

View File

@@ -0,0 +1,93 @@
# Natural Language Toolkit: Indian Language POS-Tagged Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Indian Language POS-Tagged Corpus
Collected by A Kumaran, Microsoft Research, India
Distributed with permission
Contents:
- Bangla: IIT Kharagpur
- Hindi: Microsoft Research India
- Marathi: IIT Bombay
- Telugu: IIIT Hyderabad
"""
from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *
from nltk.tag import map_tag, str2tuple
class IndianCorpusReader(CorpusReader):
"""
List of words, one per line. Blank lines are ignored.
"""
def words(self, fileids=None):
return concat(
[
IndianCorpusView(fileid, enc, False, False)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def tagged_words(self, fileids=None, tagset=None):
if tagset and tagset != self._tagset:
tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
else:
tag_mapping_function = None
return concat(
[
IndianCorpusView(fileid, enc, True, False, tag_mapping_function)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def sents(self, fileids=None):
return concat(
[
IndianCorpusView(fileid, enc, False, True)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def tagged_sents(self, fileids=None, tagset=None):
if tagset and tagset != self._tagset:
tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
else:
tag_mapping_function = None
return concat(
[
IndianCorpusView(fileid, enc, True, True, tag_mapping_function)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
class IndianCorpusView(StreamBackedCorpusView):
def __init__(
self, corpus_file, encoding, tagged, group_by_sent, tag_mapping_function=None
):
self._tagged = tagged
self._group_by_sent = group_by_sent
self._tag_mapping_function = tag_mapping_function
StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
def read_block(self, stream):
line = stream.readline()
if line.startswith("<"):
return []
sent = [str2tuple(word, sep="_") for word in line.split()]
if self._tag_mapping_function:
sent = [(w, self._tag_mapping_function(t)) for (w, t) in sent]
if not self._tagged:
sent = [w for (w, t) in sent]
if self._group_by_sent:
return [sent]
else:
return sent

View File

@@ -0,0 +1,354 @@
# Natural Language Toolkit: IPI PAN Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Konrad Goluchowski <kodie@mimuw.edu.pl>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import functools
from nltk.corpus.reader.api import CorpusReader
from nltk.corpus.reader.util import StreamBackedCorpusView, concat
def _parse_args(fun):
@functools.wraps(fun)
def decorator(self, fileids=None, **kwargs):
kwargs.pop("tags", None)
if not fileids:
fileids = self.fileids()
return fun(self, fileids, **kwargs)
return decorator
class IPIPANCorpusReader(CorpusReader):
"""
Corpus reader designed to work with corpus created by IPI PAN.
See http://korpus.pl/en/ for more details about IPI PAN corpus.
The corpus includes information about text domain, channel and categories.
You can access possible values using ``domains()``, ``channels()`` and
``categories()``. You can use also this metadata to filter files, e.g.:
``fileids(channel='prasa')``, ``fileids(categories='publicystyczny')``.
The reader supports methods: words, sents, paras and their tagged versions.
You can get part of speech instead of full tag by giving "simplify_tags=True"
parameter, e.g.: ``tagged_sents(simplify_tags=True)``.
Also you can get all tags disambiguated tags specifying parameter
"one_tag=False", e.g.: ``tagged_paras(one_tag=False)``.
You can get all tags that were assigned by a morphological analyzer specifying
parameter "disamb_only=False", e.g. ``tagged_words(disamb_only=False)``.
The IPIPAN Corpus contains tags indicating if there is a space between two
tokens. To add special "no space" markers, you should specify parameter
"append_no_space=True", e.g. ``tagged_words(append_no_space=True)``.
As a result in place where there should be no space between two tokens new
pair ('', 'no-space') will be inserted (for tagged data) and just '' for
methods without tags.
The corpus reader can also try to append spaces between words. To enable this
option, specify parameter "append_space=True", e.g. ``words(append_space=True)``.
As a result either ' ' or (' ', 'space') will be inserted between tokens.
By default, xml entities like &quot; and &amp; are replaced by corresponding
characters. You can turn off this feature, specifying parameter
"replace_xmlentities=False", e.g. ``words(replace_xmlentities=False)``.
"""
def __init__(self, root, fileids):
CorpusReader.__init__(self, root, fileids, None, None)
def channels(self, fileids=None):
if not fileids:
fileids = self.fileids()
return self._parse_header(fileids, "channel")
def domains(self, fileids=None):
if not fileids:
fileids = self.fileids()
return self._parse_header(fileids, "domain")
def categories(self, fileids=None):
if not fileids:
fileids = self.fileids()
return [
self._map_category(cat) for cat in self._parse_header(fileids, "keyTerm")
]
def fileids(self, channels=None, domains=None, categories=None):
if channels is not None and domains is not None and categories is not None:
raise ValueError(
"You can specify only one of channels, domains "
"and categories parameter at once"
)
if channels is None and domains is None and categories is None:
return CorpusReader.fileids(self)
if isinstance(channels, str):
channels = [channels]
if isinstance(domains, str):
domains = [domains]
if isinstance(categories, str):
categories = [categories]
if channels:
return self._list_morph_files_by("channel", channels)
elif domains:
return self._list_morph_files_by("domain", domains)
else:
return self._list_morph_files_by(
"keyTerm", categories, map=self._map_category
)
@_parse_args
def sents(self, fileids=None, **kwargs):
return concat(
[
self._view(
fileid, mode=IPIPANCorpusView.SENTS_MODE, tags=False, **kwargs
)
for fileid in self._list_morph_files(fileids)
]
)
@_parse_args
def paras(self, fileids=None, **kwargs):
return concat(
[
self._view(
fileid, mode=IPIPANCorpusView.PARAS_MODE, tags=False, **kwargs
)
for fileid in self._list_morph_files(fileids)
]
)
@_parse_args
def words(self, fileids=None, **kwargs):
return concat(
[
self._view(fileid, tags=False, **kwargs)
for fileid in self._list_morph_files(fileids)
]
)
@_parse_args
def tagged_sents(self, fileids=None, **kwargs):
return concat(
[
self._view(fileid, mode=IPIPANCorpusView.SENTS_MODE, **kwargs)
for fileid in self._list_morph_files(fileids)
]
)
@_parse_args
def tagged_paras(self, fileids=None, **kwargs):
return concat(
[
self._view(fileid, mode=IPIPANCorpusView.PARAS_MODE, **kwargs)
for fileid in self._list_morph_files(fileids)
]
)
@_parse_args
def tagged_words(self, fileids=None, **kwargs):
return concat(
[self._view(fileid, **kwargs) for fileid in self._list_morph_files(fileids)]
)
def _list_morph_files(self, fileids):
return [f for f in self.abspaths(fileids)]
def _list_header_files(self, fileids):
return [
f.replace("morph.xml", "header.xml")
for f in self._list_morph_files(fileids)
]
def _parse_header(self, fileids, tag):
values = set()
for f in self._list_header_files(fileids):
values_list = self._get_tag(f, tag)
for v in values_list:
values.add(v)
return list(values)
def _list_morph_files_by(self, tag, values, map=None):
fileids = self.fileids()
ret_fileids = set()
for f in fileids:
fp = self.abspath(f).replace("morph.xml", "header.xml")
values_list = self._get_tag(fp, tag)
for value in values_list:
if map is not None:
value = map(value)
if value in values:
ret_fileids.add(f)
return list(ret_fileids)
def _get_tag(self, f, tag):
tags = []
with open(f) as infile:
header = infile.read()
tag_end = 0
while True:
tag_pos = header.find("<" + tag, tag_end)
if tag_pos < 0:
return tags
tag_end = header.find("</" + tag + ">", tag_pos)
tags.append(header[tag_pos + len(tag) + 2 : tag_end])
def _map_category(self, cat):
pos = cat.find(">")
if pos == -1:
return cat
else:
return cat[pos + 1 :]
def _view(self, filename, **kwargs):
tags = kwargs.pop("tags", True)
mode = kwargs.pop("mode", 0)
simplify_tags = kwargs.pop("simplify_tags", False)
one_tag = kwargs.pop("one_tag", True)
disamb_only = kwargs.pop("disamb_only", True)
append_no_space = kwargs.pop("append_no_space", False)
append_space = kwargs.pop("append_space", False)
replace_xmlentities = kwargs.pop("replace_xmlentities", True)
if len(kwargs) > 0:
raise ValueError("Unexpected arguments: %s" % kwargs.keys())
if not one_tag and not disamb_only:
raise ValueError(
"You cannot specify both one_tag=False and " "disamb_only=False"
)
if not tags and (simplify_tags or not one_tag or not disamb_only):
raise ValueError(
"You cannot specify simplify_tags, one_tag or "
"disamb_only with functions other than tagged_*"
)
return IPIPANCorpusView(
filename,
tags=tags,
mode=mode,
simplify_tags=simplify_tags,
one_tag=one_tag,
disamb_only=disamb_only,
append_no_space=append_no_space,
append_space=append_space,
replace_xmlentities=replace_xmlentities,
)
class IPIPANCorpusView(StreamBackedCorpusView):
WORDS_MODE = 0
SENTS_MODE = 1
PARAS_MODE = 2
def __init__(self, filename, startpos=0, **kwargs):
StreamBackedCorpusView.__init__(self, filename, None, startpos, None)
self.in_sentence = False
self.position = 0
self.show_tags = kwargs.pop("tags", True)
self.disamb_only = kwargs.pop("disamb_only", True)
self.mode = kwargs.pop("mode", IPIPANCorpusView.WORDS_MODE)
self.simplify_tags = kwargs.pop("simplify_tags", False)
self.one_tag = kwargs.pop("one_tag", True)
self.append_no_space = kwargs.pop("append_no_space", False)
self.append_space = kwargs.pop("append_space", False)
self.replace_xmlentities = kwargs.pop("replace_xmlentities", True)
def read_block(self, stream):
sentence = []
sentences = []
space = False
no_space = False
tags = set()
lines = self._read_data(stream)
while True:
# we may have only part of last line
if len(lines) <= 1:
self._seek(stream)
lines = self._read_data(stream)
if lines == [""]:
assert not sentences
return []
line = lines.pop()
self.position += len(line) + 1
if line.startswith('<chunk type="s"'):
self.in_sentence = True
elif line.startswith('<chunk type="p"'):
pass
elif line.startswith("<tok"):
if self.append_space and space and not no_space:
self._append_space(sentence)
space = True
no_space = False
orth = ""
tags = set()
elif line.startswith("</chunk"):
if self.in_sentence:
self.in_sentence = False
self._seek(stream)
if self.mode == self.SENTS_MODE:
return [sentence]
elif self.mode == self.WORDS_MODE:
if self.append_space:
self._append_space(sentence)
return sentence
else:
sentences.append(sentence)
elif self.mode == self.PARAS_MODE:
self._seek(stream)
return [sentences]
elif line.startswith("<orth"):
orth = line[6:-7]
if self.replace_xmlentities:
orth = orth.replace("&quot;", '"').replace("&amp;", "&")
elif line.startswith("<lex"):
if not self.disamb_only or line.find("disamb=") != -1:
tag = line[line.index("<ctag") + 6 : line.index("</ctag")]
tags.add(tag)
elif line.startswith("</tok"):
if self.show_tags:
if self.simplify_tags:
tags = [t.split(":")[0] for t in tags]
if not self.one_tag or not self.disamb_only:
sentence.append((orth, tuple(tags)))
else:
sentence.append((orth, tags.pop()))
else:
sentence.append(orth)
elif line.startswith("<ns/>"):
if self.append_space:
no_space = True
if self.append_no_space:
if self.show_tags:
sentence.append(("", "no-space"))
else:
sentence.append("")
elif line.startswith("</cesAna"):
pass
def _read_data(self, stream):
self.position = stream.tell()
buff = stream.read(4096)
lines = buff.split("\n")
lines.reverse()
return lines
def _seek(self, stream):
stream.seek(self.position)
def _append_space(self, sentence):
if self.show_tags:
sentence.append((" ", "space"))
else:
sentence.append(" ")

View File

@@ -0,0 +1,186 @@
#! /usr/bin/env python
# KNB Corpus reader
# Copyright (C) 2001-2025 NLTK Project
# Author: Masato Hagiwara <hagisan@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
# For more information, see http://lilyx.net/pages/nltkjapanesecorpus.html
import re
from nltk.corpus.reader.api import CorpusReader, SyntaxCorpusReader
from nltk.corpus.reader.util import (
FileSystemPathPointer,
find_corpus_fileids,
read_blankline_block,
)
from nltk.parse import DependencyGraph
# default function to convert morphlist to str for tree representation
_morphs2str_default = lambda morphs: "/".join(m[0] for m in morphs if m[0] != "EOS")
class KNBCorpusReader(SyntaxCorpusReader):
"""
This class implements:
- ``__init__``, which specifies the location of the corpus
and a method for detecting the sentence blocks in corpus files.
- ``_read_block``, which reads a block from the input stream.
- ``_word``, which takes a block and returns a list of list of words.
- ``_tag``, which takes a block and returns a list of list of tagged
words.
- ``_parse``, which takes a block and returns a list of parsed
sentences.
The structure of tagged words:
tagged_word = (word(str), tags(tuple))
tags = (surface, reading, lemma, pos1, posid1, pos2, posid2, pos3, posid3, others ...)
Usage example
>>> from nltk.corpus.util import LazyCorpusLoader
>>> knbc = LazyCorpusLoader(
... 'knbc/corpus1',
... KNBCorpusReader,
... r'.*/KN.*',
... encoding='euc-jp',
... )
>>> len(knbc.sents()[0])
9
"""
def __init__(self, root, fileids, encoding="utf8", morphs2str=_morphs2str_default):
"""
Initialize KNBCorpusReader
morphs2str is a function to convert morphlist to str for tree representation
for _parse()
"""
SyntaxCorpusReader.__init__(self, root, fileids, encoding)
self.morphs2str = morphs2str
def _read_block(self, stream):
# blocks are split by blankline (or EOF) - default
return read_blankline_block(stream)
def _word(self, t):
res = []
for line in t.splitlines():
# ignore the Bunsets headers
if not re.match(r"EOS|\*|\#|\+", line):
cells = line.strip().split(" ")
res.append(cells[0])
return res
# ignores tagset argument
def _tag(self, t, tagset=None):
res = []
for line in t.splitlines():
# ignore the Bunsets headers
if not re.match(r"EOS|\*|\#|\+", line):
cells = line.strip().split(" ")
# convert cells to morph tuples
res.append((cells[0], " ".join(cells[1:])))
return res
def _parse(self, t):
dg = DependencyGraph()
i = 0
for line in t.splitlines():
if line[0] in "*+":
# start of bunsetsu or tag
cells = line.strip().split(" ", 3)
m = re.match(r"([\-0-9]*)([ADIP])", cells[1])
assert m is not None
node = dg.nodes[i]
node.update({"address": i, "rel": m.group(2), "word": []})
dep_parent = int(m.group(1))
if dep_parent == -1:
dg.root = node
else:
dg.nodes[dep_parent]["deps"].append(i)
i += 1
elif line[0] != "#":
# normal morph
cells = line.strip().split(" ")
# convert cells to morph tuples
morph = cells[0], " ".join(cells[1:])
dg.nodes[i - 1]["word"].append(morph)
if self.morphs2str:
for node in dg.nodes.values():
node["word"] = self.morphs2str(node["word"])
return dg.tree()
######################################################################
# Demo
######################################################################
def demo():
import nltk
from nltk.corpus.util import LazyCorpusLoader
root = nltk.data.find("corpora/knbc/corpus1")
fileids = [
f
for f in find_corpus_fileids(FileSystemPathPointer(root), ".*")
if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)
]
def _knbc_fileids_sort(x):
cells = x.split("-")
return (cells[0], int(cells[1]), int(cells[2]), int(cells[3]))
knbc = LazyCorpusLoader(
"knbc/corpus1",
KNBCorpusReader,
sorted(fileids, key=_knbc_fileids_sort),
encoding="euc-jp",
)
print(knbc.fileids()[:10])
print("".join(knbc.words()[:100]))
print("\n\n".join(str(tree) for tree in knbc.parsed_sents()[:2]))
knbc.morphs2str = lambda morphs: "/".join(
"{}({})".format(m[0], m[1].split(" ")[2]) for m in morphs if m[0] != "EOS"
).encode("utf-8")
print("\n\n".join("%s" % tree for tree in knbc.parsed_sents()[:2]))
print(
"\n".join(
" ".join("{}/{}".format(w[0], w[1].split(" ")[2]) for w in sent)
for sent in knbc.tagged_sents()[0:2]
)
)
def test():
from nltk.corpus.util import LazyCorpusLoader
knbc = LazyCorpusLoader(
"knbc/corpus1", KNBCorpusReader, r".*/KN.*", encoding="euc-jp"
)
assert isinstance(knbc.words()[0], str)
assert isinstance(knbc.sents()[0][0], str)
assert isinstance(knbc.tagged_words()[0], tuple)
assert isinstance(knbc.tagged_sents()[0][0], tuple)
if __name__ == "__main__":
demo()

View File

@@ -0,0 +1,183 @@
# Natural Language Toolkit: Lin's Thesaurus
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Dan Blanchard <dblanchard@ets.org>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.txt
import re
from collections import defaultdict
from functools import reduce
from nltk.corpus.reader import CorpusReader
class LinThesaurusCorpusReader(CorpusReader):
"""Wrapper for the LISP-formatted thesauruses distributed by Dekang Lin."""
# Compiled regular expression for extracting the key from the first line of each
# thesaurus entry
_key_re = re.compile(r'\("?([^"]+)"? \(desc [0-9.]+\).+')
@staticmethod
def __defaultdict_factory():
"""Factory for creating defaultdict of defaultdict(dict)s"""
return defaultdict(dict)
def __init__(self, root, badscore=0.0):
"""
Initialize the thesaurus.
:param root: root directory containing thesaurus LISP files
:type root: C{string}
:param badscore: the score to give to words which do not appear in each other's sets of synonyms
:type badscore: C{float}
"""
super().__init__(root, r"sim[A-Z]\.lsp")
self._thesaurus = defaultdict(LinThesaurusCorpusReader.__defaultdict_factory)
self._badscore = badscore
for path, encoding, fileid in self.abspaths(
include_encoding=True, include_fileid=True
):
with open(path) as lin_file:
first = True
for line in lin_file:
line = line.strip()
# Start of entry
if first:
key = LinThesaurusCorpusReader._key_re.sub(r"\1", line)
first = False
# End of entry
elif line == "))":
first = True
# Lines with pairs of ngrams and scores
else:
split_line = line.split("\t")
if len(split_line) == 2:
ngram, score = split_line
self._thesaurus[fileid][key][ngram.strip('"')] = float(
score
)
def similarity(self, ngram1, ngram2, fileid=None):
"""
Returns the similarity score for two ngrams.
:param ngram1: first ngram to compare
:type ngram1: C{string}
:param ngram2: second ngram to compare
:type ngram2: C{string}
:param fileid: thesaurus fileid to search in. If None, search all fileids.
:type fileid: C{string}
:return: If fileid is specified, just the score for the two ngrams; otherwise,
list of tuples of fileids and scores.
"""
# Entries don't contain themselves, so make sure similarity between item and itself is 1.0
if ngram1 == ngram2:
if fileid:
return 1.0
else:
return [(fid, 1.0) for fid in self._fileids]
else:
if fileid:
return (
self._thesaurus[fileid][ngram1][ngram2]
if ngram2 in self._thesaurus[fileid][ngram1]
else self._badscore
)
else:
return [
(
fid,
(
self._thesaurus[fid][ngram1][ngram2]
if ngram2 in self._thesaurus[fid][ngram1]
else self._badscore
),
)
for fid in self._fileids
]
def scored_synonyms(self, ngram, fileid=None):
"""
Returns a list of scored synonyms (tuples of synonyms and scores) for the current ngram
:param ngram: ngram to lookup
:type ngram: C{string}
:param fileid: thesaurus fileid to search in. If None, search all fileids.
:type fileid: C{string}
:return: If fileid is specified, list of tuples of scores and synonyms; otherwise,
list of tuples of fileids and lists, where inner lists consist of tuples of
scores and synonyms.
"""
if fileid:
return self._thesaurus[fileid][ngram].items()
else:
return [
(fileid, self._thesaurus[fileid][ngram].items())
for fileid in self._fileids
]
def synonyms(self, ngram, fileid=None):
"""
Returns a list of synonyms for the current ngram.
:param ngram: ngram to lookup
:type ngram: C{string}
:param fileid: thesaurus fileid to search in. If None, search all fileids.
:type fileid: C{string}
:return: If fileid is specified, list of synonyms; otherwise, list of tuples of fileids and
lists, where inner lists contain synonyms.
"""
if fileid:
return self._thesaurus[fileid][ngram].keys()
else:
return [
(fileid, self._thesaurus[fileid][ngram].keys())
for fileid in self._fileids
]
def __contains__(self, ngram):
"""
Determines whether or not the given ngram is in the thesaurus.
:param ngram: ngram to lookup
:type ngram: C{string}
:return: whether the given ngram is in the thesaurus.
"""
return reduce(
lambda accum, fileid: accum or (ngram in self._thesaurus[fileid]),
self._fileids,
False,
)
######################################################################
# Demo
######################################################################
def demo():
from nltk.corpus import lin_thesaurus as thes
word1 = "business"
word2 = "enterprise"
print("Getting synonyms for " + word1)
print(thes.synonyms(word1))
print("Getting scored synonyms for " + word1)
print(thes.scored_synonyms(word1))
print("Getting synonyms from simN.lsp (noun subsection) for " + word1)
print(thes.synonyms(word1, fileid="simN.lsp"))
print("Getting synonyms from simN.lsp (noun subsection) for " + word1)
print(thes.synonyms(word1, fileid="simN.lsp"))
print(f"Similarity score for {word1} and {word2}:")
print(thes.similarity(word1, word2))
if __name__ == "__main__":
demo()

View File

@@ -0,0 +1,344 @@
from collections import namedtuple
from functools import partial, wraps
from nltk.corpus.reader.api import CategorizedCorpusReader
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.corpus.reader.util import concat, read_blankline_block
from nltk.tokenize import blankline_tokenize, sent_tokenize, word_tokenize
def comma_separated_string_args(func):
"""
A decorator that allows a function to be called with
a single string of comma-separated values which become
individual function arguments.
"""
@wraps(func)
def wrapper(*args, **kwargs):
_args = list()
for arg in args:
if isinstance(arg, str):
_args.append({part.strip() for part in arg.split(",")})
elif isinstance(arg, list):
_args.append(set(arg))
else:
_args.append(arg)
for name, value in kwargs.items():
if isinstance(value, str):
kwargs[name] = {part.strip() for part in value.split(",")}
return func(*_args, **kwargs)
return wrapper
def read_parse_blankline_block(stream, parser):
block = read_blankline_block(stream)
if block:
return [parser.render(block[0])]
return block
class MarkdownBlock:
def __init__(self, content):
self.content = content
self.truncate_at = 16
def __repr__(self):
return f"{self.__class__.__name__}(content={repr(str(self))})"
def __str__(self):
return (
f"{self.content[:self.truncate_at]}"
f"{'...' if len(self.content) > self.truncate_at else ''}"
)
@property
def raw(self):
return self.content
@property
def words(self):
return word_tokenize(self.content)
@property
def sents(self):
return [word_tokenize(sent) for sent in sent_tokenize(self.content)]
@property
def paras(self):
return [
[word_tokenize(sent) for sent in sent_tokenize(para)]
for para in blankline_tokenize(self.content)
]
class CodeBlock(MarkdownBlock):
def __init__(self, language, *args):
self.language = language
super().__init__(*args)
@property
def sents(self):
return [word_tokenize(line) for line in self.content.splitlines()]
@property
def lines(self):
return self.content.splitlines()
@property
def paras(self):
return [
[word_tokenize(line) for line in para.splitlines()]
for para in blankline_tokenize(self.content)
]
class MarkdownSection(MarkdownBlock):
def __init__(self, heading, level, *args):
self.heading = heading
self.level = level
super().__init__(*args)
Image = namedtuple("Image", "label, src, title")
Link = namedtuple("Link", "label, href, title")
List = namedtuple("List", "is_ordered, items")
class MarkdownCorpusReader(PlaintextCorpusReader):
def __init__(self, *args, parser=None, **kwargs):
from markdown_it import MarkdownIt
from mdit_plain.renderer import RendererPlain
from mdit_py_plugins.front_matter import front_matter_plugin
self.parser = parser
if self.parser is None:
self.parser = MarkdownIt("commonmark", renderer_cls=RendererPlain)
self.parser.use(front_matter_plugin)
kwargs.setdefault(
"para_block_reader", partial(read_parse_blankline_block, parser=self.parser)
)
super().__init__(*args, **kwargs)
# This override takes care of removing markup.
def _read_word_block(self, stream):
words = list()
for para in self._para_block_reader(stream):
words.extend(self._word_tokenizer.tokenize(para))
return words
class CategorizedMarkdownCorpusReader(CategorizedCorpusReader, MarkdownCorpusReader):
"""
A reader for markdown corpora whose documents are divided into
categories based on their file identifiers.
Based on nltk.corpus.reader.plaintext.CategorizedPlaintextCorpusReader:
https://www.nltk.org/_modules/nltk/corpus/reader/api.html#CategorizedCorpusReader
"""
def __init__(self, *args, cat_field="tags", **kwargs):
"""
Initialize the corpus reader. Categorization arguments
(``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
the ``CategorizedCorpusReader`` constructor. The remaining arguments
are passed to the ``MarkdownCorpusReader`` constructor.
"""
cat_args = ["cat_pattern", "cat_map", "cat_file"]
if not any(arg in kwargs for arg in cat_args):
# Initialize with a blank map now,
# and try to build categories from document metadata later.
kwargs["cat_map"] = dict()
CategorizedCorpusReader.__init__(self, kwargs)
MarkdownCorpusReader.__init__(self, *args, **kwargs)
# Map file IDs to categories if self._map exists but is still empty:
if self._map is not None and not self._map:
for file_id in self._fileids:
metadata = self.metadata(file_id)
if metadata:
self._map[file_id] = metadata[0].get(cat_field, [])
### Begin CategorizedCorpusReader Overrides
@comma_separated_string_args
def categories(self, fileids=None):
return super().categories(fileids)
@comma_separated_string_args
def fileids(self, categories=None):
if categories is None:
return self._fileids
return super().fileids(categories)
### End CategorizedCorpusReader Overrides
### Begin MarkdownCorpusReader Overrides
@comma_separated_string_args
def raw(self, fileids=None, categories=None):
return super().raw(self._resolve(fileids, categories))
@comma_separated_string_args
def words(self, fileids=None, categories=None):
return super().words(self._resolve(fileids, categories))
@comma_separated_string_args
def sents(self, fileids=None, categories=None):
return super().sents(self._resolve(fileids, categories))
@comma_separated_string_args
def paras(self, fileids=None, categories=None):
return super().paras(self._resolve(fileids, categories))
### End MarkdownCorpusReader Overrides
def concatenated_view(self, reader, fileids, categories):
return concat(
[
self.CorpusView(path, reader, encoding=enc)
for (path, enc) in self.abspaths(
self._resolve(fileids, categories), include_encoding=True
)
]
)
def metadata_reader(self, stream):
from yaml import safe_load
return [
safe_load(t.content)
for t in self.parser.parse(stream.read())
if t.type == "front_matter"
]
@comma_separated_string_args
def metadata(self, fileids=None, categories=None):
return self.concatenated_view(self.metadata_reader, fileids, categories)
def blockquote_reader(self, stream):
tokens = self.parser.parse(stream.read())
opening_tokens = filter(
lambda t: t.level == 0 and t.type == "blockquote_open", tokens
)
closing_tokens = filter(
lambda t: t.level == 0 and t.type == "blockquote_close", tokens
)
blockquotes = list()
for o, c in zip(opening_tokens, closing_tokens):
opening_index = tokens.index(o)
closing_index = tokens.index(c, opening_index)
blockquotes.append(tokens[opening_index : closing_index + 1])
return [
MarkdownBlock(
self.parser.renderer.render(block, self.parser.options, env=None)
)
for block in blockquotes
]
@comma_separated_string_args
def blockquotes(self, fileids=None, categories=None):
return self.concatenated_view(self.blockquote_reader, fileids, categories)
def code_block_reader(self, stream):
return [
CodeBlock(
t.info,
t.content,
)
for t in self.parser.parse(stream.read())
if t.level == 0 and t.type in ("fence", "code_block")
]
@comma_separated_string_args
def code_blocks(self, fileids=None, categories=None):
return self.concatenated_view(self.code_block_reader, fileids, categories)
def image_reader(self, stream):
return [
Image(
child_token.content,
child_token.attrGet("src"),
child_token.attrGet("title"),
)
for inline_token in filter(
lambda t: t.type == "inline", self.parser.parse(stream.read())
)
for child_token in inline_token.children
if child_token.type == "image"
]
@comma_separated_string_args
def images(self, fileids=None, categories=None):
return self.concatenated_view(self.image_reader, fileids, categories)
def link_reader(self, stream):
return [
Link(
inline_token.children[i + 1].content,
child_token.attrGet("href"),
child_token.attrGet("title"),
)
for inline_token in filter(
lambda t: t.type == "inline", self.parser.parse(stream.read())
)
for i, child_token in enumerate(inline_token.children)
if child_token.type == "link_open"
]
@comma_separated_string_args
def links(self, fileids=None, categories=None):
return self.concatenated_view(self.link_reader, fileids, categories)
def list_reader(self, stream):
tokens = self.parser.parse(stream.read())
opening_types = ("bullet_list_open", "ordered_list_open")
opening_tokens = filter(
lambda t: t.level == 0 and t.type in opening_types, tokens
)
closing_types = ("bullet_list_close", "ordered_list_close")
closing_tokens = filter(
lambda t: t.level == 0 and t.type in closing_types, tokens
)
list_blocks = list()
for o, c in zip(opening_tokens, closing_tokens):
opening_index = tokens.index(o)
closing_index = tokens.index(c, opening_index)
list_blocks.append(tokens[opening_index : closing_index + 1])
return [
List(
tokens[0].type == "ordered_list_open",
[t.content for t in tokens if t.content],
)
for tokens in list_blocks
]
@comma_separated_string_args
def lists(self, fileids=None, categories=None):
return self.concatenated_view(self.list_reader, fileids, categories)
def section_reader(self, stream):
section_blocks, block = list(), list()
for t in self.parser.parse(stream.read()):
if t.level == 0 and t.type == "heading_open":
if not block:
block.append(t)
else:
section_blocks.append(block)
block = [t]
elif block:
block.append(t)
if block:
section_blocks.append(block)
return [
MarkdownSection(
block[1].content,
block[0].markup.count("#"),
self.parser.renderer.render(block, self.parser.options, env=None),
)
for block in section_blocks
]
@comma_separated_string_args
def sections(self, fileids=None, categories=None):
return self.concatenated_view(self.section_reader, fileids, categories)

View File

@@ -0,0 +1,398 @@
"""
A reader for corpora whose documents are in MTE format.
"""
import os
import re
from functools import reduce
from nltk.corpus.reader import TaggedCorpusReader, concat
from nltk.corpus.reader.xmldocs import XMLCorpusView
def xpath(root, path, ns):
return root.findall(path, ns)
class MTECorpusView(XMLCorpusView):
"""
Class for lazy viewing the MTE Corpus.
"""
def __init__(self, fileid, tagspec, elt_handler=None):
XMLCorpusView.__init__(self, fileid, tagspec, elt_handler)
def read_block(self, stream, tagspec=None, elt_handler=None):
return list(
filter(
lambda x: x is not None,
XMLCorpusView.read_block(self, stream, tagspec, elt_handler),
)
)
class MTEFileReader:
"""
Class for loading the content of the multext-east corpus. It
parses the xml files and does some tag-filtering depending on the
given method parameters.
"""
ns = {
"tei": "https://www.tei-c.org/ns/1.0",
"xml": "https://www.w3.org/XML/1998/namespace",
}
tag_ns = "{https://www.tei-c.org/ns/1.0}"
xml_ns = "{https://www.w3.org/XML/1998/namespace}"
word_path = "TEI/text/body/div/div/p/s/(w|c)"
sent_path = "TEI/text/body/div/div/p/s"
para_path = "TEI/text/body/div/div/p"
def __init__(self, file_path):
self.__file_path = file_path
@classmethod
def _word_elt(cls, elt, context):
return elt.text
@classmethod
def _sent_elt(cls, elt, context):
return [cls._word_elt(w, None) for w in xpath(elt, "*", cls.ns)]
@classmethod
def _para_elt(cls, elt, context):
return [cls._sent_elt(s, None) for s in xpath(elt, "*", cls.ns)]
@classmethod
def _tagged_word_elt(cls, elt, context):
if "ana" not in elt.attrib:
return (elt.text, "")
if cls.__tags == "" and cls.__tagset == "msd":
return (elt.text, elt.attrib["ana"])
elif cls.__tags == "" and cls.__tagset == "universal":
return (elt.text, MTETagConverter.msd_to_universal(elt.attrib["ana"]))
else:
tags = re.compile("^" + re.sub("-", ".", cls.__tags) + ".*$")
if tags.match(elt.attrib["ana"]):
if cls.__tagset == "msd":
return (elt.text, elt.attrib["ana"])
else:
return (
elt.text,
MTETagConverter.msd_to_universal(elt.attrib["ana"]),
)
else:
return None
@classmethod
def _tagged_sent_elt(cls, elt, context):
return list(
filter(
lambda x: x is not None,
[cls._tagged_word_elt(w, None) for w in xpath(elt, "*", cls.ns)],
)
)
@classmethod
def _tagged_para_elt(cls, elt, context):
return list(
filter(
lambda x: x is not None,
[cls._tagged_sent_elt(s, None) for s in xpath(elt, "*", cls.ns)],
)
)
@classmethod
def _lemma_word_elt(cls, elt, context):
if "lemma" not in elt.attrib:
return (elt.text, "")
else:
return (elt.text, elt.attrib["lemma"])
@classmethod
def _lemma_sent_elt(cls, elt, context):
return [cls._lemma_word_elt(w, None) for w in xpath(elt, "*", cls.ns)]
@classmethod
def _lemma_para_elt(cls, elt, context):
return [cls._lemma_sent_elt(s, None) for s in xpath(elt, "*", cls.ns)]
def words(self):
return MTECorpusView(
self.__file_path, MTEFileReader.word_path, MTEFileReader._word_elt
)
def sents(self):
return MTECorpusView(
self.__file_path, MTEFileReader.sent_path, MTEFileReader._sent_elt
)
def paras(self):
return MTECorpusView(
self.__file_path, MTEFileReader.para_path, MTEFileReader._para_elt
)
def lemma_words(self):
return MTECorpusView(
self.__file_path, MTEFileReader.word_path, MTEFileReader._lemma_word_elt
)
def tagged_words(self, tagset, tags):
MTEFileReader.__tagset = tagset
MTEFileReader.__tags = tags
return MTECorpusView(
self.__file_path, MTEFileReader.word_path, MTEFileReader._tagged_word_elt
)
def lemma_sents(self):
return MTECorpusView(
self.__file_path, MTEFileReader.sent_path, MTEFileReader._lemma_sent_elt
)
def tagged_sents(self, tagset, tags):
MTEFileReader.__tagset = tagset
MTEFileReader.__tags = tags
return MTECorpusView(
self.__file_path, MTEFileReader.sent_path, MTEFileReader._tagged_sent_elt
)
def lemma_paras(self):
return MTECorpusView(
self.__file_path, MTEFileReader.para_path, MTEFileReader._lemma_para_elt
)
def tagged_paras(self, tagset, tags):
MTEFileReader.__tagset = tagset
MTEFileReader.__tags = tags
return MTECorpusView(
self.__file_path, MTEFileReader.para_path, MTEFileReader._tagged_para_elt
)
class MTETagConverter:
"""
Class for converting msd tags to universal tags, more conversion
options are currently not implemented.
"""
mapping_msd_universal = {
"A": "ADJ",
"S": "ADP",
"R": "ADV",
"C": "CONJ",
"D": "DET",
"N": "NOUN",
"M": "NUM",
"Q": "PRT",
"P": "PRON",
"V": "VERB",
".": ".",
"-": "X",
}
@staticmethod
def msd_to_universal(tag):
"""
This function converts the annotation from the Multex-East to the universal tagset
as described in Chapter 5 of the NLTK-Book
Unknown Tags will be mapped to X. Punctuation marks are not supported in MSD tags, so
"""
indicator = tag[0] if not tag[0] == "#" else tag[1]
if not indicator in MTETagConverter.mapping_msd_universal:
indicator = "-"
return MTETagConverter.mapping_msd_universal[indicator]
class MTECorpusReader(TaggedCorpusReader):
"""
Reader for corpora following the TEI-p5 xml scheme, such as MULTEXT-East.
MULTEXT-East contains part-of-speech-tagged words with a quite precise tagging
scheme. These tags can be converted to the Universal tagset
"""
def __init__(self, root=None, fileids=None, encoding="utf8"):
"""
Construct a new MTECorpusreader for a set of documents
located at the given root directory. Example usage:
>>> root = '/...path to corpus.../'
>>> reader = MTECorpusReader(root, 'oana-*.xml', 'utf8') # doctest: +SKIP
:param root: The root directory for this corpus. (default points to location in multext config file)
:param fileids: A list or regexp specifying the fileids in this corpus. (default is oana-en.xml)
:param encoding: The encoding of the given files (default is utf8)
"""
TaggedCorpusReader.__init__(self, root, fileids, encoding)
self._readme = "00README.txt"
def __fileids(self, fileids):
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, str):
fileids = [fileids]
# filter wrong userinput
fileids = filter(lambda x: x in self._fileids, fileids)
# filter multext-east sourcefiles that are not compatible to the teip5 specification
fileids = filter(lambda x: x not in ["oana-bg.xml", "oana-mk.xml"], fileids)
if not fileids:
print("No valid multext-east file specified")
return fileids
def words(self, fileids=None):
"""
:param fileids: A list specifying the fileids that should be used.
:return: the given file(s) as a list of words and punctuation symbols.
:rtype: list(str)
"""
return concat(
[
MTEFileReader(os.path.join(self._root, f)).words()
for f in self.__fileids(fileids)
]
)
def sents(self, fileids=None):
"""
:param fileids: A list specifying the fileids that should be used.
:return: the given file(s) as a list of sentences or utterances,
each encoded as a list of word strings
:rtype: list(list(str))
"""
return concat(
[
MTEFileReader(os.path.join(self._root, f)).sents()
for f in self.__fileids(fileids)
]
)
def paras(self, fileids=None):
"""
:param fileids: A list specifying the fileids that should be used.
:return: the given file(s) as a list of paragraphs, each encoded as a list
of sentences, which are in turn encoded as lists of word string
:rtype: list(list(list(str)))
"""
return concat(
[
MTEFileReader(os.path.join(self._root, f)).paras()
for f in self.__fileids(fileids)
]
)
def lemma_words(self, fileids=None):
"""
:param fileids: A list specifying the fileids that should be used.
:return: the given file(s) as a list of words, the corresponding lemmas
and punctuation symbols, encoded as tuples (word, lemma)
:rtype: list(tuple(str,str))
"""
return concat(
[
MTEFileReader(os.path.join(self._root, f)).lemma_words()
for f in self.__fileids(fileids)
]
)
def tagged_words(self, fileids=None, tagset="msd", tags=""):
"""
:param fileids: A list specifying the fileids that should be used.
:param tagset: The tagset that should be used in the returned object,
either "universal" or "msd", "msd" is the default
:param tags: An MSD Tag that is used to filter all parts of the used corpus
that are not more precise or at least equal to the given tag
:return: the given file(s) as a list of tagged words and punctuation symbols
encoded as tuples (word, tag)
:rtype: list(tuple(str, str))
"""
if tagset == "universal" or tagset == "msd":
return concat(
[
MTEFileReader(os.path.join(self._root, f)).tagged_words(
tagset, tags
)
for f in self.__fileids(fileids)
]
)
else:
print("Unknown tagset specified.")
def lemma_sents(self, fileids=None):
"""
:param fileids: A list specifying the fileids that should be used.
:return: the given file(s) as a list of sentences or utterances, each
encoded as a list of tuples of the word and the corresponding
lemma (word, lemma)
:rtype: list(list(tuple(str, str)))
"""
return concat(
[
MTEFileReader(os.path.join(self._root, f)).lemma_sents()
for f in self.__fileids(fileids)
]
)
def tagged_sents(self, fileids=None, tagset="msd", tags=""):
"""
:param fileids: A list specifying the fileids that should be used.
:param tagset: The tagset that should be used in the returned object,
either "universal" or "msd", "msd" is the default
:param tags: An MSD Tag that is used to filter all parts of the used corpus
that are not more precise or at least equal to the given tag
:return: the given file(s) as a list of sentences or utterances, each
each encoded as a list of (word,tag) tuples
:rtype: list(list(tuple(str, str)))
"""
if tagset == "universal" or tagset == "msd":
return concat(
[
MTEFileReader(os.path.join(self._root, f)).tagged_sents(
tagset, tags
)
for f in self.__fileids(fileids)
]
)
else:
print("Unknown tagset specified.")
def lemma_paras(self, fileids=None):
"""
:param fileids: A list specifying the fileids that should be used.
:return: the given file(s) as a list of paragraphs, each encoded as a
list of sentences, which are in turn encoded as a list of
tuples of the word and the corresponding lemma (word, lemma)
:rtype: list(List(List(tuple(str, str))))
"""
return concat(
[
MTEFileReader(os.path.join(self._root, f)).lemma_paras()
for f in self.__fileids(fileids)
]
)
def tagged_paras(self, fileids=None, tagset="msd", tags=""):
"""
:param fileids: A list specifying the fileids that should be used.
:param tagset: The tagset that should be used in the returned object,
either "universal" or "msd", "msd" is the default
:param tags: An MSD Tag that is used to filter all parts of the used corpus
that are not more precise or at least equal to the given tag
:return: the given file(s) as a list of paragraphs, each encoded as a
list of sentences, which are in turn encoded as a list
of (word,tag) tuples
:rtype: list(list(list(tuple(str, str))))
"""
if tagset == "universal" or tagset == "msd":
return concat(
[
MTEFileReader(os.path.join(self._root, f)).tagged_paras(
tagset, tags
)
for f in self.__fileids(fileids)
]
)
else:
print("Unknown tagset specified.")

View File

@@ -0,0 +1,486 @@
# Natural Language Toolkit: NKJP Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Gabriela Kaczka
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import functools
import os
import re
import tempfile
from nltk.corpus.reader.util import concat
from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView
def _parse_args(fun):
"""
Wraps function arguments:
if fileids not specified then function set NKJPCorpusReader paths.
"""
@functools.wraps(fun)
def decorator(self, fileids=None, **kwargs):
if not fileids:
fileids = self._paths
return fun(self, fileids, **kwargs)
return decorator
class NKJPCorpusReader(XMLCorpusReader):
WORDS_MODE = 0
SENTS_MODE = 1
HEADER_MODE = 2
RAW_MODE = 3
def __init__(self, root, fileids=".*"):
"""
Corpus reader designed to work with National Corpus of Polish.
See http://nkjp.pl/ for more details about NKJP.
use example:
import nltk
import nkjp
from nkjp import NKJPCorpusReader
x = NKJPCorpusReader(root='/home/USER/nltk_data/corpora/nkjp/', fileids='') # obtain the whole corpus
x.header()
x.raw()
x.words()
x.tagged_words(tags=['subst', 'comp']) #Link to find more tags: nkjp.pl/poliqarp/help/ense2.html
x.sents()
x = NKJPCorpusReader(root='/home/USER/nltk_data/corpora/nkjp/', fileids='Wilk*') # obtain particular file(s)
x.header(fileids=['WilkDom', '/home/USER/nltk_data/corpora/nkjp/WilkWilczy'])
x.tagged_words(fileids=['WilkDom', '/home/USER/nltk_data/corpora/nkjp/WilkWilczy'], tags=['subst', 'comp'])
"""
if isinstance(fileids, str):
XMLCorpusReader.__init__(self, root, fileids + ".*/header.xml")
else:
XMLCorpusReader.__init__(
self, root, [fileid + "/header.xml" for fileid in fileids]
)
self._paths = self.get_paths()
def get_paths(self):
return [
os.path.join(str(self._root), f.split("header.xml")[0])
for f in self._fileids
]
def fileids(self):
"""
Returns a list of file identifiers for the fileids that make up
this corpus.
"""
return [f.split("header.xml")[0] for f in self._fileids]
def _view(self, filename, tags=None, **kwargs):
"""
Returns a view specialised for use with particular corpus file.
"""
mode = kwargs.pop("mode", NKJPCorpusReader.WORDS_MODE)
if mode is NKJPCorpusReader.WORDS_MODE:
return NKJPCorpus_Morph_View(filename, tags=tags)
elif mode is NKJPCorpusReader.SENTS_MODE:
return NKJPCorpus_Segmentation_View(filename, tags=tags)
elif mode is NKJPCorpusReader.HEADER_MODE:
return NKJPCorpus_Header_View(filename, tags=tags)
elif mode is NKJPCorpusReader.RAW_MODE:
return NKJPCorpus_Text_View(
filename, tags=tags, mode=NKJPCorpus_Text_View.RAW_MODE
)
else:
raise NameError("No such mode!")
def add_root(self, fileid):
"""
Add root if necessary to specified fileid.
"""
if self.root in fileid:
return fileid
return self.root + fileid
@_parse_args
def header(self, fileids=None, **kwargs):
"""
Returns header(s) of specified fileids.
"""
return concat(
[
self._view(
self.add_root(fileid), mode=NKJPCorpusReader.HEADER_MODE, **kwargs
).handle_query()
for fileid in fileids
]
)
@_parse_args
def sents(self, fileids=None, **kwargs):
"""
Returns sentences in specified fileids.
"""
return concat(
[
self._view(
self.add_root(fileid), mode=NKJPCorpusReader.SENTS_MODE, **kwargs
).handle_query()
for fileid in fileids
]
)
@_parse_args
def words(self, fileids=None, **kwargs):
"""
Returns words in specified fileids.
"""
return concat(
[
self._view(
self.add_root(fileid), mode=NKJPCorpusReader.WORDS_MODE, **kwargs
).handle_query()
for fileid in fileids
]
)
@_parse_args
def tagged_words(self, fileids=None, **kwargs):
"""
Call with specified tags as a list, e.g. tags=['subst', 'comp'].
Returns tagged words in specified fileids.
"""
tags = kwargs.pop("tags", [])
return concat(
[
self._view(
self.add_root(fileid),
mode=NKJPCorpusReader.WORDS_MODE,
tags=tags,
**kwargs
).handle_query()
for fileid in fileids
]
)
@_parse_args
def raw(self, fileids=None, **kwargs):
"""
Returns words in specified fileids.
"""
return concat(
[
self._view(
self.add_root(fileid), mode=NKJPCorpusReader.RAW_MODE, **kwargs
).handle_query()
for fileid in fileids
]
)
class NKJPCorpus_Header_View(XMLCorpusView):
def __init__(self, filename, **kwargs):
"""
HEADER_MODE
A stream backed corpus view specialized for use with
header.xml files in NKJP corpus.
"""
self.tagspec = ".*/sourceDesc$"
XMLCorpusView.__init__(self, filename + "header.xml", self.tagspec)
def handle_query(self):
self._open()
header = []
while True:
segm = XMLCorpusView.read_block(self, self._stream)
if len(segm) == 0:
break
header.extend(segm)
self.close()
return header
def handle_elt(self, elt, context):
titles = elt.findall("bibl/title")
title = []
if titles:
title = "\n".join(title.text.strip() for title in titles)
authors = elt.findall("bibl/author")
author = []
if authors:
author = "\n".join(author.text.strip() for author in authors)
dates = elt.findall("bibl/date")
date = []
if dates:
date = "\n".join(date.text.strip() for date in dates)
publishers = elt.findall("bibl/publisher")
publisher = []
if publishers:
publisher = "\n".join(publisher.text.strip() for publisher in publishers)
idnos = elt.findall("bibl/idno")
idno = []
if idnos:
idno = "\n".join(idno.text.strip() for idno in idnos)
notes = elt.findall("bibl/note")
note = []
if notes:
note = "\n".join(note.text.strip() for note in notes)
return {
"title": title,
"author": author,
"date": date,
"publisher": publisher,
"idno": idno,
"note": note,
}
class XML_Tool:
"""
Helper class creating xml file to one without references to nkjp: namespace.
That's needed because the XMLCorpusView assumes that one can find short substrings
of XML that are valid XML, which is not true if a namespace is declared at top level
"""
def __init__(self, root, filename):
self.read_file = os.path.join(root, filename)
self.write_file = tempfile.NamedTemporaryFile(delete=False)
def build_preprocessed_file(self):
try:
fr = open(self.read_file)
fw = self.write_file
line = " "
while len(line):
line = fr.readline()
x = re.split(r"nkjp:[^ ]* ", line) # in all files
ret = " ".join(x)
x = re.split("<nkjp:paren>", ret) # in ann_segmentation.xml
ret = " ".join(x)
x = re.split("</nkjp:paren>", ret) # in ann_segmentation.xml
ret = " ".join(x)
x = re.split("<choice>", ret) # in ann_segmentation.xml
ret = " ".join(x)
x = re.split("</choice>", ret) # in ann_segmentation.xml
ret = " ".join(x)
fw.write(ret)
fr.close()
fw.close()
return self.write_file.name
except Exception as e:
self.remove_preprocessed_file()
raise Exception from e
def remove_preprocessed_file(self):
os.remove(self.write_file.name)
class NKJPCorpus_Segmentation_View(XMLCorpusView):
"""
A stream backed corpus view specialized for use with
ann_segmentation.xml files in NKJP corpus.
"""
def __init__(self, filename, **kwargs):
self.tagspec = ".*p/.*s"
# intersperse NKJPCorpus_Text_View
self.text_view = NKJPCorpus_Text_View(
filename, mode=NKJPCorpus_Text_View.SENTS_MODE
)
self.text_view.handle_query()
# xml preprocessing
self.xml_tool = XML_Tool(filename, "ann_segmentation.xml")
# base class init
XMLCorpusView.__init__(
self, self.xml_tool.build_preprocessed_file(), self.tagspec
)
def get_segm_id(self, example_word):
return example_word.split("(")[1].split(",")[0]
def get_sent_beg(self, beg_word):
# returns index of beginning letter in sentence
return int(beg_word.split(",")[1])
def get_sent_end(self, end_word):
# returns index of end letter in sentence
splitted = end_word.split(")")[0].split(",")
return int(splitted[1]) + int(splitted[2])
def get_sentences(self, sent_segm):
# returns one sentence
id = self.get_segm_id(sent_segm[0])
segm = self.text_view.segm_dict[id] # text segment
beg = self.get_sent_beg(sent_segm[0])
end = self.get_sent_end(sent_segm[len(sent_segm) - 1])
return segm[beg:end]
def remove_choice(self, segm):
ret = []
prev_txt_end = -1
prev_txt_nr = -1
for word in segm:
txt_nr = self.get_segm_id(word)
# get increasing sequence of ids: in case of choice get first possibility
if self.get_sent_beg(word) > prev_txt_end - 1 or prev_txt_nr != txt_nr:
ret.append(word)
prev_txt_end = self.get_sent_end(word)
prev_txt_nr = txt_nr
return ret
def handle_query(self):
try:
self._open()
sentences = []
while True:
sent_segm = XMLCorpusView.read_block(self, self._stream)
if len(sent_segm) == 0:
break
for segm in sent_segm:
segm = self.remove_choice(segm)
sentences.append(self.get_sentences(segm))
self.close()
self.xml_tool.remove_preprocessed_file()
return sentences
except Exception as e:
self.xml_tool.remove_preprocessed_file()
raise Exception from e
def handle_elt(self, elt, context):
ret = []
for seg in elt:
ret.append(seg.get("corresp"))
return ret
class NKJPCorpus_Text_View(XMLCorpusView):
"""
A stream backed corpus view specialized for use with
text.xml files in NKJP corpus.
"""
SENTS_MODE = 0
RAW_MODE = 1
def __init__(self, filename, **kwargs):
self.mode = kwargs.pop("mode", 0)
self.tagspec = ".*/div/ab"
self.segm_dict = dict()
# xml preprocessing
self.xml_tool = XML_Tool(filename, "text.xml")
# base class init
XMLCorpusView.__init__(
self, self.xml_tool.build_preprocessed_file(), self.tagspec
)
def handle_query(self):
try:
self._open()
x = self.read_block(self._stream)
self.close()
self.xml_tool.remove_preprocessed_file()
return x
except Exception as e:
self.xml_tool.remove_preprocessed_file()
raise Exception from e
def read_block(self, stream, tagspec=None, elt_handler=None):
"""
Returns text as a list of sentences.
"""
txt = []
while True:
segm = XMLCorpusView.read_block(self, stream)
if len(segm) == 0:
break
for part in segm:
txt.append(part)
return [" ".join([segm for segm in txt])]
def get_segm_id(self, elt):
for attr in elt.attrib:
if attr.endswith("id"):
return elt.get(attr)
def handle_elt(self, elt, context):
# fill dictionary to use later in sents mode
if self.mode is NKJPCorpus_Text_View.SENTS_MODE:
self.segm_dict[self.get_segm_id(elt)] = elt.text
return elt.text
class NKJPCorpus_Morph_View(XMLCorpusView):
"""
A stream backed corpus view specialized for use with
ann_morphosyntax.xml files in NKJP corpus.
"""
def __init__(self, filename, **kwargs):
self.tags = kwargs.pop("tags", None)
self.tagspec = ".*/seg/fs"
self.xml_tool = XML_Tool(filename, "ann_morphosyntax.xml")
XMLCorpusView.__init__(
self, self.xml_tool.build_preprocessed_file(), self.tagspec
)
def handle_query(self):
try:
self._open()
words = []
while True:
segm = XMLCorpusView.read_block(self, self._stream)
if len(segm) == 0:
break
for part in segm:
if part is not None:
words.append(part)
self.close()
self.xml_tool.remove_preprocessed_file()
return words
except Exception as e:
self.xml_tool.remove_preprocessed_file()
raise Exception from e
def handle_elt(self, elt, context):
word = ""
flag = False
is_not_interp = True
# if tags not specified, then always return word
if self.tags is None:
flag = True
for child in elt:
# get word
if "name" in child.keys() and child.attrib["name"] == "orth":
for symbol in child:
if symbol.tag == "string":
word = symbol.text
elif "name" in child.keys() and child.attrib["name"] == "interps":
for symbol in child:
if "type" in symbol.keys() and symbol.attrib["type"] == "lex":
for symbol2 in symbol:
if (
"name" in symbol2.keys()
and symbol2.attrib["name"] == "ctag"
):
for symbol3 in symbol2:
if (
"value" in symbol3.keys()
and self.tags is not None
and symbol3.attrib["value"] in self.tags
):
flag = True
elif (
"value" in symbol3.keys()
and symbol3.attrib["value"] == "interp"
):
is_not_interp = False
if flag and is_not_interp:
return word

View File

@@ -0,0 +1,465 @@
# Natural Language Toolkit: NomBank Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Authors: Paul Bedaride <paul.bedaride@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
from functools import total_ordering
from xml.etree import ElementTree
from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *
from nltk.internals import raise_unorderable_types
from nltk.tree import Tree
class NombankCorpusReader(CorpusReader):
"""
Corpus reader for the nombank corpus, which augments the Penn
Treebank with information about the predicate argument structure
of every noun instance. The corpus consists of two parts: the
predicate-argument annotations themselves, and a set of "frameset
files" which define the argument labels used by the annotations,
on a per-noun basis. Each "frameset file" contains one or more
predicates, such as ``'turn'`` or ``'turn_on'``, each of which is
divided into coarse-grained word senses called "rolesets". For
each "roleset", the frameset file provides descriptions of the
argument roles, along with examples.
"""
def __init__(
self,
root,
nomfile,
framefiles="",
nounsfile=None,
parse_fileid_xform=None,
parse_corpus=None,
encoding="utf8",
):
"""
:param root: The root directory for this corpus.
:param nomfile: The name of the file containing the predicate-
argument annotations (relative to ``root``).
:param framefiles: A list or regexp specifying the frameset
fileids for this corpus.
:param parse_fileid_xform: A transform that should be applied
to the fileids in this corpus. This should be a function
of one argument (a fileid) that returns a string (the new
fileid).
:param parse_corpus: The corpus containing the parse trees
corresponding to this corpus. These parse trees are
necessary to resolve the tree pointers used by nombank.
"""
# If framefiles is specified as a regexp, expand it.
if isinstance(framefiles, str):
self._fileids = find_corpus_fileids(root, framefiles)
self._fileids = list(framefiles)
# Initialize the corpus reader.
CorpusReader.__init__(self, root, framefiles, encoding)
# Record our nom file & nouns file.
self._nomfile = nomfile
self._nounsfile = nounsfile
self._parse_fileid_xform = parse_fileid_xform
self._parse_corpus = parse_corpus
def instances(self, baseform=None):
"""
:return: a corpus view that acts as a list of
``NombankInstance`` objects, one for each noun in the corpus.
"""
kwargs = {}
if baseform is not None:
kwargs["instance_filter"] = lambda inst: inst.baseform == baseform
return StreamBackedCorpusView(
self.abspath(self._nomfile),
lambda stream: self._read_instance_block(stream, **kwargs),
encoding=self.encoding(self._nomfile),
)
def lines(self):
"""
:return: a corpus view that acts as a list of strings, one for
each line in the predicate-argument annotation file.
"""
return StreamBackedCorpusView(
self.abspath(self._nomfile),
read_line_block,
encoding=self.encoding(self._nomfile),
)
def roleset(self, roleset_id):
"""
:return: the xml description for the given roleset.
"""
baseform = roleset_id.split(".")[0]
baseform = baseform.replace("perc-sign", "%")
baseform = baseform.replace("oneslashonezero", "1/10").replace(
"1/10", "1-slash-10"
)
framefile = "frames/%s.xml" % baseform
if framefile not in self.fileids():
raise ValueError("Frameset file for %s not found" % roleset_id)
# n.b.: The encoding for XML fileids is specified by the file
# itself; so we ignore self._encoding here.
with self.abspath(framefile).open() as fp:
etree = ElementTree.parse(fp).getroot()
for roleset in etree.findall("predicate/roleset"):
if roleset.attrib["id"] == roleset_id:
return roleset
raise ValueError(f"Roleset {roleset_id} not found in {framefile}")
def rolesets(self, baseform=None):
"""
:return: list of xml descriptions for rolesets.
"""
if baseform is not None:
framefile = "frames/%s.xml" % baseform
if framefile not in self.fileids():
raise ValueError("Frameset file for %s not found" % baseform)
framefiles = [framefile]
else:
framefiles = self.fileids()
rsets = []
for framefile in framefiles:
# n.b.: The encoding for XML fileids is specified by the file
# itself; so we ignore self._encoding here.
with self.abspath(framefile).open() as fp:
etree = ElementTree.parse(fp).getroot()
rsets.append(etree.findall("predicate/roleset"))
return LazyConcatenation(rsets)
def nouns(self):
"""
:return: a corpus view that acts as a list of all noun lemmas
in this corpus (from the nombank.1.0.words file).
"""
return StreamBackedCorpusView(
self.abspath(self._nounsfile),
read_line_block,
encoding=self.encoding(self._nounsfile),
)
def _read_instance_block(self, stream, instance_filter=lambda inst: True):
block = []
# Read 100 at a time.
for i in range(100):
line = stream.readline().strip()
if line:
inst = NombankInstance.parse(
line, self._parse_fileid_xform, self._parse_corpus
)
if instance_filter(inst):
block.append(inst)
return block
######################################################################
# { Nombank Instance & related datatypes
######################################################################
class NombankInstance:
def __init__(
self,
fileid,
sentnum,
wordnum,
baseform,
sensenumber,
predicate,
predid,
arguments,
parse_corpus=None,
):
self.fileid = fileid
"""The name of the file containing the parse tree for this
instance's sentence."""
self.sentnum = sentnum
"""The sentence number of this sentence within ``fileid``.
Indexing starts from zero."""
self.wordnum = wordnum
"""The word number of this instance's predicate within its
containing sentence. Word numbers are indexed starting from
zero, and include traces and other empty parse elements."""
self.baseform = baseform
"""The baseform of the predicate."""
self.sensenumber = sensenumber
"""The sense number of the predicate."""
self.predicate = predicate
"""A ``NombankTreePointer`` indicating the position of this
instance's predicate within its containing sentence."""
self.predid = predid
"""Identifier of the predicate."""
self.arguments = tuple(arguments)
"""A list of tuples (argloc, argid), specifying the location
and identifier for each of the predicate's argument in the
containing sentence. Argument identifiers are strings such as
``'ARG0'`` or ``'ARGM-TMP'``. This list does *not* contain
the predicate."""
self.parse_corpus = parse_corpus
"""A corpus reader for the parse trees corresponding to the
instances in this nombank corpus."""
@property
def roleset(self):
"""The name of the roleset used by this instance's predicate.
Use ``nombank.roleset() <NombankCorpusReader.roleset>`` to
look up information about the roleset."""
r = self.baseform.replace("%", "perc-sign")
r = r.replace("1/10", "1-slash-10").replace("1-slash-10", "oneslashonezero")
return f"{r}.{self.sensenumber}"
def __repr__(self):
return "<NombankInstance: {}, sent {}, word {}>".format(
self.fileid,
self.sentnum,
self.wordnum,
)
def __str__(self):
s = "{} {} {} {} {}".format(
self.fileid,
self.sentnum,
self.wordnum,
self.baseform,
self.sensenumber,
)
items = self.arguments + ((self.predicate, "rel"),)
for argloc, argid in sorted(items):
s += f" {argloc}-{argid}"
return s
def _get_tree(self):
if self.parse_corpus is None:
return None
if self.fileid not in self.parse_corpus.fileids():
return None
return self.parse_corpus.parsed_sents(self.fileid)[self.sentnum]
tree = property(
_get_tree,
doc="""
The parse tree corresponding to this instance, or None if
the corresponding tree is not available.""",
)
@staticmethod
def parse(s, parse_fileid_xform=None, parse_corpus=None):
pieces = s.split()
if len(pieces) < 6:
raise ValueError("Badly formatted nombank line: %r" % s)
# Divide the line into its basic pieces.
(fileid, sentnum, wordnum, baseform, sensenumber) = pieces[:5]
args = pieces[5:]
rel = [args.pop(i) for i, p in enumerate(args) if "-rel" in p]
if len(rel) != 1:
raise ValueError("Badly formatted nombank line: %r" % s)
# Apply the fileid selector, if any.
if parse_fileid_xform is not None:
fileid = parse_fileid_xform(fileid)
# Convert sentence & word numbers to ints.
sentnum = int(sentnum)
wordnum = int(wordnum)
# Parse the predicate location.
predloc, predid = rel[0].split("-", 1)
predicate = NombankTreePointer.parse(predloc)
# Parse the arguments.
arguments = []
for arg in args:
argloc, argid = arg.split("-", 1)
arguments.append((NombankTreePointer.parse(argloc), argid))
# Put it all together.
return NombankInstance(
fileid,
sentnum,
wordnum,
baseform,
sensenumber,
predicate,
predid,
arguments,
parse_corpus,
)
class NombankPointer:
"""
A pointer used by nombank to identify one or more constituents in
a parse tree. ``NombankPointer`` is an abstract base class with
three concrete subclasses:
- ``NombankTreePointer`` is used to point to single constituents.
- ``NombankSplitTreePointer`` is used to point to 'split'
constituents, which consist of a sequence of two or more
``NombankTreePointer`` pointers.
- ``NombankChainTreePointer`` is used to point to entire trace
chains in a tree. It consists of a sequence of pieces, which
can be ``NombankTreePointer`` or ``NombankSplitTreePointer`` pointers.
"""
def __init__(self):
if self.__class__ == NombankPointer:
raise NotImplementedError()
class NombankChainTreePointer(NombankPointer):
def __init__(self, pieces):
self.pieces = pieces
"""A list of the pieces that make up this chain. Elements may
be either ``NombankSplitTreePointer`` or
``NombankTreePointer`` pointers."""
def __str__(self):
return "*".join("%s" % p for p in self.pieces)
def __repr__(self):
return "<NombankChainTreePointer: %s>" % self
def select(self, tree):
if tree is None:
raise ValueError("Parse tree not available")
return Tree("*CHAIN*", [p.select(tree) for p in self.pieces])
class NombankSplitTreePointer(NombankPointer):
def __init__(self, pieces):
self.pieces = pieces
"""A list of the pieces that make up this chain. Elements are
all ``NombankTreePointer`` pointers."""
def __str__(self):
return ",".join("%s" % p for p in self.pieces)
def __repr__(self):
return "<NombankSplitTreePointer: %s>" % self
def select(self, tree):
if tree is None:
raise ValueError("Parse tree not available")
return Tree("*SPLIT*", [p.select(tree) for p in self.pieces])
@total_ordering
class NombankTreePointer(NombankPointer):
"""
wordnum:height*wordnum:height*...
wordnum:height,
"""
def __init__(self, wordnum, height):
self.wordnum = wordnum
self.height = height
@staticmethod
def parse(s):
# Deal with chains (xx*yy*zz)
pieces = s.split("*")
if len(pieces) > 1:
return NombankChainTreePointer(
[NombankTreePointer.parse(elt) for elt in pieces]
)
# Deal with split args (xx,yy,zz)
pieces = s.split(",")
if len(pieces) > 1:
return NombankSplitTreePointer(
[NombankTreePointer.parse(elt) for elt in pieces]
)
# Deal with normal pointers.
pieces = s.split(":")
if len(pieces) != 2:
raise ValueError("bad nombank pointer %r" % s)
return NombankTreePointer(int(pieces[0]), int(pieces[1]))
def __str__(self):
return f"{self.wordnum}:{self.height}"
def __repr__(self):
return "NombankTreePointer(%d, %d)" % (self.wordnum, self.height)
def __eq__(self, other):
while isinstance(other, (NombankChainTreePointer, NombankSplitTreePointer)):
other = other.pieces[0]
if not isinstance(other, NombankTreePointer):
return self is other
return self.wordnum == other.wordnum and self.height == other.height
def __ne__(self, other):
return not self == other
def __lt__(self, other):
while isinstance(other, (NombankChainTreePointer, NombankSplitTreePointer)):
other = other.pieces[0]
if not isinstance(other, NombankTreePointer):
return id(self) < id(other)
return (self.wordnum, -self.height) < (other.wordnum, -other.height)
def select(self, tree):
if tree is None:
raise ValueError("Parse tree not available")
return tree[self.treepos(tree)]
def treepos(self, tree):
"""
Convert this pointer to a standard 'tree position' pointer,
given that it points to the given tree.
"""
if tree is None:
raise ValueError("Parse tree not available")
stack = [tree]
treepos = []
wordnum = 0
while True:
# tree node:
if isinstance(stack[-1], Tree):
# Select the next child.
if len(treepos) < len(stack):
treepos.append(0)
else:
treepos[-1] += 1
# Update the stack.
if treepos[-1] < len(stack[-1]):
stack.append(stack[-1][treepos[-1]])
else:
# End of node's child list: pop up a level.
stack.pop()
treepos.pop()
# word node:
else:
if wordnum == self.wordnum:
return tuple(treepos[: len(treepos) - self.height - 1])
else:
wordnum += 1
stack.pop()

View File

@@ -0,0 +1,90 @@
# Natural Language Toolkit: NPS Chat Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import re
import textwrap
from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *
from nltk.corpus.reader.xmldocs import *
from nltk.internals import ElementWrapper
from nltk.tag import map_tag
from nltk.util import LazyConcatenation
class NPSChatCorpusReader(XMLCorpusReader):
def __init__(self, root, fileids, wrap_etree=False, tagset=None):
XMLCorpusReader.__init__(self, root, fileids, wrap_etree)
self._tagset = tagset
def xml_posts(self, fileids=None):
if self._wrap_etree:
return concat(
[
XMLCorpusView(fileid, "Session/Posts/Post", self._wrap_elt)
for fileid in self.abspaths(fileids)
]
)
else:
return concat(
[
XMLCorpusView(fileid, "Session/Posts/Post")
for fileid in self.abspaths(fileids)
]
)
def posts(self, fileids=None):
return concat(
[
XMLCorpusView(
fileid, "Session/Posts/Post/terminals", self._elt_to_words
)
for fileid in self.abspaths(fileids)
]
)
def tagged_posts(self, fileids=None, tagset=None):
def reader(elt, handler):
return self._elt_to_tagged_words(elt, handler, tagset)
return concat(
[
XMLCorpusView(fileid, "Session/Posts/Post/terminals", reader)
for fileid in self.abspaths(fileids)
]
)
def words(self, fileids=None):
return LazyConcatenation(self.posts(fileids))
def tagged_words(self, fileids=None, tagset=None):
return LazyConcatenation(self.tagged_posts(fileids, tagset))
def _wrap_elt(self, elt, handler):
return ElementWrapper(elt)
def _elt_to_words(self, elt, handler):
return [self._simplify_username(t.attrib["word"]) for t in elt.findall("t")]
def _elt_to_tagged_words(self, elt, handler, tagset=None):
tagged_post = [
(self._simplify_username(t.attrib["word"]), t.attrib["pos"])
for t in elt.findall("t")
]
if tagset and tagset != self._tagset:
tagged_post = [
(w, map_tag(self._tagset, tagset, t)) for (w, t) in tagged_post
]
return tagged_post
@staticmethod
def _simplify_username(word):
if "User" in word:
word = "U" + word.split("User", 1)[1]
elif isinstance(word, bytes):
word = word.decode("ascii")
return word

View File

@@ -0,0 +1,125 @@
# Natural Language Toolkit: Opinion Lexicon Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
CorpusReader for the Opinion Lexicon.
Opinion Lexicon information
===========================
Authors: Minqing Hu and Bing Liu, 2004.
Department of Computer Science
University of Illinois at Chicago
Contact: Bing Liu, liub@cs.uic.edu
https://www.cs.uic.edu/~liub
Distributed with permission.
Related papers:
- Minqing Hu and Bing Liu. "Mining and summarizing customer reviews".
Proceedings of the ACM SIGKDD International Conference on Knowledge Discovery
& Data Mining (KDD-04), Aug 22-25, 2004, Seattle, Washington, USA.
- Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing and
Comparing Opinions on the Web". Proceedings of the 14th International World
Wide Web conference (WWW-2005), May 10-14, 2005, Chiba, Japan.
"""
from nltk.corpus.reader import WordListCorpusReader
from nltk.corpus.reader.api import *
class IgnoreReadmeCorpusView(StreamBackedCorpusView):
"""
This CorpusView is used to skip the initial readme block of the corpus.
"""
def __init__(self, *args, **kwargs):
StreamBackedCorpusView.__init__(self, *args, **kwargs)
# open self._stream
self._open()
# skip the readme block
read_blankline_block(self._stream)
# Set the initial position to the current stream position
self._filepos = [self._stream.tell()]
class OpinionLexiconCorpusReader(WordListCorpusReader):
"""
Reader for Liu and Hu opinion lexicon. Blank lines and readme are ignored.
>>> from nltk.corpus import opinion_lexicon
>>> opinion_lexicon.words()
['2-faced', '2-faces', 'abnormal', 'abolish', ...]
The OpinionLexiconCorpusReader provides shortcuts to retrieve positive/negative
words:
>>> opinion_lexicon.negative()
['2-faced', '2-faces', 'abnormal', 'abolish', ...]
Note that words from `words()` method are sorted by file id, not alphabetically:
>>> opinion_lexicon.words()[0:10] # doctest: +NORMALIZE_WHITESPACE
['2-faced', '2-faces', 'abnormal', 'abolish', 'abominable', 'abominably',
'abominate', 'abomination', 'abort', 'aborted']
>>> sorted(opinion_lexicon.words())[0:10] # doctest: +NORMALIZE_WHITESPACE
['2-faced', '2-faces', 'a+', 'abnormal', 'abolish', 'abominable', 'abominably',
'abominate', 'abomination', 'abort']
"""
CorpusView = IgnoreReadmeCorpusView
def words(self, fileids=None):
"""
Return all words in the opinion lexicon. Note that these words are not
sorted in alphabetical order.
:param fileids: a list or regexp specifying the ids of the files whose
words have to be returned.
:return: the given file(s) as a list of words and punctuation symbols.
:rtype: list(str)
"""
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, str):
fileids = [fileids]
return concat(
[
self.CorpusView(path, self._read_word_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def positive(self):
"""
Return all positive words in alphabetical order.
:return: a list of positive words.
:rtype: list(str)
"""
return self.words("positive-words.txt")
def negative(self):
"""
Return all negative words in alphabetical order.
:return: a list of negative words.
:rtype: list(str)
"""
return self.words("negative-words.txt")
def _read_word_block(self, stream):
words = []
for i in range(20): # Read 20 lines at a time.
line = stream.readline()
if not line:
continue
words.append(line.strip())
return words

View File

@@ -0,0 +1,174 @@
# Natural Language Toolkit: PanLex Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: David Kamholz <kamholz@panlex.org>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
CorpusReader for PanLex Lite, a stripped down version of PanLex distributed
as an SQLite database. See the README.txt in the panlex_lite corpus directory
for more information on PanLex Lite.
"""
import os
import sqlite3
from nltk.corpus.reader.api import CorpusReader
class PanLexLiteCorpusReader(CorpusReader):
MEANING_Q = """
SELECT dnx2.mn, dnx2.uq, dnx2.ap, dnx2.ui, ex2.tt, ex2.lv
FROM dnx
JOIN ex ON (ex.ex = dnx.ex)
JOIN dnx dnx2 ON (dnx2.mn = dnx.mn)
JOIN ex ex2 ON (ex2.ex = dnx2.ex)
WHERE dnx.ex != dnx2.ex AND ex.tt = ? AND ex.lv = ?
ORDER BY dnx2.uq DESC
"""
TRANSLATION_Q = """
SELECT s.tt, sum(s.uq) AS trq FROM (
SELECT ex2.tt, max(dnx.uq) AS uq
FROM dnx
JOIN ex ON (ex.ex = dnx.ex)
JOIN dnx dnx2 ON (dnx2.mn = dnx.mn)
JOIN ex ex2 ON (ex2.ex = dnx2.ex)
WHERE dnx.ex != dnx2.ex AND ex.lv = ? AND ex.tt = ? AND ex2.lv = ?
GROUP BY ex2.tt, dnx.ui
) s
GROUP BY s.tt
ORDER BY trq DESC, s.tt
"""
def __init__(self, root):
self._c = sqlite3.connect(os.path.join(root, "db.sqlite")).cursor()
self._uid_lv = {}
self._lv_uid = {}
for row in self._c.execute("SELECT uid, lv FROM lv"):
self._uid_lv[row[0]] = row[1]
self._lv_uid[row[1]] = row[0]
def language_varieties(self, lc=None):
"""
Return a list of PanLex language varieties.
:param lc: ISO 639 alpha-3 code. If specified, filters returned varieties
by this code. If unspecified, all varieties are returned.
:return: the specified language varieties as a list of tuples. The first
element is the language variety's seven-character uniform identifier,
and the second element is its default name.
:rtype: list(tuple)
"""
if lc is None:
return self._c.execute("SELECT uid, tt FROM lv ORDER BY uid").fetchall()
else:
return self._c.execute(
"SELECT uid, tt FROM lv WHERE lc = ? ORDER BY uid", (lc,)
).fetchall()
def meanings(self, expr_uid, expr_tt):
"""
Return a list of meanings for an expression.
:param expr_uid: the expression's language variety, as a seven-character
uniform identifier.
:param expr_tt: the expression's text.
:return: a list of Meaning objects.
:rtype: list(Meaning)
"""
expr_lv = self._uid_lv[expr_uid]
mn_info = {}
for i in self._c.execute(self.MEANING_Q, (expr_tt, expr_lv)):
mn = i[0]
uid = self._lv_uid[i[5]]
if not mn in mn_info:
mn_info[mn] = {
"uq": i[1],
"ap": i[2],
"ui": i[3],
"ex": {expr_uid: [expr_tt]},
}
if not uid in mn_info[mn]["ex"]:
mn_info[mn]["ex"][uid] = []
mn_info[mn]["ex"][uid].append(i[4])
return [Meaning(mn, mn_info[mn]) for mn in mn_info]
def translations(self, from_uid, from_tt, to_uid):
"""
Return a list of translations for an expression into a single language
variety.
:param from_uid: the source expression's language variety, as a
seven-character uniform identifier.
:param from_tt: the source expression's text.
:param to_uid: the target language variety, as a seven-character
uniform identifier.
:return: a list of translation tuples. The first element is the expression
text and the second element is the translation quality.
:rtype: list(tuple)
"""
from_lv = self._uid_lv[from_uid]
to_lv = self._uid_lv[to_uid]
return self._c.execute(self.TRANSLATION_Q, (from_lv, from_tt, to_lv)).fetchall()
class Meaning(dict):
"""
Represents a single PanLex meaning. A meaning is a translation set derived
from a single source.
"""
def __init__(self, mn, attr):
super().__init__(**attr)
self["mn"] = mn
def id(self):
"""
:return: the meaning's id.
:rtype: int
"""
return self["mn"]
def quality(self):
"""
:return: the meaning's source's quality (0=worst, 9=best).
:rtype: int
"""
return self["uq"]
def source(self):
"""
:return: the meaning's source id.
:rtype: int
"""
return self["ap"]
def source_group(self):
"""
:return: the meaning's source group id.
:rtype: int
"""
return self["ui"]
def expressions(self):
"""
:return: the meaning's expressions as a dictionary whose keys are language
variety uniform identifiers and whose values are lists of expression
texts.
:rtype: dict
"""
return self["ex"]

View File

@@ -0,0 +1,95 @@
# Natural Language Toolkit: Word List Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import re
from collections import defaultdict, namedtuple
from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *
from nltk.corpus.reader.wordlist import WordListCorpusReader
from nltk.tokenize import line_tokenize
PanlexLanguage = namedtuple(
"PanlexLanguage",
[
"panlex_uid", # (1) PanLex UID
"iso639", # (2) ISO 639 language code
"iso639_type", # (3) ISO 639 language type, see README
"script", # (4) normal scripts of expressions
"name", # (5) PanLex default name
"langvar_uid", # (6) UID of the language variety in which the default name is an expression
],
)
class PanlexSwadeshCorpusReader(WordListCorpusReader):
"""
This is a class to read the PanLex Swadesh list from
David Kamholz, Jonathan Pool, and Susan M. Colowick (2014).
PanLex: Building a Resource for Panlingual Lexical Translation.
In LREC. http://www.lrec-conf.org/proceedings/lrec2014/pdf/1029_Paper.pdf
License: CC0 1.0 Universal
https://creativecommons.org/publicdomain/zero/1.0/legalcode
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# Find the swadesh size using the fileids' path.
self.swadesh_size = re.match(r"swadesh([0-9].*)\/", self.fileids()[0]).group(1)
self._languages = {lang.panlex_uid: lang for lang in self.get_languages()}
self._macro_langauges = self.get_macrolanguages()
def license(self):
return "CC0 1.0 Universal"
def language_codes(self):
return self._languages.keys()
def get_languages(self):
for line in self.raw(f"langs{self.swadesh_size}.txt").split("\n"):
if not line.strip(): # Skip empty lines.
continue
yield PanlexLanguage(*line.strip().split("\t"))
def get_macrolanguages(self):
macro_langauges = defaultdict(list)
for lang in self._languages.values():
macro_langauges[lang.iso639].append(lang.panlex_uid)
return macro_langauges
def words_by_lang(self, lang_code):
"""
:return: a list of list(str)
"""
fileid = f"swadesh{self.swadesh_size}/{lang_code}.txt"
return [concept.split("\t") for concept in self.words(fileid)]
def words_by_iso639(self, iso63_code):
"""
:return: a list of list(str)
"""
fileids = [
f"swadesh{self.swadesh_size}/{lang_code}.txt"
for lang_code in self._macro_langauges[iso63_code]
]
return [
concept.split("\t") for fileid in fileids for concept in self.words(fileid)
]
def entries(self, fileids=None):
"""
:return: a tuple of words for the specified fileids.
"""
if not fileids:
fileids = self.fileids()
wordlists = [self.words(f) for f in fileids]
return list(zip(*wordlists))

View File

@@ -0,0 +1,373 @@
# Natural Language Toolkit:
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Piotr Kasprzyk <p.j.kasprzyk@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
from nltk.corpus.reader.api import *
from nltk.corpus.reader.xmldocs import XMLCorpusReader
PARA = re.compile(r"<p(?: [^>]*){0,1}>(.*?)</p>")
SENT = re.compile(r"<s(?: [^>]*){0,1}>(.*?)</s>")
TAGGEDWORD = re.compile(r"<([wc](?: [^>]*){0,1}>)(.*?)</[wc]>")
WORD = re.compile(r"<[wc](?: [^>]*){0,1}>(.*?)</[wc]>")
TYPE = re.compile(r'type="(.*?)"')
ANA = re.compile(r'ana="(.*?)"')
TEXTID = re.compile(r'text id="(.*?)"')
class TEICorpusView(StreamBackedCorpusView):
def __init__(
self,
corpus_file,
tagged,
group_by_sent,
group_by_para,
tagset=None,
head_len=0,
textids=None,
):
self._tagged = tagged
self._textids = textids
self._group_by_sent = group_by_sent
self._group_by_para = group_by_para
# WARNING -- skip header
StreamBackedCorpusView.__init__(self, corpus_file, startpos=head_len)
_pagesize = 4096
def read_block(self, stream):
block = stream.readlines(self._pagesize)
block = concat(block)
while (block.count("<text id") > block.count("</text>")) or block.count(
"<text id"
) == 0:
tmp = stream.readline()
if len(tmp) <= 0:
break
block += tmp
block = block.replace("\n", "")
textids = TEXTID.findall(block)
if self._textids:
for tid in textids:
if tid not in self._textids:
beg = block.find(tid) - 1
end = block[beg:].find("</text>") + len("</text>")
block = block[:beg] + block[beg + end :]
output = []
for para_str in PARA.findall(block):
para = []
for sent_str in SENT.findall(para_str):
if not self._tagged:
sent = WORD.findall(sent_str)
else:
sent = list(map(self._parse_tag, TAGGEDWORD.findall(sent_str)))
if self._group_by_sent:
para.append(sent)
else:
para.extend(sent)
if self._group_by_para:
output.append(para)
else:
output.extend(para)
return output
def _parse_tag(self, tag_word_tuple):
(tag, word) = tag_word_tuple
if tag.startswith("w"):
tag = ANA.search(tag).group(1)
else: # tag.startswith('c')
tag = TYPE.search(tag).group(1)
return word, tag
class Pl196xCorpusReader(CategorizedCorpusReader, XMLCorpusReader):
head_len = 2770
def __init__(self, *args, **kwargs):
if "textid_file" in kwargs:
self._textids = kwargs["textid_file"]
else:
self._textids = None
XMLCorpusReader.__init__(self, *args)
CategorizedCorpusReader.__init__(self, kwargs)
self._init_textids()
def _init_textids(self):
self._f2t = defaultdict(list)
self._t2f = defaultdict(list)
if self._textids is not None:
with open(self._textids) as fp:
for line in fp:
line = line.strip()
file_id, text_ids = line.split(" ", 1)
if file_id not in self.fileids():
raise ValueError(
"In text_id mapping file %s: %s not found"
% (self._textids, file_id)
)
for text_id in text_ids.split(self._delimiter):
self._add_textids(file_id, text_id)
def _add_textids(self, file_id, text_id):
self._f2t[file_id].append(text_id)
self._t2f[text_id].append(file_id)
def _resolve(self, fileids, categories, textids=None):
tmp = None
if (
len(
list(
filter(
lambda accessor: accessor is None,
(fileids, categories, textids),
)
)
)
!= 1
):
raise ValueError(
"Specify exactly one of: fileids, " "categories or textids"
)
if fileids is not None:
return fileids, None
if categories is not None:
return self.fileids(categories), None
if textids is not None:
if isinstance(textids, str):
textids = [textids]
files = sum((self._t2f[t] for t in textids), [])
tdict = dict()
for f in files:
tdict[f] = set(self._f2t[f]) & set(textids)
return files, tdict
def decode_tag(self, tag):
# to be implemented
return tag
def textids(self, fileids=None, categories=None):
"""
In the pl196x corpus each category is stored in single
file and thus both methods provide identical functionality. In order
to accommodate finer granularity, a non-standard textids() method was
implemented. All the main functions can be supplied with a list
of required chunks---giving much more control to the user.
"""
fileids, _ = self._resolve(fileids, categories)
if fileids is None:
return sorted(self._t2f)
if isinstance(fileids, str):
fileids = [fileids]
return sorted(sum((self._f2t[d] for d in fileids), []))
def words(self, fileids=None, categories=None, textids=None):
fileids, textids = self._resolve(fileids, categories, textids)
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, str):
fileids = [fileids]
if textids:
return concat(
[
TEICorpusView(
self.abspath(fileid),
False,
False,
False,
head_len=self.head_len,
textids=textids[fileid],
)
for fileid in fileids
]
)
else:
return concat(
[
TEICorpusView(
self.abspath(fileid),
False,
False,
False,
head_len=self.head_len,
)
for fileid in fileids
]
)
def sents(self, fileids=None, categories=None, textids=None):
fileids, textids = self._resolve(fileids, categories, textids)
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, str):
fileids = [fileids]
if textids:
return concat(
[
TEICorpusView(
self.abspath(fileid),
False,
True,
False,
head_len=self.head_len,
textids=textids[fileid],
)
for fileid in fileids
]
)
else:
return concat(
[
TEICorpusView(
self.abspath(fileid), False, True, False, head_len=self.head_len
)
for fileid in fileids
]
)
def paras(self, fileids=None, categories=None, textids=None):
fileids, textids = self._resolve(fileids, categories, textids)
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, str):
fileids = [fileids]
if textids:
return concat(
[
TEICorpusView(
self.abspath(fileid),
False,
True,
True,
head_len=self.head_len,
textids=textids[fileid],
)
for fileid in fileids
]
)
else:
return concat(
[
TEICorpusView(
self.abspath(fileid), False, True, True, head_len=self.head_len
)
for fileid in fileids
]
)
def tagged_words(self, fileids=None, categories=None, textids=None):
fileids, textids = self._resolve(fileids, categories, textids)
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, str):
fileids = [fileids]
if textids:
return concat(
[
TEICorpusView(
self.abspath(fileid),
True,
False,
False,
head_len=self.head_len,
textids=textids[fileid],
)
for fileid in fileids
]
)
else:
return concat(
[
TEICorpusView(
self.abspath(fileid), True, False, False, head_len=self.head_len
)
for fileid in fileids
]
)
def tagged_sents(self, fileids=None, categories=None, textids=None):
fileids, textids = self._resolve(fileids, categories, textids)
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, str):
fileids = [fileids]
if textids:
return concat(
[
TEICorpusView(
self.abspath(fileid),
True,
True,
False,
head_len=self.head_len,
textids=textids[fileid],
)
for fileid in fileids
]
)
else:
return concat(
[
TEICorpusView(
self.abspath(fileid), True, True, False, head_len=self.head_len
)
for fileid in fileids
]
)
def tagged_paras(self, fileids=None, categories=None, textids=None):
fileids, textids = self._resolve(fileids, categories, textids)
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, str):
fileids = [fileids]
if textids:
return concat(
[
TEICorpusView(
self.abspath(fileid),
True,
True,
True,
head_len=self.head_len,
textids=textids[fileid],
)
for fileid in fileids
]
)
else:
return concat(
[
TEICorpusView(
self.abspath(fileid), True, True, True, head_len=self.head_len
)
for fileid in fileids
]
)
def xml(self, fileids=None, categories=None):
fileids, _ = self._resolve(fileids, categories)
if len(fileids) == 1:
return XMLCorpusReader.xml(self, fileids[0])
else:
raise TypeError("Expected a single file")

View File

@@ -0,0 +1,237 @@
# Natural Language Toolkit: Plaintext Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# Nitin Madnani <nmadnani@umiacs.umd.edu>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
A reader for corpora that consist of plaintext documents.
"""
import nltk.data
from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *
from nltk.tokenize import *
class PlaintextCorpusReader(CorpusReader):
"""
Reader for corpora that consist of plaintext documents. Paragraphs
are assumed to be split using blank lines. Sentences and words can
be tokenized using the default tokenizers, or by custom tokenizers
specified as parameters to the constructor.
This corpus reader can be customized (e.g., to skip preface
sections of specific document formats) by creating a subclass and
overriding the ``CorpusView`` class variable.
"""
CorpusView = StreamBackedCorpusView
"""The corpus view class used by this reader. Subclasses of
``PlaintextCorpusReader`` may specify alternative corpus view
classes (e.g., to skip the preface sections of documents.)"""
def __init__(
self,
root,
fileids,
word_tokenizer=WordPunctTokenizer(),
sent_tokenizer=None,
para_block_reader=read_blankline_block,
encoding="utf8",
):
r"""
Construct a new plaintext corpus reader for a set of documents
located at the given root directory. Example usage:
>>> root = '/usr/local/share/nltk_data/corpora/webtext/'
>>> reader = PlaintextCorpusReader(root, '.*\.txt') # doctest: +SKIP
:param root: The root directory for this corpus.
:param fileids: A list or regexp specifying the fileids in this corpus.
:param word_tokenizer: Tokenizer for breaking sentences or
paragraphs into words.
:param sent_tokenizer: Tokenizer for breaking paragraphs
into words.
:param para_block_reader: The block reader used to divide the
corpus into paragraph blocks.
"""
CorpusReader.__init__(self, root, fileids, encoding)
self._word_tokenizer = word_tokenizer
self._sent_tokenizer = sent_tokenizer
self._para_block_reader = para_block_reader
def words(self, fileids=None):
"""
:return: the given file(s) as a list of words
and punctuation symbols.
:rtype: list(str)
"""
return concat(
[
self.CorpusView(path, self._read_word_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def sents(self, fileids=None):
"""
:return: the given file(s) as a list of
sentences or utterances, each encoded as a list of word
strings.
:rtype: list(list(str))
"""
if self._sent_tokenizer is None:
try:
self._sent_tokenizer = PunktTokenizer()
except:
raise ValueError("No sentence tokenizer for this corpus")
return concat(
[
self.CorpusView(path, self._read_sent_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def paras(self, fileids=None):
"""
:return: the given file(s) as a list of
paragraphs, each encoded as a list of sentences, which are
in turn encoded as lists of word strings.
:rtype: list(list(list(str)))
"""
if self._sent_tokenizer is None:
try:
self._sent_tokenizer = PunktTokenizer()
except:
raise ValueError("No sentence tokenizer for this corpus")
return concat(
[
self.CorpusView(path, self._read_para_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def _read_word_block(self, stream):
words = []
for i in range(20): # Read 20 lines at a time.
words.extend(self._word_tokenizer.tokenize(stream.readline()))
return words
def _read_sent_block(self, stream):
sents = []
for para in self._para_block_reader(stream):
sents.extend(
[
self._word_tokenizer.tokenize(sent)
for sent in self._sent_tokenizer.tokenize(para)
]
)
return sents
def _read_para_block(self, stream):
paras = []
for para in self._para_block_reader(stream):
paras.append(
[
self._word_tokenizer.tokenize(sent)
for sent in self._sent_tokenizer.tokenize(para)
]
)
return paras
class CategorizedPlaintextCorpusReader(CategorizedCorpusReader, PlaintextCorpusReader):
"""
A reader for plaintext corpora whose documents are divided into
categories based on their file identifiers.
"""
def __init__(self, *args, **kwargs):
"""
Initialize the corpus reader. Categorization arguments
(``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
the ``CategorizedCorpusReader`` constructor. The remaining arguments
are passed to the ``PlaintextCorpusReader`` constructor.
"""
CategorizedCorpusReader.__init__(self, kwargs)
PlaintextCorpusReader.__init__(self, *args, **kwargs)
class PortugueseCategorizedPlaintextCorpusReader(CategorizedPlaintextCorpusReader):
"""
This class is identical with CategorizedPlaintextCorpusReader,
except that it initializes a Portuguese PunktTokenizer:
>>> from nltk.corpus import machado
>>> print(machado._sent_tokenizer._lang)
portuguese
"""
def __init__(self, *args, **kwargs):
CategorizedPlaintextCorpusReader.__init__(self, *args, **kwargs)
# Fixed (@ekaf 2025), new way to invoke Punkt:
self._sent_tokenizer = PunktTokenizer("portuguese")
class EuroparlCorpusReader(PlaintextCorpusReader):
"""
Reader for Europarl corpora that consist of plaintext documents.
Documents are divided into chapters instead of paragraphs as
for regular plaintext documents. Chapters are separated using blank
lines. Everything is inherited from ``PlaintextCorpusReader`` except
that:
- Since the corpus is pre-processed and pre-tokenized, the
word tokenizer should just split the line at whitespaces.
- For the same reason, the sentence tokenizer should just
split the paragraph at line breaks.
- There is a new 'chapters()' method that returns chapters instead
instead of paragraphs.
- The 'paras()' method inherited from PlaintextCorpusReader is
made non-functional to remove any confusion between chapters
and paragraphs for Europarl.
"""
def _read_word_block(self, stream):
words = []
for i in range(20): # Read 20 lines at a time.
words.extend(stream.readline().split())
return words
def _read_sent_block(self, stream):
sents = []
for para in self._para_block_reader(stream):
sents.extend([sent.split() for sent in para.splitlines()])
return sents
def _read_para_block(self, stream):
paras = []
for para in self._para_block_reader(stream):
paras.append([sent.split() for sent in para.splitlines()])
return paras
def chapters(self, fileids=None):
"""
:return: the given file(s) as a list of
chapters, each encoded as a list of sentences, which are
in turn encoded as lists of word strings.
:rtype: list(list(list(str)))
"""
return concat(
[
self.CorpusView(fileid, self._read_para_block, encoding=enc)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def paras(self, fileids=None):
raise NotImplementedError(
"The Europarl corpus reader does not support paragraphs. Please use chapters() instead."
)

View File

@@ -0,0 +1,95 @@
# Natural Language Toolkit: PP Attachment Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Read lines from the Prepositional Phrase Attachment Corpus.
The PP Attachment Corpus contains several files having the format:
sentence_id verb noun1 preposition noun2 attachment
For example:
42960 gives authority to administration V
46742 gives inventors of microchip N
The PP attachment is to the verb phrase (V) or noun phrase (N), i.e.:
(VP gives (NP authority) (PP to administration))
(VP gives (NP inventors (PP of microchip)))
The corpus contains the following files:
training: training set
devset: development test set, used for algorithm development.
test: test set, used to report results
bitstrings: word classes derived from Mutual Information Clustering for the Wall Street Journal.
Ratnaparkhi, Adwait (1994). A Maximum Entropy Model for Prepositional
Phrase Attachment. Proceedings of the ARPA Human Language Technology
Conference. [http://www.cis.upenn.edu/~adwait/papers/hlt94.ps]
The PP Attachment Corpus is distributed with NLTK with the permission
of the author.
"""
from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *
class PPAttachment:
def __init__(self, sent, verb, noun1, prep, noun2, attachment):
self.sent = sent
self.verb = verb
self.noun1 = noun1
self.prep = prep
self.noun2 = noun2
self.attachment = attachment
def __repr__(self):
return (
"PPAttachment(sent=%r, verb=%r, noun1=%r, prep=%r, "
"noun2=%r, attachment=%r)"
% (self.sent, self.verb, self.noun1, self.prep, self.noun2, self.attachment)
)
class PPAttachmentCorpusReader(CorpusReader):
"""
sentence_id verb noun1 preposition noun2 attachment
"""
def attachments(self, fileids):
return concat(
[
StreamBackedCorpusView(fileid, self._read_obj_block, encoding=enc)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def tuples(self, fileids):
return concat(
[
StreamBackedCorpusView(fileid, self._read_tuple_block, encoding=enc)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def _read_tuple_block(self, stream):
line = stream.readline()
if line:
return [tuple(line.split())]
else:
return []
def _read_obj_block(self, stream):
line = stream.readline()
if line:
return [PPAttachment(*line.split())]
else:
return []

View File

@@ -0,0 +1,519 @@
# Natural Language Toolkit: PropBank Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import re
from functools import total_ordering
from xml.etree import ElementTree
from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *
from nltk.internals import raise_unorderable_types
from nltk.tree import Tree
class PropbankCorpusReader(CorpusReader):
"""
Corpus reader for the propbank corpus, which augments the Penn
Treebank with information about the predicate argument structure
of every verb instance. The corpus consists of two parts: the
predicate-argument annotations themselves, and a set of "frameset
files" which define the argument labels used by the annotations,
on a per-verb basis. Each "frameset file" contains one or more
predicates, such as ``'turn'`` or ``'turn_on'``, each of which is
divided into coarse-grained word senses called "rolesets". For
each "roleset", the frameset file provides descriptions of the
argument roles, along with examples.
"""
def __init__(
self,
root,
propfile,
framefiles="",
verbsfile=None,
parse_fileid_xform=None,
parse_corpus=None,
encoding="utf8",
):
"""
:param root: The root directory for this corpus.
:param propfile: The name of the file containing the predicate-
argument annotations (relative to ``root``).
:param framefiles: A list or regexp specifying the frameset
fileids for this corpus.
:param parse_fileid_xform: A transform that should be applied
to the fileids in this corpus. This should be a function
of one argument (a fileid) that returns a string (the new
fileid).
:param parse_corpus: The corpus containing the parse trees
corresponding to this corpus. These parse trees are
necessary to resolve the tree pointers used by propbank.
"""
# If framefiles is specified as a regexp, expand it.
if isinstance(framefiles, str):
framefiles = find_corpus_fileids(root, framefiles)
framefiles = list(framefiles)
# Initialize the corpus reader.
CorpusReader.__init__(self, root, [propfile, verbsfile] + framefiles, encoding)
# Record our frame fileids & prop file.
self._propfile = propfile
self._framefiles = framefiles
self._verbsfile = verbsfile
self._parse_fileid_xform = parse_fileid_xform
self._parse_corpus = parse_corpus
def instances(self, baseform=None):
"""
:return: a corpus view that acts as a list of
``PropBankInstance`` objects, one for each noun in the corpus.
"""
kwargs = {}
if baseform is not None:
kwargs["instance_filter"] = lambda inst: inst.baseform == baseform
return StreamBackedCorpusView(
self.abspath(self._propfile),
lambda stream: self._read_instance_block(stream, **kwargs),
encoding=self.encoding(self._propfile),
)
def lines(self):
"""
:return: a corpus view that acts as a list of strings, one for
each line in the predicate-argument annotation file.
"""
return StreamBackedCorpusView(
self.abspath(self._propfile),
read_line_block,
encoding=self.encoding(self._propfile),
)
def roleset(self, roleset_id):
"""
:return: the xml description for the given roleset.
"""
baseform = roleset_id.split(".")[0]
framefile = "frames/%s.xml" % baseform
if framefile not in self._framefiles:
raise ValueError("Frameset file for %s not found" % roleset_id)
# n.b.: The encoding for XML fileids is specified by the file
# itself; so we ignore self._encoding here.
with self.abspath(framefile).open() as fp:
etree = ElementTree.parse(fp).getroot()
for roleset in etree.findall("predicate/roleset"):
if roleset.attrib["id"] == roleset_id:
return roleset
raise ValueError(f"Roleset {roleset_id} not found in {framefile}")
def rolesets(self, baseform=None):
"""
:return: list of xml descriptions for rolesets.
"""
if baseform is not None:
framefile = "frames/%s.xml" % baseform
if framefile not in self._framefiles:
raise ValueError("Frameset file for %s not found" % baseform)
framefiles = [framefile]
else:
framefiles = self._framefiles
rsets = []
for framefile in framefiles:
# n.b.: The encoding for XML fileids is specified by the file
# itself; so we ignore self._encoding here.
with self.abspath(framefile).open() as fp:
etree = ElementTree.parse(fp).getroot()
rsets.append(etree.findall("predicate/roleset"))
return LazyConcatenation(rsets)
def verbs(self):
"""
:return: a corpus view that acts as a list of all verb lemmas
in this corpus (from the verbs.txt file).
"""
return StreamBackedCorpusView(
self.abspath(self._verbsfile),
read_line_block,
encoding=self.encoding(self._verbsfile),
)
def _read_instance_block(self, stream, instance_filter=lambda inst: True):
block = []
# Read 100 at a time.
for i in range(100):
line = stream.readline().strip()
if line:
inst = PropbankInstance.parse(
line, self._parse_fileid_xform, self._parse_corpus
)
if instance_filter(inst):
block.append(inst)
return block
######################################################################
# { Propbank Instance & related datatypes
######################################################################
class PropbankInstance:
def __init__(
self,
fileid,
sentnum,
wordnum,
tagger,
roleset,
inflection,
predicate,
arguments,
parse_corpus=None,
):
self.fileid = fileid
"""The name of the file containing the parse tree for this
instance's sentence."""
self.sentnum = sentnum
"""The sentence number of this sentence within ``fileid``.
Indexing starts from zero."""
self.wordnum = wordnum
"""The word number of this instance's predicate within its
containing sentence. Word numbers are indexed starting from
zero, and include traces and other empty parse elements."""
self.tagger = tagger
"""An identifier for the tagger who tagged this instance; or
``'gold'`` if this is an adjuticated instance."""
self.roleset = roleset
"""The name of the roleset used by this instance's predicate.
Use ``propbank.roleset() <PropbankCorpusReader.roleset>`` to
look up information about the roleset."""
self.inflection = inflection
"""A ``PropbankInflection`` object describing the inflection of
this instance's predicate."""
self.predicate = predicate
"""A ``PropbankTreePointer`` indicating the position of this
instance's predicate within its containing sentence."""
self.arguments = tuple(arguments)
"""A list of tuples (argloc, argid), specifying the location
and identifier for each of the predicate's argument in the
containing sentence. Argument identifiers are strings such as
``'ARG0'`` or ``'ARGM-TMP'``. This list does *not* contain
the predicate."""
self.parse_corpus = parse_corpus
"""A corpus reader for the parse trees corresponding to the
instances in this propbank corpus."""
@property
def baseform(self):
"""The baseform of the predicate."""
return self.roleset.split(".")[0]
@property
def sensenumber(self):
"""The sense number of the predicate."""
return self.roleset.split(".")[1]
@property
def predid(self):
"""Identifier of the predicate."""
return "rel"
def __repr__(self):
return "<PropbankInstance: {}, sent {}, word {}>".format(
self.fileid,
self.sentnum,
self.wordnum,
)
def __str__(self):
s = "{} {} {} {} {} {}".format(
self.fileid,
self.sentnum,
self.wordnum,
self.tagger,
self.roleset,
self.inflection,
)
items = self.arguments + ((self.predicate, "rel"),)
for argloc, argid in sorted(items):
s += f" {argloc}-{argid}"
return s
def _get_tree(self):
if self.parse_corpus is None:
return None
if self.fileid not in self.parse_corpus.fileids():
return None
return self.parse_corpus.parsed_sents(self.fileid)[self.sentnum]
tree = property(
_get_tree,
doc="""
The parse tree corresponding to this instance, or None if
the corresponding tree is not available.""",
)
@staticmethod
def parse(s, parse_fileid_xform=None, parse_corpus=None):
pieces = s.split()
if len(pieces) < 7:
raise ValueError("Badly formatted propbank line: %r" % s)
# Divide the line into its basic pieces.
(fileid, sentnum, wordnum, tagger, roleset, inflection) = pieces[:6]
rel = [p for p in pieces[6:] if p.endswith("-rel")]
args = [p for p in pieces[6:] if not p.endswith("-rel")]
if len(rel) != 1:
raise ValueError("Badly formatted propbank line: %r" % s)
# Apply the fileid selector, if any.
if parse_fileid_xform is not None:
fileid = parse_fileid_xform(fileid)
# Convert sentence & word numbers to ints.
sentnum = int(sentnum)
wordnum = int(wordnum)
# Parse the inflection
inflection = PropbankInflection.parse(inflection)
# Parse the predicate location.
predicate = PropbankTreePointer.parse(rel[0][:-4])
# Parse the arguments.
arguments = []
for arg in args:
argloc, argid = arg.split("-", 1)
arguments.append((PropbankTreePointer.parse(argloc), argid))
# Put it all together.
return PropbankInstance(
fileid,
sentnum,
wordnum,
tagger,
roleset,
inflection,
predicate,
arguments,
parse_corpus,
)
class PropbankPointer:
"""
A pointer used by propbank to identify one or more constituents in
a parse tree. ``PropbankPointer`` is an abstract base class with
three concrete subclasses:
- ``PropbankTreePointer`` is used to point to single constituents.
- ``PropbankSplitTreePointer`` is used to point to 'split'
constituents, which consist of a sequence of two or more
``PropbankTreePointer`` pointers.
- ``PropbankChainTreePointer`` is used to point to entire trace
chains in a tree. It consists of a sequence of pieces, which
can be ``PropbankTreePointer`` or ``PropbankSplitTreePointer`` pointers.
"""
def __init__(self):
if self.__class__ == PropbankPointer:
raise NotImplementedError()
class PropbankChainTreePointer(PropbankPointer):
def __init__(self, pieces):
self.pieces = pieces
"""A list of the pieces that make up this chain. Elements may
be either ``PropbankSplitTreePointer`` or
``PropbankTreePointer`` pointers."""
def __str__(self):
return "*".join("%s" % p for p in self.pieces)
def __repr__(self):
return "<PropbankChainTreePointer: %s>" % self
def select(self, tree):
if tree is None:
raise ValueError("Parse tree not available")
return Tree("*CHAIN*", [p.select(tree) for p in self.pieces])
class PropbankSplitTreePointer(PropbankPointer):
def __init__(self, pieces):
self.pieces = pieces
"""A list of the pieces that make up this chain. Elements are
all ``PropbankTreePointer`` pointers."""
def __str__(self):
return ",".join("%s" % p for p in self.pieces)
def __repr__(self):
return "<PropbankSplitTreePointer: %s>" % self
def select(self, tree):
if tree is None:
raise ValueError("Parse tree not available")
return Tree("*SPLIT*", [p.select(tree) for p in self.pieces])
@total_ordering
class PropbankTreePointer(PropbankPointer):
"""
wordnum:height*wordnum:height*...
wordnum:height,
"""
def __init__(self, wordnum, height):
self.wordnum = wordnum
self.height = height
@staticmethod
def parse(s):
# Deal with chains (xx*yy*zz)
pieces = s.split("*")
if len(pieces) > 1:
return PropbankChainTreePointer(
[PropbankTreePointer.parse(elt) for elt in pieces]
)
# Deal with split args (xx,yy,zz)
pieces = s.split(",")
if len(pieces) > 1:
return PropbankSplitTreePointer(
[PropbankTreePointer.parse(elt) for elt in pieces]
)
# Deal with normal pointers.
pieces = s.split(":")
if len(pieces) != 2:
raise ValueError("bad propbank pointer %r" % s)
return PropbankTreePointer(int(pieces[0]), int(pieces[1]))
def __str__(self):
return f"{self.wordnum}:{self.height}"
def __repr__(self):
return "PropbankTreePointer(%d, %d)" % (self.wordnum, self.height)
def __eq__(self, other):
while isinstance(other, (PropbankChainTreePointer, PropbankSplitTreePointer)):
other = other.pieces[0]
if not isinstance(other, PropbankTreePointer):
return self is other
return self.wordnum == other.wordnum and self.height == other.height
def __ne__(self, other):
return not self == other
def __lt__(self, other):
while isinstance(other, (PropbankChainTreePointer, PropbankSplitTreePointer)):
other = other.pieces[0]
if not isinstance(other, PropbankTreePointer):
return id(self) < id(other)
return (self.wordnum, -self.height) < (other.wordnum, -other.height)
def select(self, tree):
if tree is None:
raise ValueError("Parse tree not available")
return tree[self.treepos(tree)]
def treepos(self, tree):
"""
Convert this pointer to a standard 'tree position' pointer,
given that it points to the given tree.
"""
if tree is None:
raise ValueError("Parse tree not available")
stack = [tree]
treepos = []
wordnum = 0
while True:
# tree node:
if isinstance(stack[-1], Tree):
# Select the next child.
if len(treepos) < len(stack):
treepos.append(0)
else:
treepos[-1] += 1
# Update the stack.
if treepos[-1] < len(stack[-1]):
stack.append(stack[-1][treepos[-1]])
else:
# End of node's child list: pop up a level.
stack.pop()
treepos.pop()
# word node:
else:
if wordnum == self.wordnum:
return tuple(treepos[: len(treepos) - self.height - 1])
else:
wordnum += 1
stack.pop()
class PropbankInflection:
# { Inflection Form
INFINITIVE = "i"
GERUND = "g"
PARTICIPLE = "p"
FINITE = "v"
# { Inflection Tense
FUTURE = "f"
PAST = "p"
PRESENT = "n"
# { Inflection Aspect
PERFECT = "p"
PROGRESSIVE = "o"
PERFECT_AND_PROGRESSIVE = "b"
# { Inflection Person
THIRD_PERSON = "3"
# { Inflection Voice
ACTIVE = "a"
PASSIVE = "p"
# { Inflection
NONE = "-"
# }
def __init__(self, form="-", tense="-", aspect="-", person="-", voice="-"):
self.form = form
self.tense = tense
self.aspect = aspect
self.person = person
self.voice = voice
def __str__(self):
return self.form + self.tense + self.aspect + self.person + self.voice
def __repr__(self):
return "<PropbankInflection: %s>" % self
_VALIDATE = re.compile(r"[igpv\-][fpn\-][pob\-][3\-][ap\-]$")
@staticmethod
def parse(s):
if not isinstance(s, str):
raise TypeError("expected a string")
if len(s) != 5 or not PropbankInflection._VALIDATE.match(s):
raise ValueError("Bad propbank inflection string %r" % s)
return PropbankInflection(*s)

View File

@@ -0,0 +1,133 @@
# Natural Language Toolkit: Pros and Cons Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
CorpusReader for the Pros and Cons dataset.
- Pros and Cons dataset information -
Contact: Bing Liu, liub@cs.uic.edu
https://www.cs.uic.edu/~liub
Distributed with permission.
Related papers:
- Murthy Ganapathibhotla and Bing Liu. "Mining Opinions in Comparative Sentences".
Proceedings of the 22nd International Conference on Computational Linguistics
(Coling-2008), Manchester, 18-22 August, 2008.
- Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing and Comparing
Opinions on the Web". Proceedings of the 14th international World Wide Web
conference (WWW-2005), May 10-14, 2005, in Chiba, Japan.
"""
import re
from nltk.corpus.reader.api import *
from nltk.tokenize import *
class ProsConsCorpusReader(CategorizedCorpusReader, CorpusReader):
"""
Reader for the Pros and Cons sentence dataset.
>>> from nltk.corpus import pros_cons
>>> pros_cons.sents(categories='Cons') # doctest: +NORMALIZE_WHITESPACE
[['East', 'batteries', '!', 'On', '-', 'off', 'switch', 'too', 'easy',
'to', 'maneuver', '.'], ['Eats', '...', 'no', ',', 'GULPS', 'batteries'],
...]
>>> pros_cons.words('IntegratedPros.txt')
['Easy', 'to', 'use', ',', 'economical', '!', ...]
"""
CorpusView = StreamBackedCorpusView
def __init__(
self,
root,
fileids,
word_tokenizer=WordPunctTokenizer(),
encoding="utf8",
**kwargs
):
"""
:param root: The root directory for the corpus.
:param fileids: a list or regexp specifying the fileids in the corpus.
:param word_tokenizer: a tokenizer for breaking sentences or paragraphs
into words. Default: `WhitespaceTokenizer`
:param encoding: the encoding that should be used to read the corpus.
:param kwargs: additional parameters passed to CategorizedCorpusReader.
"""
CorpusReader.__init__(self, root, fileids, encoding)
CategorizedCorpusReader.__init__(self, kwargs)
self._word_tokenizer = word_tokenizer
def sents(self, fileids=None, categories=None):
"""
Return all sentences in the corpus or in the specified files/categories.
:param fileids: a list or regexp specifying the ids of the files whose
sentences have to be returned.
:param categories: a list specifying the categories whose sentences
have to be returned.
:return: the given file(s) as a list of sentences. Each sentence is
tokenized using the specified word_tokenizer.
:rtype: list(list(str))
"""
fileids = self._resolve(fileids, categories)
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, str):
fileids = [fileids]
return concat(
[
self.CorpusView(path, self._read_sent_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def words(self, fileids=None, categories=None):
"""
Return all words and punctuation symbols in the corpus or in the specified
files/categories.
:param fileids: a list or regexp specifying the ids of the files whose
words have to be returned.
:param categories: a list specifying the categories whose words have
to be returned.
:return: the given file(s) as a list of words and punctuation symbols.
:rtype: list(str)
"""
fileids = self._resolve(fileids, categories)
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, str):
fileids = [fileids]
return concat(
[
self.CorpusView(path, self._read_word_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def _read_sent_block(self, stream):
sents = []
for i in range(20): # Read 20 lines at a time.
line = stream.readline()
if not line:
continue
sent = re.match(r"^(?!\n)\s*<(Pros|Cons)>(.*)</(?:Pros|Cons)>", line)
if sent:
sents.append(self._word_tokenizer.tokenize(sent.group(2).strip()))
return sents
def _read_word_block(self, stream):
words = []
for sent in self._read_sent_block(stream):
words.extend(sent)
return words

View File

@@ -0,0 +1,331 @@
# Natural Language Toolkit: Product Reviews Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
CorpusReader for reviews corpora (syntax based on Customer Review Corpus).
Customer Review Corpus information
==================================
Annotated by: Minqing Hu and Bing Liu, 2004.
Department of Computer Science
University of Illinois at Chicago
Contact: Bing Liu, liub@cs.uic.edu
https://www.cs.uic.edu/~liub
Distributed with permission.
The "product_reviews_1" and "product_reviews_2" datasets respectively contain
annotated customer reviews of 5 and 9 products from amazon.com.
Related papers:
- Minqing Hu and Bing Liu. "Mining and summarizing customer reviews".
Proceedings of the ACM SIGKDD International Conference on Knowledge
Discovery & Data Mining (KDD-04), 2004.
- Minqing Hu and Bing Liu. "Mining Opinion Features in Customer Reviews".
Proceedings of Nineteeth National Conference on Artificial Intelligence
(AAAI-2004), 2004.
- Xiaowen Ding, Bing Liu and Philip S. Yu. "A Holistic Lexicon-Based Appraoch to
Opinion Mining." Proceedings of First ACM International Conference on Web
Search and Data Mining (WSDM-2008), Feb 11-12, 2008, Stanford University,
Stanford, California, USA.
Symbols used in the annotated reviews:
:[t]: the title of the review: Each [t] tag starts a review.
:xxxx[+|-n]: xxxx is a product feature.
:[+n]: Positive opinion, n is the opinion strength: 3 strongest, and 1 weakest.
Note that the strength is quite subjective.
You may want ignore it, but only considering + and -
:[-n]: Negative opinion
:##: start of each sentence. Each line is a sentence.
:[u]: feature not appeared in the sentence.
:[p]: feature not appeared in the sentence. Pronoun resolution is needed.
:[s]: suggestion or recommendation.
:[cc]: comparison with a competing product from a different brand.
:[cs]: comparison with a competing product from the same brand.
Note: Some of the files (e.g. "ipod.txt", "Canon PowerShot SD500.txt") do not
provide separation between different reviews. This is due to the fact that
the dataset was specifically designed for aspect/feature-based sentiment
analysis, for which sentence-level annotation is sufficient. For document-
level classification and analysis, this peculiarity should be taken into
consideration.
"""
import re
from nltk.corpus.reader.api import *
from nltk.tokenize import *
TITLE = re.compile(r"^\[t\](.*)$") # [t] Title
FEATURES = re.compile(
r"((?:(?:\w+\s)+)?\w+)\[((?:\+|\-)\d)\]"
) # find 'feature' in feature[+3]
NOTES = re.compile(r"\[(?!t)(p|u|s|cc|cs)\]") # find 'p' in camera[+2][p]
SENT = re.compile(r"##(.*)$") # find tokenized sentence
class Review:
"""
A Review is the main block of a ReviewsCorpusReader.
"""
def __init__(self, title=None, review_lines=None):
"""
:param title: the title of the review.
:param review_lines: the list of the ReviewLines that belong to the Review.
"""
self.title = title
if review_lines is None:
self.review_lines = []
else:
self.review_lines = review_lines
def add_line(self, review_line):
"""
Add a line (ReviewLine) to the review.
:param review_line: a ReviewLine instance that belongs to the Review.
"""
assert isinstance(review_line, ReviewLine)
self.review_lines.append(review_line)
def features(self):
"""
Return a list of features in the review. Each feature is a tuple made of
the specific item feature and the opinion strength about that feature.
:return: all features of the review as a list of tuples (feat, score).
:rtype: list(tuple)
"""
features = []
for review_line in self.review_lines:
features.extend(review_line.features)
return features
def sents(self):
"""
Return all tokenized sentences in the review.
:return: all sentences of the review as lists of tokens.
:rtype: list(list(str))
"""
return [review_line.sent for review_line in self.review_lines]
def __repr__(self):
return 'Review(title="{}", review_lines={})'.format(
self.title, self.review_lines
)
class ReviewLine:
"""
A ReviewLine represents a sentence of the review, together with (optional)
annotations of its features and notes about the reviewed item.
"""
def __init__(self, sent, features=None, notes=None):
self.sent = sent
if features is None:
self.features = []
else:
self.features = features
if notes is None:
self.notes = []
else:
self.notes = notes
def __repr__(self):
return "ReviewLine(features={}, notes={}, sent={})".format(
self.features, self.notes, self.sent
)
class ReviewsCorpusReader(CorpusReader):
"""
Reader for the Customer Review Data dataset by Hu, Liu (2004).
Note: we are not applying any sentence tokenization at the moment, just word
tokenization.
>>> from nltk.corpus import product_reviews_1
>>> camera_reviews = product_reviews_1.reviews('Canon_G3.txt')
>>> review = camera_reviews[0]
>>> review.sents()[0] # doctest: +NORMALIZE_WHITESPACE
['i', 'recently', 'purchased', 'the', 'canon', 'powershot', 'g3', 'and', 'am',
'extremely', 'satisfied', 'with', 'the', 'purchase', '.']
>>> review.features() # doctest: +NORMALIZE_WHITESPACE
[('canon powershot g3', '+3'), ('use', '+2'), ('picture', '+2'),
('picture quality', '+1'), ('picture quality', '+1'), ('camera', '+2'),
('use', '+2'), ('feature', '+1'), ('picture quality', '+3'), ('use', '+1'),
('option', '+1')]
We can also reach the same information directly from the stream:
>>> product_reviews_1.features('Canon_G3.txt')
[('canon powershot g3', '+3'), ('use', '+2'), ...]
We can compute stats for specific product features:
>>> n_reviews = len([(feat,score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture'])
>>> tot = sum([int(score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture'])
>>> mean = tot / n_reviews
>>> print(n_reviews, tot, mean)
15 24 1.6
"""
CorpusView = StreamBackedCorpusView
def __init__(
self, root, fileids, word_tokenizer=WordPunctTokenizer(), encoding="utf8"
):
"""
:param root: The root directory for the corpus.
:param fileids: a list or regexp specifying the fileids in the corpus.
:param word_tokenizer: a tokenizer for breaking sentences or paragraphs
into words. Default: `WordPunctTokenizer`
:param encoding: the encoding that should be used to read the corpus.
"""
CorpusReader.__init__(self, root, fileids, encoding)
self._word_tokenizer = word_tokenizer
self._readme = "README.txt"
def features(self, fileids=None):
"""
Return a list of features. Each feature is a tuple made of the specific
item feature and the opinion strength about that feature.
:param fileids: a list or regexp specifying the ids of the files whose
features have to be returned.
:return: all features for the item(s) in the given file(s).
:rtype: list(tuple)
"""
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, str):
fileids = [fileids]
return concat(
[
self.CorpusView(fileid, self._read_features, encoding=enc)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def reviews(self, fileids=None):
"""
Return all the reviews as a list of Review objects. If `fileids` is
specified, return all the reviews from each of the specified files.
:param fileids: a list or regexp specifying the ids of the files whose
reviews have to be returned.
:return: the given file(s) as a list of reviews.
"""
if fileids is None:
fileids = self._fileids
return concat(
[
self.CorpusView(fileid, self._read_review_block, encoding=enc)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def sents(self, fileids=None):
"""
Return all sentences in the corpus or in the specified files.
:param fileids: a list or regexp specifying the ids of the files whose
sentences have to be returned.
:return: the given file(s) as a list of sentences, each encoded as a
list of word strings.
:rtype: list(list(str))
"""
return concat(
[
self.CorpusView(path, self._read_sent_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def words(self, fileids=None):
"""
Return all words and punctuation symbols in the corpus or in the specified
files.
:param fileids: a list or regexp specifying the ids of the files whose
words have to be returned.
:return: the given file(s) as a list of words and punctuation symbols.
:rtype: list(str)
"""
return concat(
[
self.CorpusView(path, self._read_word_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def _read_features(self, stream):
features = []
for i in range(20):
line = stream.readline()
if not line:
return features
features.extend(re.findall(FEATURES, line))
return features
def _read_review_block(self, stream):
while True:
line = stream.readline()
if not line:
return [] # end of file.
title_match = re.match(TITLE, line)
if title_match:
review = Review(
title=title_match.group(1).strip()
) # We create a new review
break
# Scan until we find another line matching the regexp, or EOF.
while True:
oldpos = stream.tell()
line = stream.readline()
# End of file:
if not line:
return [review]
# Start of a new review: backup to just before it starts, and
# return the review we've already collected.
if re.match(TITLE, line):
stream.seek(oldpos)
return [review]
# Anything else is part of the review line.
feats = re.findall(FEATURES, line)
notes = re.findall(NOTES, line)
sent = re.findall(SENT, line)
if sent:
sent = self._word_tokenizer.tokenize(sent[0])
review_line = ReviewLine(sent=sent, features=feats, notes=notes)
review.add_line(review_line)
def _read_sent_block(self, stream):
sents = []
for review in self._read_review_block(stream):
sents.extend([sent for sent in review.sents()])
return sents
def _read_word_block(self, stream):
words = []
for i in range(20): # Read 20 lines at a time.
line = stream.readline()
sent = re.findall(SENT, line)
if sent:
words.extend(self._word_tokenizer.tokenize(sent[0]))
return words

View File

@@ -0,0 +1,146 @@
# Natural Language Toolkit: RTE Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Corpus reader for the Recognizing Textual Entailment (RTE) Challenge Corpora.
The files were taken from the RTE1, RTE2 and RTE3 datasets and the files
were regularized.
Filenames are of the form rte*_dev.xml and rte*_test.xml. The latter are the
gold standard annotated files.
Each entailment corpus is a list of 'text'/'hypothesis' pairs. The following
example is taken from RTE3::
<pair id="1" entailment="YES" task="IE" length="short" >
<t>The sale was made to pay Yukos' US$ 27.5 billion tax bill,
Yuganskneftegaz was originally sold for US$ 9.4 billion to a little known
company Baikalfinansgroup which was later bought by the Russian
state-owned oil company Rosneft .</t>
<h>Baikalfinansgroup was sold to Rosneft.</h>
</pair>
In order to provide globally unique IDs for each pair, a new attribute
``challenge`` has been added to the root element ``entailment-corpus`` of each
file, taking values 1, 2 or 3. The GID is formatted 'm-n', where 'm' is the
challenge number and 'n' is the pair ID.
"""
from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *
from nltk.corpus.reader.xmldocs import *
def norm(value_string):
"""
Normalize the string value in an RTE pair's ``value`` or ``entailment``
attribute as an integer (1, 0).
:param value_string: the label used to classify a text/hypothesis pair
:type value_string: str
:rtype: int
"""
valdict = {"TRUE": 1, "FALSE": 0, "YES": 1, "NO": 0}
return valdict[value_string.upper()]
class RTEPair:
"""
Container for RTE text-hypothesis pairs.
The entailment relation is signalled by the ``value`` attribute in RTE1, and by
``entailment`` in RTE2 and RTE3. These both get mapped on to the ``entailment``
attribute of this class.
"""
def __init__(
self,
pair,
challenge=None,
id=None,
text=None,
hyp=None,
value=None,
task=None,
length=None,
):
"""
:param challenge: version of the RTE challenge (i.e., RTE1, RTE2 or RTE3)
:param id: identifier for the pair
:param text: the text component of the pair
:param hyp: the hypothesis component of the pair
:param value: classification label for the pair
:param task: attribute for the particular NLP task that the data was drawn from
:param length: attribute for the length of the text of the pair
"""
self.challenge = challenge
self.id = pair.attrib["id"]
self.gid = f"{self.challenge}-{self.id}"
self.text = pair[0].text
self.hyp = pair[1].text
if "value" in pair.attrib:
self.value = norm(pair.attrib["value"])
elif "entailment" in pair.attrib:
self.value = norm(pair.attrib["entailment"])
else:
self.value = value
if "task" in pair.attrib:
self.task = pair.attrib["task"]
else:
self.task = task
if "length" in pair.attrib:
self.length = pair.attrib["length"]
else:
self.length = length
def __repr__(self):
if self.challenge:
return f"<RTEPair: gid={self.challenge}-{self.id}>"
else:
return "<RTEPair: id=%s>" % self.id
class RTECorpusReader(XMLCorpusReader):
"""
Corpus reader for corpora in RTE challenges.
This is just a wrapper around the XMLCorpusReader. See module docstring above for the expected
structure of input documents.
"""
def _read_etree(self, doc):
"""
Map the XML input into an RTEPair.
This uses the ``getiterator()`` method from the ElementTree package to
find all the ``<pair>`` elements.
:param doc: a parsed XML document
:rtype: list(RTEPair)
"""
try:
challenge = doc.attrib["challenge"]
except KeyError:
challenge = None
pairiter = doc.iter("pair")
return [RTEPair(pair, challenge=challenge) for pair in pairiter]
def pairs(self, fileids):
"""
Build a list of RTEPairs from a RTE corpus.
:param fileids: a list of RTE corpus fileids
:type: list
:rtype: list(RTEPair)
"""
if isinstance(fileids, str):
fileids = [fileids]
return concat([self._read_etree(self.xml(fileid)) for fileid in fileids])

View File

@@ -0,0 +1,296 @@
# Natural Language Toolkit: SemCor Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Nathan Schneider <nschneid@cs.cmu.edu>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Corpus reader for the SemCor Corpus.
"""
__docformat__ = "epytext en"
from nltk.corpus.reader.api import *
from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView
from nltk.tree import Tree
class SemcorCorpusReader(XMLCorpusReader):
"""
Corpus reader for the SemCor Corpus.
For access to the complete XML data structure, use the ``xml()``
method. For access to simple word lists and tagged word lists, use
``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``.
"""
def __init__(self, root, fileids, wordnet, lazy=True):
XMLCorpusReader.__init__(self, root, fileids)
self._lazy = lazy
self._wordnet = wordnet
def words(self, fileids=None):
"""
:return: the given file(s) as a list of words and punctuation symbols.
:rtype: list(str)
"""
return self._items(fileids, "word", False, False, False)
def chunks(self, fileids=None):
"""
:return: the given file(s) as a list of chunks,
each of which is a list of words and punctuation symbols
that form a unit.
:rtype: list(list(str))
"""
return self._items(fileids, "chunk", False, False, False)
def tagged_chunks(self, fileids=None, tag=("pos" or "sem" or "both")):
"""
:return: the given file(s) as a list of tagged chunks, represented
in tree form.
:rtype: list(Tree)
:param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'`
to indicate the kind of tags to include. Semantic tags consist of
WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity
without a specific entry in WordNet. (Named entities of type 'other'
have no lemma. Other chunks not in WordNet have no semantic tag.
Punctuation tokens have `None` for their part of speech tag.)
"""
return self._items(fileids, "chunk", False, tag != "sem", tag != "pos")
def sents(self, fileids=None):
"""
:return: the given file(s) as a list of sentences, each encoded
as a list of word strings.
:rtype: list(list(str))
"""
return self._items(fileids, "word", True, False, False)
def chunk_sents(self, fileids=None):
"""
:return: the given file(s) as a list of sentences, each encoded
as a list of chunks.
:rtype: list(list(list(str)))
"""
return self._items(fileids, "chunk", True, False, False)
def tagged_sents(self, fileids=None, tag=("pos" or "sem" or "both")):
"""
:return: the given file(s) as a list of sentences. Each sentence
is represented as a list of tagged chunks (in tree form).
:rtype: list(list(Tree))
:param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'`
to indicate the kind of tags to include. Semantic tags consist of
WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity
without a specific entry in WordNet. (Named entities of type 'other'
have no lemma. Other chunks not in WordNet have no semantic tag.
Punctuation tokens have `None` for their part of speech tag.)
"""
return self._items(fileids, "chunk", True, tag != "sem", tag != "pos")
def _items(self, fileids, unit, bracket_sent, pos_tag, sem_tag):
if unit == "word" and not bracket_sent:
# the result of the SemcorWordView may be a multiword unit, so the
# LazyConcatenation will make sure the sentence is flattened
_ = lambda *args: LazyConcatenation(
(SemcorWordView if self._lazy else self._words)(*args)
)
else:
_ = SemcorWordView if self._lazy else self._words
return concat(
[
_(fileid, unit, bracket_sent, pos_tag, sem_tag, self._wordnet)
for fileid in self.abspaths(fileids)
]
)
def _words(self, fileid, unit, bracket_sent, pos_tag, sem_tag):
"""
Helper used to implement the view methods -- returns a list of
tokens, (segmented) words, chunks, or sentences. The tokens
and chunks may optionally be tagged (with POS and sense
information).
:param fileid: The name of the underlying file.
:param unit: One of `'token'`, `'word'`, or `'chunk'`.
:param bracket_sent: If true, include sentence bracketing.
:param pos_tag: Whether to include part-of-speech tags.
:param sem_tag: Whether to include semantic tags, namely WordNet lemma
and OOV named entity status.
"""
assert unit in ("token", "word", "chunk")
result = []
xmldoc = ElementTree.parse(fileid).getroot()
for xmlsent in xmldoc.findall(".//s"):
sent = []
for xmlword in _all_xmlwords_in(xmlsent):
itm = SemcorCorpusReader._word(
xmlword, unit, pos_tag, sem_tag, self._wordnet
)
if unit == "word":
sent.extend(itm)
else:
sent.append(itm)
if bracket_sent:
result.append(SemcorSentence(xmlsent.attrib["snum"], sent))
else:
result.extend(sent)
assert None not in result
return result
@staticmethod
def _word(xmlword, unit, pos_tag, sem_tag, wordnet):
tkn = xmlword.text
if not tkn:
tkn = "" # fixes issue 337?
lemma = xmlword.get("lemma", tkn) # lemma or NE class
lexsn = xmlword.get("lexsn") # lex_sense (locator for the lemma's sense)
if lexsn is not None:
sense_key = lemma + "%" + lexsn
wnpos = ("n", "v", "a", "r", "s")[
int(lexsn.split(":")[0]) - 1
] # see http://wordnet.princeton.edu/man/senseidx.5WN.html
else:
sense_key = wnpos = None
redef = xmlword.get(
"rdf", tkn
) # redefinition--this indicates the lookup string
# does not exactly match the enclosed string, e.g. due to typographical adjustments
# or discontinuity of a multiword expression. If a redefinition has occurred,
# the "rdf" attribute holds its inflected form and "lemma" holds its lemma.
# For NEs, "rdf", "lemma", and "pn" all hold the same value (the NE class).
sensenum = xmlword.get("wnsn") # WordNet sense number
isOOVEntity = "pn" in xmlword.keys() # a "personal name" (NE) not in WordNet
pos = xmlword.get(
"pos"
) # part of speech for the whole chunk (None for punctuation)
if unit == "token":
if not pos_tag and not sem_tag:
itm = tkn
else:
itm = (
(tkn,)
+ ((pos,) if pos_tag else ())
+ ((lemma, wnpos, sensenum, isOOVEntity) if sem_tag else ())
)
return itm
else:
ww = tkn.split("_") # TODO: case where punctuation intervenes in MWE
if unit == "word":
return ww
else:
if sensenum is not None:
try:
sense = wordnet.lemma_from_key(sense_key) # Lemma object
except Exception:
# cannot retrieve the wordnet.Lemma object. possible reasons:
# (a) the wordnet corpus is not downloaded;
# (b) a nonexistent sense is annotated: e.g., such.s.00 triggers:
# nltk.corpus.reader.wordnet.WordNetError: No synset found for key u'such%5:00:01:specified:00'
# solution: just use the lemma name as a string
try:
sense = "%s.%s.%02d" % (
lemma,
wnpos,
int(sensenum),
) # e.g.: reach.v.02
except ValueError:
sense = (
lemma + "." + wnpos + "." + sensenum
) # e.g. the sense number may be "2;1"
bottom = [Tree(pos, ww)] if pos_tag else ww
if sem_tag and isOOVEntity:
if sensenum is not None:
return Tree(sense, [Tree("NE", bottom)])
else: # 'other' NE
return Tree("NE", bottom)
elif sem_tag and sensenum is not None:
return Tree(sense, bottom)
elif pos_tag:
return bottom[0]
else:
return bottom # chunk as a list
def _all_xmlwords_in(elt, result=None):
if result is None:
result = []
for child in elt:
if child.tag in ("wf", "punc"):
result.append(child)
else:
_all_xmlwords_in(child, result)
return result
class SemcorSentence(list):
"""
A list of words, augmented by an attribute ``num`` used to record
the sentence identifier (the ``n`` attribute from the XML).
"""
def __init__(self, num, items):
self.num = num
list.__init__(self, items)
class SemcorWordView(XMLCorpusView):
"""
A stream backed corpus view specialized for use with the BNC corpus.
"""
def __init__(self, fileid, unit, bracket_sent, pos_tag, sem_tag, wordnet):
"""
:param fileid: The name of the underlying file.
:param unit: One of `'token'`, `'word'`, or `'chunk'`.
:param bracket_sent: If true, include sentence bracketing.
:param pos_tag: Whether to include part-of-speech tags.
:param sem_tag: Whether to include semantic tags, namely WordNet lemma
and OOV named entity status.
"""
if bracket_sent:
tagspec = ".*/s"
else:
tagspec = ".*/s/(punc|wf)"
self._unit = unit
self._sent = bracket_sent
self._pos_tag = pos_tag
self._sem_tag = sem_tag
self._wordnet = wordnet
XMLCorpusView.__init__(self, fileid, tagspec)
def handle_elt(self, elt, context):
if self._sent:
return self.handle_sent(elt)
else:
return self.handle_word(elt)
def handle_word(self, elt):
return SemcorCorpusReader._word(
elt, self._unit, self._pos_tag, self._sem_tag, self._wordnet
)
def handle_sent(self, elt):
sent = []
for child in elt:
if child.tag in ("wf", "punc"):
itm = self.handle_word(child)
if self._unit == "word":
sent.extend(itm)
else:
sent.append(itm)
else:
raise ValueError("Unexpected element %s" % child.tag)
return SemcorSentence(elt.attrib["snum"], sent)

View File

@@ -0,0 +1,196 @@
# Natural Language Toolkit: Senseval 2 Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
# Steven Bird <stevenbird1@gmail.com> (modifications)
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Read from the Senseval 2 Corpus.
SENSEVAL [http://www.senseval.org/]
Evaluation exercises for Word Sense Disambiguation.
Organized by ACL-SIGLEX [https://www.siglex.org/]
Prepared by Ted Pedersen <tpederse@umn.edu>, University of Minnesota,
https://www.d.umn.edu/~tpederse/data.html
Distributed with permission.
The NLTK version of the Senseval 2 files uses well-formed XML.
Each instance of the ambiguous words "hard", "interest", "line", and "serve"
is tagged with a sense identifier, and supplied with context.
"""
import re
from xml.etree import ElementTree
from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *
from nltk.tokenize import *
class SensevalInstance:
def __init__(self, word, position, context, senses):
self.word = word
self.senses = tuple(senses)
self.position = position
self.context = context
def __repr__(self):
return "SensevalInstance(word=%r, position=%r, " "context=%r, senses=%r)" % (
self.word,
self.position,
self.context,
self.senses,
)
class SensevalCorpusReader(CorpusReader):
def instances(self, fileids=None):
return concat(
[
SensevalCorpusView(fileid, enc)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def _entry(self, tree):
elts = []
for lexelt in tree.findall("lexelt"):
for inst in lexelt.findall("instance"):
sense = inst[0].attrib["senseid"]
context = [(w.text, w.attrib["pos"]) for w in inst[1]]
elts.append((sense, context))
return elts
class SensevalCorpusView(StreamBackedCorpusView):
def __init__(self, fileid, encoding):
StreamBackedCorpusView.__init__(self, fileid, encoding=encoding)
self._word_tokenizer = WhitespaceTokenizer()
self._lexelt_starts = [0] # list of streampos
self._lexelts = [None] # list of lexelt names
def read_block(self, stream):
# Decide which lexical element we're in.
lexelt_num = bisect.bisect_right(self._lexelt_starts, stream.tell()) - 1
lexelt = self._lexelts[lexelt_num]
instance_lines = []
in_instance = False
while True:
line = stream.readline()
if line == "":
assert instance_lines == []
return []
# Start of a lexical element?
if line.lstrip().startswith("<lexelt"):
lexelt_num += 1
m = re.search("item=(\"[^\"]+\"|'[^']+')", line)
assert m is not None # <lexelt> has no 'item=...'
lexelt = m.group(1)[1:-1]
if lexelt_num < len(self._lexelts):
assert lexelt == self._lexelts[lexelt_num]
else:
self._lexelts.append(lexelt)
self._lexelt_starts.append(stream.tell())
# Start of an instance?
if line.lstrip().startswith("<instance"):
assert instance_lines == []
in_instance = True
# Body of an instance?
if in_instance:
instance_lines.append(line)
# End of an instance?
if line.lstrip().startswith("</instance"):
xml_block = "\n".join(instance_lines)
xml_block = _fixXML(xml_block)
inst = ElementTree.fromstring(xml_block)
return [self._parse_instance(inst, lexelt)]
def _parse_instance(self, instance, lexelt):
senses = []
context = []
position = None
for child in instance:
if child.tag == "answer":
senses.append(child.attrib["senseid"])
elif child.tag == "context":
context += self._word_tokenizer.tokenize(child.text)
for cword in child:
if cword.tag == "compound":
cword = cword[0] # is this ok to do?
if cword.tag == "head":
# Some santiy checks:
assert position is None, "head specified twice"
assert cword.text.strip() or len(cword) == 1
assert not (cword.text.strip() and len(cword) == 1)
# Record the position of the head:
position = len(context)
# Add on the head word itself:
if cword.text.strip():
context.append(cword.text.strip())
elif cword[0].tag == "wf":
context.append((cword[0].text, cword[0].attrib["pos"]))
if cword[0].tail:
context += self._word_tokenizer.tokenize(cword[0].tail)
else:
assert False, "expected CDATA or wf in <head>"
elif cword.tag == "wf":
context.append((cword.text, cword.attrib["pos"]))
elif cword.tag == "s":
pass # Sentence boundary marker.
else:
print("ACK", cword.tag)
assert False, "expected CDATA or <wf> or <head>"
if cword.tail:
context += self._word_tokenizer.tokenize(cword.tail)
else:
assert False, "unexpected tag %s" % child.tag
return SensevalInstance(lexelt, position, context, senses)
def _fixXML(text):
"""
Fix the various issues with Senseval pseudo-XML.
"""
# <~> or <^> => ~ or ^
text = re.sub(r"<([~\^])>", r"\1", text)
# fix lone &
text = re.sub(r"(\s+)\&(\s+)", r"\1&amp;\2", text)
# fix """
text = re.sub(r'"""', "'\"'", text)
# fix <s snum=dd> => <s snum="dd"/>
text = re.sub(r'(<[^<]*snum=)([^">]+)>', r'\1"\2"/>', text)
# fix foreign word tag
text = re.sub(r"<\&frasl>\s*<p[^>]*>", "FRASL", text)
# remove <&I .>
text = re.sub(r"<\&I[^>]*>", "", text)
# fix <{word}>
text = re.sub(r"<{([^}]+)}>", r"\1", text)
# remove <@>, <p>, </p>
text = re.sub(r"<(@|/?p)>", r"", text)
# remove <&M .> and <&T .> and <&Ms .>
text = re.sub(r"<&\w+ \.>", r"", text)
# remove <!DOCTYPE... > lines
text = re.sub(r"<!DOCTYPE[^>]*>", r"", text)
# remove <[hi]> and <[/p]> etc
text = re.sub(r"<\[\/?[^>]+\]*>", r"", text)
# take the thing out of the brackets: <&hellip;>
text = re.sub(r"<(\&\w+;)>", r"\1", text)
# and remove the & for those patterns that aren't regular XML
text = re.sub(r"&(?!amp|gt|lt|apos|quot)", r"", text)
# fix 'abc <p="foo"/>' style tags - now <wf pos="foo">abc</wf>
text = re.sub(
r'[ \t]*([^<>\s]+?)[ \t]*<p="([^"]*"?)"/>', r' <wf pos="\2">\1</wf>', text
)
text = re.sub(r'\s*"\s*<p=\'"\'/>', " <wf pos='\"'>\"</wf>", text)
return text

View File

@@ -0,0 +1,136 @@
# Natural Language Toolkit: SentiWordNet
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Christopher Potts <cgpotts@stanford.edu>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
An NLTK interface for SentiWordNet
SentiWordNet is a lexical resource for opinion mining.
SentiWordNet assigns to each synset of WordNet three
sentiment scores: positivity, negativity, and objectivity.
For details about SentiWordNet see:
http://sentiwordnet.isti.cnr.it/
>>> from nltk.corpus import sentiwordnet as swn
>>> print(swn.senti_synset('breakdown.n.03'))
<breakdown.n.03: PosScore=0.0 NegScore=0.25>
>>> list(swn.senti_synsets('slow'))
[SentiSynset('decelerate.v.01'), SentiSynset('slow.v.02'),\
SentiSynset('slow.v.03'), SentiSynset('slow.a.01'),\
SentiSynset('slow.a.02'), SentiSynset('dense.s.04'),\
SentiSynset('slow.a.04'), SentiSynset('boring.s.01'),\
SentiSynset('dull.s.05'), SentiSynset('slowly.r.01'),\
SentiSynset('behind.r.03')]
>>> happy = swn.senti_synsets('happy', 'a')
>>> happy0 = list(happy)[0]
>>> happy0.pos_score()
0.875
>>> happy0.neg_score()
0.0
>>> happy0.obj_score()
0.125
"""
import re
from nltk.corpus.reader import CorpusReader
class SentiWordNetCorpusReader(CorpusReader):
def __init__(self, root, fileids, encoding="utf-8"):
"""
Construct a new SentiWordNet Corpus Reader, using data from
the specified file.
"""
super().__init__(root, fileids, encoding=encoding)
if len(self._fileids) != 1:
raise ValueError("Exactly one file must be specified")
self._db = {}
self._parse_src_file()
def _parse_src_file(self):
lines = self.open(self._fileids[0]).read().splitlines()
lines = filter((lambda x: not re.search(r"^\s*#", x)), lines)
for i, line in enumerate(lines):
fields = [field.strip() for field in re.split(r"\t+", line)]
try:
pos, offset, pos_score, neg_score, synset_terms, gloss = fields
except BaseException as e:
raise ValueError(f"Line {i} formatted incorrectly: {line}\n") from e
if pos and offset:
offset = int(offset)
self._db[(pos, offset)] = (float(pos_score), float(neg_score))
def senti_synset(self, *vals):
from nltk.corpus import wordnet as wn
if tuple(vals) in self._db:
pos_score, neg_score = self._db[tuple(vals)]
pos, offset = vals
if pos == "s":
pos = "a"
synset = wn.synset_from_pos_and_offset(pos, offset)
return SentiSynset(pos_score, neg_score, synset)
else:
synset = wn.synset(vals[0])
pos = synset.pos()
if pos == "s":
pos = "a"
offset = synset.offset()
if (pos, offset) in self._db:
pos_score, neg_score = self._db[(pos, offset)]
return SentiSynset(pos_score, neg_score, synset)
else:
return None
def senti_synsets(self, string, pos=None):
from nltk.corpus import wordnet as wn
sentis = []
synset_list = wn.synsets(string, pos)
for synset in synset_list:
sentis.append(self.senti_synset(synset.name()))
sentis = filter(lambda x: x, sentis)
return sentis
def all_senti_synsets(self):
from nltk.corpus import wordnet as wn
for key, fields in self._db.items():
pos, offset = key
pos_score, neg_score = fields
synset = wn.synset_from_pos_and_offset(pos, offset)
yield SentiSynset(pos_score, neg_score, synset)
class SentiSynset:
def __init__(self, pos_score, neg_score, synset):
self._pos_score = pos_score
self._neg_score = neg_score
self._obj_score = 1.0 - (self._pos_score + self._neg_score)
self.synset = synset
def pos_score(self):
return self._pos_score
def neg_score(self):
return self._neg_score
def obj_score(self):
return self._obj_score
def __str__(self):
"""Prints just the Pos/Neg scores for now."""
s = "<"
s += self.synset.name() + ": "
s += "PosScore=%s " % self._pos_score
s += "NegScore=%s" % self._neg_score
s += ">"
return s
def __repr__(self):
return "Senti" + repr(self.synset)

View File

@@ -0,0 +1,75 @@
# Natural Language Toolkit: Sinica Treebank Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Sinica Treebank Corpus Sample
http://rocling.iis.sinica.edu.tw/CKIP/engversion/treebank.htm
10,000 parsed sentences, drawn from the Academia Sinica Balanced
Corpus of Modern Chinese. Parse tree notation is based on
Information-based Case Grammar. Tagset documentation is available
at https://www.sinica.edu.tw/SinicaCorpus/modern_e_wordtype.html
Language and Knowledge Processing Group, Institute of Information
Science, Academia Sinica
The data is distributed with the Natural Language Toolkit under the terms of
the Creative Commons Attribution-NonCommercial-ShareAlike License
[https://creativecommons.org/licenses/by-nc-sa/2.5/].
References:
Feng-Yi Chen, Pi-Fang Tsai, Keh-Jiann Chen, and Chu-Ren Huang (1999)
The Construction of Sinica Treebank. Computational Linguistics and
Chinese Language Processing, 4, pp 87-104.
Huang Chu-Ren, Keh-Jiann Chen, Feng-Yi Chen, Keh-Jiann Chen, Zhao-Ming
Gao, and Kuang-Yu Chen. 2000. Sinica Treebank: Design Criteria,
Annotation Guidelines, and On-line Interface. Proceedings of 2nd
Chinese Language Processing Workshop, Association for Computational
Linguistics.
Chen Keh-Jiann and Yu-Ming Hsieh (2004) Chinese Treebanks and Grammar
Extraction, Proceedings of IJCNLP-04, pp560-565.
"""
from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *
from nltk.tag import map_tag
from nltk.tree import sinica_parse
IDENTIFIER = re.compile(r"^#\S+\s")
APPENDIX = re.compile(r"(?<=\))#.*$")
TAGWORD = re.compile(r":([^:()|]+):([^:()|]+)")
WORD = re.compile(r":[^:()|]+:([^:()|]+)")
class SinicaTreebankCorpusReader(SyntaxCorpusReader):
"""
Reader for the sinica treebank.
"""
def _read_block(self, stream):
sent = stream.readline()
sent = IDENTIFIER.sub("", sent)
sent = APPENDIX.sub("", sent)
return [sent]
def _parse(self, sent):
return sinica_parse(sent)
def _tag(self, sent, tagset=None):
tagged_sent = [(w, t) for (t, w) in TAGWORD.findall(sent)]
if tagset and tagset != self._tagset:
tagged_sent = [
(w, map_tag(self._tagset, tagset, t)) for (w, t) in tagged_sent
]
return tagged_sent
def _word(self, sent):
return WORD.findall(sent)

View File

@@ -0,0 +1,56 @@
# Natural Language Toolkit: String Category Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Read tuples from a corpus consisting of categorized strings.
For example, from the question classification corpus:
NUM:dist How far is it from Denver to Aspen ?
LOC:city What county is Modesto , California in ?
HUM:desc Who was Galileo ?
DESC:def What is an atom ?
NUM:date When did Hawaii become a state ?
"""
from nltk.corpus.reader.api import *
# based on PPAttachmentCorpusReader
from nltk.corpus.reader.util import *
# [xx] Should the order of the tuple be reversed -- in most other places
# in nltk, we use the form (data, tag) -- e.g., tagged words and
# labeled texts for classifiers.
class StringCategoryCorpusReader(CorpusReader):
def __init__(self, root, fileids, delimiter=" ", encoding="utf8"):
"""
:param root: The root directory for this corpus.
:param fileids: A list or regexp specifying the fileids in this corpus.
:param delimiter: Field delimiter
"""
CorpusReader.__init__(self, root, fileids, encoding)
self._delimiter = delimiter
def tuples(self, fileids=None):
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, str):
fileids = [fileids]
return concat(
[
StreamBackedCorpusView(fileid, self._read_tuple_block, encoding=enc)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def _read_tuple_block(self, stream):
line = stream.readline().strip()
if line:
return [tuple(line.split(self._delimiter, 1))]
else:
return []

View File

@@ -0,0 +1,125 @@
# Natural Language Toolkit: Switchboard Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import re
from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *
from nltk.tag import map_tag, str2tuple
class SwitchboardTurn(list):
"""
A specialized list object used to encode switchboard utterances.
The elements of the list are the words in the utterance; and two
attributes, ``speaker`` and ``id``, are provided to retrieve the
spearker identifier and utterance id. Note that utterance ids
are only unique within a given discourse.
"""
def __init__(self, words, speaker, id):
list.__init__(self, words)
self.speaker = speaker
self.id = int(id)
def __repr__(self):
if len(self) == 0:
text = ""
elif isinstance(self[0], tuple):
text = " ".join("%s/%s" % w for w in self)
else:
text = " ".join(self)
return f"<{self.speaker}.{self.id}: {text!r}>"
class SwitchboardCorpusReader(CorpusReader):
_FILES = ["tagged"]
# Use the "tagged" file even for non-tagged data methods, since
# it's tokenized.
def __init__(self, root, tagset=None):
CorpusReader.__init__(self, root, self._FILES)
self._tagset = tagset
def words(self):
return StreamBackedCorpusView(self.abspath("tagged"), self._words_block_reader)
def tagged_words(self, tagset=None):
def tagged_words_block_reader(stream):
return self._tagged_words_block_reader(stream, tagset)
return StreamBackedCorpusView(self.abspath("tagged"), tagged_words_block_reader)
def turns(self):
return StreamBackedCorpusView(self.abspath("tagged"), self._turns_block_reader)
def tagged_turns(self, tagset=None):
def tagged_turns_block_reader(stream):
return self._tagged_turns_block_reader(stream, tagset)
return StreamBackedCorpusView(self.abspath("tagged"), tagged_turns_block_reader)
def discourses(self):
return StreamBackedCorpusView(
self.abspath("tagged"), self._discourses_block_reader
)
def tagged_discourses(self, tagset=False):
def tagged_discourses_block_reader(stream):
return self._tagged_discourses_block_reader(stream, tagset)
return StreamBackedCorpusView(
self.abspath("tagged"), tagged_discourses_block_reader
)
def _discourses_block_reader(self, stream):
# returns at most 1 discourse. (The other methods depend on this.)
return [
[
self._parse_utterance(u, include_tag=False)
for b in read_blankline_block(stream)
for u in b.split("\n")
if u.strip()
]
]
def _tagged_discourses_block_reader(self, stream, tagset=None):
# returns at most 1 discourse. (The other methods depend on this.)
return [
[
self._parse_utterance(u, include_tag=True, tagset=tagset)
for b in read_blankline_block(stream)
for u in b.split("\n")
if u.strip()
]
]
def _turns_block_reader(self, stream):
return self._discourses_block_reader(stream)[0]
def _tagged_turns_block_reader(self, stream, tagset=None):
return self._tagged_discourses_block_reader(stream, tagset)[0]
def _words_block_reader(self, stream):
return sum(self._discourses_block_reader(stream)[0], [])
def _tagged_words_block_reader(self, stream, tagset=None):
return sum(self._tagged_discourses_block_reader(stream, tagset)[0], [])
_UTTERANCE_RE = re.compile(r"(\w+)\.(\d+)\:\s*(.*)")
_SEP = "/"
def _parse_utterance(self, utterance, include_tag, tagset=None):
m = self._UTTERANCE_RE.match(utterance)
if m is None:
raise ValueError("Bad utterance %r" % utterance)
speaker, id, text = m.groups()
words = [str2tuple(s, self._SEP) for s in text.split()]
if not include_tag:
words = [w for (w, t) in words]
elif tagset and tagset != self._tagset:
words = [(w, map_tag(self._tagset, tagset, t)) for (w, t) in words]
return SwitchboardTurn(words, speaker, id)

View File

@@ -0,0 +1,354 @@
# Natural Language Toolkit: Tagged Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# Jacob Perkins <japerk@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
A reader for corpora whose documents contain part-of-speech-tagged words.
"""
import os
from nltk.corpus.reader.api import *
from nltk.corpus.reader.timit import read_timit_block
from nltk.corpus.reader.util import *
from nltk.tag import map_tag, str2tuple
from nltk.tokenize import *
class TaggedCorpusReader(CorpusReader):
"""
Reader for simple part-of-speech tagged corpora. Paragraphs are
assumed to be split using blank lines. Sentences and words can be
tokenized using the default tokenizers, or by custom tokenizers
specified as parameters to the constructor. Words are parsed
using ``nltk.tag.str2tuple``. By default, ``'/'`` is used as the
separator. I.e., words should have the form::
word1/tag1 word2/tag2 word3/tag3 ...
But custom separators may be specified as parameters to the
constructor. Part of speech tags are case-normalized to upper
case.
"""
def __init__(
self,
root,
fileids,
sep="/",
word_tokenizer=WhitespaceTokenizer(),
sent_tokenizer=RegexpTokenizer("\n", gaps=True),
para_block_reader=read_blankline_block,
encoding="utf8",
tagset=None,
):
"""
Construct a new Tagged Corpus reader for a set of documents
located at the given root directory. Example usage:
>>> root = '/...path to corpus.../'
>>> reader = TaggedCorpusReader(root, '.*', '.txt') # doctest: +SKIP
:param root: The root directory for this corpus.
:param fileids: A list or regexp specifying the fileids in this corpus.
"""
CorpusReader.__init__(self, root, fileids, encoding)
self._sep = sep
self._word_tokenizer = word_tokenizer
self._sent_tokenizer = sent_tokenizer
self._para_block_reader = para_block_reader
self._tagset = tagset
def words(self, fileids=None):
"""
:return: the given file(s) as a list of words
and punctuation symbols.
:rtype: list(str)
"""
return concat(
[
TaggedCorpusView(
fileid,
enc,
False,
False,
False,
self._sep,
self._word_tokenizer,
self._sent_tokenizer,
self._para_block_reader,
None,
)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def sents(self, fileids=None):
"""
:return: the given file(s) as a list of
sentences or utterances, each encoded as a list of word
strings.
:rtype: list(list(str))
"""
return concat(
[
TaggedCorpusView(
fileid,
enc,
False,
True,
False,
self._sep,
self._word_tokenizer,
self._sent_tokenizer,
self._para_block_reader,
None,
)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def paras(self, fileids=None):
"""
:return: the given file(s) as a list of
paragraphs, each encoded as a list of sentences, which are
in turn encoded as lists of word strings.
:rtype: list(list(list(str)))
"""
return concat(
[
TaggedCorpusView(
fileid,
enc,
False,
True,
True,
self._sep,
self._word_tokenizer,
self._sent_tokenizer,
self._para_block_reader,
None,
)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def tagged_words(self, fileids=None, tagset=None):
"""
:return: the given file(s) as a list of tagged
words and punctuation symbols, encoded as tuples
``(word,tag)``.
:rtype: list(tuple(str,str))
"""
if tagset and tagset != self._tagset:
tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
else:
tag_mapping_function = None
return concat(
[
TaggedCorpusView(
fileid,
enc,
True,
False,
False,
self._sep,
self._word_tokenizer,
self._sent_tokenizer,
self._para_block_reader,
tag_mapping_function,
)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def tagged_sents(self, fileids=None, tagset=None):
"""
:return: the given file(s) as a list of
sentences, each encoded as a list of ``(word,tag)`` tuples.
:rtype: list(list(tuple(str,str)))
"""
if tagset and tagset != self._tagset:
tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
else:
tag_mapping_function = None
return concat(
[
TaggedCorpusView(
fileid,
enc,
True,
True,
False,
self._sep,
self._word_tokenizer,
self._sent_tokenizer,
self._para_block_reader,
tag_mapping_function,
)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def tagged_paras(self, fileids=None, tagset=None):
"""
:return: the given file(s) as a list of
paragraphs, each encoded as a list of sentences, which are
in turn encoded as lists of ``(word,tag)`` tuples.
:rtype: list(list(list(tuple(str,str))))
"""
if tagset and tagset != self._tagset:
tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
else:
tag_mapping_function = None
return concat(
[
TaggedCorpusView(
fileid,
enc,
True,
True,
True,
self._sep,
self._word_tokenizer,
self._sent_tokenizer,
self._para_block_reader,
tag_mapping_function,
)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
class CategorizedTaggedCorpusReader(CategorizedCorpusReader, TaggedCorpusReader):
"""
A reader for part-of-speech tagged corpora whose documents are
divided into categories based on their file identifiers.
"""
def __init__(self, *args, **kwargs):
"""
Initialize the corpus reader. Categorization arguments
(``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
the ``CategorizedCorpusReader`` constructor. The remaining arguments
are passed to the ``TaggedCorpusReader``.
"""
CategorizedCorpusReader.__init__(self, kwargs)
TaggedCorpusReader.__init__(self, *args, **kwargs)
def tagged_words(self, fileids=None, categories=None, tagset=None):
return super().tagged_words(self._resolve(fileids, categories), tagset)
def tagged_sents(self, fileids=None, categories=None, tagset=None):
return super().tagged_sents(self._resolve(fileids, categories), tagset)
def tagged_paras(self, fileids=None, categories=None, tagset=None):
return super().tagged_paras(self._resolve(fileids, categories), tagset)
class TaggedCorpusView(StreamBackedCorpusView):
"""
A specialized corpus view for tagged documents. It can be
customized via flags to divide the tagged corpus documents up by
sentence or paragraph, and to include or omit part of speech tags.
``TaggedCorpusView`` objects are typically created by
``TaggedCorpusReader`` (not directly by nltk users).
"""
def __init__(
self,
corpus_file,
encoding,
tagged,
group_by_sent,
group_by_para,
sep,
word_tokenizer,
sent_tokenizer,
para_block_reader,
tag_mapping_function=None,
):
self._tagged = tagged
self._group_by_sent = group_by_sent
self._group_by_para = group_by_para
self._sep = sep
self._word_tokenizer = word_tokenizer
self._sent_tokenizer = sent_tokenizer
self._para_block_reader = para_block_reader
self._tag_mapping_function = tag_mapping_function
StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
def read_block(self, stream):
"""Reads one paragraph at a time."""
block = []
for para_str in self._para_block_reader(stream):
para = []
for sent_str in self._sent_tokenizer.tokenize(para_str):
sent = [
str2tuple(s, self._sep)
for s in self._word_tokenizer.tokenize(sent_str)
]
if self._tag_mapping_function:
sent = [(w, self._tag_mapping_function(t)) for (w, t) in sent]
if not self._tagged:
sent = [w for (w, t) in sent]
if self._group_by_sent:
para.append(sent)
else:
para.extend(sent)
if self._group_by_para:
block.append(para)
else:
block.extend(para)
return block
# needs to implement simplified tags
class MacMorphoCorpusReader(TaggedCorpusReader):
"""
A corpus reader for the MAC_MORPHO corpus. Each line contains a
single tagged word, using '_' as a separator. Sentence boundaries
are based on the end-sentence tag ('_.'). Paragraph information
is not included in the corpus, so each paragraph returned by
``self.paras()`` and ``self.tagged_paras()`` contains a single
sentence.
"""
def __init__(self, root, fileids, encoding="utf8", tagset=None):
TaggedCorpusReader.__init__(
self,
root,
fileids,
sep="_",
word_tokenizer=LineTokenizer(),
sent_tokenizer=RegexpTokenizer(".*\n"),
para_block_reader=self._read_block,
encoding=encoding,
tagset=tagset,
)
def _read_block(self, stream):
return read_regexp_block(stream, r".*", r".*_\.")
class TimitTaggedCorpusReader(TaggedCorpusReader):
"""
A corpus reader for tagged sentences that are included in the TIMIT corpus.
"""
def __init__(self, *args, **kwargs):
TaggedCorpusReader.__init__(
self, para_block_reader=read_timit_block, *args, **kwargs
)
def paras(self):
raise NotImplementedError("use sents() instead")
def tagged_paras(self):
raise NotImplementedError("use tagged_sents() instead")

View File

@@ -0,0 +1,510 @@
# Natural Language Toolkit: TIMIT Corpus Reader
#
# Copyright (C) 2001-2007 NLTK Project
# Author: Haejoong Lee <haejoong@ldc.upenn.edu>
# Steven Bird <stevenbird1@gmail.com>
# Jacob Perkins <japerk@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
# [xx] this docstring is out-of-date:
"""
Read tokens, phonemes and audio data from the NLTK TIMIT Corpus.
This corpus contains selected portion of the TIMIT corpus.
- 16 speakers from 8 dialect regions
- 1 male and 1 female from each dialect region
- total 130 sentences (10 sentences per speaker. Note that some
sentences are shared among other speakers, especially sa1 and sa2
are spoken by all speakers.)
- total 160 recording of sentences (10 recordings per speaker)
- audio format: NIST Sphere, single channel, 16kHz sampling,
16 bit sample, PCM encoding
Module contents
===============
The timit corpus reader provides 4 functions and 4 data items.
- utterances
List of utterances in the corpus. There are total 160 utterances,
each of which corresponds to a unique utterance of a speaker.
Here's an example of an utterance identifier in the list::
dr1-fvmh0/sx206
- _---- _---
| | | | |
| | | | |
| | | | `--- sentence number
| | | `----- sentence type (a:all, i:shared, x:exclusive)
| | `--------- speaker ID
| `------------ sex (m:male, f:female)
`-------------- dialect region (1..8)
- speakers
List of speaker IDs. An example of speaker ID::
dr1-fvmh0
Note that if you split an item ID with colon and take the first element of
the result, you will get a speaker ID.
>>> itemid = 'dr1-fvmh0/sx206'
>>> spkrid , sentid = itemid.split('/')
>>> spkrid
'dr1-fvmh0'
The second element of the result is a sentence ID.
- dictionary()
Phonetic dictionary of words contained in this corpus. This is a Python
dictionary from words to phoneme lists.
- spkrinfo()
Speaker information table. It's a Python dictionary from speaker IDs to
records of 10 fields. Speaker IDs the same as the ones in timie.speakers.
Each record is a dictionary from field names to values, and the fields are
as follows::
id speaker ID as defined in the original TIMIT speaker info table
sex speaker gender (M:male, F:female)
dr speaker dialect region (1:new england, 2:northern,
3:north midland, 4:south midland, 5:southern, 6:new york city,
7:western, 8:army brat (moved around))
use corpus type (TRN:training, TST:test)
in this sample corpus only TRN is available
recdate recording date
birthdate speaker birth date
ht speaker height
race speaker race (WHT:white, BLK:black, AMR:american indian,
SPN:spanish-american, ORN:oriental,???:unknown)
edu speaker education level (HS:high school, AS:associate degree,
BS:bachelor's degree (BS or BA), MS:master's degree (MS or MA),
PHD:doctorate degree (PhD,JD,MD), ??:unknown)
comments comments by the recorder
The 4 functions are as follows.
- tokenized(sentences=items, offset=False)
Given a list of items, returns an iterator of a list of word lists,
each of which corresponds to an item (sentence). If offset is set to True,
each element of the word list is a tuple of word(string), start offset and
end offset, where offset is represented as a number of 16kHz samples.
- phonetic(sentences=items, offset=False)
Given a list of items, returns an iterator of a list of phoneme lists,
each of which corresponds to an item (sentence). If offset is set to True,
each element of the phoneme list is a tuple of word(string), start offset
and end offset, where offset is represented as a number of 16kHz samples.
- audiodata(item, start=0, end=None)
Given an item, returns a chunk of audio samples formatted into a string.
When the function is called, if start and end are omitted, the entire
samples of the recording will be returned. If only end is omitted,
samples from the start offset to the end of the recording will be returned.
- play(data)
Play the given audio samples. The audio samples can be obtained from the
timit.audiodata function.
"""
import sys
import time
from nltk.corpus.reader.api import *
from nltk.internals import import_from_stdlib
from nltk.tree import Tree
class TimitCorpusReader(CorpusReader):
"""
Reader for the TIMIT corpus (or any other corpus with the same
file layout and use of file formats). The corpus root directory
should contain the following files:
- timitdic.txt: dictionary of standard transcriptions
- spkrinfo.txt: table of speaker information
In addition, the root directory should contain one subdirectory
for each speaker, containing three files for each utterance:
- <utterance-id>.txt: text content of utterances
- <utterance-id>.wrd: tokenized text content of utterances
- <utterance-id>.phn: phonetic transcription of utterances
- <utterance-id>.wav: utterance sound file
"""
_FILE_RE = r"(\w+-\w+/\w+\.(phn|txt|wav|wrd))|" + r"timitdic\.txt|spkrinfo\.txt"
"""A regexp matching fileids that are used by this corpus reader."""
_UTTERANCE_RE = r"\w+-\w+/\w+\.txt"
def __init__(self, root, encoding="utf8"):
"""
Construct a new TIMIT corpus reader in the given directory.
:param root: The root directory for this corpus.
"""
# Ensure that wave files don't get treated as unicode data:
if isinstance(encoding, str):
encoding = [(r".*\.wav", None), (".*", encoding)]
CorpusReader.__init__(
self, root, find_corpus_fileids(root, self._FILE_RE), encoding=encoding
)
self._utterances = [
name[:-4] for name in find_corpus_fileids(root, self._UTTERANCE_RE)
]
"""A list of the utterance identifiers for all utterances in
this corpus."""
self._speakerinfo = None
self._root = root
self.speakers = sorted({u.split("/")[0] for u in self._utterances})
def fileids(self, filetype=None):
"""
Return a list of file identifiers for the files that make up
this corpus.
:param filetype: If specified, then ``filetype`` indicates that
only the files that have the given type should be
returned. Accepted values are: ``txt``, ``wrd``, ``phn``,
``wav``, or ``metadata``,
"""
if filetype is None:
return CorpusReader.fileids(self)
elif filetype in ("txt", "wrd", "phn", "wav"):
return [f"{u}.{filetype}" for u in self._utterances]
elif filetype == "metadata":
return ["timitdic.txt", "spkrinfo.txt"]
else:
raise ValueError("Bad value for filetype: %r" % filetype)
def utteranceids(
self, dialect=None, sex=None, spkrid=None, sent_type=None, sentid=None
):
"""
:return: A list of the utterance identifiers for all
utterances in this corpus, or for the given speaker, dialect
region, gender, sentence type, or sentence number, if
specified.
"""
if isinstance(dialect, str):
dialect = [dialect]
if isinstance(sex, str):
sex = [sex]
if isinstance(spkrid, str):
spkrid = [spkrid]
if isinstance(sent_type, str):
sent_type = [sent_type]
if isinstance(sentid, str):
sentid = [sentid]
utterances = self._utterances[:]
if dialect is not None:
utterances = [u for u in utterances if u[2] in dialect]
if sex is not None:
utterances = [u for u in utterances if u[4] in sex]
if spkrid is not None:
utterances = [u for u in utterances if u[:9] in spkrid]
if sent_type is not None:
utterances = [u for u in utterances if u[11] in sent_type]
if sentid is not None:
utterances = [u for u in utterances if u[10:] in spkrid]
return utterances
def transcription_dict(self):
"""
:return: A dictionary giving the 'standard' transcription for
each word.
"""
_transcriptions = {}
with self.open("timitdic.txt") as fp:
for line in fp:
if not line.strip() or line[0] == ";":
continue
m = re.match(r"\s*(\S+)\s+/(.*)/\s*$", line)
if not m:
raise ValueError("Bad line: %r" % line)
_transcriptions[m.group(1)] = m.group(2).split()
return _transcriptions
def spkrid(self, utterance):
return utterance.split("/")[0]
def sentid(self, utterance):
return utterance.split("/")[1]
def utterance(self, spkrid, sentid):
return f"{spkrid}/{sentid}"
def spkrutteranceids(self, speaker):
"""
:return: A list of all utterances associated with a given
speaker.
"""
return [
utterance
for utterance in self._utterances
if utterance.startswith(speaker + "/")
]
def spkrinfo(self, speaker):
"""
:return: A dictionary mapping .. something.
"""
if speaker in self._utterances:
speaker = self.spkrid(speaker)
if self._speakerinfo is None:
self._speakerinfo = {}
with self.open("spkrinfo.txt") as fp:
for line in fp:
if not line.strip() or line[0] == ";":
continue
rec = line.strip().split(None, 9)
key = f"dr{rec[2]}-{rec[1].lower()}{rec[0].lower()}"
self._speakerinfo[key] = SpeakerInfo(*rec)
return self._speakerinfo[speaker]
def phones(self, utterances=None):
results = []
for fileid in self._utterance_fileids(utterances, ".phn"):
with self.open(fileid) as fp:
for line in fp:
if line.strip():
results.append(line.split()[-1])
return results
def phone_times(self, utterances=None):
"""
offset is represented as a number of 16kHz samples!
"""
results = []
for fileid in self._utterance_fileids(utterances, ".phn"):
with self.open(fileid) as fp:
for line in fp:
if line.strip():
results.append(
(
line.split()[2],
int(line.split()[0]),
int(line.split()[1]),
)
)
return results
def words(self, utterances=None):
results = []
for fileid in self._utterance_fileids(utterances, ".wrd"):
with self.open(fileid) as fp:
for line in fp:
if line.strip():
results.append(line.split()[-1])
return results
def word_times(self, utterances=None):
results = []
for fileid in self._utterance_fileids(utterances, ".wrd"):
with self.open(fileid) as fp:
for line in fp:
if line.strip():
results.append(
(
line.split()[2],
int(line.split()[0]),
int(line.split()[1]),
)
)
return results
def sents(self, utterances=None):
results = []
for fileid in self._utterance_fileids(utterances, ".wrd"):
with self.open(fileid) as fp:
results.append([line.split()[-1] for line in fp if line.strip()])
return results
def sent_times(self, utterances=None):
# TODO: Check this
return [
(
line.split(None, 2)[-1].strip(),
int(line.split()[0]),
int(line.split()[1]),
)
for fileid in self._utterance_fileids(utterances, ".txt")
for line in self.open(fileid)
if line.strip()
]
def phone_trees(self, utterances=None):
if utterances is None:
utterances = self._utterances
if isinstance(utterances, str):
utterances = [utterances]
trees = []
for utterance in utterances:
word_times = self.word_times(utterance)
phone_times = self.phone_times(utterance)
sent_times = self.sent_times(utterance)
while sent_times:
(sent, sent_start, sent_end) = sent_times.pop(0)
trees.append(Tree("S", []))
while (
word_times and phone_times and phone_times[0][2] <= word_times[0][1]
):
trees[-1].append(phone_times.pop(0)[0])
while word_times and word_times[0][2] <= sent_end:
(word, word_start, word_end) = word_times.pop(0)
trees[-1].append(Tree(word, []))
while phone_times and phone_times[0][2] <= word_end:
trees[-1][-1].append(phone_times.pop(0)[0])
while phone_times and phone_times[0][2] <= sent_end:
trees[-1].append(phone_times.pop(0)[0])
return trees
# [xx] NOTE: This is currently broken -- we're assuming that the
# fileids are WAV fileids (aka RIFF), but they're actually NIST SPHERE
# fileids.
def wav(self, utterance, start=0, end=None):
# nltk.chunk conflicts with the stdlib module 'chunk'
wave = import_from_stdlib("wave")
w = wave.open(self.open(utterance + ".wav"), "rb")
if end is None:
end = w.getnframes()
# Skip past frames before start, then read the frames we want
w.readframes(start)
frames = w.readframes(end - start)
# Open a new temporary file -- the wave module requires
# an actual file, and won't work w/ stringio. :(
tf = tempfile.TemporaryFile()
out = wave.open(tf, "w")
# Write the parameters & data to the new file.
out.setparams(w.getparams())
out.writeframes(frames)
out.close()
# Read the data back from the file, and return it. The
# file will automatically be deleted when we return.
tf.seek(0)
return tf.read()
def audiodata(self, utterance, start=0, end=None):
assert end is None or end > start
headersize = 44
with self.open(utterance + ".wav") as fp:
if end is None:
data = fp.read()
else:
data = fp.read(headersize + end * 2)
return data[headersize + start * 2 :]
def _utterance_fileids(self, utterances, extension):
if utterances is None:
utterances = self._utterances
if isinstance(utterances, str):
utterances = [utterances]
return [f"{u}{extension}" for u in utterances]
def play(self, utterance, start=0, end=None):
"""
Play the given audio sample.
:param utterance: The utterance id of the sample to play
"""
# Method 1: os audio dev.
try:
import ossaudiodev
try:
dsp = ossaudiodev.open("w")
dsp.setfmt(ossaudiodev.AFMT_S16_LE)
dsp.channels(1)
dsp.speed(16000)
dsp.write(self.audiodata(utterance, start, end))
dsp.close()
except OSError as e:
print(
(
"can't acquire the audio device; please "
"activate your audio device."
),
file=sys.stderr,
)
print("system error message:", str(e), file=sys.stderr)
return
except ImportError:
pass
# Method 2: pygame
try:
# FIXME: this won't work under python 3
import pygame.mixer
import StringIO
pygame.mixer.init(16000)
f = StringIO.StringIO(self.wav(utterance, start, end))
pygame.mixer.Sound(f).play()
while pygame.mixer.get_busy():
time.sleep(0.01)
return
except ImportError:
pass
# Method 3: complain. :)
print(
("you must install pygame or ossaudiodev " "for audio playback."),
file=sys.stderr,
)
class SpeakerInfo:
def __init__(
self, id, sex, dr, use, recdate, birthdate, ht, race, edu, comments=None
):
self.id = id
self.sex = sex
self.dr = dr
self.use = use
self.recdate = recdate
self.birthdate = birthdate
self.ht = ht
self.race = race
self.edu = edu
self.comments = comments
def __repr__(self):
attribs = "id sex dr use recdate birthdate ht race edu comments"
args = [f"{attr}={getattr(self, attr)!r}" for attr in attribs.split()]
return "SpeakerInfo(%s)" % (", ".join(args))
def read_timit_block(stream):
"""
Block reader for timit tagged sentences, which are preceded by a sentence
number that will be ignored.
"""
line = stream.readline()
if not line:
return []
n, sent = line.split(" ", 1)
return [sent]

View File

@@ -0,0 +1,76 @@
# Natural Language Toolkit: Toolbox Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Greg Aumann <greg_aumann@sil.org>
# Stuart Robinson <Stuart.Robinson@mpi.nl>
# Steven Bird <stevenbird1@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Module for reading, writing and manipulating
Toolbox databases and settings fileids.
"""
from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *
from nltk.toolbox import ToolboxData
class ToolboxCorpusReader(CorpusReader):
def xml(self, fileids, key=None):
return concat(
[
ToolboxData(path, enc).parse(key=key)
for (path, enc) in self.abspaths(fileids, True)
]
)
def fields(
self,
fileids,
strip=True,
unwrap=True,
encoding="utf8",
errors="strict",
unicode_fields=None,
):
return concat(
[
list(
ToolboxData(fileid, enc).fields(
strip, unwrap, encoding, errors, unicode_fields
)
)
for (fileid, enc) in self.abspaths(fileids, include_encoding=True)
]
)
# should probably be done lazily:
def entries(self, fileids, **kwargs):
if "key" in kwargs:
key = kwargs["key"]
del kwargs["key"]
else:
key = "lx" # the default key in MDF
entries = []
for marker, contents in self.fields(fileids, **kwargs):
if marker == key:
entries.append((contents, []))
else:
try:
entries[-1][-1].append((marker, contents))
except IndexError:
pass
return entries
def words(self, fileids, key="lx"):
return [contents for marker, contents in self.fields(fileids) if marker == key]
def demo():
pass
if __name__ == "__main__":
demo()

View File

@@ -0,0 +1,136 @@
# Natural Language Toolkit: Twitter Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
A reader for corpora that consist of Tweets. It is assumed that the Tweets
have been serialised into line-delimited JSON.
"""
import json
import os
from nltk.corpus.reader.api import CorpusReader
from nltk.corpus.reader.util import StreamBackedCorpusView, ZipFilePathPointer, concat
from nltk.tokenize import TweetTokenizer
class TwitterCorpusReader(CorpusReader):
r"""
Reader for corpora that consist of Tweets represented as a list of line-delimited JSON.
Individual Tweets can be tokenized using the default tokenizer, or by a
custom tokenizer specified as a parameter to the constructor.
Construct a new Tweet corpus reader for a set of documents
located at the given root directory.
If you made your own tweet collection in a directory called
`twitter-files`, then you can initialise the reader as::
from nltk.corpus import TwitterCorpusReader
reader = TwitterCorpusReader(root='/path/to/twitter-files', '.*\.json')
However, the recommended approach is to set the relevant directory as the
value of the environmental variable `TWITTER`, and then invoke the reader
as follows::
root = os.environ['TWITTER']
reader = TwitterCorpusReader(root, '.*\.json')
If you want to work directly with the raw Tweets, the `json` library can
be used::
import json
for tweet in reader.docs():
print(json.dumps(tweet, indent=1, sort_keys=True))
"""
CorpusView = StreamBackedCorpusView
"""
The corpus view class used by this reader.
"""
def __init__(
self, root, fileids=None, word_tokenizer=TweetTokenizer(), encoding="utf8"
):
"""
:param root: The root directory for this corpus.
:param fileids: A list or regexp specifying the fileids in this corpus.
:param word_tokenizer: Tokenizer for breaking the text of Tweets into
smaller units, including but not limited to words.
"""
CorpusReader.__init__(self, root, fileids, encoding)
for path in self.abspaths(self._fileids):
if isinstance(path, ZipFilePathPointer):
pass
elif os.path.getsize(path) == 0:
raise ValueError(f"File {path} is empty")
"""Check that all user-created corpus files are non-empty."""
self._word_tokenizer = word_tokenizer
def docs(self, fileids=None):
"""
Returns the full Tweet objects, as specified by `Twitter
documentation on Tweets
<https://dev.twitter.com/docs/platform-objects/tweets>`_
:return: the given file(s) as a list of dictionaries deserialised
from JSON.
:rtype: list(dict)
"""
return concat(
[
self.CorpusView(path, self._read_tweets, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def strings(self, fileids=None):
"""
Returns only the text content of Tweets in the file(s)
:return: the given file(s) as a list of Tweets.
:rtype: list(str)
"""
fulltweets = self.docs(fileids)
tweets = []
for jsono in fulltweets:
try:
text = jsono["text"]
if isinstance(text, bytes):
text = text.decode(self.encoding)
tweets.append(text)
except KeyError:
pass
return tweets
def tokenized(self, fileids=None):
"""
:return: the given file(s) as a list of the text content of Tweets as
as a list of words, screenanames, hashtags, URLs and punctuation symbols.
:rtype: list(list(str))
"""
tweets = self.strings(fileids)
tokenizer = self._word_tokenizer
return [tokenizer.tokenize(t) for t in tweets]
def _read_tweets(self, stream):
"""
Assumes that each line in ``stream`` is a JSON-serialised object.
"""
tweets = []
for i in range(10):
line = stream.readline()
if not line:
return tweets
tweet = json.loads(line)
tweets.append(tweet)
return tweets

View File

@@ -0,0 +1,74 @@
"""
UDHR corpus reader. It mostly deals with encodings.
"""
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.corpus.reader.util import find_corpus_fileids
class UdhrCorpusReader(PlaintextCorpusReader):
ENCODINGS = [
(".*-Latin1$", "latin-1"),
(".*-Hebrew$", "hebrew"),
(".*-Arabic$", "cp1256"),
("Czech_Cesky-UTF8", "cp1250"), # yeah
("Polish-Latin2", "cp1250"),
("Polish_Polski-Latin2", "cp1250"),
(".*-Cyrillic$", "cyrillic"),
(".*-SJIS$", "SJIS"),
(".*-GB2312$", "GB2312"),
(".*-Latin2$", "ISO-8859-2"),
(".*-Greek$", "greek"),
(".*-UTF8$", "utf-8"),
("Hungarian_Magyar-Unicode", "utf-16-le"),
("Amahuaca", "latin1"),
("Turkish_Turkce-Turkish", "latin5"),
("Lithuanian_Lietuviskai-Baltic", "latin4"),
("Japanese_Nihongo-EUC", "EUC-JP"),
("Japanese_Nihongo-JIS", "iso2022_jp"),
("Chinese_Mandarin-HZ", "hz"),
(r"Abkhaz\-Cyrillic\+Abkh", "cp1251"),
]
SKIP = {
# The following files are not fully decodable because they
# were truncated at wrong bytes:
"Burmese_Myanmar-UTF8",
"Japanese_Nihongo-JIS",
"Chinese_Mandarin-HZ",
"Chinese_Mandarin-UTF8",
"Gujarati-UTF8",
"Hungarian_Magyar-Unicode",
"Lao-UTF8",
"Magahi-UTF8",
"Marathi-UTF8",
"Tamil-UTF8",
# Unfortunately, encodings required for reading
# the following files are not supported by Python:
"Vietnamese-VPS",
"Vietnamese-VIQR",
"Vietnamese-TCVN",
"Magahi-Agra",
"Bhojpuri-Agra",
"Esperanto-T61", # latin3 raises an exception
# The following files are encoded for specific fonts:
"Burmese_Myanmar-WinResearcher",
"Armenian-DallakHelv",
"Tigrinya_Tigrigna-VG2Main",
"Amharic-Afenegus6..60375", # ?
"Navaho_Dine-Navajo-Navaho-font",
# What are these?
"Azeri_Azerbaijani_Cyrillic-Az.Times.Cyr.Normal0117",
"Azeri_Azerbaijani_Latin-Az.Times.Lat0117",
# The following files are unintended:
"Czech-Latin2-err",
"Russian_Russky-UTF8~",
}
def __init__(self, root="udhr"):
fileids = find_corpus_fileids(root, r"(?!README|\.).*")
super().__init__(
root,
[fileid for fileid in fileids if fileid not in self.SKIP],
encoding=self.ENCODINGS,
)

View File

@@ -0,0 +1,780 @@
# Natural Language Toolkit: Corpus Reader Utilities
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import bisect
import os
import pickle
import re
import tempfile
from functools import reduce
from xml.etree import ElementTree
from nltk.data import (
FileSystemPathPointer,
PathPointer,
SeekableUnicodeStreamReader,
ZipFilePathPointer,
)
from nltk.internals import slice_bounds
from nltk.tokenize import wordpunct_tokenize
from nltk.util import AbstractLazySequence, LazyConcatenation, LazySubsequence
######################################################################
# { Corpus View
######################################################################
class StreamBackedCorpusView(AbstractLazySequence):
"""
A 'view' of a corpus file, which acts like a sequence of tokens:
it can be accessed by index, iterated over, etc. However, the
tokens are only constructed as-needed -- the entire corpus is
never stored in memory at once.
The constructor to ``StreamBackedCorpusView`` takes two arguments:
a corpus fileid (specified as a string or as a ``PathPointer``);
and a block reader. A "block reader" is a function that reads
zero or more tokens from a stream, and returns them as a list. A
very simple example of a block reader is:
>>> def simple_block_reader(stream):
... return stream.readline().split()
This simple block reader reads a single line at a time, and
returns a single token (consisting of a string) for each
whitespace-separated substring on the line.
When deciding how to define the block reader for a given
corpus, careful consideration should be given to the size of
blocks handled by the block reader. Smaller block sizes will
increase the memory requirements of the corpus view's internal
data structures (by 2 integers per block). On the other hand,
larger block sizes may decrease performance for random access to
the corpus. (But note that larger block sizes will *not*
decrease performance for iteration.)
Internally, ``CorpusView`` maintains a partial mapping from token
index to file position, with one entry per block. When a token
with a given index *i* is requested, the ``CorpusView`` constructs
it as follows:
1. First, it searches the toknum/filepos mapping for the token
index closest to (but less than or equal to) *i*.
2. Then, starting at the file position corresponding to that
index, it reads one block at a time using the block reader
until it reaches the requested token.
The toknum/filepos mapping is created lazily: it is initially
empty, but every time a new block is read, the block's
initial token is added to the mapping. (Thus, the toknum/filepos
map has one entry per block.)
In order to increase efficiency for random access patterns that
have high degrees of locality, the corpus view may cache one or
more blocks.
:note: Each ``CorpusView`` object internally maintains an open file
object for its underlying corpus file. This file should be
automatically closed when the ``CorpusView`` is garbage collected,
but if you wish to close it manually, use the ``close()``
method. If you access a ``CorpusView``'s items after it has been
closed, the file object will be automatically re-opened.
:warning: If the contents of the file are modified during the
lifetime of the ``CorpusView``, then the ``CorpusView``'s behavior
is undefined.
:warning: If a unicode encoding is specified when constructing a
``CorpusView``, then the block reader may only call
``stream.seek()`` with offsets that have been returned by
``stream.tell()``; in particular, calling ``stream.seek()`` with
relative offsets, or with offsets based on string lengths, may
lead to incorrect behavior.
:ivar _block_reader: The function used to read
a single block from the underlying file stream.
:ivar _toknum: A list containing the token index of each block
that has been processed. In particular, ``_toknum[i]`` is the
token index of the first token in block ``i``. Together
with ``_filepos``, this forms a partial mapping between token
indices and file positions.
:ivar _filepos: A list containing the file position of each block
that has been processed. In particular, ``_toknum[i]`` is the
file position of the first character in block ``i``. Together
with ``_toknum``, this forms a partial mapping between token
indices and file positions.
:ivar _stream: The stream used to access the underlying corpus file.
:ivar _len: The total number of tokens in the corpus, if known;
or None, if the number of tokens is not yet known.
:ivar _eofpos: The character position of the last character in the
file. This is calculated when the corpus view is initialized,
and is used to decide when the end of file has been reached.
:ivar _cache: A cache of the most recently read block. It
is encoded as a tuple (start_toknum, end_toknum, tokens), where
start_toknum is the token index of the first token in the block;
end_toknum is the token index of the first token not in the
block; and tokens is a list of the tokens in the block.
"""
def __init__(self, fileid, block_reader=None, startpos=0, encoding="utf8"):
"""
Create a new corpus view, based on the file ``fileid``, and
read with ``block_reader``. See the class documentation
for more information.
:param fileid: The path to the file that is read by this
corpus view. ``fileid`` can either be a string or a
``PathPointer``.
:param startpos: The file position at which the view will
start reading. This can be used to skip over preface
sections.
:param encoding: The unicode encoding that should be used to
read the file's contents. If no encoding is specified,
then the file's contents will be read as a non-unicode
string (i.e., a str).
"""
if block_reader:
self.read_block = block_reader
# Initialize our toknum/filepos mapping.
self._toknum = [0]
self._filepos = [startpos]
self._encoding = encoding
# We don't know our length (number of tokens) yet.
self._len = None
self._fileid = fileid
self._stream = None
self._current_toknum = None
"""This variable is set to the index of the next token that
will be read, immediately before ``self.read_block()`` is
called. This is provided for the benefit of the block
reader, which under rare circumstances may need to know
the current token number."""
self._current_blocknum = None
"""This variable is set to the index of the next block that
will be read, immediately before ``self.read_block()`` is
called. This is provided for the benefit of the block
reader, which under rare circumstances may need to know
the current block number."""
# Find the length of the file.
try:
if isinstance(self._fileid, PathPointer):
self._eofpos = self._fileid.file_size()
else:
self._eofpos = os.stat(self._fileid).st_size
except Exception as exc:
raise ValueError(f"Unable to open or access {fileid!r} -- {exc}") from exc
# Maintain a cache of the most recently read block, to
# increase efficiency of random access.
self._cache = (-1, -1, None)
fileid = property(
lambda self: self._fileid,
doc="""
The fileid of the file that is accessed by this view.
:type: str or PathPointer""",
)
def read_block(self, stream):
"""
Read a block from the input stream.
:return: a block of tokens from the input stream
:rtype: list(any)
:param stream: an input stream
:type stream: stream
"""
raise NotImplementedError("Abstract Method")
def _open(self):
"""
Open the file stream associated with this corpus view. This
will be called performed if any value is read from the view
while its file stream is closed.
"""
if isinstance(self._fileid, PathPointer):
self._stream = self._fileid.open(self._encoding)
elif self._encoding:
self._stream = SeekableUnicodeStreamReader(
open(self._fileid, "rb"), self._encoding
)
else:
self._stream = open(self._fileid, "rb")
def close(self):
"""
Close the file stream associated with this corpus view. This
can be useful if you are worried about running out of file
handles (although the stream should automatically be closed
upon garbage collection of the corpus view). If the corpus
view is accessed after it is closed, it will be automatically
re-opened.
"""
if self._stream is not None:
self._stream.close()
self._stream = None
def __enter__(self):
return self
def __exit__(self, type, value, traceback):
self.close()
def __len__(self):
if self._len is None:
# iterate_from() sets self._len when it reaches the end
# of the file:
for tok in self.iterate_from(self._toknum[-1]):
pass
return self._len
def __getitem__(self, i):
if isinstance(i, slice):
start, stop = slice_bounds(self, i)
# Check if it's in the cache.
offset = self._cache[0]
if offset <= start and stop <= self._cache[1]:
return self._cache[2][start - offset : stop - offset]
# Construct & return the result.
return LazySubsequence(self, start, stop)
else:
# Handle negative indices
if i < 0:
i += len(self)
if i < 0:
raise IndexError("index out of range")
# Check if it's in the cache.
offset = self._cache[0]
if offset <= i < self._cache[1]:
return self._cache[2][i - offset]
# Use iterate_from to extract it.
try:
return next(self.iterate_from(i))
except StopIteration as e:
raise IndexError("index out of range") from e
# If we wanted to be thread-safe, then this method would need to
# do some locking.
def iterate_from(self, start_tok):
# Start by feeding from the cache, if possible.
if self._cache[0] <= start_tok < self._cache[1]:
for tok in self._cache[2][start_tok - self._cache[0] :]:
yield tok
start_tok += 1
# Decide where in the file we should start. If `start` is in
# our mapping, then we can jump straight to the correct block;
# otherwise, start at the last block we've processed.
if start_tok < self._toknum[-1]:
block_index = bisect.bisect_right(self._toknum, start_tok) - 1
toknum = self._toknum[block_index]
filepos = self._filepos[block_index]
else:
block_index = len(self._toknum) - 1
toknum = self._toknum[-1]
filepos = self._filepos[-1]
# Open the stream, if it's not open already.
if self._stream is None:
self._open()
# If the file is empty, the while loop will never run.
# This *seems* to be all the state we need to set:
if self._eofpos == 0:
self._len = 0
# Each iteration through this loop, we read a single block
# from the stream.
while filepos < self._eofpos:
# Read the next block.
self._stream.seek(filepos)
self._current_toknum = toknum
self._current_blocknum = block_index
tokens = self.read_block(self._stream)
assert isinstance(tokens, (tuple, list, AbstractLazySequence)), (
"block reader %s() should return list or tuple."
% self.read_block.__name__
)
num_toks = len(tokens)
new_filepos = self._stream.tell()
assert (
new_filepos > filepos
), "block reader %s() should consume at least 1 byte (filepos=%d)" % (
self.read_block.__name__,
filepos,
)
# Update our cache.
self._cache = (toknum, toknum + num_toks, list(tokens))
# Update our mapping.
assert toknum <= self._toknum[-1]
if num_toks > 0:
block_index += 1
if toknum == self._toknum[-1]:
assert new_filepos > self._filepos[-1] # monotonic!
self._filepos.append(new_filepos)
self._toknum.append(toknum + num_toks)
else:
# Check for consistency:
assert (
new_filepos == self._filepos[block_index]
), "inconsistent block reader (num chars read)"
assert (
toknum + num_toks == self._toknum[block_index]
), "inconsistent block reader (num tokens returned)"
# If we reached the end of the file, then update self._len
if new_filepos == self._eofpos:
self._len = toknum + num_toks
# Generate the tokens in this block (but skip any tokens
# before start_tok). Note that between yields, our state
# may be modified.
for tok in tokens[max(0, start_tok - toknum) :]:
yield tok
# If we're at the end of the file, then we're done.
assert new_filepos <= self._eofpos
if new_filepos == self._eofpos:
break
# Update our indices
toknum += num_toks
filepos = new_filepos
# If we reach this point, then we should know our length.
assert self._len is not None
# Enforce closing of stream once we reached end of file
# We should have reached EOF once we're out of the while loop.
self.close()
# Use concat for these, so we can use a ConcatenatedCorpusView
# when possible.
def __add__(self, other):
return concat([self, other])
def __radd__(self, other):
return concat([other, self])
def __mul__(self, count):
return concat([self] * count)
def __rmul__(self, count):
return concat([self] * count)
class ConcatenatedCorpusView(AbstractLazySequence):
"""
A 'view' of a corpus file that joins together one or more
``StreamBackedCorpusViews<StreamBackedCorpusView>``. At most
one file handle is left open at any time.
"""
def __init__(self, corpus_views):
self._pieces = corpus_views
"""A list of the corpus subviews that make up this
concatenation."""
self._offsets = [0]
"""A list of offsets, indicating the index at which each
subview begins. In particular::
offsets[i] = sum([len(p) for p in pieces[:i]])"""
self._open_piece = None
"""The most recently accessed corpus subview (or None).
Before a new subview is accessed, this subview will be closed."""
def __len__(self):
if len(self._offsets) <= len(self._pieces):
# Iterate to the end of the corpus.
for tok in self.iterate_from(self._offsets[-1]):
pass
return self._offsets[-1]
def close(self):
for piece in self._pieces:
piece.close()
def iterate_from(self, start_tok):
piecenum = bisect.bisect_right(self._offsets, start_tok) - 1
while piecenum < len(self._pieces):
offset = self._offsets[piecenum]
piece = self._pieces[piecenum]
# If we've got another piece open, close it first.
if self._open_piece is not piece:
if self._open_piece is not None:
self._open_piece.close()
self._open_piece = piece
# Get everything we can from this piece.
yield from piece.iterate_from(max(0, start_tok - offset))
# Update the offset table.
if piecenum + 1 == len(self._offsets):
self._offsets.append(self._offsets[-1] + len(piece))
# Move on to the next piece.
piecenum += 1
def concat(docs):
"""
Concatenate together the contents of multiple documents from a
single corpus, using an appropriate concatenation function. This
utility function is used by corpus readers when the user requests
more than one document at a time.
"""
if len(docs) == 1:
return docs[0]
if len(docs) == 0:
raise ValueError("concat() expects at least one object!")
types = {d.__class__ for d in docs}
# If they're all strings, use string concatenation.
if all(isinstance(doc, str) for doc in docs):
return "".join(docs)
# If they're all corpus views, then use ConcatenatedCorpusView.
for typ in types:
if not issubclass(typ, (StreamBackedCorpusView, ConcatenatedCorpusView)):
break
else:
return ConcatenatedCorpusView(docs)
# If they're all lazy sequences, use a lazy concatenation
for typ in types:
if not issubclass(typ, AbstractLazySequence):
break
else:
return LazyConcatenation(docs)
# Otherwise, see what we can do:
if len(types) == 1:
typ = list(types)[0]
if issubclass(typ, list):
return reduce((lambda a, b: a + b), docs, [])
if issubclass(typ, tuple):
return reduce((lambda a, b: a + b), docs, ())
if ElementTree.iselement(typ):
xmltree = ElementTree.Element("documents")
for doc in docs:
xmltree.append(doc)
return xmltree
# No method found!
raise ValueError("Don't know how to concatenate types: %r" % types)
######################################################################
# { Block Readers
######################################################################
def read_whitespace_block(stream):
toks = []
for i in range(20): # Read 20 lines at a time.
toks.extend(stream.readline().split())
return toks
def read_wordpunct_block(stream):
toks = []
for i in range(20): # Read 20 lines at a time.
toks.extend(wordpunct_tokenize(stream.readline()))
return toks
def read_line_block(stream):
toks = []
for i in range(20):
line = stream.readline()
if not line:
return toks
toks.append(line.rstrip("\n"))
return toks
def read_blankline_block(stream):
s = ""
while True:
line = stream.readline()
# End of file:
if not line:
if s:
return [s]
else:
return []
# Blank line:
elif line and not line.strip():
if s:
return [s]
# Other line:
else:
s += line
def read_alignedsent_block(stream):
s = ""
while True:
line = stream.readline()
if line[0] == "=" or line[0] == "\n" or line[:2] == "\r\n":
continue
# End of file:
if not line:
if s:
return [s]
else:
return []
# Other line:
else:
s += line
if re.match(r"^\d+-\d+", line) is not None:
return [s]
def read_regexp_block(stream, start_re, end_re=None):
"""
Read a sequence of tokens from a stream, where tokens begin with
lines that match ``start_re``. If ``end_re`` is specified, then
tokens end with lines that match ``end_re``; otherwise, tokens end
whenever the next line matching ``start_re`` or EOF is found.
"""
# Scan until we find a line matching the start regexp.
while True:
line = stream.readline()
if not line:
return [] # end of file.
if re.match(start_re, line):
break
# Scan until we find another line matching the regexp, or EOF.
lines = [line]
while True:
oldpos = stream.tell()
line = stream.readline()
# End of file:
if not line:
return ["".join(lines)]
# End of token:
if end_re is not None and re.match(end_re, line):
return ["".join(lines)]
# Start of new token: backup to just before it starts, and
# return the token we've already collected.
if end_re is None and re.match(start_re, line):
stream.seek(oldpos)
return ["".join(lines)]
# Anything else is part of the token.
lines.append(line)
def read_sexpr_block(stream, block_size=16384, comment_char=None):
"""
Read a sequence of s-expressions from the stream, and leave the
stream's file position at the end the last complete s-expression
read. This function will always return at least one s-expression,
unless there are no more s-expressions in the file.
If the file ends in in the middle of an s-expression, then that
incomplete s-expression is returned when the end of the file is
reached.
:param block_size: The default block size for reading. If an
s-expression is longer than one block, then more than one
block will be read.
:param comment_char: A character that marks comments. Any lines
that begin with this character will be stripped out.
(If spaces or tabs precede the comment character, then the
line will not be stripped.)
"""
start = stream.tell()
block = stream.read(block_size)
encoding = getattr(stream, "encoding", None)
assert encoding is not None or isinstance(block, str)
if encoding not in (None, "utf-8"):
import warnings
warnings.warn(
"Parsing may fail, depending on the properties "
"of the %s encoding!" % encoding
)
# (e.g., the utf-16 encoding does not work because it insists
# on adding BOMs to the beginning of encoded strings.)
if comment_char:
COMMENT = re.compile("(?m)^%s.*$" % re.escape(comment_char))
while True:
try:
# If we're stripping comments, then make sure our block ends
# on a line boundary; and then replace any comments with
# space characters. (We can't just strip them out -- that
# would make our offset wrong.)
if comment_char:
block += stream.readline()
block = re.sub(COMMENT, _sub_space, block)
# Read the block.
tokens, offset = _parse_sexpr_block(block)
# Skip whitespace
offset = re.compile(r"\s*").search(block, offset).end()
# Move to the end position.
if encoding is None:
stream.seek(start + offset)
else:
stream.seek(start + len(block[:offset].encode(encoding)))
# Return the list of tokens we processed
return tokens
except ValueError as e:
if e.args[0] == "Block too small":
next_block = stream.read(block_size)
if next_block:
block += next_block
continue
else:
# The file ended mid-sexpr -- return what we got.
return [block.strip()]
else:
raise
def _sub_space(m):
"""Helper function: given a regexp match, return a string of
spaces that's the same length as the matched string."""
return " " * (m.end() - m.start())
def _parse_sexpr_block(block):
tokens = []
start = end = 0
while end < len(block):
m = re.compile(r"\S").search(block, end)
if not m:
return tokens, end
start = m.start()
# Case 1: sexpr is not parenthesized.
if m.group() != "(":
m2 = re.compile(r"[\s(]").search(block, start)
if m2:
end = m2.start()
else:
if tokens:
return tokens, end
raise ValueError("Block too small")
# Case 2: parenthesized sexpr.
else:
nesting = 0
for m in re.compile(r"[()]").finditer(block, start):
if m.group() == "(":
nesting += 1
else:
nesting -= 1
if nesting == 0:
end = m.end()
break
else:
if tokens:
return tokens, end
raise ValueError("Block too small")
tokens.append(block[start:end])
return tokens, end
######################################################################
# { Finding Corpus Items
######################################################################
def find_corpus_fileids(root, regexp):
if not isinstance(root, PathPointer):
raise TypeError("find_corpus_fileids: expected a PathPointer")
regexp += "$"
# Find fileids in a zipfile: scan the zipfile's namelist. Filter
# out entries that end in '/' -- they're directories.
if isinstance(root, ZipFilePathPointer):
fileids = [
name[len(root.entry) :]
for name in root.zipfile.namelist()
if not name.endswith("/")
]
items = [name for name in fileids if re.match(regexp, name)]
return sorted(items)
# Find fileids in a directory: use os.walk to search all (proper
# or symlinked) subdirectories, and match paths against the regexp.
elif isinstance(root, FileSystemPathPointer):
items = []
for dirname, subdirs, fileids in os.walk(root.path):
prefix = "".join("%s/" % p for p in _path_from(root.path, dirname))
items += [
prefix + fileid
for fileid in fileids
if re.match(regexp, prefix + fileid)
]
# Don't visit svn directories:
if ".svn" in subdirs:
subdirs.remove(".svn")
return sorted(items)
else:
raise AssertionError("Don't know how to handle %r" % root)
def _path_from(parent, child):
if os.path.split(parent)[1] == "":
parent = os.path.split(parent)[0]
path = []
while parent != child:
child, dirname = os.path.split(child)
path.insert(0, dirname)
assert os.path.split(child)[0] != child
return path
######################################################################
# { Paragraph structure in Treebank files
######################################################################
def tagged_treebank_para_block_reader(stream):
# Read the next paragraph.
para = ""
while True:
line = stream.readline()
# End of paragraph:
if re.match(r"======+\s*$", line):
if para.strip():
return [para]
# End of file:
elif line == "":
if para.strip():
return [para]
else:
return []
# Content line:
else:
para += line

View File

@@ -0,0 +1,629 @@
# Natural Language Toolkit: Verbnet Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
An NLTK interface to the VerbNet verb lexicon
For details about VerbNet see:
https://verbs.colorado.edu/~mpalmer/projects/verbnet.html
"""
import re
import textwrap
from collections import defaultdict
from nltk.corpus.reader.xmldocs import XMLCorpusReader
class VerbnetCorpusReader(XMLCorpusReader):
"""
An NLTK interface to the VerbNet verb lexicon.
From the VerbNet site: "VerbNet (VN) (Kipper-Schuler 2006) is the largest
on-line verb lexicon currently available for English. It is a hierarchical
domain-independent, broad-coverage verb lexicon with mappings to other
lexical resources such as WordNet (Miller, 1990; Fellbaum, 1998), XTAG
(XTAG Research Group, 2001), and FrameNet (Baker et al., 1998)."
For details about VerbNet see:
https://verbs.colorado.edu/~mpalmer/projects/verbnet.html
"""
# No unicode encoding param, since the data files are all XML.
def __init__(self, root, fileids, wrap_etree=False):
XMLCorpusReader.__init__(self, root, fileids, wrap_etree)
self._lemma_to_class = defaultdict(list)
"""A dictionary mapping from verb lemma strings to lists of
VerbNet class identifiers."""
self._wordnet_to_class = defaultdict(list)
"""A dictionary mapping from wordnet identifier strings to
lists of VerbNet class identifiers."""
self._class_to_fileid = {}
"""A dictionary mapping from class identifiers to
corresponding file identifiers. The keys of this dictionary
provide a complete list of all classes and subclasses."""
self._shortid_to_longid = {}
# Initialize the dictionaries. Use the quick (regexp-based)
# method instead of the slow (xml-based) method, because it
# runs 2-30 times faster.
self._quick_index()
_LONGID_RE = re.compile(r"([^\-\.]*)-([\d+.\-]+)$")
"""Regular expression that matches (and decomposes) longids"""
_SHORTID_RE = re.compile(r"[\d+.\-]+$")
"""Regular expression that matches shortids"""
_INDEX_RE = re.compile(
r'<MEMBER name="\??([^"]+)" wn="([^"]*)"[^>]+>|' r'<VNSUBCLASS ID="([^"]+)"/?>'
)
"""Regular expression used by ``_index()`` to quickly scan the corpus
for basic information."""
def lemmas(self, vnclass=None):
"""
Return a list of all verb lemmas that appear in any class, or
in the ``classid`` if specified.
"""
if vnclass is None:
return sorted(self._lemma_to_class.keys())
else:
# [xx] should this include subclass members?
if isinstance(vnclass, str):
vnclass = self.vnclass(vnclass)
return [member.get("name") for member in vnclass.findall("MEMBERS/MEMBER")]
def wordnetids(self, vnclass=None):
"""
Return a list of all wordnet identifiers that appear in any
class, or in ``classid`` if specified.
"""
if vnclass is None:
return sorted(self._wordnet_to_class.keys())
else:
# [xx] should this include subclass members?
if isinstance(vnclass, str):
vnclass = self.vnclass(vnclass)
return sum(
(
member.get("wn", "").split()
for member in vnclass.findall("MEMBERS/MEMBER")
),
[],
)
def classids(self, lemma=None, wordnetid=None, fileid=None, classid=None):
"""
Return a list of the VerbNet class identifiers. If a file
identifier is specified, then return only the VerbNet class
identifiers for classes (and subclasses) defined by that file.
If a lemma is specified, then return only VerbNet class
identifiers for classes that contain that lemma as a member.
If a wordnetid is specified, then return only identifiers for
classes that contain that wordnetid as a member. If a classid
is specified, then return only identifiers for subclasses of
the specified VerbNet class.
If nothing is specified, return all classids within VerbNet
"""
if fileid is not None:
return [c for (c, f) in self._class_to_fileid.items() if f == fileid]
elif lemma is not None:
return self._lemma_to_class[lemma]
elif wordnetid is not None:
return self._wordnet_to_class[wordnetid]
elif classid is not None:
xmltree = self.vnclass(classid)
return [
subclass.get("ID")
for subclass in xmltree.findall("SUBCLASSES/VNSUBCLASS")
]
else:
return sorted(self._class_to_fileid.keys())
def vnclass(self, fileid_or_classid):
"""Returns VerbNet class ElementTree
Return an ElementTree containing the xml for the specified
VerbNet class.
:param fileid_or_classid: An identifier specifying which class
should be returned. Can be a file identifier (such as
``'put-9.1.xml'``), or a VerbNet class identifier (such as
``'put-9.1'``) or a short VerbNet class identifier (such as
``'9.1'``).
"""
# File identifier: just return the xml.
if fileid_or_classid in self._fileids:
return self.xml(fileid_or_classid)
# Class identifier: get the xml, and find the right elt.
classid = self.longid(fileid_or_classid)
if classid in self._class_to_fileid:
fileid = self._class_to_fileid[self.longid(classid)]
tree = self.xml(fileid)
if classid == tree.get("ID"):
return tree
else:
for subclass in tree.findall(".//VNSUBCLASS"):
if classid == subclass.get("ID"):
return subclass
else:
assert False # we saw it during _index()!
else:
raise ValueError(f"Unknown identifier {fileid_or_classid}")
def fileids(self, vnclass_ids=None):
"""
Return a list of fileids that make up this corpus. If
``vnclass_ids`` is specified, then return the fileids that make
up the specified VerbNet class(es).
"""
if vnclass_ids is None:
return self._fileids
elif isinstance(vnclass_ids, str):
return [self._class_to_fileid[self.longid(vnclass_ids)]]
else:
return [
self._class_to_fileid[self.longid(vnclass_id)]
for vnclass_id in vnclass_ids
]
def frames(self, vnclass):
"""Given a VerbNet class, this method returns VerbNet frames
The members returned are:
1) Example
2) Description
3) Syntax
4) Semantics
:param vnclass: A VerbNet class identifier; or an ElementTree
containing the xml contents of a VerbNet class.
:return: frames - a list of frame dictionaries
"""
if isinstance(vnclass, str):
vnclass = self.vnclass(vnclass)
frames = []
vnframes = vnclass.findall("FRAMES/FRAME")
for vnframe in vnframes:
frames.append(
{
"example": self._get_example_within_frame(vnframe),
"description": self._get_description_within_frame(vnframe),
"syntax": self._get_syntactic_list_within_frame(vnframe),
"semantics": self._get_semantics_within_frame(vnframe),
}
)
return frames
def subclasses(self, vnclass):
"""Returns subclass ids, if any exist
Given a VerbNet class, this method returns subclass ids (if they exist)
in a list of strings.
:param vnclass: A VerbNet class identifier; or an ElementTree
containing the xml contents of a VerbNet class.
:return: list of subclasses
"""
if isinstance(vnclass, str):
vnclass = self.vnclass(vnclass)
subclasses = [
subclass.get("ID") for subclass in vnclass.findall("SUBCLASSES/VNSUBCLASS")
]
return subclasses
def themroles(self, vnclass):
"""Returns thematic roles participating in a VerbNet class
Members returned as part of roles are-
1) Type
2) Modifiers
:param vnclass: A VerbNet class identifier; or an ElementTree
containing the xml contents of a VerbNet class.
:return: themroles: A list of thematic roles in the VerbNet class
"""
if isinstance(vnclass, str):
vnclass = self.vnclass(vnclass)
themroles = []
for trole in vnclass.findall("THEMROLES/THEMROLE"):
themroles.append(
{
"type": trole.get("type"),
"modifiers": [
{"value": restr.get("Value"), "type": restr.get("type")}
for restr in trole.findall("SELRESTRS/SELRESTR")
],
}
)
return themroles
######################################################################
# { Index Initialization
######################################################################
def _index(self):
"""
Initialize the indexes ``_lemma_to_class``,
``_wordnet_to_class``, and ``_class_to_fileid`` by scanning
through the corpus fileids. This is fast if ElementTree
uses the C implementation (<0.1 secs), but quite slow (>10 secs)
if only the python implementation is available.
"""
for fileid in self._fileids:
self._index_helper(self.xml(fileid), fileid)
def _index_helper(self, xmltree, fileid):
"""Helper for ``_index()``"""
vnclass = xmltree.get("ID")
self._class_to_fileid[vnclass] = fileid
self._shortid_to_longid[self.shortid(vnclass)] = vnclass
for member in xmltree.findall("MEMBERS/MEMBER"):
self._lemma_to_class[member.get("name")].append(vnclass)
for wn in member.get("wn", "").split():
self._wordnet_to_class[wn].append(vnclass)
for subclass in xmltree.findall("SUBCLASSES/VNSUBCLASS"):
self._index_helper(subclass, fileid)
def _quick_index(self):
"""
Initialize the indexes ``_lemma_to_class``,
``_wordnet_to_class``, and ``_class_to_fileid`` by scanning
through the corpus fileids. This doesn't do proper xml parsing,
but is good enough to find everything in the standard VerbNet
corpus -- and it runs about 30 times faster than xml parsing
(with the python ElementTree; only 2-3 times faster
if ElementTree uses the C implementation).
"""
# nb: if we got rid of wordnet_to_class, this would run 2-3
# times faster.
for fileid in self._fileids:
vnclass = fileid[:-4] # strip the '.xml'
self._class_to_fileid[vnclass] = fileid
self._shortid_to_longid[self.shortid(vnclass)] = vnclass
with self.open(fileid) as fp:
for m in self._INDEX_RE.finditer(fp.read()):
groups = m.groups()
if groups[0] is not None:
self._lemma_to_class[groups[0]].append(vnclass)
for wn in groups[1].split():
self._wordnet_to_class[wn].append(vnclass)
elif groups[2] is not None:
self._class_to_fileid[groups[2]] = fileid
vnclass = groups[2] # for <MEMBER> elts.
self._shortid_to_longid[self.shortid(vnclass)] = vnclass
else:
assert False, "unexpected match condition"
######################################################################
# { Identifier conversion
######################################################################
def longid(self, shortid):
"""Returns longid of a VerbNet class
Given a short VerbNet class identifier (eg '37.10'), map it
to a long id (eg 'confess-37.10'). If ``shortid`` is already a
long id, then return it as-is"""
if self._LONGID_RE.match(shortid):
return shortid # it's already a longid.
elif not self._SHORTID_RE.match(shortid):
raise ValueError("vnclass identifier %r not found" % shortid)
try:
return self._shortid_to_longid[shortid]
except KeyError as e:
raise ValueError("vnclass identifier %r not found" % shortid) from e
def shortid(self, longid):
"""Returns shortid of a VerbNet class
Given a long VerbNet class identifier (eg 'confess-37.10'),
map it to a short id (eg '37.10'). If ``longid`` is already a
short id, then return it as-is."""
if self._SHORTID_RE.match(longid):
return longid # it's already a shortid.
m = self._LONGID_RE.match(longid)
if m:
return m.group(2)
else:
raise ValueError("vnclass identifier %r not found" % longid)
######################################################################
# { Frame access utility functions
######################################################################
def _get_semantics_within_frame(self, vnframe):
"""Returns semantics within a single frame
A utility function to retrieve semantics within a frame in VerbNet
Members of the semantics dictionary:
1) Predicate value
2) Arguments
:param vnframe: An ElementTree containing the xml contents of
a VerbNet frame.
:return: semantics: semantics dictionary
"""
semantics_within_single_frame = []
for pred in vnframe.findall("SEMANTICS/PRED"):
arguments = [
{"type": arg.get("type"), "value": arg.get("value")}
for arg in pred.findall("ARGS/ARG")
]
semantics_within_single_frame.append(
{
"predicate_value": pred.get("value"),
"arguments": arguments,
"negated": pred.get("bool") == "!",
}
)
return semantics_within_single_frame
def _get_example_within_frame(self, vnframe):
"""Returns example within a frame
A utility function to retrieve an example within a frame in VerbNet.
:param vnframe: An ElementTree containing the xml contents of
a VerbNet frame.
:return: example_text: The example sentence for this particular frame
"""
example_element = vnframe.find("EXAMPLES/EXAMPLE")
if example_element is not None:
example_text = example_element.text
else:
example_text = ""
return example_text
def _get_description_within_frame(self, vnframe):
"""Returns member description within frame
A utility function to retrieve a description of participating members
within a frame in VerbNet.
:param vnframe: An ElementTree containing the xml contents of
a VerbNet frame.
:return: description: a description dictionary with members - primary and secondary
"""
description_element = vnframe.find("DESCRIPTION")
return {
"primary": description_element.attrib["primary"],
"secondary": description_element.get("secondary", ""),
}
def _get_syntactic_list_within_frame(self, vnframe):
"""Returns semantics within a frame
A utility function to retrieve semantics within a frame in VerbNet.
Members of the syntactic dictionary:
1) POS Tag
2) Modifiers
:param vnframe: An ElementTree containing the xml contents of
a VerbNet frame.
:return: syntax_within_single_frame
"""
syntax_within_single_frame = []
for elt in vnframe.find("SYNTAX"):
pos_tag = elt.tag
modifiers = dict()
modifiers["value"] = elt.get("value") if "value" in elt.attrib else ""
modifiers["selrestrs"] = [
{"value": restr.get("Value"), "type": restr.get("type")}
for restr in elt.findall("SELRESTRS/SELRESTR")
]
modifiers["synrestrs"] = [
{"value": restr.get("Value"), "type": restr.get("type")}
for restr in elt.findall("SYNRESTRS/SYNRESTR")
]
syntax_within_single_frame.append(
{"pos_tag": pos_tag, "modifiers": modifiers}
)
return syntax_within_single_frame
######################################################################
# { Pretty Printing
######################################################################
def pprint(self, vnclass):
"""Returns pretty printed version of a VerbNet class
Return a string containing a pretty-printed representation of
the given VerbNet class.
:param vnclass: A VerbNet class identifier; or an ElementTree
containing the xml contents of a VerbNet class.
"""
if isinstance(vnclass, str):
vnclass = self.vnclass(vnclass)
s = vnclass.get("ID") + "\n"
s += self.pprint_subclasses(vnclass, indent=" ") + "\n"
s += self.pprint_members(vnclass, indent=" ") + "\n"
s += " Thematic roles:\n"
s += self.pprint_themroles(vnclass, indent=" ") + "\n"
s += " Frames:\n"
s += self.pprint_frames(vnclass, indent=" ")
return s
def pprint_subclasses(self, vnclass, indent=""):
"""Returns pretty printed version of subclasses of VerbNet class
Return a string containing a pretty-printed representation of
the given VerbNet class's subclasses.
:param vnclass: A VerbNet class identifier; or an ElementTree
containing the xml contents of a VerbNet class.
"""
if isinstance(vnclass, str):
vnclass = self.vnclass(vnclass)
subclasses = self.subclasses(vnclass)
if not subclasses:
subclasses = ["(none)"]
s = "Subclasses: " + " ".join(subclasses)
return textwrap.fill(
s, 70, initial_indent=indent, subsequent_indent=indent + " "
)
def pprint_members(self, vnclass, indent=""):
"""Returns pretty printed version of members in a VerbNet class
Return a string containing a pretty-printed representation of
the given VerbNet class's member verbs.
:param vnclass: A VerbNet class identifier; or an ElementTree
containing the xml contents of a VerbNet class.
"""
if isinstance(vnclass, str):
vnclass = self.vnclass(vnclass)
members = self.lemmas(vnclass)
if not members:
members = ["(none)"]
s = "Members: " + " ".join(members)
return textwrap.fill(
s, 70, initial_indent=indent, subsequent_indent=indent + " "
)
def pprint_themroles(self, vnclass, indent=""):
"""Returns pretty printed version of thematic roles in a VerbNet class
Return a string containing a pretty-printed representation of
the given VerbNet class's thematic roles.
:param vnclass: A VerbNet class identifier; or an ElementTree
containing the xml contents of a VerbNet class.
"""
if isinstance(vnclass, str):
vnclass = self.vnclass(vnclass)
pieces = []
for themrole in self.themroles(vnclass):
piece = indent + "* " + themrole.get("type")
modifiers = [
modifier["value"] + modifier["type"]
for modifier in themrole["modifiers"]
]
if modifiers:
piece += "[{}]".format(" ".join(modifiers))
pieces.append(piece)
return "\n".join(pieces)
def pprint_frames(self, vnclass, indent=""):
"""Returns pretty version of all frames in a VerbNet class
Return a string containing a pretty-printed representation of
the list of frames within the VerbNet class.
:param vnclass: A VerbNet class identifier; or an ElementTree
containing the xml contents of a VerbNet class.
"""
if isinstance(vnclass, str):
vnclass = self.vnclass(vnclass)
pieces = []
for vnframe in self.frames(vnclass):
pieces.append(self._pprint_single_frame(vnframe, indent))
return "\n".join(pieces)
def _pprint_single_frame(self, vnframe, indent=""):
"""Returns pretty printed version of a single frame in a VerbNet class
Returns a string containing a pretty-printed representation of
the given frame.
:param vnframe: An ElementTree containing the xml contents of
a VerbNet frame.
"""
frame_string = self._pprint_description_within_frame(vnframe, indent) + "\n"
frame_string += self._pprint_example_within_frame(vnframe, indent + " ") + "\n"
frame_string += (
self._pprint_syntax_within_frame(vnframe, indent + " Syntax: ") + "\n"
)
frame_string += indent + " Semantics:\n"
frame_string += self._pprint_semantics_within_frame(vnframe, indent + " ")
return frame_string
def _pprint_example_within_frame(self, vnframe, indent=""):
"""Returns pretty printed version of example within frame in a VerbNet class
Return a string containing a pretty-printed representation of
the given VerbNet frame example.
:param vnframe: An ElementTree containing the xml contents of
a Verbnet frame.
"""
if vnframe["example"]:
return indent + " Example: " + vnframe["example"]
def _pprint_description_within_frame(self, vnframe, indent=""):
"""Returns pretty printed version of a VerbNet frame description
Return a string containing a pretty-printed representation of
the given VerbNet frame description.
:param vnframe: An ElementTree containing the xml contents of
a VerbNet frame.
"""
description = indent + vnframe["description"]["primary"]
if vnframe["description"]["secondary"]:
description += " ({})".format(vnframe["description"]["secondary"])
return description
def _pprint_syntax_within_frame(self, vnframe, indent=""):
"""Returns pretty printed version of syntax within a frame in a VerbNet class
Return a string containing a pretty-printed representation of
the given VerbNet frame syntax.
:param vnframe: An ElementTree containing the xml contents of
a VerbNet frame.
"""
pieces = []
for element in vnframe["syntax"]:
piece = element["pos_tag"]
modifier_list = []
if "value" in element["modifiers"] and element["modifiers"]["value"]:
modifier_list.append(element["modifiers"]["value"])
modifier_list += [
"{}{}".format(restr["value"], restr["type"])
for restr in (
element["modifiers"]["selrestrs"]
+ element["modifiers"]["synrestrs"]
)
]
if modifier_list:
piece += "[{}]".format(" ".join(modifier_list))
pieces.append(piece)
return indent + " ".join(pieces)
def _pprint_semantics_within_frame(self, vnframe, indent=""):
"""Returns a pretty printed version of semantics within frame in a VerbNet class
Return a string containing a pretty-printed representation of
the given VerbNet frame semantics.
:param vnframe: An ElementTree containing the xml contents of
a VerbNet frame.
"""
pieces = []
for predicate in vnframe["semantics"]:
arguments = [argument["value"] for argument in predicate["arguments"]]
pieces.append(
f"{'¬' if predicate['negated'] else ''}{predicate['predicate_value']}({', '.join(arguments)})"
)
return "\n".join(f"{indent}* {piece}" for piece in pieces)

View File

@@ -0,0 +1,166 @@
# Natural Language Toolkit: Word List Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *
from nltk.tokenize import line_tokenize
class WordListCorpusReader(CorpusReader):
"""
List of words, one per line. Blank lines are ignored.
"""
def words(self, fileids=None, ignore_lines_startswith="\n"):
return [
line
for line in line_tokenize(self.raw(fileids))
if not line.startswith(ignore_lines_startswith)
]
class SwadeshCorpusReader(WordListCorpusReader):
def entries(self, fileids=None):
"""
:return: a tuple of words for the specified fileids.
"""
if not fileids:
fileids = self.fileids()
wordlists = [self.words(f) for f in fileids]
return list(zip(*wordlists))
class NonbreakingPrefixesCorpusReader(WordListCorpusReader):
"""
This is a class to read the nonbreaking prefixes textfiles from the
Moses Machine Translation toolkit. These lists are used in the Python port
of the Moses' word tokenizer.
"""
available_langs = {
"catalan": "ca",
"czech": "cs",
"german": "de",
"greek": "el",
"english": "en",
"spanish": "es",
"finnish": "fi",
"french": "fr",
"hungarian": "hu",
"icelandic": "is",
"italian": "it",
"latvian": "lv",
"dutch": "nl",
"polish": "pl",
"portuguese": "pt",
"romanian": "ro",
"russian": "ru",
"slovak": "sk",
"slovenian": "sl",
"swedish": "sv",
"tamil": "ta",
}
# Also, add the lang IDs as the keys.
available_langs.update({v: v for v in available_langs.values()})
def words(self, lang=None, fileids=None, ignore_lines_startswith="#"):
"""
This module returns a list of nonbreaking prefixes for the specified
language(s).
>>> from nltk.corpus import nonbreaking_prefixes as nbp
>>> nbp.words('en')[:10] == [u'A', u'B', u'C', u'D', u'E', u'F', u'G', u'H', u'I', u'J']
True
>>> nbp.words('ta')[:5] == [u'\u0b85', u'\u0b86', u'\u0b87', u'\u0b88', u'\u0b89']
True
:return: a list words for the specified language(s).
"""
# If *lang* in list of languages available, allocate apt fileid.
# Otherwise, the function returns non-breaking prefixes for
# all languages when fileids==None.
if lang in self.available_langs:
lang = self.available_langs[lang]
fileids = ["nonbreaking_prefix." + lang]
return [
line
for line in line_tokenize(self.raw(fileids))
if not line.startswith(ignore_lines_startswith)
]
class UnicharsCorpusReader(WordListCorpusReader):
"""
This class is used to read lists of characters from the Perl Unicode
Properties (see https://perldoc.perl.org/perluniprops.html).
The files in the perluniprop.zip are extracted using the Unicode::Tussle
module from https://search.cpan.org/~bdfoy/Unicode-Tussle-1.11/lib/Unicode/Tussle.pm
"""
# These are categories similar to the Perl Unicode Properties
available_categories = [
"Close_Punctuation",
"Currency_Symbol",
"IsAlnum",
"IsAlpha",
"IsLower",
"IsN",
"IsSc",
"IsSo",
"IsUpper",
"Line_Separator",
"Number",
"Open_Punctuation",
"Punctuation",
"Separator",
"Symbol",
]
def chars(self, category=None, fileids=None):
"""
This module returns a list of characters from the Perl Unicode Properties.
They are very useful when porting Perl tokenizers to Python.
>>> from nltk.corpus import perluniprops as pup
>>> pup.chars('Open_Punctuation')[:5] == [u'(', u'[', u'{', u'\u0f3a', u'\u0f3c']
True
>>> pup.chars('Currency_Symbol')[:5] == [u'$', u'\xa2', u'\xa3', u'\xa4', u'\xa5']
True
>>> pup.available_categories
['Close_Punctuation', 'Currency_Symbol', 'IsAlnum', 'IsAlpha', 'IsLower', 'IsN', 'IsSc', 'IsSo', 'IsUpper', 'Line_Separator', 'Number', 'Open_Punctuation', 'Punctuation', 'Separator', 'Symbol']
:return: a list of characters given the specific unicode character category
"""
if category in self.available_categories:
fileids = [category + ".txt"]
return list(self.raw(fileids).strip())
class MWAPPDBCorpusReader(WordListCorpusReader):
"""
This class is used to read the list of word pairs from the subset of lexical
pairs of The Paraphrase Database (PPDB) XXXL used in the Monolingual Word
Alignment (MWA) algorithm described in Sultan et al. (2014a, 2014b, 2015):
- http://acl2014.org/acl2014/Q14/pdf/Q14-1017
- https://www.aclweb.org/anthology/S14-2039
- https://www.aclweb.org/anthology/S15-2027
The original source of the full PPDB corpus can be found on
https://www.cis.upenn.edu/~ccb/ppdb/
:return: a list of tuples of similar lexical terms.
"""
mwa_ppdb_xxxl_file = "ppdb-1.0-xxxl-lexical.extended.synonyms.uniquepairs"
def entries(self, fileids=mwa_ppdb_xxxl_file):
"""
:return: a tuple of synonym word pairs.
"""
return [tuple(line.split("\t")) for line in line_tokenize(self.raw(fileids))]

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,397 @@
# Natural Language Toolkit: XML Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Corpus reader for corpora whose documents are xml files.
(note -- not named 'xml' to avoid conflicting w/ standard xml package)
"""
import codecs
from xml.etree import ElementTree
from nltk.corpus.reader.api import CorpusReader
from nltk.corpus.reader.util import *
from nltk.data import SeekableUnicodeStreamReader
from nltk.internals import ElementWrapper
from nltk.tokenize import WordPunctTokenizer
class XMLCorpusReader(CorpusReader):
"""
Corpus reader for corpora whose documents are xml files.
Note that the ``XMLCorpusReader`` constructor does not take an
``encoding`` argument, because the unicode encoding is specified by
the XML files themselves. See the XML specs for more info.
"""
def __init__(self, root, fileids, wrap_etree=False):
self._wrap_etree = wrap_etree
CorpusReader.__init__(self, root, fileids)
def xml(self, fileid=None):
# Make sure we have exactly one file -- no concatenating XML.
if fileid is None and len(self._fileids) == 1:
fileid = self._fileids[0]
if not isinstance(fileid, str):
raise TypeError("Expected a single file identifier string")
# Read the XML in using ElementTree.
with self.abspath(fileid).open() as fp:
elt = ElementTree.parse(fp).getroot()
# If requested, wrap it.
if self._wrap_etree:
elt = ElementWrapper(elt)
# Return the ElementTree element.
return elt
def words(self, fileid=None):
"""
Returns all of the words and punctuation symbols in the specified file
that were in text nodes -- ie, tags are ignored. Like the xml() method,
fileid can only specify one file.
:return: the given file's text nodes as a list of words and punctuation symbols
:rtype: list(str)
"""
elt = self.xml(fileid)
encoding = self.encoding(fileid)
word_tokenizer = WordPunctTokenizer()
try:
iterator = elt.getiterator()
except:
iterator = elt.iter()
out = []
for node in iterator:
text = node.text
if text is not None:
if isinstance(text, bytes):
text = text.decode(encoding)
toks = word_tokenizer.tokenize(text)
out.extend(toks)
return out
class XMLCorpusView(StreamBackedCorpusView):
"""
A corpus view that selects out specified elements from an XML
file, and provides a flat list-like interface for accessing them.
(Note: ``XMLCorpusView`` is not used by ``XMLCorpusReader`` itself,
but may be used by subclasses of ``XMLCorpusReader``.)
Every XML corpus view has a "tag specification", indicating what
XML elements should be included in the view; and each (non-nested)
element that matches this specification corresponds to one item in
the view. Tag specifications are regular expressions over tag
paths, where a tag path is a list of element tag names, separated
by '/', indicating the ancestry of the element. Some examples:
- ``'foo'``: A top-level element whose tag is ``foo``.
- ``'foo/bar'``: An element whose tag is ``bar`` and whose parent
is a top-level element whose tag is ``foo``.
- ``'.*/foo'``: An element whose tag is ``foo``, appearing anywhere
in the xml tree.
- ``'.*/(foo|bar)'``: An wlement whose tag is ``foo`` or ``bar``,
appearing anywhere in the xml tree.
The view items are generated from the selected XML elements via
the method ``handle_elt()``. By default, this method returns the
element as-is (i.e., as an ElementTree object); but it can be
overridden, either via subclassing or via the ``elt_handler``
constructor parameter.
"""
#: If true, then display debugging output to stdout when reading
#: blocks.
_DEBUG = False
#: The number of characters read at a time by this corpus reader.
_BLOCK_SIZE = 1024
def __init__(self, fileid, tagspec, elt_handler=None):
"""
Create a new corpus view based on a specified XML file.
Note that the ``XMLCorpusView`` constructor does not take an
``encoding`` argument, because the unicode encoding is
specified by the XML files themselves.
:type tagspec: str
:param tagspec: A tag specification, indicating what XML
elements should be included in the view. Each non-nested
element that matches this specification corresponds to one
item in the view.
:param elt_handler: A function used to transform each element
to a value for the view. If no handler is specified, then
``self.handle_elt()`` is called, which returns the element
as an ElementTree object. The signature of elt_handler is::
elt_handler(elt, tagspec) -> value
"""
if elt_handler:
self.handle_elt = elt_handler
self._tagspec = re.compile(tagspec + r"\Z")
"""The tag specification for this corpus view."""
self._tag_context = {0: ()}
"""A dictionary mapping from file positions (as returned by
``stream.seek()`` to XML contexts. An XML context is a
tuple of XML tag names, indicating which tags have not yet
been closed."""
encoding = self._detect_encoding(fileid)
StreamBackedCorpusView.__init__(self, fileid, encoding=encoding)
def _detect_encoding(self, fileid):
if isinstance(fileid, PathPointer):
try:
infile = fileid.open()
s = infile.readline()
finally:
infile.close()
else:
with open(fileid, "rb") as infile:
s = infile.readline()
if s.startswith(codecs.BOM_UTF16_BE):
return "utf-16-be"
if s.startswith(codecs.BOM_UTF16_LE):
return "utf-16-le"
if s.startswith(codecs.BOM_UTF32_BE):
return "utf-32-be"
if s.startswith(codecs.BOM_UTF32_LE):
return "utf-32-le"
if s.startswith(codecs.BOM_UTF8):
return "utf-8"
m = re.match(rb'\s*<\?xml\b.*\bencoding="([^"]+)"', s)
if m:
return m.group(1).decode()
m = re.match(rb"\s*<\?xml\b.*\bencoding='([^']+)'", s)
if m:
return m.group(1).decode()
# No encoding found -- what should the default be?
return "utf-8"
def handle_elt(self, elt, context):
"""
Convert an element into an appropriate value for inclusion in
the view. Unless overridden by a subclass or by the
``elt_handler`` constructor argument, this method simply
returns ``elt``.
:return: The view value corresponding to ``elt``.
:type elt: ElementTree
:param elt: The element that should be converted.
:type context: str
:param context: A string composed of element tags separated by
forward slashes, indicating the XML context of the given
element. For example, the string ``'foo/bar/baz'``
indicates that the element is a ``baz`` element whose
parent is a ``bar`` element and whose grandparent is a
top-level ``foo`` element.
"""
return elt
#: A regular expression that matches XML fragments that do not
#: contain any un-closed tags.
_VALID_XML_RE = re.compile(
r"""
[^<]*
(
((<!--.*?-->) | # comment
(<![CDATA[.*?]]) | # raw character data
(<!DOCTYPE\s+[^\[]*(\[[^\]]*])?\s*>) | # doctype decl
(<[^!>][^>]*>)) # tag or PI
[^<]*)*
\Z""",
re.DOTALL | re.VERBOSE,
)
#: A regular expression used to extract the tag name from a start tag,
#: end tag, or empty-elt tag string.
_XML_TAG_NAME = re.compile(r"<\s*(?:/\s*)?([^\s>]+)")
#: A regular expression used to find all start-tags, end-tags, and
#: empty-elt tags in an XML file. This regexp is more lenient than
#: the XML spec -- e.g., it allows spaces in some places where the
#: spec does not.
_XML_PIECE = re.compile(
r"""
# Include these so we can skip them:
(?P<COMMENT> <!--.*?--> )|
(?P<CDATA> <![CDATA[.*?]]> )|
(?P<PI> <\?.*?\?> )|
(?P<DOCTYPE> <!DOCTYPE\s+[^\[^>]*(\[[^\]]*])?\s*>)|
# These are the ones we actually care about:
(?P<EMPTY_ELT_TAG> <\s*[^>/\?!\s][^>]*/\s*> )|
(?P<START_TAG> <\s*[^>/\?!\s][^>]*> )|
(?P<END_TAG> <\s*/[^>/\?!\s][^>]*> )""",
re.DOTALL | re.VERBOSE,
)
def _read_xml_fragment(self, stream):
"""
Read a string from the given stream that does not contain any
un-closed tags. In particular, this function first reads a
block from the stream of size ``self._BLOCK_SIZE``. It then
checks if that block contains an un-closed tag. If it does,
then this function either backtracks to the last '<', or reads
another block.
"""
fragment = ""
if isinstance(stream, SeekableUnicodeStreamReader):
startpos = stream.tell()
while True:
# Read a block and add it to the fragment.
xml_block = stream.read(self._BLOCK_SIZE)
fragment += xml_block
# Do we have a well-formed xml fragment?
if self._VALID_XML_RE.match(fragment):
return fragment
# Do we have a fragment that will never be well-formed?
if re.search("[<>]", fragment).group(0) == ">":
pos = stream.tell() - (
len(fragment) - re.search("[<>]", fragment).end()
)
raise ValueError('Unexpected ">" near char %s' % pos)
# End of file?
if not xml_block:
raise ValueError("Unexpected end of file: tag not closed")
# If not, then we must be in the middle of a <..tag..>.
# If appropriate, backtrack to the most recent '<'
# character.
last_open_bracket = fragment.rfind("<")
if last_open_bracket > 0:
if self._VALID_XML_RE.match(fragment[:last_open_bracket]):
if isinstance(stream, SeekableUnicodeStreamReader):
stream.seek(startpos)
stream.char_seek_forward(last_open_bracket)
else:
stream.seek(-(len(fragment) - last_open_bracket), 1)
return fragment[:last_open_bracket]
# Otherwise, read another block. (i.e., return to the
# top of the loop.)
def read_block(self, stream, tagspec=None, elt_handler=None):
"""
Read from ``stream`` until we find at least one element that
matches ``tagspec``, and return the result of applying
``elt_handler`` to each element found.
"""
if tagspec is None:
tagspec = self._tagspec
if elt_handler is None:
elt_handler = self.handle_elt
# Use a stack of strings to keep track of our context:
context = list(self._tag_context.get(stream.tell()))
assert context is not None # check this -- could it ever happen?
elts = []
elt_start = None # where does the elt start
elt_depth = None # what context depth
elt_text = ""
while elts == [] or elt_start is not None:
if isinstance(stream, SeekableUnicodeStreamReader):
startpos = stream.tell()
xml_fragment = self._read_xml_fragment(stream)
# End of file.
if not xml_fragment:
if elt_start is None:
break
else:
raise ValueError("Unexpected end of file")
# Process each <tag> in the xml fragment.
for piece in self._XML_PIECE.finditer(xml_fragment):
if self._DEBUG:
print("{:>25} {}".format("/".join(context)[-20:], piece.group()))
if piece.group("START_TAG"):
name = self._XML_TAG_NAME.match(piece.group()).group(1)
# Keep context up-to-date.
context.append(name)
# Is this one of the elts we're looking for?
if elt_start is None:
if re.match(tagspec, "/".join(context)):
elt_start = piece.start()
elt_depth = len(context)
elif piece.group("END_TAG"):
name = self._XML_TAG_NAME.match(piece.group()).group(1)
# sanity checks:
if not context:
raise ValueError("Unmatched tag </%s>" % name)
if name != context[-1]:
raise ValueError(f"Unmatched tag <{context[-1]}>...</{name}>")
# Is this the end of an element?
if elt_start is not None and elt_depth == len(context):
elt_text += xml_fragment[elt_start : piece.end()]
elts.append((elt_text, "/".join(context)))
elt_start = elt_depth = None
elt_text = ""
# Keep context up-to-date
context.pop()
elif piece.group("EMPTY_ELT_TAG"):
name = self._XML_TAG_NAME.match(piece.group()).group(1)
if elt_start is None:
if re.match(tagspec, "/".join(context) + "/" + name):
elts.append((piece.group(), "/".join(context) + "/" + name))
if elt_start is not None:
# If we haven't found any elements yet, then keep
# looping until we do.
if elts == []:
elt_text += xml_fragment[elt_start:]
elt_start = 0
# If we've found at least one element, then try
# backtracking to the start of the element that we're
# inside of.
else:
# take back the last start-tag, and return what
# we've gotten so far (elts is non-empty).
if self._DEBUG:
print(" " * 36 + "(backtrack)")
if isinstance(stream, SeekableUnicodeStreamReader):
stream.seek(startpos)
stream.char_seek_forward(elt_start)
else:
stream.seek(-(len(xml_fragment) - elt_start), 1)
context = context[: elt_depth - 1]
elt_start = elt_depth = None
elt_text = ""
# Update the _tag_context dict.
pos = stream.tell()
if pos in self._tag_context:
assert tuple(context) == self._tag_context[pos]
else:
self._tag_context[pos] = tuple(context)
return [
elt_handler(
ElementTree.fromstring(elt.encode("ascii", "xmlcharrefreplace")),
context,
)
for (elt, context) in elts
]

View File

@@ -0,0 +1,256 @@
# Natural Language Toolkit: York-Toronto-Helsinki Parsed Corpus of Old English Prose (YCOE)
#
# Copyright (C) 2001-2015 NLTK Project
# Author: Selina Dennis <selina@tranzfusion.net>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Corpus reader for the York-Toronto-Helsinki Parsed Corpus of Old
English Prose (YCOE), a 1.5 million word syntactically-annotated
corpus of Old English prose texts. The corpus is distributed by the
Oxford Text Archive: http://www.ota.ahds.ac.uk/ It is not included
with NLTK.
The YCOE corpus is divided into 100 files, each representing
an Old English prose text. Tags used within each text complies
to the YCOE standard: https://www-users.york.ac.uk/~lang22/YCOE/YcoeHome.htm
"""
import os
import re
from nltk.corpus.reader.api import *
from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader
from nltk.corpus.reader.tagged import TaggedCorpusReader
from nltk.corpus.reader.util import *
from nltk.tokenize import RegexpTokenizer
class YCOECorpusReader(CorpusReader):
"""
Corpus reader for the York-Toronto-Helsinki Parsed Corpus of Old
English Prose (YCOE), a 1.5 million word syntactically-annotated
corpus of Old English prose texts.
"""
def __init__(self, root, encoding="utf8"):
CorpusReader.__init__(self, root, [], encoding)
self._psd_reader = YCOEParseCorpusReader(
self.root.join("psd"), ".*", ".psd", encoding=encoding
)
self._pos_reader = YCOETaggedCorpusReader(self.root.join("pos"), ".*", ".pos")
# Make sure we have a consistent set of items:
documents = {f[:-4] for f in self._psd_reader.fileids()}
if {f[:-4] for f in self._pos_reader.fileids()} != documents:
raise ValueError('Items in "psd" and "pos" ' "subdirectories do not match.")
fileids = sorted(
["%s.psd" % doc for doc in documents]
+ ["%s.pos" % doc for doc in documents]
)
CorpusReader.__init__(self, root, fileids, encoding)
self._documents = sorted(documents)
def documents(self, fileids=None):
"""
Return a list of document identifiers for all documents in
this corpus, or for the documents with the given file(s) if
specified.
"""
if fileids is None:
return self._documents
if isinstance(fileids, str):
fileids = [fileids]
for f in fileids:
if f not in self._fileids:
raise KeyError("File id %s not found" % fileids)
# Strip off the '.pos' and '.psd' extensions.
return sorted({f[:-4] for f in fileids})
def fileids(self, documents=None):
"""
Return a list of file identifiers for the files that make up
this corpus, or that store the given document(s) if specified.
"""
if documents is None:
return self._fileids
elif isinstance(documents, str):
documents = [documents]
return sorted(
set(
["%s.pos" % doc for doc in documents]
+ ["%s.psd" % doc for doc in documents]
)
)
def _getfileids(self, documents, subcorpus):
"""
Helper that selects the appropriate fileids for a given set of
documents from a given subcorpus (pos or psd).
"""
if documents is None:
documents = self._documents
else:
if isinstance(documents, str):
documents = [documents]
for document in documents:
if document not in self._documents:
if document[-4:] in (".pos", ".psd"):
raise ValueError(
"Expected a document identifier, not a file "
"identifier. (Use corpus.documents() to get "
"a list of document identifiers."
)
else:
raise ValueError("Document identifier %s not found" % document)
return [f"{d}.{subcorpus}" for d in documents]
# Delegate to one of our two sub-readers:
def words(self, documents=None):
return self._pos_reader.words(self._getfileids(documents, "pos"))
def sents(self, documents=None):
return self._pos_reader.sents(self._getfileids(documents, "pos"))
def paras(self, documents=None):
return self._pos_reader.paras(self._getfileids(documents, "pos"))
def tagged_words(self, documents=None):
return self._pos_reader.tagged_words(self._getfileids(documents, "pos"))
def tagged_sents(self, documents=None):
return self._pos_reader.tagged_sents(self._getfileids(documents, "pos"))
def tagged_paras(self, documents=None):
return self._pos_reader.tagged_paras(self._getfileids(documents, "pos"))
def parsed_sents(self, documents=None):
return self._psd_reader.parsed_sents(self._getfileids(documents, "psd"))
class YCOEParseCorpusReader(BracketParseCorpusReader):
"""Specialized version of the standard bracket parse corpus reader
that strips out (CODE ...) and (ID ...) nodes."""
def _parse(self, t):
t = re.sub(r"(?u)\((CODE|ID)[^\)]*\)", "", t)
if re.match(r"\s*\(\s*\)\s*$", t):
return None
return BracketParseCorpusReader._parse(self, t)
class YCOETaggedCorpusReader(TaggedCorpusReader):
def __init__(self, root, items, encoding="utf8"):
gaps_re = r"(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*"
sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True)
TaggedCorpusReader.__init__(
self, root, items, sep="_", sent_tokenizer=sent_tokenizer
)
#: A list of all documents and their titles in ycoe.
documents = {
"coadrian.o34": "Adrian and Ritheus",
"coaelhom.o3": "Ælfric, Supplemental Homilies",
"coaelive.o3": "Ælfric's Lives of Saints",
"coalcuin": "Alcuin De virtutibus et vitiis",
"coalex.o23": "Alexander's Letter to Aristotle",
"coapollo.o3": "Apollonius of Tyre",
"coaugust": "Augustine",
"cobede.o2": "Bede's History of the English Church",
"cobenrul.o3": "Benedictine Rule",
"coblick.o23": "Blickling Homilies",
"coboeth.o2": "Boethius' Consolation of Philosophy",
"cobyrhtf.o3": "Byrhtferth's Manual",
"cocanedgD": "Canons of Edgar (D)",
"cocanedgX": "Canons of Edgar (X)",
"cocathom1.o3": "Ælfric's Catholic Homilies I",
"cocathom2.o3": "Ælfric's Catholic Homilies II",
"cochad.o24": "Saint Chad",
"cochdrul": "Chrodegang of Metz, Rule",
"cochristoph": "Saint Christopher",
"cochronA.o23": "Anglo-Saxon Chronicle A",
"cochronC": "Anglo-Saxon Chronicle C",
"cochronD": "Anglo-Saxon Chronicle D",
"cochronE.o34": "Anglo-Saxon Chronicle E",
"cocura.o2": "Cura Pastoralis",
"cocuraC": "Cura Pastoralis (Cotton)",
"codicts.o34": "Dicts of Cato",
"codocu1.o1": "Documents 1 (O1)",
"codocu2.o12": "Documents 2 (O1/O2)",
"codocu2.o2": "Documents 2 (O2)",
"codocu3.o23": "Documents 3 (O2/O3)",
"codocu3.o3": "Documents 3 (O3)",
"codocu4.o24": "Documents 4 (O2/O4)",
"coeluc1": "Honorius of Autun, Elucidarium 1",
"coeluc2": "Honorius of Autun, Elucidarium 1",
"coepigen.o3": "Ælfric's Epilogue to Genesis",
"coeuphr": "Saint Euphrosyne",
"coeust": "Saint Eustace and his companions",
"coexodusP": "Exodus (P)",
"cogenesiC": "Genesis (C)",
"cogregdC.o24": "Gregory's Dialogues (C)",
"cogregdH.o23": "Gregory's Dialogues (H)",
"coherbar": "Pseudo-Apuleius, Herbarium",
"coinspolD.o34": "Wulfstan's Institute of Polity (D)",
"coinspolX": "Wulfstan's Institute of Polity (X)",
"cojames": "Saint James",
"colacnu.o23": "Lacnunga",
"colaece.o2": "Leechdoms",
"colaw1cn.o3": "Laws, Cnut I",
"colaw2cn.o3": "Laws, Cnut II",
"colaw5atr.o3": "Laws, Æthelred V",
"colaw6atr.o3": "Laws, Æthelred VI",
"colawaf.o2": "Laws, Alfred",
"colawafint.o2": "Alfred's Introduction to Laws",
"colawger.o34": "Laws, Gerefa",
"colawine.ox2": "Laws, Ine",
"colawnorthu.o3": "Northumbra Preosta Lagu",
"colawwllad.o4": "Laws, William I, Lad",
"coleofri.o4": "Leofric",
"colsigef.o3": "Ælfric's Letter to Sigefyrth",
"colsigewB": "Ælfric's Letter to Sigeweard (B)",
"colsigewZ.o34": "Ælfric's Letter to Sigeweard (Z)",
"colwgeat": "Ælfric's Letter to Wulfgeat",
"colwsigeT": "Ælfric's Letter to Wulfsige (T)",
"colwsigeXa.o34": "Ælfric's Letter to Wulfsige (Xa)",
"colwstan1.o3": "Ælfric's Letter to Wulfstan I",
"colwstan2.o3": "Ælfric's Letter to Wulfstan II",
"comargaC.o34": "Saint Margaret (C)",
"comargaT": "Saint Margaret (T)",
"comart1": "Martyrology, I",
"comart2": "Martyrology, II",
"comart3.o23": "Martyrology, III",
"comarvel.o23": "Marvels of the East",
"comary": "Mary of Egypt",
"coneot": "Saint Neot",
"conicodA": "Gospel of Nicodemus (A)",
"conicodC": "Gospel of Nicodemus (C)",
"conicodD": "Gospel of Nicodemus (D)",
"conicodE": "Gospel of Nicodemus (E)",
"coorosiu.o2": "Orosius",
"cootest.o3": "Heptateuch",
"coprefcath1.o3": "Ælfric's Preface to Catholic Homilies I",
"coprefcath2.o3": "Ælfric's Preface to Catholic Homilies II",
"coprefcura.o2": "Preface to the Cura Pastoralis",
"coprefgen.o3": "Ælfric's Preface to Genesis",
"copreflives.o3": "Ælfric's Preface to Lives of Saints",
"coprefsolilo": "Preface to Augustine's Soliloquies",
"coquadru.o23": "Pseudo-Apuleius, Medicina de quadrupedibus",
"corood": "History of the Holy Rood-Tree",
"cosevensl": "Seven Sleepers",
"cosolilo": "St. Augustine's Soliloquies",
"cosolsat1.o4": "Solomon and Saturn I",
"cosolsat2": "Solomon and Saturn II",
"cotempo.o3": "Ælfric's De Temporibus Anni",
"coverhom": "Vercelli Homilies",
"coverhomE": "Vercelli Homilies (E)",
"coverhomL": "Vercelli Homilies (L)",
"covinceB": "Saint Vincent (Bodley 343)",
"covinsal": "Vindicta Salvatoris",
"cowsgosp.o3": "West-Saxon Gospels",
"cowulf.o34": "Wulfstan's Homilies",
}

View File

@@ -0,0 +1,153 @@
# Natural Language Toolkit: Corpus Reader Utility Functions
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
######################################################################
# { Lazy Corpus Loader
######################################################################
import gc
import re
import nltk
TRY_ZIPFILE_FIRST = False
class LazyCorpusLoader:
"""
To see the API documentation for this lazily loaded corpus, first
run corpus.ensure_loaded(), and then run help(this_corpus).
LazyCorpusLoader is a proxy object which is used to stand in for a
corpus object before the corpus is loaded. This allows NLTK to
create an object for each corpus, but defer the costs associated
with loading those corpora until the first time that they're
actually accessed.
The first time this object is accessed in any way, it will load
the corresponding corpus, and transform itself into that corpus
(by modifying its own ``__class__`` and ``__dict__`` attributes).
If the corpus can not be found, then accessing this object will
raise an exception, displaying installation instructions for the
NLTK data package. Once they've properly installed the data
package (or modified ``nltk.data.path`` to point to its location),
they can then use the corpus object without restarting python.
:param name: The name of the corpus
:type name: str
:param reader_cls: The specific CorpusReader class, e.g. PlaintextCorpusReader, WordListCorpusReader
:type reader: nltk.corpus.reader.api.CorpusReader
:param nltk_data_subdir: The subdirectory where the corpus is stored.
:type nltk_data_subdir: str
:param `*args`: Any other non-keywords arguments that `reader_cls` might need.
:param `**kwargs`: Any other keywords arguments that `reader_cls` might need.
"""
def __init__(self, name, reader_cls, *args, **kwargs):
from nltk.corpus.reader.api import CorpusReader
assert issubclass(reader_cls, CorpusReader)
self.__name = self.__name__ = name
self.__reader_cls = reader_cls
# If nltk_data_subdir is set explicitly
if "nltk_data_subdir" in kwargs:
# Use the specified subdirectory path
self.subdir = kwargs["nltk_data_subdir"]
# Pops the `nltk_data_subdir` argument, we don't need it anymore.
kwargs.pop("nltk_data_subdir", None)
else: # Otherwise use 'nltk_data/corpora'
self.subdir = "corpora"
self.__args = args
self.__kwargs = kwargs
def __load(self):
# Find the corpus root directory.
zip_name = re.sub(r"(([^/]+)(/.*)?)", r"\2.zip/\1/", self.__name)
if TRY_ZIPFILE_FIRST:
try:
root = nltk.data.find(f"{self.subdir}/{zip_name}")
except LookupError as e:
try:
root = nltk.data.find(f"{self.subdir}/{self.__name}")
except LookupError:
raise e
else:
try:
root = nltk.data.find(f"{self.subdir}/{self.__name}")
except LookupError as e:
try:
root = nltk.data.find(f"{self.subdir}/{zip_name}")
except LookupError:
raise e
# Load the corpus.
corpus = self.__reader_cls(root, *self.__args, **self.__kwargs)
# This is where the magic happens! Transform ourselves into
# the corpus by modifying our own __dict__ and __class__ to
# match that of the corpus.
args, kwargs = self.__args, self.__kwargs
name, reader_cls = self.__name, self.__reader_cls
self.__dict__ = corpus.__dict__
self.__class__ = corpus.__class__
# _unload support: assign __dict__ and __class__ back, then do GC.
# after reassigning __dict__ there shouldn't be any references to
# corpus data so the memory should be deallocated after gc.collect()
def _unload(self):
lazy_reader = LazyCorpusLoader(name, reader_cls, *args, **kwargs)
self.__dict__ = lazy_reader.__dict__
self.__class__ = lazy_reader.__class__
gc.collect()
self._unload = _make_bound_method(_unload, self)
def __getattr__(self, attr):
# Fix for inspect.isclass under Python 2.6
# (see https://bugs.python.org/issue1225107).
# Without this fix tests may take extra 1.5GB RAM
# because all corpora gets loaded during test collection.
if attr == "__bases__":
raise AttributeError("LazyCorpusLoader object has no attribute '__bases__'")
self.__load()
# This looks circular, but its not, since __load() changes our
# __class__ to something new:
return getattr(self, attr)
def __repr__(self):
return "<{} in {!r} (not loaded yet)>".format(
self.__reader_cls.__name__,
".../corpora/" + self.__name,
)
def _unload(self):
# If an exception occurs during corpus loading then
# '_unload' method may be unattached, so __getattr__ can be called;
# we shouldn't trigger corpus loading again in this case.
pass
def _make_bound_method(func, self):
"""
Magic for creating bound methods (used for _unload).
"""
class Foo:
def meth(self):
pass
f = Foo()
bound_method = type(f.meth)
try:
return bound_method(func, self, self.__class__)
except TypeError: # python3
return bound_method(func, self)