Initial commit

This commit is contained in:
2026-02-01 09:31:38 +01:00
commit e02db93960
4396 changed files with 1511612 additions and 0 deletions

View File

@@ -0,0 +1 @@
3.9.2

View File

@@ -0,0 +1,208 @@
# Natural Language Toolkit (NLTK)
#
# Copyright (C) 2001-2025 NLTK Project
# Authors: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
The Natural Language Toolkit (NLTK) is an open source Python library
for Natural Language Processing. A free online book is available.
(If you use the library for academic research, please cite the book.)
Steven Bird, Ewan Klein, and Edward Loper (2009).
Natural Language Processing with Python. O'Reilly Media Inc.
https://www.nltk.org/book/
isort:skip_file
"""
import os
import importlib
# //////////////////////////////////////////////////////
# Metadata
# //////////////////////////////////////////////////////
# Version. For each new release, the version number should be updated
# in the file VERSION.
try:
# If a VERSION file exists, use it!
version_file = os.path.join(os.path.dirname(__file__), "VERSION")
with open(version_file) as infile:
__version__ = infile.read().strip()
except NameError:
__version__ = "unknown (running code interactively?)"
except OSError as ex:
__version__ = "unknown (%s)" % ex
if __doc__ is not None: # fix for the ``python -OO``
__doc__ += "\n@version: " + __version__
# Copyright notice
__copyright__ = """\
Copyright (C) 2001-2025 NLTK Project.
Distributed and Licensed under the Apache License, Version 2.0,
which is included by reference.
"""
__license__ = "Apache License, Version 2.0"
# Description of the toolkit, keywords, and the project's primary URL.
__longdescr__ = """\
The Natural Language Toolkit (NLTK) is a Python package for
natural language processing. NLTK requires Python 3.9, 3.10, 3.11, 3.12 or 3.13."""
__keywords__ = [
"NLP",
"CL",
"natural language processing",
"computational linguistics",
"parsing",
"tagging",
"tokenizing",
"syntax",
"linguistics",
"language",
"natural language",
"text analytics",
]
__url__ = "https://www.nltk.org/"
# Maintainer, contributors, etc.
__maintainer__ = "NLTK Team"
__maintainer_email__ = "nltk.team@gmail.com"
__author__ = __maintainer__
__author_email__ = __maintainer_email__
# "Trove" classifiers for Python Package Index.
__classifiers__ = [
"Development Status :: 5 - Production/Stable",
"Intended Audience :: Developers",
"Intended Audience :: Education",
"Intended Audience :: Information Technology",
"Intended Audience :: Science/Research",
"License :: OSI Approved :: Apache Software License",
"Operating System :: OS Independent",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
"Topic :: Scientific/Engineering",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Topic :: Scientific/Engineering :: Human Machine Interfaces",
"Topic :: Scientific/Engineering :: Information Analysis",
"Topic :: Text Processing",
"Topic :: Text Processing :: Filters",
"Topic :: Text Processing :: General",
"Topic :: Text Processing :: Indexing",
"Topic :: Text Processing :: Linguistic",
]
from nltk.internals import config_java
# support numpy from pypy
try:
import numpypy
except ImportError:
pass
# Override missing methods on environments where it cannot be used like GAE.
import subprocess
if not hasattr(subprocess, "PIPE"):
def _fake_PIPE(*args, **kwargs):
raise NotImplementedError("subprocess.PIPE is not supported.")
subprocess.PIPE = _fake_PIPE
if not hasattr(subprocess, "Popen"):
def _fake_Popen(*args, **kwargs):
raise NotImplementedError("subprocess.Popen is not supported.")
subprocess.Popen = _fake_Popen
###########################################################
# TOP-LEVEL MODULES
###########################################################
# Import top-level functionality into top-level namespace
from nltk.collocations import *
from nltk.decorators import decorator, memoize
from nltk.featstruct import *
from nltk.grammar import *
from nltk.probability import *
from nltk.text import *
from nltk.util import *
from nltk.jsontags import *
###########################################################
# PACKAGES
###########################################################
from nltk.chunk import *
from nltk.classify import *
from nltk.inference import *
from nltk.metrics import *
from nltk.parse import *
from nltk.tag import *
from nltk.tokenize import *
from nltk.translate import *
from nltk.tree import *
from nltk.sem import *
from nltk.stem import *
# Packages which can be lazily imported
# (a) we don't import *
# (b) they're slow to import or have run-time dependencies
# that can safely fail at run time
from nltk import lazyimport
app = lazyimport.LazyModule("app", locals(), globals())
chat = lazyimport.LazyModule("chat", locals(), globals())
corpus = lazyimport.LazyModule("corpus", locals(), globals())
draw = lazyimport.LazyModule("draw", locals(), globals())
toolbox = lazyimport.LazyModule("toolbox", locals(), globals())
# Optional loading
try:
import numpy
except ImportError:
pass
else:
from nltk import cluster
from nltk.downloader import download, download_shell
# Check if tkinter exists without importing it to avoid crashes after
# forks on macOS. Only nltk.app, nltk.draw, and demo modules should
# have top-level tkinter imports. See #2949 for more details.
if importlib.util.find_spec("tkinter"):
try:
from nltk.downloader import download_gui
except RuntimeError as e:
import warnings
warnings.warn(
"Corpus downloader GUI not loaded "
"(RuntimeError during import: %s)" % str(e)
)
# explicitly import all top-level modules (ensuring
# they override the same names inadvertently imported
# from a subpackage)
from nltk import ccg, chunk, classify, collocations
from nltk import data, featstruct, grammar, help, inference, metrics
from nltk import misc, parse, probability, sem, stem, wsd
from nltk import tag, tbl, text, tokenize, translate, tree, util
# FIXME: override any accidentally imported demo, see https://github.com/nltk/nltk/issues/2116
def demo():
print("To run the demo code for a module, type nltk.module.demo()")

View File

@@ -0,0 +1,47 @@
# Natural Language Toolkit: Applications package
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Interactive NLTK Applications:
chartparser: Chart Parser
chunkparser: Regular-Expression Chunk Parser
collocations: Find collocations in text
concordance: Part-of-speech concordancer
nemo: Finding (and Replacing) Nemo regular expression tool
rdparser: Recursive Descent Parser
srparser: Shift-Reduce Parser
wordnet: WordNet Browser
"""
# Import Tkinter-based modules if Tkinter is installed
try:
import tkinter
except ImportError:
import warnings
warnings.warn("nltk.app package not loaded (please install Tkinter library).")
else:
from nltk.app.chartparser_app import app as chartparser
from nltk.app.chunkparser_app import app as chunkparser
from nltk.app.collocations_app import app as collocations
from nltk.app.concordance_app import app as concordance
from nltk.app.nemo_app import app as nemo
from nltk.app.rdparser_app import app as rdparser
from nltk.app.srparser_app import app as srparser
from nltk.app.wordnet_app import app as wordnet
try:
from matplotlib import pylab
except ImportError:
import warnings
warnings.warn("nltk.app.wordfreq not loaded (requires the matplotlib library).")
else:
from nltk.app.wordfreq_app import app as wordfreq

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,438 @@
# Natural Language Toolkit: Collocations Application
# Much of the GUI code is imported from concordance.py; We intend to merge these tools together
# Copyright (C) 2001-2025 NLTK Project
# Author: Sumukh Ghodke <sghodke@csse.unimelb.edu.au>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
#
import queue as q
import threading
from tkinter import (
END,
LEFT,
SUNKEN,
Button,
Frame,
IntVar,
Label,
Menu,
OptionMenu,
Scrollbar,
StringVar,
Text,
Tk,
)
from tkinter.font import Font
from nltk.corpus import (
alpino,
brown,
cess_cat,
cess_esp,
floresta,
indian,
mac_morpho,
machado,
nps_chat,
sinica_treebank,
treebank,
)
from nltk.probability import FreqDist
from nltk.util import in_idle
CORPUS_LOADED_EVENT = "<<CL_EVENT>>"
ERROR_LOADING_CORPUS_EVENT = "<<ELC_EVENT>>"
POLL_INTERVAL = 100
_DEFAULT = "English: Brown Corpus (Humor)"
_CORPORA = {
"Catalan: CESS-CAT Corpus": lambda: cess_cat.words(),
"English: Brown Corpus": lambda: brown.words(),
"English: Brown Corpus (Press)": lambda: brown.words(
categories=["news", "editorial", "reviews"]
),
"English: Brown Corpus (Religion)": lambda: brown.words(categories="religion"),
"English: Brown Corpus (Learned)": lambda: brown.words(categories="learned"),
"English: Brown Corpus (Science Fiction)": lambda: brown.words(
categories="science_fiction"
),
"English: Brown Corpus (Romance)": lambda: brown.words(categories="romance"),
"English: Brown Corpus (Humor)": lambda: brown.words(categories="humor"),
"English: NPS Chat Corpus": lambda: nps_chat.words(),
"English: Wall Street Journal Corpus": lambda: treebank.words(),
"Chinese: Sinica Corpus": lambda: sinica_treebank.words(),
"Dutch: Alpino Corpus": lambda: alpino.words(),
"Hindi: Indian Languages Corpus": lambda: indian.words(files="hindi.pos"),
"Portuguese: Floresta Corpus (Portugal)": lambda: floresta.words(),
"Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.words(),
"Portuguese: Machado Corpus (Brazil)": lambda: machado.words(),
"Spanish: CESS-ESP Corpus": lambda: cess_esp.words(),
}
class CollocationsView:
_BACKGROUND_COLOUR = "#FFF" # white
def __init__(self):
self.queue = q.Queue()
self.model = CollocationsModel(self.queue)
self.top = Tk()
self._init_top(self.top)
self._init_menubar()
self._init_widgets(self.top)
self.load_corpus(self.model.DEFAULT_CORPUS)
self.after = self.top.after(POLL_INTERVAL, self._poll)
def _init_top(self, top):
top.geometry("550x650+50+50")
top.title("NLTK Collocations List")
top.bind("<Control-q>", self.destroy)
top.protocol("WM_DELETE_WINDOW", self.destroy)
top.minsize(550, 650)
def _init_widgets(self, parent):
self.main_frame = Frame(
parent, dict(background=self._BACKGROUND_COLOUR, padx=1, pady=1, border=1)
)
self._init_corpus_select(self.main_frame)
self._init_results_box(self.main_frame)
self._init_paging(self.main_frame)
self._init_status(self.main_frame)
self.main_frame.pack(fill="both", expand=True)
def _init_corpus_select(self, parent):
innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
self.var = StringVar(innerframe)
self.var.set(self.model.DEFAULT_CORPUS)
Label(
innerframe,
justify=LEFT,
text=" Corpus: ",
background=self._BACKGROUND_COLOUR,
padx=2,
pady=1,
border=0,
).pack(side="left")
other_corpora = list(self.model.CORPORA.keys()).remove(
self.model.DEFAULT_CORPUS
)
om = OptionMenu(
innerframe,
self.var,
self.model.DEFAULT_CORPUS,
command=self.corpus_selected,
*self.model.non_default_corpora()
)
om["borderwidth"] = 0
om["highlightthickness"] = 1
om.pack(side="left")
innerframe.pack(side="top", fill="x", anchor="n")
def _init_status(self, parent):
self.status = Label(
parent,
justify=LEFT,
relief=SUNKEN,
background=self._BACKGROUND_COLOUR,
border=0,
padx=1,
pady=0,
)
self.status.pack(side="top", anchor="sw")
def _init_menubar(self):
self._result_size = IntVar(self.top)
menubar = Menu(self.top)
filemenu = Menu(menubar, tearoff=0, borderwidth=0)
filemenu.add_command(
label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-q"
)
menubar.add_cascade(label="File", underline=0, menu=filemenu)
editmenu = Menu(menubar, tearoff=0)
rescntmenu = Menu(editmenu, tearoff=0)
rescntmenu.add_radiobutton(
label="20",
variable=self._result_size,
underline=0,
value=20,
command=self.set_result_size,
)
rescntmenu.add_radiobutton(
label="50",
variable=self._result_size,
underline=0,
value=50,
command=self.set_result_size,
)
rescntmenu.add_radiobutton(
label="100",
variable=self._result_size,
underline=0,
value=100,
command=self.set_result_size,
)
rescntmenu.invoke(1)
editmenu.add_cascade(label="Result Count", underline=0, menu=rescntmenu)
menubar.add_cascade(label="Edit", underline=0, menu=editmenu)
self.top.config(menu=menubar)
def set_result_size(self, **kwargs):
self.model.result_count = self._result_size.get()
def _init_results_box(self, parent):
innerframe = Frame(parent)
i1 = Frame(innerframe)
i2 = Frame(innerframe)
vscrollbar = Scrollbar(i1, borderwidth=1)
hscrollbar = Scrollbar(i2, borderwidth=1, orient="horiz")
self.results_box = Text(
i1,
font=Font(family="courier", size="16"),
state="disabled",
borderwidth=1,
yscrollcommand=vscrollbar.set,
xscrollcommand=hscrollbar.set,
wrap="none",
width="40",
height="20",
exportselection=1,
)
self.results_box.pack(side="left", fill="both", expand=True)
vscrollbar.pack(side="left", fill="y", anchor="e")
vscrollbar.config(command=self.results_box.yview)
hscrollbar.pack(side="left", fill="x", expand=True, anchor="w")
hscrollbar.config(command=self.results_box.xview)
# there is no other way of avoiding the overlap of scrollbars while using pack layout manager!!!
Label(i2, text=" ", background=self._BACKGROUND_COLOUR).pack(
side="left", anchor="e"
)
i1.pack(side="top", fill="both", expand=True, anchor="n")
i2.pack(side="bottom", fill="x", anchor="s")
innerframe.pack(side="top", fill="both", expand=True)
def _init_paging(self, parent):
innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
self.prev = prev = Button(
innerframe,
text="Previous",
command=self.previous,
width="10",
borderwidth=1,
highlightthickness=1,
state="disabled",
)
prev.pack(side="left", anchor="center")
self.next = next = Button(
innerframe,
text="Next",
command=self.__next__,
width="10",
borderwidth=1,
highlightthickness=1,
state="disabled",
)
next.pack(side="right", anchor="center")
innerframe.pack(side="top", fill="y")
self.reset_current_page()
def reset_current_page(self):
self.current_page = -1
def _poll(self):
try:
event = self.queue.get(block=False)
except q.Empty:
pass
else:
if event == CORPUS_LOADED_EVENT:
self.handle_corpus_loaded(event)
elif event == ERROR_LOADING_CORPUS_EVENT:
self.handle_error_loading_corpus(event)
self.after = self.top.after(POLL_INTERVAL, self._poll)
def handle_error_loading_corpus(self, event):
self.status["text"] = "Error in loading " + self.var.get()
self.unfreeze_editable()
self.clear_results_box()
self.freeze_editable()
self.reset_current_page()
def handle_corpus_loaded(self, event):
self.status["text"] = self.var.get() + " is loaded"
self.unfreeze_editable()
self.clear_results_box()
self.reset_current_page()
# self.next()
collocations = self.model.next(self.current_page + 1)
self.write_results(collocations)
self.current_page += 1
def corpus_selected(self, *args):
new_selection = self.var.get()
self.load_corpus(new_selection)
def previous(self):
self.freeze_editable()
collocations = self.model.prev(self.current_page - 1)
self.current_page = self.current_page - 1
self.clear_results_box()
self.write_results(collocations)
self.unfreeze_editable()
def __next__(self):
self.freeze_editable()
collocations = self.model.next(self.current_page + 1)
self.clear_results_box()
self.write_results(collocations)
self.current_page += 1
self.unfreeze_editable()
def load_corpus(self, selection):
if self.model.selected_corpus != selection:
self.status["text"] = "Loading " + selection + "..."
self.freeze_editable()
self.model.load_corpus(selection)
def freeze_editable(self):
self.prev["state"] = "disabled"
self.next["state"] = "disabled"
def clear_results_box(self):
self.results_box["state"] = "normal"
self.results_box.delete("1.0", END)
self.results_box["state"] = "disabled"
def fire_event(self, event):
# Firing an event so that rendering of widgets happen in the mainloop thread
self.top.event_generate(event, when="tail")
def destroy(self, *e):
if self.top is None:
return
self.top.after_cancel(self.after)
self.top.destroy()
self.top = None
def mainloop(self, *args, **kwargs):
if in_idle():
return
self.top.mainloop(*args, **kwargs)
def unfreeze_editable(self):
self.set_paging_button_states()
def set_paging_button_states(self):
if self.current_page == -1 or self.current_page == 0:
self.prev["state"] = "disabled"
else:
self.prev["state"] = "normal"
if self.model.is_last_page(self.current_page):
self.next["state"] = "disabled"
else:
self.next["state"] = "normal"
def write_results(self, results):
self.results_box["state"] = "normal"
row = 1
for each in results:
self.results_box.insert(str(row) + ".0", each[0] + " " + each[1] + "\n")
row += 1
self.results_box["state"] = "disabled"
class CollocationsModel:
def __init__(self, queue):
self.result_count = None
self.selected_corpus = None
self.collocations = None
self.CORPORA = _CORPORA
self.DEFAULT_CORPUS = _DEFAULT
self.queue = queue
self.reset_results()
def reset_results(self):
self.result_pages = []
self.results_returned = 0
def load_corpus(self, name):
self.selected_corpus = name
self.collocations = None
runner_thread = self.LoadCorpus(name, self)
runner_thread.start()
self.reset_results()
def non_default_corpora(self):
copy = []
copy.extend(list(self.CORPORA.keys()))
copy.remove(self.DEFAULT_CORPUS)
copy.sort()
return copy
def is_last_page(self, number):
if number < len(self.result_pages):
return False
return self.results_returned + (
number - len(self.result_pages)
) * self.result_count >= len(self.collocations)
def next(self, page):
if (len(self.result_pages) - 1) < page:
for i in range(page - (len(self.result_pages) - 1)):
self.result_pages.append(
self.collocations[
self.results_returned : self.results_returned
+ self.result_count
]
)
self.results_returned += self.result_count
return self.result_pages[page]
def prev(self, page):
if page == -1:
return []
return self.result_pages[page]
class LoadCorpus(threading.Thread):
def __init__(self, name, model):
threading.Thread.__init__(self)
self.model, self.name = model, name
def run(self):
try:
words = self.model.CORPORA[self.name]()
from operator import itemgetter
text = [w for w in words if len(w) > 2]
fd = FreqDist(tuple(text[i : i + 2]) for i in range(len(text) - 1))
vocab = FreqDist(text)
scored = [
((w1, w2), fd[(w1, w2)] ** 3 / (vocab[w1] * vocab[w2]))
for w1, w2 in fd
]
scored.sort(key=itemgetter(1), reverse=True)
self.model.collocations = list(map(itemgetter(0), scored))
self.model.queue.put(CORPUS_LOADED_EVENT)
except Exception as e:
print(e)
self.model.queue.put(ERROR_LOADING_CORPUS_EVENT)
# def collocations():
# colloc_strings = [w1 + ' ' + w2 for w1, w2 in self._collocations[:num]]
def app():
c = CollocationsView()
c.mainloop()
if __name__ == "__main__":
app()
__all__ = ["app"]

View File

@@ -0,0 +1,709 @@
# Natural Language Toolkit: Concordance Application
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Sumukh Ghodke <sghodke@csse.unimelb.edu.au>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import queue as q
import re
import threading
from tkinter import (
END,
LEFT,
SUNKEN,
Button,
Entry,
Frame,
IntVar,
Label,
Menu,
OptionMenu,
Scrollbar,
StringVar,
Text,
Tk,
)
from tkinter.font import Font
from nltk.corpus import (
alpino,
brown,
cess_cat,
cess_esp,
floresta,
indian,
mac_morpho,
nps_chat,
sinica_treebank,
treebank,
)
from nltk.draw.util import ShowText
from nltk.util import in_idle
WORD_OR_TAG = "[^/ ]+"
BOUNDARY = r"\b"
CORPUS_LOADED_EVENT = "<<CL_EVENT>>"
SEARCH_TERMINATED_EVENT = "<<ST_EVENT>>"
SEARCH_ERROR_EVENT = "<<SE_EVENT>>"
ERROR_LOADING_CORPUS_EVENT = "<<ELC_EVENT>>"
POLL_INTERVAL = 50
# NB All corpora must be specified in a lambda expression so as not to be
# loaded when the module is imported.
_DEFAULT = "English: Brown Corpus (Humor, simplified)"
_CORPORA = {
"Catalan: CESS-CAT Corpus (simplified)": lambda: cess_cat.tagged_sents(
tagset="universal"
),
"English: Brown Corpus": lambda: brown.tagged_sents(),
"English: Brown Corpus (simplified)": lambda: brown.tagged_sents(
tagset="universal"
),
"English: Brown Corpus (Press, simplified)": lambda: brown.tagged_sents(
categories=["news", "editorial", "reviews"], tagset="universal"
),
"English: Brown Corpus (Religion, simplified)": lambda: brown.tagged_sents(
categories="religion", tagset="universal"
),
"English: Brown Corpus (Learned, simplified)": lambda: brown.tagged_sents(
categories="learned", tagset="universal"
),
"English: Brown Corpus (Science Fiction, simplified)": lambda: brown.tagged_sents(
categories="science_fiction", tagset="universal"
),
"English: Brown Corpus (Romance, simplified)": lambda: brown.tagged_sents(
categories="romance", tagset="universal"
),
"English: Brown Corpus (Humor, simplified)": lambda: brown.tagged_sents(
categories="humor", tagset="universal"
),
"English: NPS Chat Corpus": lambda: nps_chat.tagged_posts(),
"English: NPS Chat Corpus (simplified)": lambda: nps_chat.tagged_posts(
tagset="universal"
),
"English: Wall Street Journal Corpus": lambda: treebank.tagged_sents(),
"English: Wall Street Journal Corpus (simplified)": lambda: treebank.tagged_sents(
tagset="universal"
),
"Chinese: Sinica Corpus": lambda: sinica_treebank.tagged_sents(),
"Chinese: Sinica Corpus (simplified)": lambda: sinica_treebank.tagged_sents(
tagset="universal"
),
"Dutch: Alpino Corpus": lambda: alpino.tagged_sents(),
"Dutch: Alpino Corpus (simplified)": lambda: alpino.tagged_sents(
tagset="universal"
),
"Hindi: Indian Languages Corpus": lambda: indian.tagged_sents(files="hindi.pos"),
"Hindi: Indian Languages Corpus (simplified)": lambda: indian.tagged_sents(
files="hindi.pos", tagset="universal"
),
"Portuguese: Floresta Corpus (Portugal)": lambda: floresta.tagged_sents(),
"Portuguese: Floresta Corpus (Portugal, simplified)": lambda: floresta.tagged_sents(
tagset="universal"
),
"Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.tagged_sents(),
"Portuguese: MAC-MORPHO Corpus (Brazil, simplified)": lambda: mac_morpho.tagged_sents(
tagset="universal"
),
"Spanish: CESS-ESP Corpus (simplified)": lambda: cess_esp.tagged_sents(
tagset="universal"
),
}
class ConcordanceSearchView:
_BACKGROUND_COLOUR = "#FFF" # white
# Colour of highlighted results
_HIGHLIGHT_WORD_COLOUR = "#F00" # red
_HIGHLIGHT_WORD_TAG = "HL_WRD_TAG"
_HIGHLIGHT_LABEL_COLOUR = "#C0C0C0" # dark grey
_HIGHLIGHT_LABEL_TAG = "HL_LBL_TAG"
# Percentage of text left of the scrollbar position
_FRACTION_LEFT_TEXT = 0.30
def __init__(self):
self.queue = q.Queue()
self.model = ConcordanceSearchModel(self.queue)
self.top = Tk()
self._init_top(self.top)
self._init_menubar()
self._init_widgets(self.top)
self.load_corpus(self.model.DEFAULT_CORPUS)
self.after = self.top.after(POLL_INTERVAL, self._poll)
def _init_top(self, top):
top.geometry("950x680+50+50")
top.title("NLTK Concordance Search")
top.bind("<Control-q>", self.destroy)
top.protocol("WM_DELETE_WINDOW", self.destroy)
top.minsize(950, 680)
def _init_widgets(self, parent):
self.main_frame = Frame(
parent, dict(background=self._BACKGROUND_COLOUR, padx=1, pady=1, border=1)
)
self._init_corpus_select(self.main_frame)
self._init_query_box(self.main_frame)
self._init_results_box(self.main_frame)
self._init_paging(self.main_frame)
self._init_status(self.main_frame)
self.main_frame.pack(fill="both", expand=True)
def _init_menubar(self):
self._result_size = IntVar(self.top)
self._cntx_bf_len = IntVar(self.top)
self._cntx_af_len = IntVar(self.top)
menubar = Menu(self.top)
filemenu = Menu(menubar, tearoff=0, borderwidth=0)
filemenu.add_command(
label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-q"
)
menubar.add_cascade(label="File", underline=0, menu=filemenu)
editmenu = Menu(menubar, tearoff=0)
rescntmenu = Menu(editmenu, tearoff=0)
rescntmenu.add_radiobutton(
label="20",
variable=self._result_size,
underline=0,
value=20,
command=self.set_result_size,
)
rescntmenu.add_radiobutton(
label="50",
variable=self._result_size,
underline=0,
value=50,
command=self.set_result_size,
)
rescntmenu.add_radiobutton(
label="100",
variable=self._result_size,
underline=0,
value=100,
command=self.set_result_size,
)
rescntmenu.invoke(1)
editmenu.add_cascade(label="Result Count", underline=0, menu=rescntmenu)
cntxmenu = Menu(editmenu, tearoff=0)
cntxbfmenu = Menu(cntxmenu, tearoff=0)
cntxbfmenu.add_radiobutton(
label="60 characters",
variable=self._cntx_bf_len,
underline=0,
value=60,
command=self.set_cntx_bf_len,
)
cntxbfmenu.add_radiobutton(
label="80 characters",
variable=self._cntx_bf_len,
underline=0,
value=80,
command=self.set_cntx_bf_len,
)
cntxbfmenu.add_radiobutton(
label="100 characters",
variable=self._cntx_bf_len,
underline=0,
value=100,
command=self.set_cntx_bf_len,
)
cntxbfmenu.invoke(1)
cntxmenu.add_cascade(label="Before", underline=0, menu=cntxbfmenu)
cntxafmenu = Menu(cntxmenu, tearoff=0)
cntxafmenu.add_radiobutton(
label="70 characters",
variable=self._cntx_af_len,
underline=0,
value=70,
command=self.set_cntx_af_len,
)
cntxafmenu.add_radiobutton(
label="90 characters",
variable=self._cntx_af_len,
underline=0,
value=90,
command=self.set_cntx_af_len,
)
cntxafmenu.add_radiobutton(
label="110 characters",
variable=self._cntx_af_len,
underline=0,
value=110,
command=self.set_cntx_af_len,
)
cntxafmenu.invoke(1)
cntxmenu.add_cascade(label="After", underline=0, menu=cntxafmenu)
editmenu.add_cascade(label="Context", underline=0, menu=cntxmenu)
menubar.add_cascade(label="Edit", underline=0, menu=editmenu)
self.top.config(menu=menubar)
def set_result_size(self, **kwargs):
self.model.result_count = self._result_size.get()
def set_cntx_af_len(self, **kwargs):
self._char_after = self._cntx_af_len.get()
def set_cntx_bf_len(self, **kwargs):
self._char_before = self._cntx_bf_len.get()
def _init_corpus_select(self, parent):
innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
self.var = StringVar(innerframe)
self.var.set(self.model.DEFAULT_CORPUS)
Label(
innerframe,
justify=LEFT,
text=" Corpus: ",
background=self._BACKGROUND_COLOUR,
padx=2,
pady=1,
border=0,
).pack(side="left")
other_corpora = list(self.model.CORPORA.keys()).remove(
self.model.DEFAULT_CORPUS
)
om = OptionMenu(
innerframe,
self.var,
self.model.DEFAULT_CORPUS,
command=self.corpus_selected,
*self.model.non_default_corpora()
)
om["borderwidth"] = 0
om["highlightthickness"] = 1
om.pack(side="left")
innerframe.pack(side="top", fill="x", anchor="n")
def _init_status(self, parent):
self.status = Label(
parent,
justify=LEFT,
relief=SUNKEN,
background=self._BACKGROUND_COLOUR,
border=0,
padx=1,
pady=0,
)
self.status.pack(side="top", anchor="sw")
def _init_query_box(self, parent):
innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
another = Frame(innerframe, background=self._BACKGROUND_COLOUR)
self.query_box = Entry(another, width=60)
self.query_box.pack(side="left", fill="x", pady=25, anchor="center")
self.search_button = Button(
another,
text="Search",
command=self.search,
borderwidth=1,
highlightthickness=1,
)
self.search_button.pack(side="left", fill="x", pady=25, anchor="center")
self.query_box.bind("<KeyPress-Return>", self.search_enter_keypress_handler)
another.pack()
innerframe.pack(side="top", fill="x", anchor="n")
def search_enter_keypress_handler(self, *event):
self.search()
def _init_results_box(self, parent):
innerframe = Frame(parent)
i1 = Frame(innerframe)
i2 = Frame(innerframe)
vscrollbar = Scrollbar(i1, borderwidth=1)
hscrollbar = Scrollbar(i2, borderwidth=1, orient="horiz")
self.results_box = Text(
i1,
font=Font(family="courier", size="16"),
state="disabled",
borderwidth=1,
yscrollcommand=vscrollbar.set,
xscrollcommand=hscrollbar.set,
wrap="none",
width="40",
height="20",
exportselection=1,
)
self.results_box.pack(side="left", fill="both", expand=True)
self.results_box.tag_config(
self._HIGHLIGHT_WORD_TAG, foreground=self._HIGHLIGHT_WORD_COLOUR
)
self.results_box.tag_config(
self._HIGHLIGHT_LABEL_TAG, foreground=self._HIGHLIGHT_LABEL_COLOUR
)
vscrollbar.pack(side="left", fill="y", anchor="e")
vscrollbar.config(command=self.results_box.yview)
hscrollbar.pack(side="left", fill="x", expand=True, anchor="w")
hscrollbar.config(command=self.results_box.xview)
# there is no other way of avoiding the overlap of scrollbars while using pack layout manager!!!
Label(i2, text=" ", background=self._BACKGROUND_COLOUR).pack(
side="left", anchor="e"
)
i1.pack(side="top", fill="both", expand=True, anchor="n")
i2.pack(side="bottom", fill="x", anchor="s")
innerframe.pack(side="top", fill="both", expand=True)
def _init_paging(self, parent):
innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
self.prev = prev = Button(
innerframe,
text="Previous",
command=self.previous,
width="10",
borderwidth=1,
highlightthickness=1,
state="disabled",
)
prev.pack(side="left", anchor="center")
self.next = next = Button(
innerframe,
text="Next",
command=self.__next__,
width="10",
borderwidth=1,
highlightthickness=1,
state="disabled",
)
next.pack(side="right", anchor="center")
innerframe.pack(side="top", fill="y")
self.current_page = 0
def previous(self):
self.clear_results_box()
self.freeze_editable()
self.model.prev(self.current_page - 1)
def __next__(self):
self.clear_results_box()
self.freeze_editable()
self.model.next(self.current_page + 1)
def about(self, *e):
ABOUT = "NLTK Concordance Search Demo\n"
TITLE = "About: NLTK Concordance Search Demo"
try:
from tkinter.messagebox import Message
Message(message=ABOUT, title=TITLE, parent=self.main_frame).show()
except:
ShowText(self.top, TITLE, ABOUT)
def _bind_event_handlers(self):
self.top.bind(CORPUS_LOADED_EVENT, self.handle_corpus_loaded)
self.top.bind(SEARCH_TERMINATED_EVENT, self.handle_search_terminated)
self.top.bind(SEARCH_ERROR_EVENT, self.handle_search_error)
self.top.bind(ERROR_LOADING_CORPUS_EVENT, self.handle_error_loading_corpus)
def _poll(self):
try:
event = self.queue.get(block=False)
except q.Empty:
pass
else:
if event == CORPUS_LOADED_EVENT:
self.handle_corpus_loaded(event)
elif event == SEARCH_TERMINATED_EVENT:
self.handle_search_terminated(event)
elif event == SEARCH_ERROR_EVENT:
self.handle_search_error(event)
elif event == ERROR_LOADING_CORPUS_EVENT:
self.handle_error_loading_corpus(event)
self.after = self.top.after(POLL_INTERVAL, self._poll)
def handle_error_loading_corpus(self, event):
self.status["text"] = "Error in loading " + self.var.get()
self.unfreeze_editable()
self.clear_all()
self.freeze_editable()
def handle_corpus_loaded(self, event):
self.status["text"] = self.var.get() + " is loaded"
self.unfreeze_editable()
self.clear_all()
self.query_box.focus_set()
def handle_search_terminated(self, event):
# todo: refactor the model such that it is less state sensitive
results = self.model.get_results()
self.write_results(results)
self.status["text"] = ""
if len(results) == 0:
self.status["text"] = "No results found for " + self.model.query
else:
self.current_page = self.model.last_requested_page
self.unfreeze_editable()
self.results_box.xview_moveto(self._FRACTION_LEFT_TEXT)
def handle_search_error(self, event):
self.status["text"] = "Error in query " + self.model.query
self.unfreeze_editable()
def corpus_selected(self, *args):
new_selection = self.var.get()
self.load_corpus(new_selection)
def load_corpus(self, selection):
if self.model.selected_corpus != selection:
self.status["text"] = "Loading " + selection + "..."
self.freeze_editable()
self.model.load_corpus(selection)
def search(self):
self.current_page = 0
self.clear_results_box()
self.model.reset_results()
query = self.query_box.get()
if len(query.strip()) == 0:
return
self.status["text"] = "Searching for " + query
self.freeze_editable()
self.model.search(query, self.current_page + 1)
def write_results(self, results):
self.results_box["state"] = "normal"
row = 1
for each in results:
sent, pos1, pos2 = each[0].strip(), each[1], each[2]
if len(sent) != 0:
if pos1 < self._char_before:
sent, pos1, pos2 = self.pad(sent, pos1, pos2)
sentence = sent[pos1 - self._char_before : pos1 + self._char_after]
if not row == len(results):
sentence += "\n"
self.results_box.insert(str(row) + ".0", sentence)
word_markers, label_markers = self.words_and_labels(sent, pos1, pos2)
for marker in word_markers:
self.results_box.tag_add(
self._HIGHLIGHT_WORD_TAG,
str(row) + "." + str(marker[0]),
str(row) + "." + str(marker[1]),
)
for marker in label_markers:
self.results_box.tag_add(
self._HIGHLIGHT_LABEL_TAG,
str(row) + "." + str(marker[0]),
str(row) + "." + str(marker[1]),
)
row += 1
self.results_box["state"] = "disabled"
def words_and_labels(self, sentence, pos1, pos2):
search_exp = sentence[pos1:pos2]
words, labels = [], []
labeled_words = search_exp.split(" ")
index = 0
for each in labeled_words:
if each == "":
index += 1
else:
word, label = each.split("/")
words.append(
(self._char_before + index, self._char_before + index + len(word))
)
index += len(word) + 1
labels.append(
(self._char_before + index, self._char_before + index + len(label))
)
index += len(label)
index += 1
return words, labels
def pad(self, sent, hstart, hend):
if hstart >= self._char_before:
return sent, hstart, hend
d = self._char_before - hstart
sent = "".join([" "] * d) + sent
return sent, hstart + d, hend + d
def destroy(self, *e):
if self.top is None:
return
self.top.after_cancel(self.after)
self.top.destroy()
self.top = None
def clear_all(self):
self.query_box.delete(0, END)
self.model.reset_query()
self.clear_results_box()
def clear_results_box(self):
self.results_box["state"] = "normal"
self.results_box.delete("1.0", END)
self.results_box["state"] = "disabled"
def freeze_editable(self):
self.query_box["state"] = "disabled"
self.search_button["state"] = "disabled"
self.prev["state"] = "disabled"
self.next["state"] = "disabled"
def unfreeze_editable(self):
self.query_box["state"] = "normal"
self.search_button["state"] = "normal"
self.set_paging_button_states()
def set_paging_button_states(self):
if self.current_page == 0 or self.current_page == 1:
self.prev["state"] = "disabled"
else:
self.prev["state"] = "normal"
if self.model.has_more_pages(self.current_page):
self.next["state"] = "normal"
else:
self.next["state"] = "disabled"
def fire_event(self, event):
# Firing an event so that rendering of widgets happen in the mainloop thread
self.top.event_generate(event, when="tail")
def mainloop(self, *args, **kwargs):
if in_idle():
return
self.top.mainloop(*args, **kwargs)
class ConcordanceSearchModel:
def __init__(self, queue):
self.queue = queue
self.CORPORA = _CORPORA
self.DEFAULT_CORPUS = _DEFAULT
self.selected_corpus = None
self.reset_query()
self.reset_results()
self.result_count = None
self.last_sent_searched = 0
def non_default_corpora(self):
copy = []
copy.extend(list(self.CORPORA.keys()))
copy.remove(self.DEFAULT_CORPUS)
copy.sort()
return copy
def load_corpus(self, name):
self.selected_corpus = name
self.tagged_sents = []
runner_thread = self.LoadCorpus(name, self)
runner_thread.start()
def search(self, query, page):
self.query = query
self.last_requested_page = page
self.SearchCorpus(self, page, self.result_count).start()
def next(self, page):
self.last_requested_page = page
if len(self.results) < page:
self.search(self.query, page)
else:
self.queue.put(SEARCH_TERMINATED_EVENT)
def prev(self, page):
self.last_requested_page = page
self.queue.put(SEARCH_TERMINATED_EVENT)
def reset_results(self):
self.last_sent_searched = 0
self.results = []
self.last_page = None
def reset_query(self):
self.query = None
def set_results(self, page, resultset):
self.results.insert(page - 1, resultset)
def get_results(self):
return self.results[self.last_requested_page - 1]
def has_more_pages(self, page):
if self.results == [] or self.results[0] == []:
return False
if self.last_page is None:
return True
return page < self.last_page
class LoadCorpus(threading.Thread):
def __init__(self, name, model):
threading.Thread.__init__(self)
self.model, self.name = model, name
def run(self):
try:
ts = self.model.CORPORA[self.name]()
self.model.tagged_sents = [
" ".join(w + "/" + t for (w, t) in sent) for sent in ts
]
self.model.queue.put(CORPUS_LOADED_EVENT)
except Exception as e:
print(e)
self.model.queue.put(ERROR_LOADING_CORPUS_EVENT)
class SearchCorpus(threading.Thread):
def __init__(self, model, page, count):
self.model, self.count, self.page = model, count, page
threading.Thread.__init__(self)
def run(self):
q = self.processed_query()
sent_pos, i, sent_count = [], 0, 0
for sent in self.model.tagged_sents[self.model.last_sent_searched :]:
try:
m = re.search(q, sent)
except re.error:
self.model.reset_results()
self.model.queue.put(SEARCH_ERROR_EVENT)
return
if m:
sent_pos.append((sent, m.start(), m.end()))
i += 1
if i > self.count:
self.model.last_sent_searched += sent_count - 1
break
sent_count += 1
if self.count >= len(sent_pos):
self.model.last_sent_searched += sent_count - 1
self.model.last_page = self.page
self.model.set_results(self.page, sent_pos)
else:
self.model.set_results(self.page, sent_pos[:-1])
self.model.queue.put(SEARCH_TERMINATED_EVENT)
def processed_query(self):
new = []
for term in self.model.query.split():
term = re.sub(r"\.", r"[^/ ]", term)
if re.match("[A-Z]+$", term):
new.append(BOUNDARY + WORD_OR_TAG + "/" + term + BOUNDARY)
elif "/" in term:
new.append(BOUNDARY + term + BOUNDARY)
else:
new.append(BOUNDARY + term + "/" + WORD_OR_TAG + BOUNDARY)
return " ".join(new)
def app():
d = ConcordanceSearchView()
d.mainloop()
if __name__ == "__main__":
app()
__all__ = ["app"]

View File

@@ -0,0 +1,163 @@
# Finding (and Replacing) Nemo, Version 1.1, Aristide Grange 2006/06/06
# https://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/496783
"""
Finding (and Replacing) Nemo
Instant Regular Expressions
Created by Aristide Grange
"""
import itertools
import re
from tkinter import SEL_FIRST, SEL_LAST, Frame, Label, PhotoImage, Scrollbar, Text, Tk
windowTitle = "Finding (and Replacing) Nemo"
initialFind = r"n(.*?)e(.*?)m(.*?)o"
initialRepl = r"M\1A\2K\3I"
initialText = """\
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
"""
images = {
"FIND": "R0lGODlhMAAiAPcAMf/////37//35//n1v97Off///f/9/f37/fexvfOvfeEQvd7QvdrQvdrKfdaKfdSMfdSIe/v9+/v7+/v5+/n3u/e1u/Wxu/Gre+1lO+tnO+thO+Ua+97Y+97Oe97Me9rOe9rMe9jOe9jMe9jIe9aMefe5+fe3ufezuece+eEWudzQudaIedSIedKMedKIedCKedCId7e1t7Wzt7Oxt7Gvd69vd69rd61pd6ljN6UjN6Ue96EY95zY95rUt5rQt5jMd5SId5KIdbn59be3tbGztbGvda1rdaEa9Z7a9Z7WtZzQtZzOdZzMdZjMdZaQtZSOdZSMdZKMdZCKdZCGNY5Ic7W1s7Oxs7Gtc69xs69tc69rc6tpc6llM6clM6cjM6Ue86EY85zWs5rSs5SKc5KKc5KGMa1tcatrcalvcalnMaUpcZ7c8ZzMcZrUsZrOcZrMcZaQsZSOcZSMcZKMcZCKcZCGMYxIcYxGL3Gxr21tb21rb2lpb2crb2cjL2UnL2UlL2UhL2Ec717Wr17Ur1zWr1rMb1jUr1KMb1KIb1CIb0xGLWlrbWlpbWcnLWEe7V7c7VzY7VzUrVSKbVKMbVCMbVCIbU5KbUxIbUxEK2lta2lpa2clK2UjK2MnK2MlK2Ea617e61za61rY61rMa1jSq1aUq1aSq1SQq1KKa0xEKWlnKWcnKWUnKWUhKWMjKWEa6Vza6VrWqVjMaVaUqVaKaVSMaVCMaU5KaUxIaUxGJyclJyMe5yElJyEhJx7e5x7c5xrOZxaQpxSOZxKQpw5IZSMhJSEjJR7c5Rre5RrY5RrUpRSQpRSKZRCOZRCKZQxKZQxIYyEhIx7hIxza4xzY4xrc4xjUoxaa4xaUoxSSoxKQoxCMYw5GIR7c4Rzc4Rre4RjY4RjWoRaa4RSWoRSUoRSMYRKQoRCOYQ5KYQxIXtra3taY3taSntKOXtCMXtCKXNCMXM5MXMxIWtSUmtKSmtKQmtCOWs5MWs5KWs5IWNCKWMxIVIxKUIQCDkhGAAAACH+AS4ALAAAAAAwACIAAAj/AAEIHEiwoMGDCBMqXMiwoUOHMqxIeEiRoZVp7cpZ29WrF4WKIAd208dGAQEVbiTVChUjZMU9+pYQmPmBZpxgvVw+nDdKwQICNVcIXQEkTgKdDdUJ+/nggVAXK1xI3TEA6UIr2uJ8iBqka1cXXTlkqGoVYRZ7iLyqBSs0iiEtZQVKiDGxBI1u3NR6lUpGDKg8MSgEQCphU7Z22vhg0dILXRCpYLuSCcYJT4wqXASBQaBzU7klHxC127OHD7ZDJFpERqRt0x5OnwQpmZmCLEhrbgg4WIHO1RY+nbQ9WRGEDJlmnXwJ+9FBgXMCIzYMVijBBgYMFxIMqJBMSc0Ht7qh/+Gjpte2rnYsYeNlasWIBgQ6yCewIoPCCp/cyP/wgUGbXVu0QcADZNBDnh98gHMLGXYQUw02w61QU3wdbNWDbQVVIIhMMwFF1DaZiPLBAy7E04kafrjSizaK3LFNNc0AAYRQDsAHHQlJ2IDQJ2zE1+EKDjiAijShkECCC8Qgw4cr7ZgyzC2WaHPNLWWoNeNWPiRAw0QFWQFMhz8C+QQ20yAiVSrY+MGOJCsccsst2GCzoHFxxEGGC+8hgs0MB2kyCpgzrUDCbs1Es41UdtATHFFkWELMOtsoQsYcgvRRQw5RSDgGOjZMR1AvPQIq6KCo9AKOJWDd48owQlHR4DXEKP9iyRrK+DNNBTu4RwIPFeTAGUG7hAomkA84gEg1m6ADljy9PBKGGJY4ig0xlsTBRSn98FOFDUC8pwQOPkgHbCGAzhTkA850s0c7j6Hjix9+gBIrMXLeAccWXUCyiRBcBEECdEJ98KtAqtBCYQc/OvDENnl4gYpUxISCIjjzylkGGV9okYUVNogRhAOBuuAEhjG08wOgDYzAgA5bCjIoCe5uwUk80RKTTSppPREGGGCIISOQ9AXBg6cC6WIywvCpoMHAocRBwhP4bHLFLujYkV42xNxBRhAyGrc113EgYtRBerDDDHMoDCyQEL5sE083EkgwQyBhxGFHMM206DUixGxmE0wssbQjCQ4JCaFKFwgQTVAVVhQUwAVPIFJKrHfYYRwi6OCDzzuIJIFhXAD0EccPsYRiSyqKSDpFcWSMIcZRoBMkQyA2BGZDIKSYcggih8TRRg4VxM5QABVYYLxgwiev/PLMCxQQADs=",
"find": "R0lGODlhMAAiAPQAMf////f39+/v7+fn597e3tbW1s7OzsbGxr29vbW1ta2traWlpZycnJSUlIyMjISEhHt7e3Nzc2tra2NjY1paWlJSUkpKSkJCQjk5OSkpKRgYGAAAAAAAAAAAAAAAAAAAACH+AS4ALAAAAAAwACIAAAX/ICCOZGmeaKquY2AGLiuvMCAUBuHWc48Kh0iFInEYCb4kSQCxPBiMxkMigRQEgJiSFVBYHNGG0RiZOHjblWAiiY4fkDhEYoBp06dAWfyAQyKAgAwDaHgnB0RwgYASgQ0IhDuGJDAIFhMRVFSLEX8QCJJ4AQM5AgQHTZqqjBAOCQQEkWkCDRMUFQsICQ4Vm5maEwwHOAsPDTpKMAsUDlO4CssTcb+2DAp8YGCyNFoCEsZwFQ3QDRTTVBRS0g1QbgsCd5QAAwgIBwYFAwStzQ8UEdCKVchky0yVBw7YuXkAKt4IAg74vXHVagqFBRgXSCAyYWAVCH0SNhDTitCJfSL5/4RbAPKPhQYYjVCYYAvCP0BxEDaD8CheAAHNwqh8MMGPSwgLeJWhwHSjqkYI+xg4MMCEgQjtRvZ7UAYCpghMF7CxONOWJkYR+rCpY4JlVpVxKDwYWEactKW9mhYRtqCTgwgWEMArERSK1j5q//6T8KXonFsShpiJkAECgQYVjykooCVA0JGHEWNiYCHThTFeb3UkoiCCBgwGEKQ1kuAJlhFwhA71h5SukwUM5qqeCSGBgicEWkfNiWSERtBad4JNIBaQBaQah1ToyGZBAnsIuIJs1qnqiAIVjIE2gnAB1T5x0icgzXT79ipgMOOEH6HBbREBMJCeGEY08IoLAkzB1YYFwjxwSUGSNULQJnNUwRYlCcyEkALIxECAP9cNMMABYpRhy3ZsSLDaR70oUAiABGCkAxowCGCAAfDYIQACXoElGRsdXWDBdg2Y90IWktDYGYAB9PWHP0PMdFZaF07SQgAFNDAMAQg0QA1UC8xoZQl22JGFPgWkOUCOL1pZQyhjxinnnCWEAAA7",
"REPL": "R0lGODlhMAAjAPcAMf/////3//+lOf+UKf+MEPf///f39/f35/fv7/ecQvecOfecKfeUIfeUGPeUEPeUCPeMAO/37+/v9+/v3u/n3u/n1u+9jO+9c++1hO+ta++tY++tWu+tUu+tSu+lUu+lQu+lMe+UMe+UKe+UGO+UEO+UAO+MCOfv5+fvxufn7+fn5+fnzue9lOe9c+e1jOe1e+e1c+e1a+etWuetUuelQuecOeeUUueUCN7e597e3t7e1t7ezt7evd7Wzt7Oxt7Ovd7Otd7Opd7OnN7Gtd7Gpd69lN61hN6ta96lStbextberdbW3tbWztbWxtbOvdbOrda1hNalUtaECM7W1s7Ozs7Oxs7Otc7Gxs7Gvc69tc69rc69pc61jM6lc8bWlMbOvcbGxsbGpca9tca9pca1nMaMAL3OhL3Gtb21vb21tb2tpb2tnL2tlLW9tbW9pbW9e7W1pbWtjLWcKa21nK2tra2tnK2tlK2lpa2llK2ljK2le6WlnKWljKWUe6WUc6WUY5y1QpyclJycjJychJyUc5yMY5StY5SUe5SMhJSMe5SMc5SMWpSEa5SESoyUe4yMhIyEY4SlKYScWoSMe4SEe4SEa4R7c4R7Y3uMY3uEe3t7e3t7c3tza3tzY3trKXtjIXOcAHOUMXOEY3Nzc3NzWnNrSmulCGuUMWuMGGtzWmtrY2taMWtaGGOUOWOMAGNzUmNjWmNjSmNaUmNaQmNaOWNaIWNSCFqcAFpjUlpSMVpSIVpSEFpKKVKMAFJSUlJSSlJSMVJKMVJKGFJKAFI5CEqUAEqEAEpzQkpKIUpCQkpCGEpCAEo5EEoxAEJjOUJCOUJCAEI5IUIxADl7ADlaITlCOTkxMTkxKTkxEDkhADFzADFrGDE5OTExADEpEClrCCkxKSkpKSkpISkpACkhCCkhACkYACFzACFrACEhCCEYGBhjEBhjABghABgYCBgYABgQEBgQABAQABAIAAhjAAhSAAhKAAgIEAgICABaAABCAAAhAAAQAAAIAAAAAAAAACH+AS4ALAAAAAAwACMAAAj/AAEIHEiwoMGDCBMqXMiwocOHAA4cgEixIIIJO3JMmAjADIqKFU/8MHIkg5EgYXx4iaTkI0iHE6wE2TCggYILQayEAgXIy8uGCKz8sDCAQAMRG3iEcXULlJkJPwli3OFjh9UdYYLE6NBhA04UXHoVA2XoTZgfPKBWlOBDphAWOdfMcfMDLloeO3hIMjbWVCQ5Fn6E2UFxgpsgFjYIEBADrZU6luqEEfqjTqpt54z1uuWqTIcgWAk7PECGzIUQDRosDmxlUrVJkwQJkqVuX71v06YZcyUlROAdbnLAJKPFyAYFAhoMwFlnEh0rWkpz8raPHm7dqKKc/KFFkBUrVn1M/ziBcEIeLUEQI8/AYk0i9Be4sqjsrN66c9/OnbobhpR3HkIUoZ0WVnBE0AGLFKKFD0HAFUQe77HQgQI1hRBDEHMcY0899bBzihZuCPILJD8EccEGGzwAQhFaUHHQH82sUkgeNHISDBk8WCCCcsqFUEQWmOyzjz3sUGNNOO5Y48YOEgowAAQhnBScQV00k82V47jzjy9CXZBcjziFoco//4CDiSOyhPMPLkJZkEBqJmRQxA9uZGEQD8Ncmc044/zzDF2IZQBCCDYE8QMZz/iiCSx0neHGI7BIhhhNn+1gxRpokEcQAp7seWU7/PwTyxqG/iCEEVzQmUombnDRxRExzP9nBR2PCKLFD3UJwcMPa/SRqUGNWJmNOVn+M44ukMRB4KGcWDNLVhuUMEIJAlzwA3DJBHMJIXm4sQYhqyxCRQQGLSIsn1qac2UzysQSyzX/hLMGD0F0IMCODYAQBA9W/PKPOcRiw0wzwxTiokF9dLMnuv/Mo+fCZF7jBr0xbDDCACWEYKgb1vzjDp/jZNOMLX0IZxAKq2TZTjtaOjwOsXyG+s8sZJTIQsUdIGHoJPf8w487QI/TDSt5mGwQFZxc406o8HiDJchk/ltLHpSlJwSvz5DpTjvmuGNOM57koelBOaAhiCaaPBLL0wwbm003peRBnBZqJMJL1ECz/HXYYx/NdAIOOVCxQyLorswymU93o0wuwfAiTDNR/xz0MLXU0XdCE+UwSTRZAq2lsSATu+4wkGvt+TjNzPLrQyegAUku2Hij5cd8LhxyM8QIg4w18HgcdC6BTBFSDmfQqsovttveDcG7lFLHI75cE841sARCxeWsnxC4G9HADPK6ywzDCRqBo0EHHWhMgT1IJzziNci1N7PMKnSYfML96/90AiJKey/0KtbLX1QK0rrNnQ541xugQ7SHhkXBghN0SKACWRc4KlAhBwKcIOYymJCAAAA7",
"repl": "R0lGODlhMAAjAPQAMf////f39+/v7+fn597e3tbW1s7OzsbGxr29vbW1ta2traWlpZycnJSUlIyMjISEhHt7e3Nzc2tra2NjY1paWlJSUkpKSkJCQjk5OTExMSkpKSEhIRgYGBAQEAgICAAAACH+AS4ALAAAAAAwACMAAAX/ICCOZGmeaKqubOu+gCDANBkIQ1EMQhAghFptYEAkEgjEwXBo7ISvweGgWCwUysPjwTgEoCafTySYIhYMxgLBjEQgCULvCw0QdAZdoVhUIJUFChISEAxYeQM1N1OMTAp+UwZ5eA4TEhFbDWYFdC4ECVMJjwl5BwsQa0umEhUVlhESDgqlBp0rAn5nVpBMDxeZDRQbHBgWFBSWDgtLBnFjKwRYCI9VqQsPs0YKEcMXFq0UEalFDWx4BAO2IwPjppAKDkrTWKYUGd7fEJJFEZpM00cOzCgh4EE8SaoWxKNixQooBRMyZMBwAYIRBhUgLDGS4MoBJeoANMhAgQsaCRZm/5lqaCUJhA4cNHjDoKEDBlJUHqkBlYBTiQUZNGjYMMxDhY3VWk6R4MEDBoMUak5AqoYBqANIBo4wcGGDUKIeLlzVZmWJggsVIkwAZaQSA3kdZzlKkIiEAAlDvW5oOkEBs488JTw44oeUIwdvVTFTUK7uiAAPgubt8GFDhQepqETAQCFU1UMGzlqAgFhUsAcCS0AO6lUDhw8xNRSbENGDhgWSHjWUe6ACbKITizmopZoBa6KvOwj9uuHDhwxyj3xekgDDhw5EvWKo0IB4iQLCOCC/njc7ZQ8UeGvza+ABZZgcxJNc4FO1gc0cOsCUrHevc8tdIMTIAhc4F198G2Qwwd8CBIQUAwEINABBBJUwR9R5wElgVRLwWODBBx4cGB8GEzDQIAo33CGJA8gh+JoH/clUgQU0YvDhdfmJdwEFC6Sjgg8yEPAABsPkh2F22cl2AQbn6QdTghTQ5eAJAQyQAAQV0MSBB9gRVZ4GE1mw5JZOAmiAVi1UWcAZDrDyZXYTeaOhA/bIVuIBPtKQ4h7ViYekUPdcEAEbzTzCRp5CADmAAwj+ORGPBcgwAAHo9ABGCYtm0ChwFHShlRiXhmHlkAcCiOeUodqQw5W0oXLAiamy4MOkjOyAaqxUymApDCEAADs=",
}
colors = ["#FF7B39", "#80F121"]
emphColors = ["#DAFC33", "#F42548"]
fieldParams = {
"height": 3,
"width": 70,
"font": ("monaco", 14),
"highlightthickness": 0,
"borderwidth": 0,
"background": "white",
}
textParams = {
"bg": "#F7E0D4",
"fg": "#2321F1",
"highlightthickness": 0,
"width": 1,
"height": 10,
"font": ("verdana", 16),
"wrap": "word",
}
class Zone:
def __init__(self, image, initialField, initialText):
frm = Frame(root)
frm.config(background="white")
self.image = PhotoImage(format="gif", data=images[image.upper()])
self.imageDimmed = PhotoImage(format="gif", data=images[image])
self.img = Label(frm)
self.img.config(borderwidth=0)
self.img.pack(side="left")
self.fld = Text(frm, **fieldParams)
self.initScrollText(frm, self.fld, initialField)
frm = Frame(root)
self.txt = Text(frm, **textParams)
self.initScrollText(frm, self.txt, initialText)
for i in range(2):
self.txt.tag_config(colors[i], background=colors[i])
self.txt.tag_config("emph" + colors[i], foreground=emphColors[i])
def initScrollText(self, frm, txt, contents):
scl = Scrollbar(frm)
scl.config(command=txt.yview)
scl.pack(side="right", fill="y")
txt.pack(side="left", expand=True, fill="x")
txt.config(yscrollcommand=scl.set)
txt.insert("1.0", contents)
frm.pack(fill="x")
Frame(height=2, bd=1, relief="ridge").pack(fill="x")
def refresh(self):
self.colorCycle = itertools.cycle(colors)
try:
self.substitute()
self.img.config(image=self.image)
except re.error:
self.img.config(image=self.imageDimmed)
class FindZone(Zone):
def addTags(self, m):
color = next(self.colorCycle)
self.txt.tag_add(color, "1.0+%sc" % m.start(), "1.0+%sc" % m.end())
try:
self.txt.tag_add(
"emph" + color, "1.0+%sc" % m.start("emph"), "1.0+%sc" % m.end("emph")
)
except:
pass
def substitute(self, *args):
for color in colors:
self.txt.tag_remove(color, "1.0", "end")
self.txt.tag_remove("emph" + color, "1.0", "end")
self.rex = re.compile("") # default value in case of malformed regexp
self.rex = re.compile(self.fld.get("1.0", "end")[:-1], re.MULTILINE)
try:
re.compile("(?P<emph>%s)" % self.fld.get(SEL_FIRST, SEL_LAST))
self.rexSel = re.compile(
"%s(?P<emph>%s)%s"
% (
self.fld.get("1.0", SEL_FIRST),
self.fld.get(SEL_FIRST, SEL_LAST),
self.fld.get(SEL_LAST, "end")[:-1],
),
re.MULTILINE,
)
except:
self.rexSel = self.rex
self.rexSel.sub(self.addTags, self.txt.get("1.0", "end"))
class ReplaceZone(Zone):
def addTags(self, m):
s = sz.rex.sub(self.repl, m.group())
self.txt.delete(
"1.0+%sc" % (m.start() + self.diff), "1.0+%sc" % (m.end() + self.diff)
)
self.txt.insert("1.0+%sc" % (m.start() + self.diff), s, next(self.colorCycle))
self.diff += len(s) - (m.end() - m.start())
def substitute(self):
self.txt.delete("1.0", "end")
self.txt.insert("1.0", sz.txt.get("1.0", "end")[:-1])
self.diff = 0
self.repl = rex0.sub(r"\\g<\1>", self.fld.get("1.0", "end")[:-1])
sz.rex.sub(self.addTags, sz.txt.get("1.0", "end")[:-1])
def launchRefresh(_):
sz.fld.after_idle(sz.refresh)
rz.fld.after_idle(rz.refresh)
def app():
global root, sz, rz, rex0
root = Tk()
root.resizable(height=False, width=True)
root.title(windowTitle)
root.minsize(width=250, height=0)
sz = FindZone("find", initialFind, initialText)
sz.fld.bind("<Button-1>", launchRefresh)
sz.fld.bind("<ButtonRelease-1>", launchRefresh)
sz.fld.bind("<B1-Motion>", launchRefresh)
sz.rexSel = re.compile("")
rz = ReplaceZone("repl", initialRepl, "")
rex0 = re.compile(r"(?<!\\)\\([0-9]+)")
root.bind_all("<Key>", launchRefresh)
launchRefresh(None)
root.mainloop()
if __name__ == "__main__":
app()
__all__ = ["app"]

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,937 @@
# Natural Language Toolkit: Shift-Reduce Parser Application
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
A graphical tool for exploring the shift-reduce parser.
The shift-reduce parser maintains a stack, which records the structure
of the portion of the text that has been parsed. The stack is
initially empty. Its contents are shown on the left side of the main
canvas.
On the right side of the main canvas is the remaining text. This is
the portion of the text which has not yet been considered by the
parser.
The parser builds up a tree structure for the text using two
operations:
- "shift" moves the first token from the remaining text to the top
of the stack. In the demo, the top of the stack is its right-hand
side.
- "reduce" uses a grammar production to combine the rightmost stack
elements into a single tree token.
You can control the parser's operation by using the "shift" and
"reduce" buttons; or you can use the "step" button to let the parser
automatically decide which operation to apply. The parser uses the
following rules to decide which operation to apply:
- Only shift if no reductions are available.
- If multiple reductions are available, then apply the reduction
whose CFG production is listed earliest in the grammar.
The "reduce" button applies the reduction whose CFG production is
listed earliest in the grammar. There are two ways to manually choose
which reduction to apply:
- Click on a CFG production from the list of available reductions,
on the left side of the main window. The reduction based on that
production will be applied to the top of the stack.
- Click on one of the stack elements. A popup window will appear,
containing all available reductions. Select one, and it will be
applied to the top of the stack.
Note that reductions can only be applied to the top of the stack.
Keyboard Shortcuts::
[Space]\t Perform the next shift or reduce operation
[s]\t Perform a shift operation
[r]\t Perform a reduction operation
[Ctrl-z]\t Undo most recent operation
[Delete]\t Reset the parser
[g]\t Show/hide available production list
[Ctrl-a]\t Toggle animations
[h]\t Help
[Ctrl-p]\t Print
[q]\t Quit
"""
from tkinter import Button, Frame, IntVar, Label, Listbox, Menu, Scrollbar, Tk
from tkinter.font import Font
from nltk.draw import CFGEditor, TreeSegmentWidget, tree_to_treesegment
from nltk.draw.util import CanvasFrame, EntryDialog, ShowText, TextWidget
from nltk.parse import SteppingShiftReduceParser
from nltk.tree import Tree
from nltk.util import in_idle
"""
Possible future improvements:
- button/window to change and/or select text. Just pop up a window
with an entry, and let them modify the text; and then retokenize
it? Maybe give a warning if it contains tokens whose types are
not in the grammar.
- button/window to change and/or select grammar. Select from
several alternative grammars? Or actually change the grammar? If
the later, then I'd want to define nltk.draw.cfg, which would be
responsible for that.
"""
class ShiftReduceApp:
"""
A graphical tool for exploring the shift-reduce parser. The tool
displays the parser's stack and the remaining text, and allows the
user to control the parser's operation. In particular, the user
can shift tokens onto the stack, and can perform reductions on the
top elements of the stack. A "step" button simply steps through
the parsing process, performing the operations that
``nltk.parse.ShiftReduceParser`` would use.
"""
def __init__(self, grammar, sent, trace=0):
self._sent = sent
self._parser = SteppingShiftReduceParser(grammar, trace)
# Set up the main window.
self._top = Tk()
self._top.title("Shift Reduce Parser Application")
# Animations. animating_lock is a lock to prevent the demo
# from performing new operations while it's animating.
self._animating_lock = 0
self._animate = IntVar(self._top)
self._animate.set(10) # = medium
# The user can hide the grammar.
self._show_grammar = IntVar(self._top)
self._show_grammar.set(1)
# Initialize fonts.
self._init_fonts(self._top)
# Set up key bindings.
self._init_bindings()
# Create the basic frames.
self._init_menubar(self._top)
self._init_buttons(self._top)
self._init_feedback(self._top)
self._init_grammar(self._top)
self._init_canvas(self._top)
# A popup menu for reducing.
self._reduce_menu = Menu(self._canvas, tearoff=0)
# Reset the demo, and set the feedback frame to empty.
self.reset()
self._lastoper1["text"] = ""
#########################################
## Initialization Helpers
#########################################
def _init_fonts(self, root):
# See: <http://www.astro.washington.edu/owen/ROTKFolklore.html>
self._sysfont = Font(font=Button()["font"])
root.option_add("*Font", self._sysfont)
# TWhat's our font size (default=same as sysfont)
self._size = IntVar(root)
self._size.set(self._sysfont.cget("size"))
self._boldfont = Font(family="helvetica", weight="bold", size=self._size.get())
self._font = Font(family="helvetica", size=self._size.get())
def _init_grammar(self, parent):
# Grammar view.
self._prodframe = listframe = Frame(parent)
self._prodframe.pack(fill="both", side="left", padx=2)
self._prodlist_label = Label(
self._prodframe, font=self._boldfont, text="Available Reductions"
)
self._prodlist_label.pack()
self._prodlist = Listbox(
self._prodframe,
selectmode="single",
relief="groove",
background="white",
foreground="#909090",
font=self._font,
selectforeground="#004040",
selectbackground="#c0f0c0",
)
self._prodlist.pack(side="right", fill="both", expand=1)
self._productions = list(self._parser.grammar().productions())
for production in self._productions:
self._prodlist.insert("end", (" %s" % production))
self._prodlist.config(height=min(len(self._productions), 25))
# Add a scrollbar if there are more than 25 productions.
if 1: # len(self._productions) > 25:
listscroll = Scrollbar(self._prodframe, orient="vertical")
self._prodlist.config(yscrollcommand=listscroll.set)
listscroll.config(command=self._prodlist.yview)
listscroll.pack(side="left", fill="y")
# If they select a production, apply it.
self._prodlist.bind("<<ListboxSelect>>", self._prodlist_select)
# When they hover over a production, highlight it.
self._hover = -1
self._prodlist.bind("<Motion>", self._highlight_hover)
self._prodlist.bind("<Leave>", self._clear_hover)
def _init_bindings(self):
# Quit
self._top.bind("<Control-q>", self.destroy)
self._top.bind("<Control-x>", self.destroy)
self._top.bind("<Alt-q>", self.destroy)
self._top.bind("<Alt-x>", self.destroy)
# Ops (step, shift, reduce, undo)
self._top.bind("<space>", self.step)
self._top.bind("<s>", self.shift)
self._top.bind("<Alt-s>", self.shift)
self._top.bind("<Control-s>", self.shift)
self._top.bind("<r>", self.reduce)
self._top.bind("<Alt-r>", self.reduce)
self._top.bind("<Control-r>", self.reduce)
self._top.bind("<Delete>", self.reset)
self._top.bind("<u>", self.undo)
self._top.bind("<Alt-u>", self.undo)
self._top.bind("<Control-u>", self.undo)
self._top.bind("<Control-z>", self.undo)
self._top.bind("<BackSpace>", self.undo)
# Misc
self._top.bind("<Control-p>", self.postscript)
self._top.bind("<Control-h>", self.help)
self._top.bind("<F1>", self.help)
self._top.bind("<Control-g>", self.edit_grammar)
self._top.bind("<Control-t>", self.edit_sentence)
# Animation speed control
self._top.bind("-", lambda e, a=self._animate: a.set(20))
self._top.bind("=", lambda e, a=self._animate: a.set(10))
self._top.bind("+", lambda e, a=self._animate: a.set(4))
def _init_buttons(self, parent):
# Set up the frames.
self._buttonframe = buttonframe = Frame(parent)
buttonframe.pack(fill="none", side="bottom")
Button(
buttonframe,
text="Step",
background="#90c0d0",
foreground="black",
command=self.step,
).pack(side="left")
Button(
buttonframe,
text="Shift",
underline=0,
background="#90f090",
foreground="black",
command=self.shift,
).pack(side="left")
Button(
buttonframe,
text="Reduce",
underline=0,
background="#90f090",
foreground="black",
command=self.reduce,
).pack(side="left")
Button(
buttonframe,
text="Undo",
underline=0,
background="#f0a0a0",
foreground="black",
command=self.undo,
).pack(side="left")
def _init_menubar(self, parent):
menubar = Menu(parent)
filemenu = Menu(menubar, tearoff=0)
filemenu.add_command(
label="Reset Parser", underline=0, command=self.reset, accelerator="Del"
)
filemenu.add_command(
label="Print to Postscript",
underline=0,
command=self.postscript,
accelerator="Ctrl-p",
)
filemenu.add_command(
label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x"
)
menubar.add_cascade(label="File", underline=0, menu=filemenu)
editmenu = Menu(menubar, tearoff=0)
editmenu.add_command(
label="Edit Grammar",
underline=5,
command=self.edit_grammar,
accelerator="Ctrl-g",
)
editmenu.add_command(
label="Edit Text",
underline=5,
command=self.edit_sentence,
accelerator="Ctrl-t",
)
menubar.add_cascade(label="Edit", underline=0, menu=editmenu)
rulemenu = Menu(menubar, tearoff=0)
rulemenu.add_command(
label="Step", underline=1, command=self.step, accelerator="Space"
)
rulemenu.add_separator()
rulemenu.add_command(
label="Shift", underline=0, command=self.shift, accelerator="Ctrl-s"
)
rulemenu.add_command(
label="Reduce", underline=0, command=self.reduce, accelerator="Ctrl-r"
)
rulemenu.add_separator()
rulemenu.add_command(
label="Undo", underline=0, command=self.undo, accelerator="Ctrl-u"
)
menubar.add_cascade(label="Apply", underline=0, menu=rulemenu)
viewmenu = Menu(menubar, tearoff=0)
viewmenu.add_checkbutton(
label="Show Grammar",
underline=0,
variable=self._show_grammar,
command=self._toggle_grammar,
)
viewmenu.add_separator()
viewmenu.add_radiobutton(
label="Tiny",
variable=self._size,
underline=0,
value=10,
command=self.resize,
)
viewmenu.add_radiobutton(
label="Small",
variable=self._size,
underline=0,
value=12,
command=self.resize,
)
viewmenu.add_radiobutton(
label="Medium",
variable=self._size,
underline=0,
value=14,
command=self.resize,
)
viewmenu.add_radiobutton(
label="Large",
variable=self._size,
underline=0,
value=18,
command=self.resize,
)
viewmenu.add_radiobutton(
label="Huge",
variable=self._size,
underline=0,
value=24,
command=self.resize,
)
menubar.add_cascade(label="View", underline=0, menu=viewmenu)
animatemenu = Menu(menubar, tearoff=0)
animatemenu.add_radiobutton(
label="No Animation", underline=0, variable=self._animate, value=0
)
animatemenu.add_radiobutton(
label="Slow Animation",
underline=0,
variable=self._animate,
value=20,
accelerator="-",
)
animatemenu.add_radiobutton(
label="Normal Animation",
underline=0,
variable=self._animate,
value=10,
accelerator="=",
)
animatemenu.add_radiobutton(
label="Fast Animation",
underline=0,
variable=self._animate,
value=4,
accelerator="+",
)
menubar.add_cascade(label="Animate", underline=1, menu=animatemenu)
helpmenu = Menu(menubar, tearoff=0)
helpmenu.add_command(label="About", underline=0, command=self.about)
helpmenu.add_command(
label="Instructions", underline=0, command=self.help, accelerator="F1"
)
menubar.add_cascade(label="Help", underline=0, menu=helpmenu)
parent.config(menu=menubar)
def _init_feedback(self, parent):
self._feedbackframe = feedbackframe = Frame(parent)
feedbackframe.pack(fill="x", side="bottom", padx=3, pady=3)
self._lastoper_label = Label(
feedbackframe, text="Last Operation:", font=self._font
)
self._lastoper_label.pack(side="left")
lastoperframe = Frame(feedbackframe, relief="sunken", border=1)
lastoperframe.pack(fill="x", side="right", expand=1, padx=5)
self._lastoper1 = Label(
lastoperframe, foreground="#007070", background="#f0f0f0", font=self._font
)
self._lastoper2 = Label(
lastoperframe,
anchor="w",
width=30,
foreground="#004040",
background="#f0f0f0",
font=self._font,
)
self._lastoper1.pack(side="left")
self._lastoper2.pack(side="left", fill="x", expand=1)
def _init_canvas(self, parent):
self._cframe = CanvasFrame(
parent,
background="white",
width=525,
closeenough=10,
border=2,
relief="sunken",
)
self._cframe.pack(expand=1, fill="both", side="top", pady=2)
canvas = self._canvas = self._cframe.canvas()
self._stackwidgets = []
self._rtextwidgets = []
self._titlebar = canvas.create_rectangle(
0, 0, 0, 0, fill="#c0f0f0", outline="black"
)
self._exprline = canvas.create_line(0, 0, 0, 0, dash=".")
self._stacktop = canvas.create_line(0, 0, 0, 0, fill="#408080")
size = self._size.get() + 4
self._stacklabel = TextWidget(
canvas, "Stack", color="#004040", font=self._boldfont
)
self._rtextlabel = TextWidget(
canvas, "Remaining Text", color="#004040", font=self._boldfont
)
self._cframe.add_widget(self._stacklabel)
self._cframe.add_widget(self._rtextlabel)
#########################################
## Main draw procedure
#########################################
def _redraw(self):
scrollregion = self._canvas["scrollregion"].split()
(cx1, cy1, cx2, cy2) = (int(c) for c in scrollregion)
# Delete the old stack & rtext widgets.
for stackwidget in self._stackwidgets:
self._cframe.destroy_widget(stackwidget)
self._stackwidgets = []
for rtextwidget in self._rtextwidgets:
self._cframe.destroy_widget(rtextwidget)
self._rtextwidgets = []
# Position the titlebar & exprline
(x1, y1, x2, y2) = self._stacklabel.bbox()
y = y2 - y1 + 10
self._canvas.coords(self._titlebar, -5000, 0, 5000, y - 4)
self._canvas.coords(self._exprline, 0, y * 2 - 10, 5000, y * 2 - 10)
# Position the titlebar labels..
(x1, y1, x2, y2) = self._stacklabel.bbox()
self._stacklabel.move(5 - x1, 3 - y1)
(x1, y1, x2, y2) = self._rtextlabel.bbox()
self._rtextlabel.move(cx2 - x2 - 5, 3 - y1)
# Draw the stack.
stackx = 5
for tok in self._parser.stack():
if isinstance(tok, Tree):
attribs = {
"tree_color": "#4080a0",
"tree_width": 2,
"node_font": self._boldfont,
"node_color": "#006060",
"leaf_color": "#006060",
"leaf_font": self._font,
}
widget = tree_to_treesegment(self._canvas, tok, **attribs)
widget.label()["color"] = "#000000"
else:
widget = TextWidget(self._canvas, tok, color="#000000", font=self._font)
widget.bind_click(self._popup_reduce)
self._stackwidgets.append(widget)
self._cframe.add_widget(widget, stackx, y)
stackx = widget.bbox()[2] + 10
# Draw the remaining text.
rtextwidth = 0
for tok in self._parser.remaining_text():
widget = TextWidget(self._canvas, tok, color="#000000", font=self._font)
self._rtextwidgets.append(widget)
self._cframe.add_widget(widget, rtextwidth, y)
rtextwidth = widget.bbox()[2] + 4
# Allow enough room to shift the next token (for animations)
if len(self._rtextwidgets) > 0:
stackx += self._rtextwidgets[0].width()
# Move the remaining text to the correct location (keep it
# right-justified, when possible); and move the remaining text
# label, if necessary.
stackx = max(stackx, self._stacklabel.width() + 25)
rlabelwidth = self._rtextlabel.width() + 10
if stackx >= cx2 - max(rtextwidth, rlabelwidth):
cx2 = stackx + max(rtextwidth, rlabelwidth)
for rtextwidget in self._rtextwidgets:
rtextwidget.move(4 + cx2 - rtextwidth, 0)
self._rtextlabel.move(cx2 - self._rtextlabel.bbox()[2] - 5, 0)
midx = (stackx + cx2 - max(rtextwidth, rlabelwidth)) / 2
self._canvas.coords(self._stacktop, midx, 0, midx, 5000)
(x1, y1, x2, y2) = self._stacklabel.bbox()
# Set up binding to allow them to shift a token by dragging it.
if len(self._rtextwidgets) > 0:
def drag_shift(widget, midx=midx, self=self):
if widget.bbox()[0] < midx:
self.shift()
else:
self._redraw()
self._rtextwidgets[0].bind_drag(drag_shift)
self._rtextwidgets[0].bind_click(self.shift)
# Draw the stack top.
self._highlight_productions()
def _draw_stack_top(self, widget):
# hack..
midx = widget.bbox()[2] + 50
self._canvas.coords(self._stacktop, midx, 0, midx, 5000)
def _highlight_productions(self):
# Highlight the productions that can be reduced.
self._prodlist.selection_clear(0, "end")
for prod in self._parser.reducible_productions():
index = self._productions.index(prod)
self._prodlist.selection_set(index)
#########################################
## Button Callbacks
#########################################
def destroy(self, *e):
if self._top is None:
return
self._top.destroy()
self._top = None
def reset(self, *e):
self._parser.initialize(self._sent)
self._lastoper1["text"] = "Reset App"
self._lastoper2["text"] = ""
self._redraw()
def step(self, *e):
if self.reduce():
return True
elif self.shift():
return True
else:
if list(self._parser.parses()):
self._lastoper1["text"] = "Finished:"
self._lastoper2["text"] = "Success"
else:
self._lastoper1["text"] = "Finished:"
self._lastoper2["text"] = "Failure"
def shift(self, *e):
if self._animating_lock:
return
if self._parser.shift():
tok = self._parser.stack()[-1]
self._lastoper1["text"] = "Shift:"
self._lastoper2["text"] = "%r" % tok
if self._animate.get():
self._animate_shift()
else:
self._redraw()
return True
return False
def reduce(self, *e):
if self._animating_lock:
return
production = self._parser.reduce()
if production:
self._lastoper1["text"] = "Reduce:"
self._lastoper2["text"] = "%s" % production
if self._animate.get():
self._animate_reduce()
else:
self._redraw()
return production
def undo(self, *e):
if self._animating_lock:
return
if self._parser.undo():
self._redraw()
def postscript(self, *e):
self._cframe.print_to_file()
def mainloop(self, *args, **kwargs):
"""
Enter the Tkinter mainloop. This function must be called if
this demo is created from a non-interactive program (e.g.
from a secript); otherwise, the demo will close as soon as
the script completes.
"""
if in_idle():
return
self._top.mainloop(*args, **kwargs)
#########################################
## Menubar callbacks
#########################################
def resize(self, size=None):
if size is not None:
self._size.set(size)
size = self._size.get()
self._font.configure(size=-(abs(size)))
self._boldfont.configure(size=-(abs(size)))
self._sysfont.configure(size=-(abs(size)))
# self._stacklabel['font'] = ('helvetica', -size-4, 'bold')
# self._rtextlabel['font'] = ('helvetica', -size-4, 'bold')
# self._lastoper_label['font'] = ('helvetica', -size)
# self._lastoper1['font'] = ('helvetica', -size)
# self._lastoper2['font'] = ('helvetica', -size)
# self._prodlist['font'] = ('helvetica', -size)
# self._prodlist_label['font'] = ('helvetica', -size-2, 'bold')
self._redraw()
def help(self, *e):
# The default font's not very legible; try using 'fixed' instead.
try:
ShowText(
self._top,
"Help: Shift-Reduce Parser Application",
(__doc__ or "").strip(),
width=75,
font="fixed",
)
except:
ShowText(
self._top,
"Help: Shift-Reduce Parser Application",
(__doc__ or "").strip(),
width=75,
)
def about(self, *e):
ABOUT = "NLTK Shift-Reduce Parser Application\n" + "Written by Edward Loper"
TITLE = "About: Shift-Reduce Parser Application"
try:
from tkinter.messagebox import Message
Message(message=ABOUT, title=TITLE).show()
except:
ShowText(self._top, TITLE, ABOUT)
def edit_grammar(self, *e):
CFGEditor(self._top, self._parser.grammar(), self.set_grammar)
def set_grammar(self, grammar):
self._parser.set_grammar(grammar)
self._productions = list(grammar.productions())
self._prodlist.delete(0, "end")
for production in self._productions:
self._prodlist.insert("end", (" %s" % production))
def edit_sentence(self, *e):
sentence = " ".join(self._sent)
title = "Edit Text"
instr = "Enter a new sentence to parse."
EntryDialog(self._top, sentence, instr, self.set_sentence, title)
def set_sentence(self, sent):
self._sent = sent.split() # [XX] use tagged?
self.reset()
#########################################
## Reduce Production Selection
#########################################
def _toggle_grammar(self, *e):
if self._show_grammar.get():
self._prodframe.pack(
fill="both", side="left", padx=2, after=self._feedbackframe
)
self._lastoper1["text"] = "Show Grammar"
else:
self._prodframe.pack_forget()
self._lastoper1["text"] = "Hide Grammar"
self._lastoper2["text"] = ""
def _prodlist_select(self, event):
selection = self._prodlist.curselection()
if len(selection) != 1:
return
index = int(selection[0])
production = self._parser.reduce(self._productions[index])
if production:
self._lastoper1["text"] = "Reduce:"
self._lastoper2["text"] = "%s" % production
if self._animate.get():
self._animate_reduce()
else:
self._redraw()
else:
# Reset the production selections.
self._prodlist.selection_clear(0, "end")
for prod in self._parser.reducible_productions():
index = self._productions.index(prod)
self._prodlist.selection_set(index)
def _popup_reduce(self, widget):
# Remove old commands.
productions = self._parser.reducible_productions()
if len(productions) == 0:
return
self._reduce_menu.delete(0, "end")
for production in productions:
self._reduce_menu.add_command(label=str(production), command=self.reduce)
self._reduce_menu.post(
self._canvas.winfo_pointerx(), self._canvas.winfo_pointery()
)
#########################################
## Animations
#########################################
def _animate_shift(self):
# What widget are we shifting?
widget = self._rtextwidgets[0]
# Where are we shifting from & to?
right = widget.bbox()[0]
if len(self._stackwidgets) == 0:
left = 5
else:
left = self._stackwidgets[-1].bbox()[2] + 10
# Start animating.
dt = self._animate.get()
dx = (left - right) * 1.0 / dt
self._animate_shift_frame(dt, widget, dx)
def _animate_shift_frame(self, frame, widget, dx):
if frame > 0:
self._animating_lock = 1
widget.move(dx, 0)
self._top.after(10, self._animate_shift_frame, frame - 1, widget, dx)
else:
# but: stacktop??
# Shift the widget to the stack.
del self._rtextwidgets[0]
self._stackwidgets.append(widget)
self._animating_lock = 0
# Display the available productions.
self._draw_stack_top(widget)
self._highlight_productions()
def _animate_reduce(self):
# What widgets are we shifting?
numwidgets = len(self._parser.stack()[-1]) # number of children
widgets = self._stackwidgets[-numwidgets:]
# How far are we moving?
if isinstance(widgets[0], TreeSegmentWidget):
ydist = 15 + widgets[0].label().height()
else:
ydist = 15 + widgets[0].height()
# Start animating.
dt = self._animate.get()
dy = ydist * 2.0 / dt
self._animate_reduce_frame(dt / 2, widgets, dy)
def _animate_reduce_frame(self, frame, widgets, dy):
if frame > 0:
self._animating_lock = 1
for widget in widgets:
widget.move(0, dy)
self._top.after(10, self._animate_reduce_frame, frame - 1, widgets, dy)
else:
del self._stackwidgets[-len(widgets) :]
for widget in widgets:
self._cframe.remove_widget(widget)
tok = self._parser.stack()[-1]
if not isinstance(tok, Tree):
raise ValueError()
label = TextWidget(
self._canvas, str(tok.label()), color="#006060", font=self._boldfont
)
widget = TreeSegmentWidget(self._canvas, label, widgets, width=2)
(x1, y1, x2, y2) = self._stacklabel.bbox()
y = y2 - y1 + 10
if not self._stackwidgets:
x = 5
else:
x = self._stackwidgets[-1].bbox()[2] + 10
self._cframe.add_widget(widget, x, y)
self._stackwidgets.append(widget)
# Display the available productions.
self._draw_stack_top(widget)
self._highlight_productions()
# # Delete the old widgets..
# del self._stackwidgets[-len(widgets):]
# for widget in widgets:
# self._cframe.destroy_widget(widget)
#
# # Make a new one.
# tok = self._parser.stack()[-1]
# if isinstance(tok, Tree):
# attribs = {'tree_color': '#4080a0', 'tree_width': 2,
# 'node_font': bold, 'node_color': '#006060',
# 'leaf_color': '#006060', 'leaf_font':self._font}
# widget = tree_to_treesegment(self._canvas, tok.type(),
# **attribs)
# widget.node()['color'] = '#000000'
# else:
# widget = TextWidget(self._canvas, tok.type(),
# color='#000000', font=self._font)
# widget.bind_click(self._popup_reduce)
# (x1, y1, x2, y2) = self._stacklabel.bbox()
# y = y2-y1+10
# if not self._stackwidgets: x = 5
# else: x = self._stackwidgets[-1].bbox()[2] + 10
# self._cframe.add_widget(widget, x, y)
# self._stackwidgets.append(widget)
# self._redraw()
self._animating_lock = 0
#########################################
## Hovering.
#########################################
def _highlight_hover(self, event):
# What production are we hovering over?
index = self._prodlist.nearest(event.y)
if self._hover == index:
return
# Clear any previous hover highlighting.
self._clear_hover()
# If the production corresponds to an available reduction,
# highlight the stack.
selection = [int(s) for s in self._prodlist.curselection()]
if index in selection:
rhslen = len(self._productions[index].rhs())
for stackwidget in self._stackwidgets[-rhslen:]:
if isinstance(stackwidget, TreeSegmentWidget):
stackwidget.label()["color"] = "#00a000"
else:
stackwidget["color"] = "#00a000"
# Remember what production we're hovering over.
self._hover = index
def _clear_hover(self, *event):
# Clear any previous hover highlighting.
if self._hover == -1:
return
self._hover = -1
for stackwidget in self._stackwidgets:
if isinstance(stackwidget, TreeSegmentWidget):
stackwidget.label()["color"] = "black"
else:
stackwidget["color"] = "black"
def app():
"""
Create a shift reduce parser app, using a simple grammar and
text.
"""
from nltk.grammar import CFG, Nonterminal, Production
nonterminals = "S VP NP PP P N Name V Det"
(S, VP, NP, PP, P, N, Name, V, Det) = (Nonterminal(s) for s in nonterminals.split())
productions = (
# Syntactic Productions
Production(S, [NP, VP]),
Production(NP, [Det, N]),
Production(NP, [NP, PP]),
Production(VP, [VP, PP]),
Production(VP, [V, NP, PP]),
Production(VP, [V, NP]),
Production(PP, [P, NP]),
# Lexical Productions
Production(NP, ["I"]),
Production(Det, ["the"]),
Production(Det, ["a"]),
Production(N, ["man"]),
Production(V, ["saw"]),
Production(P, ["in"]),
Production(P, ["with"]),
Production(N, ["park"]),
Production(N, ["dog"]),
Production(N, ["statue"]),
Production(Det, ["my"]),
)
grammar = CFG(S, productions)
# tokenize the sentence
sent = "my dog saw a man in the park with a statue".split()
ShiftReduceApp(grammar, sent).mainloop()
if __name__ == "__main__":
app()
__all__ = ["app"]

View File

@@ -0,0 +1,36 @@
# Natural Language Toolkit: Wordfreq Application
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Sumukh Ghodke <sghodke@csse.unimelb.edu.au>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
from matplotlib import pylab
from nltk.corpus import gutenberg
from nltk.text import Text
def plot_word_freq_dist(text):
fd = text.vocab()
samples = [item for item, _ in fd.most_common(50)]
values = [fd[sample] for sample in samples]
values = [sum(values[: i + 1]) * 100.0 / fd.N() for i in range(len(values))]
pylab.title(text.name)
pylab.xlabel("Samples")
pylab.ylabel("Cumulative Percentage")
pylab.plot(values)
pylab.xticks(range(len(samples)), [str(s) for s in samples], rotation=90)
pylab.show()
def app():
t1 = Text(gutenberg.words("melville-moby_dick.txt"))
plot_word_freq_dist(t1)
if __name__ == "__main__":
app()
__all__ = ["app"]

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,213 @@
# Natural Language Toolkit: Some texts for exploration in chapter 1 of the book
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
#
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
from nltk.corpus import (
genesis,
gutenberg,
inaugural,
nps_chat,
treebank,
webtext,
wordnet,
)
from nltk.probability import FreqDist
from nltk.text import Text
from nltk.util import bigrams
print("*** Introductory Examples for the NLTK Book ***")
print("Loading text1, ..., text9 and sent1, ..., sent9")
print("Type the name of the text or sentence to view it.")
print("Type: 'texts()' or 'sents()' to list the materials.")
text1 = Text(gutenberg.words("melville-moby_dick.txt"))
print("text1:", text1.name)
text2 = Text(gutenberg.words("austen-sense.txt"))
print("text2:", text2.name)
text3 = Text(genesis.words("english-kjv.txt"), name="The Book of Genesis")
print("text3:", text3.name)
text4 = Text(inaugural.words(), name="Inaugural Address Corpus")
print("text4:", text4.name)
text5 = Text(nps_chat.words(), name="Chat Corpus")
print("text5:", text5.name)
text6 = Text(webtext.words("grail.txt"), name="Monty Python and the Holy Grail")
print("text6:", text6.name)
text7 = Text(treebank.words(), name="Wall Street Journal")
print("text7:", text7.name)
text8 = Text(webtext.words("singles.txt"), name="Personals Corpus")
print("text8:", text8.name)
text9 = Text(gutenberg.words("chesterton-thursday.txt"))
print("text9:", text9.name)
def texts():
print("text1:", text1.name)
print("text2:", text2.name)
print("text3:", text3.name)
print("text4:", text4.name)
print("text5:", text5.name)
print("text6:", text6.name)
print("text7:", text7.name)
print("text8:", text8.name)
print("text9:", text9.name)
sent1 = ["Call", "me", "Ishmael", "."]
sent2 = [
"The",
"family",
"of",
"Dashwood",
"had",
"long",
"been",
"settled",
"in",
"Sussex",
".",
]
sent3 = [
"In",
"the",
"beginning",
"God",
"created",
"the",
"heaven",
"and",
"the",
"earth",
".",
]
sent4 = [
"Fellow",
"-",
"Citizens",
"of",
"the",
"Senate",
"and",
"of",
"the",
"House",
"of",
"Representatives",
":",
]
sent5 = [
"I",
"have",
"a",
"problem",
"with",
"people",
"PMing",
"me",
"to",
"lol",
"JOIN",
]
sent6 = [
"SCENE",
"1",
":",
"[",
"wind",
"]",
"[",
"clop",
"clop",
"clop",
"]",
"KING",
"ARTHUR",
":",
"Whoa",
"there",
"!",
]
sent7 = [
"Pierre",
"Vinken",
",",
"61",
"years",
"old",
",",
"will",
"join",
"the",
"board",
"as",
"a",
"nonexecutive",
"director",
"Nov.",
"29",
".",
]
sent8 = [
"25",
"SEXY",
"MALE",
",",
"seeks",
"attrac",
"older",
"single",
"lady",
",",
"for",
"discreet",
"encounters",
".",
]
sent9 = [
"THE",
"suburb",
"of",
"Saffron",
"Park",
"lay",
"on",
"the",
"sunset",
"side",
"of",
"London",
",",
"as",
"red",
"and",
"ragged",
"as",
"a",
"cloud",
"of",
"sunset",
".",
]
def sents():
print("sent1:", " ".join(sent1))
print("sent2:", " ".join(sent2))
print("sent3:", " ".join(sent3))
print("sent4:", " ".join(sent4))
print("sent5:", " ".join(sent5))
print("sent6:", " ".join(sent6))
print("sent7:", " ".join(sent7))
print("sent8:", " ".join(sent8))
print("sent9:", " ".join(sent9))

View File

@@ -0,0 +1,34 @@
# Natural Language Toolkit: Combinatory Categorial Grammar
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Graeme Gange <ggange@csse.unimelb.edu.au>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Combinatory Categorial Grammar.
For more information see nltk/doc/contrib/ccg/ccg.pdf
"""
from nltk.ccg.chart import CCGChart, CCGChartParser, CCGEdge, CCGLeafEdge
from nltk.ccg.combinator import (
BackwardApplication,
BackwardBx,
BackwardCombinator,
BackwardComposition,
BackwardSx,
BackwardT,
DirectedBinaryCombinator,
ForwardApplication,
ForwardCombinator,
ForwardComposition,
ForwardSubstitution,
ForwardT,
UndirectedBinaryCombinator,
UndirectedComposition,
UndirectedFunctionApplication,
UndirectedSubstitution,
UndirectedTypeRaise,
)
from nltk.ccg.lexicon import CCGLexicon

View File

@@ -0,0 +1,358 @@
# Natural Language Toolkit: CCG Categories
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Graeme Gange <ggange@csse.unimelb.edu.au>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
from abc import ABCMeta, abstractmethod
from functools import total_ordering
from nltk.internals import raise_unorderable_types
@total_ordering
class AbstractCCGCategory(metaclass=ABCMeta):
"""
Interface for categories in combinatory grammars.
"""
@abstractmethod
def is_primitive(self):
"""
Returns true if the category is primitive.
"""
@abstractmethod
def is_function(self):
"""
Returns true if the category is a function application.
"""
@abstractmethod
def is_var(self):
"""
Returns true if the category is a variable.
"""
@abstractmethod
def substitute(self, substitutions):
"""
Takes a set of (var, category) substitutions, and replaces every
occurrence of the variable with the corresponding category.
"""
@abstractmethod
def can_unify(self, other):
"""
Determines whether two categories can be unified.
- Returns None if they cannot be unified
- Returns a list of necessary substitutions if they can.
"""
# Utility functions: comparison, strings and hashing.
@abstractmethod
def __str__(self):
pass
def __eq__(self, other):
return (
self.__class__ is other.__class__
and self._comparison_key == other._comparison_key
)
def __ne__(self, other):
return not self == other
def __lt__(self, other):
if not isinstance(other, AbstractCCGCategory):
raise_unorderable_types("<", self, other)
if self.__class__ is other.__class__:
return self._comparison_key < other._comparison_key
else:
return self.__class__.__name__ < other.__class__.__name__
def __hash__(self):
try:
return self._hash
except AttributeError:
self._hash = hash(self._comparison_key)
return self._hash
class CCGVar(AbstractCCGCategory):
"""
Class representing a variable CCG category.
Used for conjunctions (and possibly type-raising, if implemented as a
unary rule).
"""
_maxID = 0
def __init__(self, prim_only=False):
"""Initialize a variable (selects a new identifier)
:param prim_only: a boolean that determines whether the variable is
restricted to primitives
:type prim_only: bool
"""
self._id = self.new_id()
self._prim_only = prim_only
self._comparison_key = self._id
@classmethod
def new_id(cls):
"""
A class method allowing generation of unique variable identifiers.
"""
cls._maxID = cls._maxID + 1
return cls._maxID - 1
@classmethod
def reset_id(cls):
cls._maxID = 0
def is_primitive(self):
return False
def is_function(self):
return False
def is_var(self):
return True
def substitute(self, substitutions):
"""If there is a substitution corresponding to this variable,
return the substituted category.
"""
for var, cat in substitutions:
if var == self:
return cat
return self
def can_unify(self, other):
"""If the variable can be replaced with other
a substitution is returned.
"""
if other.is_primitive() or not self._prim_only:
return [(self, other)]
return None
def id(self):
return self._id
def __str__(self):
return "_var" + str(self._id)
@total_ordering
class Direction:
"""
Class representing the direction of a function application.
Also contains maintains information as to which combinators
may be used with the category.
"""
def __init__(self, dir, restrictions):
self._dir = dir
self._restrs = restrictions
self._comparison_key = (dir, tuple(restrictions))
# Testing the application direction
def is_forward(self):
return self._dir == "/"
def is_backward(self):
return self._dir == "\\"
def dir(self):
return self._dir
def restrs(self):
"""A list of restrictions on the combinators.
'.' denotes that permuting operations are disallowed
',' denotes that function composition is disallowed
'_' denotes that the direction has variable restrictions.
(This is redundant in the current implementation of type-raising)
"""
return self._restrs
def is_variable(self):
return self._restrs == "_"
# Unification and substitution of variable directions.
# Used only if type-raising is implemented as a unary rule, as it
# must inherit restrictions from the argument category.
def can_unify(self, other):
if other.is_variable():
return [("_", self.restrs())]
elif self.is_variable():
return [("_", other.restrs())]
else:
if self.restrs() == other.restrs():
return []
return None
def substitute(self, subs):
if not self.is_variable():
return self
for var, restrs in subs:
if var == "_":
return Direction(self._dir, restrs)
return self
# Testing permitted combinators
def can_compose(self):
return "," not in self._restrs
def can_cross(self):
return "." not in self._restrs
def __eq__(self, other):
return (
self.__class__ is other.__class__
and self._comparison_key == other._comparison_key
)
def __ne__(self, other):
return not self == other
def __lt__(self, other):
if not isinstance(other, Direction):
raise_unorderable_types("<", self, other)
if self.__class__ is other.__class__:
return self._comparison_key < other._comparison_key
else:
return self.__class__.__name__ < other.__class__.__name__
def __hash__(self):
try:
return self._hash
except AttributeError:
self._hash = hash(self._comparison_key)
return self._hash
def __str__(self):
r_str = ""
for r in self._restrs:
r_str = r_str + "%s" % r
return f"{self._dir}{r_str}"
# The negation operator reverses the direction of the application
def __neg__(self):
if self._dir == "/":
return Direction("\\", self._restrs)
else:
return Direction("/", self._restrs)
class PrimitiveCategory(AbstractCCGCategory):
"""
Class representing primitive categories.
Takes a string representation of the category, and a
list of strings specifying the morphological subcategories.
"""
def __init__(self, categ, restrictions=[]):
self._categ = categ
self._restrs = restrictions
self._comparison_key = (categ, tuple(restrictions))
def is_primitive(self):
return True
def is_function(self):
return False
def is_var(self):
return False
def restrs(self):
return self._restrs
def categ(self):
return self._categ
# Substitution does nothing to a primitive category
def substitute(self, subs):
return self
# A primitive can be unified with a class of the same
# base category, given that the other category shares all
# of its subclasses, or with a variable.
def can_unify(self, other):
if not other.is_primitive():
return None
if other.is_var():
return [(other, self)]
if other.categ() == self.categ():
for restr in self._restrs:
if restr not in other.restrs():
return None
return []
return None
def __str__(self):
if self._restrs == []:
return "%s" % self._categ
restrictions = "[%s]" % ",".join(repr(r) for r in self._restrs)
return f"{self._categ}{restrictions}"
class FunctionalCategory(AbstractCCGCategory):
"""
Class that represents a function application category.
Consists of argument and result categories, together with
an application direction.
"""
def __init__(self, res, arg, dir):
self._res = res
self._arg = arg
self._dir = dir
self._comparison_key = (arg, dir, res)
def is_primitive(self):
return False
def is_function(self):
return True
def is_var(self):
return False
# Substitution returns the category consisting of the
# substitution applied to each of its constituents.
def substitute(self, subs):
sub_res = self._res.substitute(subs)
sub_dir = self._dir.substitute(subs)
sub_arg = self._arg.substitute(subs)
return FunctionalCategory(sub_res, sub_arg, self._dir)
# A function can unify with another function, so long as its
# constituents can unify, or with an unrestricted variable.
def can_unify(self, other):
if other.is_var():
return [(other, self)]
if other.is_function():
sa = self._res.can_unify(other.res())
sd = self._dir.can_unify(other.dir())
if sa is not None and sd is not None:
sb = self._arg.substitute(sa).can_unify(other.arg().substitute(sa))
if sb is not None:
return sa + sb
return None
# Constituent accessors
def arg(self):
return self._arg
def res(self):
return self._res
def dir(self):
return self._dir
def __str__(self):
return f"({self._res}{self._dir}{self._arg})"

View File

@@ -0,0 +1,480 @@
# Natural Language Toolkit: Combinatory Categorial Grammar
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Graeme Gange <ggange@csse.unimelb.edu.au>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
The lexicon is constructed by calling
``lexicon.fromstring(<lexicon string>)``.
In order to construct a parser, you also need a rule set.
The standard English rules are provided in chart as
``chart.DefaultRuleSet``.
The parser can then be constructed by calling, for example:
``parser = chart.CCGChartParser(<lexicon>, <ruleset>)``
Parsing is then performed by running
``parser.parse(<sentence>.split())``.
While this returns a list of trees, the default representation
of the produced trees is not very enlightening, particularly
given that it uses the same tree class as the CFG parsers.
It is probably better to call:
``chart.printCCGDerivation(<parse tree extracted from list>)``
which should print a nice representation of the derivation.
This entire process is shown far more clearly in the demonstration:
python chart.py
"""
import itertools
from nltk.ccg.combinator import *
from nltk.ccg.combinator import (
BackwardApplication,
BackwardBx,
BackwardComposition,
BackwardSx,
BackwardT,
ForwardApplication,
ForwardComposition,
ForwardSubstitution,
ForwardT,
)
from nltk.ccg.lexicon import Token, fromstring
from nltk.ccg.logic import *
from nltk.parse import ParserI
from nltk.parse.chart import AbstractChartRule, Chart, EdgeI
from nltk.sem.logic import *
from nltk.tree import Tree
# Based on the EdgeI class from NLTK.
# A number of the properties of the EdgeI interface don't
# transfer well to CCGs, however.
class CCGEdge(EdgeI):
def __init__(self, span, categ, rule):
self._span = span
self._categ = categ
self._rule = rule
self._comparison_key = (span, categ, rule)
# Accessors
def lhs(self):
return self._categ
def span(self):
return self._span
def start(self):
return self._span[0]
def end(self):
return self._span[1]
def length(self):
return self._span[1] - self.span[0]
def rhs(self):
return ()
def dot(self):
return 0
def is_complete(self):
return True
def is_incomplete(self):
return False
def nextsym(self):
return None
def categ(self):
return self._categ
def rule(self):
return self._rule
class CCGLeafEdge(EdgeI):
"""
Class representing leaf edges in a CCG derivation.
"""
def __init__(self, pos, token, leaf):
self._pos = pos
self._token = token
self._leaf = leaf
self._comparison_key = (pos, token.categ(), leaf)
# Accessors
def lhs(self):
return self._token.categ()
def span(self):
return (self._pos, self._pos + 1)
def start(self):
return self._pos
def end(self):
return self._pos + 1
def length(self):
return 1
def rhs(self):
return self._leaf
def dot(self):
return 0
def is_complete(self):
return True
def is_incomplete(self):
return False
def nextsym(self):
return None
def token(self):
return self._token
def categ(self):
return self._token.categ()
def leaf(self):
return self._leaf
class BinaryCombinatorRule(AbstractChartRule):
"""
Class implementing application of a binary combinator to a chart.
Takes the directed combinator to apply.
"""
NUMEDGES = 2
def __init__(self, combinator):
self._combinator = combinator
# Apply a combinator
def apply(self, chart, grammar, left_edge, right_edge):
# The left & right edges must be touching.
if not (left_edge.end() == right_edge.start()):
return
# Check if the two edges are permitted to combine.
# If so, generate the corresponding edge.
if self._combinator.can_combine(left_edge.categ(), right_edge.categ()):
for res in self._combinator.combine(left_edge.categ(), right_edge.categ()):
new_edge = CCGEdge(
span=(left_edge.start(), right_edge.end()),
categ=res,
rule=self._combinator,
)
if chart.insert(new_edge, (left_edge, right_edge)):
yield new_edge
# The representation of the combinator (for printing derivations)
def __str__(self):
return "%s" % self._combinator
# Type-raising must be handled slightly differently to the other rules, as the
# resulting rules only span a single edge, rather than both edges.
class ForwardTypeRaiseRule(AbstractChartRule):
"""
Class for applying forward type raising
"""
NUMEDGES = 2
def __init__(self):
self._combinator = ForwardT
def apply(self, chart, grammar, left_edge, right_edge):
if not (left_edge.end() == right_edge.start()):
return
for res in self._combinator.combine(left_edge.categ(), right_edge.categ()):
new_edge = CCGEdge(span=left_edge.span(), categ=res, rule=self._combinator)
if chart.insert(new_edge, (left_edge,)):
yield new_edge
def __str__(self):
return "%s" % self._combinator
class BackwardTypeRaiseRule(AbstractChartRule):
"""
Class for applying backward type raising.
"""
NUMEDGES = 2
def __init__(self):
self._combinator = BackwardT
def apply(self, chart, grammar, left_edge, right_edge):
if not (left_edge.end() == right_edge.start()):
return
for res in self._combinator.combine(left_edge.categ(), right_edge.categ()):
new_edge = CCGEdge(span=right_edge.span(), categ=res, rule=self._combinator)
if chart.insert(new_edge, (right_edge,)):
yield new_edge
def __str__(self):
return "%s" % self._combinator
# Common sets of combinators used for English derivations.
ApplicationRuleSet = [
BinaryCombinatorRule(ForwardApplication),
BinaryCombinatorRule(BackwardApplication),
]
CompositionRuleSet = [
BinaryCombinatorRule(ForwardComposition),
BinaryCombinatorRule(BackwardComposition),
BinaryCombinatorRule(BackwardBx),
]
SubstitutionRuleSet = [
BinaryCombinatorRule(ForwardSubstitution),
BinaryCombinatorRule(BackwardSx),
]
TypeRaiseRuleSet = [ForwardTypeRaiseRule(), BackwardTypeRaiseRule()]
# The standard English rule set.
DefaultRuleSet = (
ApplicationRuleSet + CompositionRuleSet + SubstitutionRuleSet + TypeRaiseRuleSet
)
class CCGChartParser(ParserI):
"""
Chart parser for CCGs.
Based largely on the ChartParser class from NLTK.
"""
def __init__(self, lexicon, rules, trace=0):
self._lexicon = lexicon
self._rules = rules
self._trace = trace
def lexicon(self):
return self._lexicon
# Implements the CYK algorithm
def parse(self, tokens):
tokens = list(tokens)
chart = CCGChart(list(tokens))
lex = self._lexicon
# Initialize leaf edges.
for index in range(chart.num_leaves()):
for token in lex.categories(chart.leaf(index)):
new_edge = CCGLeafEdge(index, token, chart.leaf(index))
chart.insert(new_edge, ())
# Select a span for the new edges
for span in range(2, chart.num_leaves() + 1):
for start in range(0, chart.num_leaves() - span + 1):
# Try all possible pairs of edges that could generate
# an edge for that span
for part in range(1, span):
lstart = start
mid = start + part
rend = start + span
for left in chart.select(span=(lstart, mid)):
for right in chart.select(span=(mid, rend)):
# Generate all possible combinations of the two edges
for rule in self._rules:
edges_added_by_rule = 0
for newedge in rule.apply(chart, lex, left, right):
edges_added_by_rule += 1
# Output the resulting parses
return chart.parses(lex.start())
class CCGChart(Chart):
def __init__(self, tokens):
Chart.__init__(self, tokens)
# Constructs the trees for a given parse. Unfortnunately, the parse trees need to be
# constructed slightly differently to those in the default Chart class, so it has to
# be reimplemented
def _trees(self, edge, complete, memo, tree_class):
assert complete, "CCGChart cannot build incomplete trees"
if edge in memo:
return memo[edge]
if isinstance(edge, CCGLeafEdge):
word = tree_class(edge.token(), [self._tokens[edge.start()]])
leaf = tree_class((edge.token(), "Leaf"), [word])
memo[edge] = [leaf]
return [leaf]
memo[edge] = []
trees = []
for cpl in self.child_pointer_lists(edge):
child_choices = [self._trees(cp, complete, memo, tree_class) for cp in cpl]
for children in itertools.product(*child_choices):
lhs = (
Token(
self._tokens[edge.start() : edge.end()],
edge.lhs(),
compute_semantics(children, edge),
),
str(edge.rule()),
)
trees.append(tree_class(lhs, children))
memo[edge] = trees
return trees
def compute_semantics(children, edge):
if children[0].label()[0].semantics() is None:
return None
if len(children) == 2:
if isinstance(edge.rule(), BackwardCombinator):
children = [children[1], children[0]]
combinator = edge.rule()._combinator
function = children[0].label()[0].semantics()
argument = children[1].label()[0].semantics()
if isinstance(combinator, UndirectedFunctionApplication):
return compute_function_semantics(function, argument)
elif isinstance(combinator, UndirectedComposition):
return compute_composition_semantics(function, argument)
elif isinstance(combinator, UndirectedSubstitution):
return compute_substitution_semantics(function, argument)
else:
raise AssertionError("Unsupported combinator '" + combinator + "'")
else:
return compute_type_raised_semantics(children[0].label()[0].semantics())
# --------
# Displaying derivations
# --------
def printCCGDerivation(tree):
# Get the leaves and initial categories
leafcats = tree.pos()
leafstr = ""
catstr = ""
# Construct a string with both the leaf word and corresponding
# category aligned.
for leaf, cat in leafcats:
str_cat = "%s" % cat
nextlen = 2 + max(len(leaf), len(str_cat))
lcatlen = (nextlen - len(str_cat)) // 2
rcatlen = lcatlen + (nextlen - len(str_cat)) % 2
catstr += " " * lcatlen + str_cat + " " * rcatlen
lleaflen = (nextlen - len(leaf)) // 2
rleaflen = lleaflen + (nextlen - len(leaf)) % 2
leafstr += " " * lleaflen + leaf + " " * rleaflen
print(leafstr.rstrip())
print(catstr.rstrip())
# Display the derivation steps
printCCGTree(0, tree)
# Prints the sequence of derivation steps.
def printCCGTree(lwidth, tree):
rwidth = lwidth
# Is a leaf (word).
# Increment the span by the space occupied by the leaf.
if not isinstance(tree, Tree):
return 2 + lwidth + len(tree)
# Find the width of the current derivation step
for child in tree:
rwidth = max(rwidth, printCCGTree(rwidth, child))
# Is a leaf node.
# Don't print anything, but account for the space occupied.
if not isinstance(tree.label(), tuple):
return max(
rwidth, 2 + lwidth + len("%s" % tree.label()), 2 + lwidth + len(tree[0])
)
(token, op) = tree.label()
if op == "Leaf":
return rwidth
# Pad to the left with spaces, followed by a sequence of '-'
# and the derivation rule.
print(lwidth * " " + (rwidth - lwidth) * "-" + "%s" % op)
# Print the resulting category on a new line.
str_res = "%s" % (token.categ())
if token.semantics() is not None:
str_res += " {" + str(token.semantics()) + "}"
respadlen = (rwidth - lwidth - len(str_res)) // 2 + lwidth
print(respadlen * " " + str_res)
return rwidth
### Demonstration code
# Construct the lexicon
lex = fromstring(
"""
:- S, NP, N, VP # Primitive categories, S is the target primitive
Det :: NP/N # Family of words
Pro :: NP
TV :: VP/NP
Modal :: (S\\NP)/VP # Backslashes need to be escaped
I => Pro # Word -> Category mapping
you => Pro
the => Det
# Variables have the special keyword 'var'
# '.' prevents permutation
# ',' prevents composition
and => var\\.,var/.,var
which => (N\\N)/(S/NP)
will => Modal # Categories can be either explicit, or families.
might => Modal
cook => TV
eat => TV
mushrooms => N
parsnips => N
bacon => N
"""
)
def demo():
parser = CCGChartParser(lex, DefaultRuleSet)
for parse in parser.parse("I might cook and eat the bacon".split()):
printCCGDerivation(parse)
if __name__ == "__main__":
demo()

View File

@@ -0,0 +1,340 @@
# Natural Language Toolkit: Combinatory Categorial Grammar
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Graeme Gange <ggange@csse.unimelb.edu.au>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
CCG Combinators
"""
from abc import ABCMeta, abstractmethod
from nltk.ccg.api import FunctionalCategory
class UndirectedBinaryCombinator(metaclass=ABCMeta):
"""
Abstract class for representing a binary combinator.
Merely defines functions for checking if the function and argument
are able to be combined, and what the resulting category is.
Note that as no assumptions are made as to direction, the unrestricted
combinators can perform all backward, forward and crossed variations
of the combinators; these restrictions must be added in the rule
class.
"""
@abstractmethod
def can_combine(self, function, argument):
pass
@abstractmethod
def combine(self, function, argument):
pass
class DirectedBinaryCombinator(metaclass=ABCMeta):
"""
Wrapper for the undirected binary combinator.
It takes left and right categories, and decides which is to be
the function, and which the argument.
It then decides whether or not they can be combined.
"""
@abstractmethod
def can_combine(self, left, right):
pass
@abstractmethod
def combine(self, left, right):
pass
class ForwardCombinator(DirectedBinaryCombinator):
"""
Class representing combinators where the primary functor is on the left.
Takes an undirected combinator, and a predicate which adds constraints
restricting the cases in which it may apply.
"""
def __init__(self, combinator, predicate, suffix=""):
self._combinator = combinator
self._predicate = predicate
self._suffix = suffix
def can_combine(self, left, right):
return self._combinator.can_combine(left, right) and self._predicate(
left, right
)
def combine(self, left, right):
yield from self._combinator.combine(left, right)
def __str__(self):
return f">{self._combinator}{self._suffix}"
class BackwardCombinator(DirectedBinaryCombinator):
"""
The backward equivalent of the ForwardCombinator class.
"""
def __init__(self, combinator, predicate, suffix=""):
self._combinator = combinator
self._predicate = predicate
self._suffix = suffix
def can_combine(self, left, right):
return self._combinator.can_combine(right, left) and self._predicate(
left, right
)
def combine(self, left, right):
yield from self._combinator.combine(right, left)
def __str__(self):
return f"<{self._combinator}{self._suffix}"
class UndirectedFunctionApplication(UndirectedBinaryCombinator):
"""
Class representing function application.
Implements rules of the form:
X/Y Y -> X (>)
And the corresponding backwards application rule
"""
def can_combine(self, function, argument):
if not function.is_function():
return False
return not function.arg().can_unify(argument) is None
def combine(self, function, argument):
if not function.is_function():
return
subs = function.arg().can_unify(argument)
if subs is None:
return
yield function.res().substitute(subs)
def __str__(self):
return ""
# Predicates for function application.
# Ensures the left functor takes an argument on the right
def forwardOnly(left, right):
return left.dir().is_forward()
# Ensures the right functor takes an argument on the left
def backwardOnly(left, right):
return right.dir().is_backward()
# Application combinator instances
ForwardApplication = ForwardCombinator(UndirectedFunctionApplication(), forwardOnly)
BackwardApplication = BackwardCombinator(UndirectedFunctionApplication(), backwardOnly)
class UndirectedComposition(UndirectedBinaryCombinator):
"""
Functional composition (harmonic) combinator.
Implements rules of the form
X/Y Y/Z -> X/Z (B>)
And the corresponding backwards and crossed variations.
"""
def can_combine(self, function, argument):
# Can only combine two functions, and both functions must
# allow composition.
if not (function.is_function() and argument.is_function()):
return False
if function.dir().can_compose() and argument.dir().can_compose():
return not function.arg().can_unify(argument.res()) is None
return False
def combine(self, function, argument):
if not (function.is_function() and argument.is_function()):
return
if function.dir().can_compose() and argument.dir().can_compose():
subs = function.arg().can_unify(argument.res())
if subs is not None:
yield FunctionalCategory(
function.res().substitute(subs),
argument.arg().substitute(subs),
argument.dir(),
)
def __str__(self):
return "B"
# Predicates for restricting application of straight composition.
def bothForward(left, right):
return left.dir().is_forward() and right.dir().is_forward()
def bothBackward(left, right):
return left.dir().is_backward() and right.dir().is_backward()
# Predicates for crossed composition
def crossedDirs(left, right):
return left.dir().is_forward() and right.dir().is_backward()
def backwardBxConstraint(left, right):
# The functors must be crossed inwards
if not crossedDirs(left, right):
return False
# Permuting combinators must be allowed
if not left.dir().can_cross() and right.dir().can_cross():
return False
# The resulting argument category is restricted to be primitive
return left.arg().is_primitive()
# Straight composition combinators
ForwardComposition = ForwardCombinator(UndirectedComposition(), forwardOnly)
BackwardComposition = BackwardCombinator(UndirectedComposition(), backwardOnly)
# Backward crossed composition
BackwardBx = BackwardCombinator(
UndirectedComposition(), backwardBxConstraint, suffix="x"
)
class UndirectedSubstitution(UndirectedBinaryCombinator):
r"""
Substitution (permutation) combinator.
Implements rules of the form
Y/Z (X\Y)/Z -> X/Z (<Sx)
And other variations.
"""
def can_combine(self, function, argument):
if function.is_primitive() or argument.is_primitive():
return False
# These could potentially be moved to the predicates, as the
# constraints may not be general to all languages.
if function.res().is_primitive():
return False
if not function.arg().is_primitive():
return False
if not (function.dir().can_compose() and argument.dir().can_compose()):
return False
return (function.res().arg() == argument.res()) and (
function.arg() == argument.arg()
)
def combine(self, function, argument):
if self.can_combine(function, argument):
yield FunctionalCategory(
function.res().res(), argument.arg(), argument.dir()
)
def __str__(self):
return "S"
# Predicate for forward substitution
def forwardSConstraint(left, right):
if not bothForward(left, right):
return False
return left.res().dir().is_forward() and left.arg().is_primitive()
# Predicate for backward crossed substitution
def backwardSxConstraint(left, right):
if not left.dir().can_cross() and right.dir().can_cross():
return False
if not bothForward(left, right):
return False
return right.res().dir().is_backward() and right.arg().is_primitive()
# Instances of substitution combinators
ForwardSubstitution = ForwardCombinator(UndirectedSubstitution(), forwardSConstraint)
BackwardSx = BackwardCombinator(UndirectedSubstitution(), backwardSxConstraint, "x")
# Retrieves the left-most functional category.
# ie, (N\N)/(S/NP) => N\N
def innermostFunction(categ):
while categ.res().is_function():
categ = categ.res()
return categ
class UndirectedTypeRaise(UndirectedBinaryCombinator):
"""
Undirected combinator for type raising.
"""
def can_combine(self, function, arg):
# The argument must be a function.
# The restriction that arg.res() must be a function
# merely reduces redundant type-raising; if arg.res() is
# primitive, we have:
# X Y\X =>(<T) Y/(Y\X) Y\X =>(>) Y
# which is equivalent to
# X Y\X =>(<) Y
if not (arg.is_function() and arg.res().is_function()):
return False
arg = innermostFunction(arg)
# left, arg_categ are undefined!
subs = left.can_unify(arg_categ.arg())
if subs is not None:
return True
return False
def combine(self, function, arg):
if not (
function.is_primitive() and arg.is_function() and arg.res().is_function()
):
return
# Type-raising matches only the innermost application.
arg = innermostFunction(arg)
subs = function.can_unify(arg.arg())
if subs is not None:
xcat = arg.res().substitute(subs)
yield FunctionalCategory(
xcat, FunctionalCategory(xcat, function, arg.dir()), -(arg.dir())
)
def __str__(self):
return "T"
# Predicates for type-raising
# The direction of the innermost category must be towards
# the primary functor.
# The restriction that the variable must be primitive is not
# common to all versions of CCGs; some authors have other restrictions.
def forwardTConstraint(left, right):
arg = innermostFunction(right)
return arg.dir().is_backward() and arg.res().is_primitive()
def backwardTConstraint(left, right):
arg = innermostFunction(left)
return arg.dir().is_forward() and arg.res().is_primitive()
# Instances of type-raising combinators
ForwardT = ForwardCombinator(UndirectedTypeRaise(), forwardTConstraint)
BackwardT = BackwardCombinator(UndirectedTypeRaise(), backwardTConstraint)

View File

@@ -0,0 +1,338 @@
# Natural Language Toolkit: Combinatory Categorial Grammar
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Graeme Gange <ggange@csse.unimelb.edu.au>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
CCG Lexicons
"""
import re
from collections import defaultdict
from nltk.ccg.api import CCGVar, Direction, FunctionalCategory, PrimitiveCategory
from nltk.internals import deprecated
from nltk.sem.logic import Expression
# ------------
# Regular expressions used for parsing components of the lexicon
# ------------
# Parses a primitive category and subscripts
PRIM_RE = re.compile(r"""([A-Za-z]+)(\[[A-Za-z,]+\])?""")
# Separates the next primitive category from the remainder of the
# string
NEXTPRIM_RE = re.compile(r"""([A-Za-z]+(?:\[[A-Za-z,]+\])?)(.*)""")
# Separates the next application operator from the remainder
APP_RE = re.compile(r"""([\\/])([.,]?)([.,]?)(.*)""")
# Parses the definition of the right-hand side (rhs) of either a word or a family
LEX_RE = re.compile(r"""([\S_]+)\s*(::|[-=]+>)\s*(.+)""", re.UNICODE)
# Parses the right hand side that contains category and maybe semantic predicate
RHS_RE = re.compile(r"""([^{}]*[^ {}])\s*(\{[^}]+\})?""", re.UNICODE)
# Parses the semantic predicate
SEMANTICS_RE = re.compile(r"""\{([^}]+)\}""", re.UNICODE)
# Strips comments from a line
COMMENTS_RE = re.compile("""([^#]*)(?:#.*)?""")
class Token:
"""
Class representing a token.
token => category {semantics}
e.g. eat => S\\var[pl]/var {\\x y.eat(x,y)}
* `token` (string)
* `categ` (string)
* `semantics` (Expression)
"""
def __init__(self, token, categ, semantics=None):
self._token = token
self._categ = categ
self._semantics = semantics
def categ(self):
return self._categ
def semantics(self):
return self._semantics
def __str__(self):
semantics_str = ""
if self._semantics is not None:
semantics_str = " {" + str(self._semantics) + "}"
return "" + str(self._categ) + semantics_str
def __cmp__(self, other):
if not isinstance(other, Token):
return -1
return cmp((self._categ, self._semantics), other.categ(), other.semantics())
class CCGLexicon:
"""
Class representing a lexicon for CCG grammars.
* `primitives`: The list of primitive categories for the lexicon
* `families`: Families of categories
* `entries`: A mapping of words to possible categories
"""
def __init__(self, start, primitives, families, entries):
self._start = PrimitiveCategory(start)
self._primitives = primitives
self._families = families
self._entries = entries
def categories(self, word):
"""
Returns all the possible categories for a word
"""
return self._entries[word]
def start(self):
"""
Return the target category for the parser
"""
return self._start
def __str__(self):
"""
String representation of the lexicon. Used for debugging.
"""
string = ""
first = True
for ident in sorted(self._entries):
if not first:
string = string + "\n"
string = string + ident + " => "
first = True
for cat in self._entries[ident]:
if not first:
string = string + " | "
else:
first = False
string = string + "%s" % cat
return string
# -----------
# Parsing lexicons
# -----------
def matchBrackets(string):
"""
Separate the contents matching the first set of brackets from the rest of
the input.
"""
rest = string[1:]
inside = "("
while rest != "" and not rest.startswith(")"):
if rest.startswith("("):
(part, rest) = matchBrackets(rest)
inside = inside + part
else:
inside = inside + rest[0]
rest = rest[1:]
if rest.startswith(")"):
return (inside + ")", rest[1:])
raise AssertionError("Unmatched bracket in string '" + string + "'")
def nextCategory(string):
"""
Separate the string for the next portion of the category from the rest
of the string
"""
if string.startswith("("):
return matchBrackets(string)
return NEXTPRIM_RE.match(string).groups()
def parseApplication(app):
"""
Parse an application operator
"""
return Direction(app[0], app[1:])
def parseSubscripts(subscr):
"""
Parse the subscripts for a primitive category
"""
if subscr:
return subscr[1:-1].split(",")
return []
def parsePrimitiveCategory(chunks, primitives, families, var):
"""
Parse a primitive category
If the primitive is the special category 'var', replace it with the
correct `CCGVar`.
"""
if chunks[0] == "var":
if chunks[1] is None:
if var is None:
var = CCGVar()
return (var, var)
catstr = chunks[0]
if catstr in families:
(cat, cvar) = families[catstr]
if var is None:
var = cvar
else:
cat = cat.substitute([(cvar, var)])
return (cat, var)
if catstr in primitives:
subscrs = parseSubscripts(chunks[1])
return (PrimitiveCategory(catstr, subscrs), var)
raise AssertionError(
"String '" + catstr + "' is neither a family nor primitive category."
)
def augParseCategory(line, primitives, families, var=None):
"""
Parse a string representing a category, and returns a tuple with
(possibly) the CCG variable for the category
"""
(cat_string, rest) = nextCategory(line)
if cat_string.startswith("("):
(res, var) = augParseCategory(cat_string[1:-1], primitives, families, var)
else:
(res, var) = parsePrimitiveCategory(
PRIM_RE.match(cat_string).groups(), primitives, families, var
)
while rest != "":
app = APP_RE.match(rest).groups()
direction = parseApplication(app[0:3])
rest = app[3]
(cat_string, rest) = nextCategory(rest)
if cat_string.startswith("("):
(arg, var) = augParseCategory(cat_string[1:-1], primitives, families, var)
else:
(arg, var) = parsePrimitiveCategory(
PRIM_RE.match(cat_string).groups(), primitives, families, var
)
res = FunctionalCategory(res, arg, direction)
return (res, var)
def fromstring(lex_str, include_semantics=False):
"""
Convert string representation into a lexicon for CCGs.
"""
CCGVar.reset_id()
primitives = []
families = {}
entries = defaultdict(list)
for line in lex_str.splitlines():
# Strip comments and leading/trailing whitespace.
line = COMMENTS_RE.match(line).groups()[0].strip()
if line == "":
continue
if line.startswith(":-"):
# A line of primitive categories.
# The first one is the target category
# ie, :- S, N, NP, VP
primitives = primitives + [
prim.strip() for prim in line[2:].strip().split(",")
]
else:
# Either a family definition, or a word definition
(ident, sep, rhs) = LEX_RE.match(line).groups()
(catstr, semantics_str) = RHS_RE.match(rhs).groups()
(cat, var) = augParseCategory(catstr, primitives, families)
if sep == "::":
# Family definition
# ie, Det :: NP/N
families[ident] = (cat, var)
else:
semantics = None
if include_semantics is True:
if semantics_str is None:
raise AssertionError(
line
+ " must contain semantics because include_semantics is set to True"
)
else:
semantics = Expression.fromstring(
SEMANTICS_RE.match(semantics_str).groups()[0]
)
# Word definition
# ie, which => (N\N)/(S/NP)
entries[ident].append(Token(ident, cat, semantics))
return CCGLexicon(primitives[0], primitives, families, entries)
@deprecated("Use fromstring() instead.")
def parseLexicon(lex_str):
return fromstring(lex_str)
openccg_tinytiny = fromstring(
"""
# Rather minimal lexicon based on the openccg `tinytiny' grammar.
# Only incorporates a subset of the morphological subcategories, however.
:- S,NP,N # Primitive categories
Det :: NP/N # Determiners
Pro :: NP
IntransVsg :: S\\NP[sg] # Tensed intransitive verbs (singular)
IntransVpl :: S\\NP[pl] # Plural
TransVsg :: S\\NP[sg]/NP # Tensed transitive verbs (singular)
TransVpl :: S\\NP[pl]/NP # Plural
the => NP[sg]/N[sg]
the => NP[pl]/N[pl]
I => Pro
me => Pro
we => Pro
us => Pro
book => N[sg]
books => N[pl]
peach => N[sg]
peaches => N[pl]
policeman => N[sg]
policemen => N[pl]
boy => N[sg]
boys => N[pl]
sleep => IntransVsg
sleep => IntransVpl
eat => IntransVpl
eat => TransVpl
eats => IntransVsg
eats => TransVsg
see => TransVpl
sees => TransVsg
"""
)

View File

@@ -0,0 +1,63 @@
# Natural Language Toolkit: Combinatory Categorial Grammar
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Tanin Na Nakorn (@tanin)
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Helper functions for CCG semantics computation
"""
import copy
from nltk.sem.logic import *
def compute_type_raised_semantics(semantics):
semantics_copy = copy.deepcopy(semantics)
core = semantics_copy
parent = None
while isinstance(core, LambdaExpression):
parent = core
core = core.term
var = Variable("F")
while var in core.free():
var = unique_variable(pattern=var)
core = ApplicationExpression(FunctionVariableExpression(var), core)
if parent is not None:
parent.term = core
else:
semantics_copy = core
return LambdaExpression(var, semantics_copy)
def compute_function_semantics(function, argument):
return ApplicationExpression(function, argument).simplify()
def compute_composition_semantics(function, argument):
assert isinstance(argument, LambdaExpression), (
"`" + str(argument) + "` must be a lambda expression"
)
return LambdaExpression(
argument.variable, ApplicationExpression(function, argument.term).simplify()
)
def compute_substitution_semantics(function, argument):
assert isinstance(function, LambdaExpression) and isinstance(
function.term, LambdaExpression
), ("`" + str(function) + "` must be a lambda expression with 2 arguments")
assert isinstance(argument, LambdaExpression), (
"`" + str(argument) + "` must be a lambda expression"
)
new_argument = ApplicationExpression(
argument, VariableExpression(function.variable)
).simplify()
new_term = ApplicationExpression(function.term, new_argument).simplify()
return LambdaExpression(function.variable, new_term)

View File

@@ -0,0 +1,48 @@
# Natural Language Toolkit: Chatbots
#
# Copyright (C) 2001-2025 NLTK Project
# Authors: Steven Bird <stevenbird1@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
# Based on an Eliza implementation by Joe Strout <joe@strout.net>,
# Jeff Epler <jepler@inetnebr.com> and Jez Higgins <jez@jezuk.co.uk>.
"""
A class for simple chatbots. These perform simple pattern matching on sentences
typed by users, and respond with automatically generated sentences.
These chatbots may not work using the windows command line or the
windows IDLE GUI.
"""
from nltk.chat.eliza import eliza_chat
from nltk.chat.iesha import iesha_chat
from nltk.chat.rude import rude_chat
from nltk.chat.suntsu import suntsu_chat
from nltk.chat.util import Chat
from nltk.chat.zen import zen_chat
bots = [
(eliza_chat, "Eliza (psycho-babble)"),
(iesha_chat, "Iesha (teen anime junky)"),
(rude_chat, "Rude (abusive bot)"),
(suntsu_chat, "Suntsu (Chinese sayings)"),
(zen_chat, "Zen (gems of wisdom)"),
]
def chatbots():
print("Which chatbot would you like to talk to?")
botcount = len(bots)
for i in range(botcount):
print(" %d: %s" % (i + 1, bots[i][1]))
while True:
choice = input(f"\nEnter a number in the range 1-{botcount}: ").strip()
if choice.isdigit() and (int(choice) - 1) in range(botcount):
break
else:
print(" Error: bad chatbot number")
chatbot = bots[int(choice) - 1][0]
chatbot()

View File

@@ -0,0 +1,337 @@
# Natural Language Toolkit: Eliza
#
# Copyright (C) 2001-2025 NLTK Project
# Authors: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
# Based on an Eliza implementation by Joe Strout <joe@strout.net>,
# Jeff Epler <jepler@inetnebr.com> and Jez Higgins <mailto:jez@jezuk.co.uk>.
# a translation table used to convert things you say into things the
# computer says back, e.g. "I am" --> "you are"
from nltk.chat.util import Chat, reflections
# a table of response pairs, where each pair consists of a
# regular expression, and a list of possible responses,
# with group-macros labelled as %1, %2.
pairs = (
(
r"I need (.*)",
(
"Why do you need %1?",
"Would it really help you to get %1?",
"Are you sure you need %1?",
),
),
(
r"Why don\'t you (.*)",
(
"Do you really think I don't %1?",
"Perhaps eventually I will %1.",
"Do you really want me to %1?",
),
),
(
r"Why can\'t I (.*)",
(
"Do you think you should be able to %1?",
"If you could %1, what would you do?",
"I don't know -- why can't you %1?",
"Have you really tried?",
),
),
(
r"I can\'t (.*)",
(
"How do you know you can't %1?",
"Perhaps you could %1 if you tried.",
"What would it take for you to %1?",
),
),
(
r"I am (.*)",
(
"Did you come to me because you are %1?",
"How long have you been %1?",
"How do you feel about being %1?",
),
),
(
r"I\'m (.*)",
(
"How does being %1 make you feel?",
"Do you enjoy being %1?",
"Why do you tell me you're %1?",
"Why do you think you're %1?",
),
),
(
r"Are you (.*)",
(
"Why does it matter whether I am %1?",
"Would you prefer it if I were not %1?",
"Perhaps you believe I am %1.",
"I may be %1 -- what do you think?",
),
),
(
r"What (.*)",
(
"Why do you ask?",
"How would an answer to that help you?",
"What do you think?",
),
),
(
r"How (.*)",
(
"How do you suppose?",
"Perhaps you can answer your own question.",
"What is it you're really asking?",
),
),
(
r"Because (.*)",
(
"Is that the real reason?",
"What other reasons come to mind?",
"Does that reason apply to anything else?",
"If %1, what else must be true?",
),
),
(
r"(.*) sorry (.*)",
(
"There are many times when no apology is needed.",
"What feelings do you have when you apologize?",
),
),
(
r"Hello(.*)",
(
"Hello... I'm glad you could drop by today.",
"Hi there... how are you today?",
"Hello, how are you feeling today?",
),
),
(
r"I think (.*)",
("Do you doubt %1?", "Do you really think so?", "But you're not sure %1?"),
),
(
r"(.*) friend (.*)",
(
"Tell me more about your friends.",
"When you think of a friend, what comes to mind?",
"Why don't you tell me about a childhood friend?",
),
),
(r"Yes", ("You seem quite sure.", "OK, but can you elaborate a bit?")),
(
r"(.*) computer(.*)",
(
"Are you really talking about me?",
"Does it seem strange to talk to a computer?",
"How do computers make you feel?",
"Do you feel threatened by computers?",
),
),
(
r"Is it (.*)",
(
"Do you think it is %1?",
"Perhaps it's %1 -- what do you think?",
"If it were %1, what would you do?",
"It could well be that %1.",
),
),
(
r"It is (.*)",
(
"You seem very certain.",
"If I told you that it probably isn't %1, what would you feel?",
),
),
(
r"Can you (.*)",
(
"What makes you think I can't %1?",
"If I could %1, then what?",
"Why do you ask if I can %1?",
),
),
(
r"Can I (.*)",
(
"Perhaps you don't want to %1.",
"Do you want to be able to %1?",
"If you could %1, would you?",
),
),
(
r"You are (.*)",
(
"Why do you think I am %1?",
"Does it please you to think that I'm %1?",
"Perhaps you would like me to be %1.",
"Perhaps you're really talking about yourself?",
),
),
(
r"You\'re (.*)",
(
"Why do you say I am %1?",
"Why do you think I am %1?",
"Are we talking about you, or me?",
),
),
(
r"I don\'t (.*)",
("Don't you really %1?", "Why don't you %1?", "Do you want to %1?"),
),
(
r"I feel (.*)",
(
"Good, tell me more about these feelings.",
"Do you often feel %1?",
"When do you usually feel %1?",
"When you feel %1, what do you do?",
),
),
(
r"I have (.*)",
(
"Why do you tell me that you've %1?",
"Have you really %1?",
"Now that you have %1, what will you do next?",
),
),
(
r"I would (.*)",
(
"Could you explain why you would %1?",
"Why would you %1?",
"Who else knows that you would %1?",
),
),
(
r"Is there (.*)",
(
"Do you think there is %1?",
"It's likely that there is %1.",
"Would you like there to be %1?",
),
),
(
r"My (.*)",
(
"I see, your %1.",
"Why do you say that your %1?",
"When your %1, how do you feel?",
),
),
(
r"You (.*)",
(
"We should be discussing you, not me.",
"Why do you say that about me?",
"Why do you care whether I %1?",
),
),
(r"Why (.*)", ("Why don't you tell me the reason why %1?", "Why do you think %1?")),
(
r"I want (.*)",
(
"What would it mean to you if you got %1?",
"Why do you want %1?",
"What would you do if you got %1?",
"If you got %1, then what would you do?",
),
),
(
r"(.*) mother(.*)",
(
"Tell me more about your mother.",
"What was your relationship with your mother like?",
"How do you feel about your mother?",
"How does this relate to your feelings today?",
"Good family relations are important.",
),
),
(
r"(.*) father(.*)",
(
"Tell me more about your father.",
"How did your father make you feel?",
"How do you feel about your father?",
"Does your relationship with your father relate to your feelings today?",
"Do you have trouble showing affection with your family?",
),
),
(
r"(.*) child(.*)",
(
"Did you have close friends as a child?",
"What is your favorite childhood memory?",
"Do you remember any dreams or nightmares from childhood?",
"Did the other children sometimes tease you?",
"How do you think your childhood experiences relate to your feelings today?",
),
),
(
r"(.*)\?",
(
"Why do you ask that?",
"Please consider whether you can answer your own question.",
"Perhaps the answer lies within yourself?",
"Why don't you tell me?",
),
),
(
r"quit",
(
"Thank you for talking with me.",
"Good-bye.",
"Thank you, that will be $150. Have a good day!",
),
),
(
r"(.*)",
(
"Please tell me more.",
"Let's change focus a bit... Tell me about your family.",
"Can you elaborate on that?",
"Why do you say that %1?",
"I see.",
"Very interesting.",
"%1.",
"I see. And what does that tell you?",
"How does that make you feel?",
"How do you feel when you say that?",
),
),
)
eliza_chatbot = Chat(pairs, reflections)
def eliza_chat():
print("Therapist\n---------")
print("Talk to the program by typing in plain English, using normal upper-")
print('and lower-case letters and punctuation. Enter "quit" when done.')
print("=" * 72)
print("Hello. How are you feeling today?")
eliza_chatbot.converse()
def demo():
eliza_chat()
if __name__ == "__main__":
eliza_chat()

View File

@@ -0,0 +1,160 @@
# Natural Language Toolkit: Teen Chatbot
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Selina Dennis <sjmd@csse.unimelb.edu.au>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
This chatbot is a tongue-in-cheek take on the average teen
anime junky that frequents YahooMessenger or MSNM.
All spelling mistakes and flawed grammar are intentional.
"""
from nltk.chat.util import Chat
reflections = {
"am": "r",
"was": "were",
"i": "u",
"i'd": "u'd",
"i've": "u'v",
"ive": "u'v",
"i'll": "u'll",
"my": "ur",
"are": "am",
"you're": "im",
"you've": "ive",
"you'll": "i'll",
"your": "my",
"yours": "mine",
"you": "me",
"u": "me",
"ur": "my",
"urs": "mine",
"me": "u",
}
# Note: %1/2/etc are used without spaces prior as the chat bot seems
# to add a superfluous space when matching.
pairs = (
(
r"I\'m (.*)",
(
"ur%1?? that's so cool! kekekekeke ^_^ tell me more!",
"ur%1? neat!! kekeke >_<",
),
),
(
r"(.*) don\'t you (.*)",
(
r"u think I can%2??! really?? kekeke \<_\<",
"what do u mean%2??!",
"i could if i wanted, don't you think!! kekeke",
),
),
(r"ye[as] [iI] (.*)", ("u%1? cool!! how?", "how come u%1??", "u%1? so do i!!")),
(
r"do (you|u) (.*)\??",
("do i%2? only on tuesdays! kekeke *_*", "i dunno! do u%2??"),
),
(
r"(.*)\?",
(
"man u ask lots of questions!",
"booooring! how old r u??",
"boooooring!! ur not very fun",
),
),
(
r"(cos|because) (.*)",
("hee! i don't believe u! >_<", "nuh-uh! >_<", "ooooh i agree!"),
),
(
r"why can\'t [iI] (.*)",
(
"i dunno! y u askin me for!",
"try harder, silly! hee! ^_^",
"i dunno! but when i can't%1 i jump up and down!",
),
),
(
r"I can\'t (.*)",
(
"u can't what??! >_<",
"that's ok! i can't%1 either! kekekekeke ^_^",
"try harder, silly! hee! ^&^",
),
),
(
r"(.*) (like|love|watch) anime",
(
"omg i love anime!! do u like sailor moon??! ^&^",
"anime yay! anime rocks sooooo much!",
"oooh anime! i love anime more than anything!",
"anime is the bestest evar! evangelion is the best!",
"hee anime is the best! do you have ur fav??",
),
),
(
r"I (like|love|watch|play) (.*)",
("yay! %2 rocks!", "yay! %2 is neat!", "cool! do u like other stuff?? ^_^"),
),
(
r"anime sucks|(.*) (hate|detest) anime",
(
"ur a liar! i'm not gonna talk to u nemore if u h8 anime *;*",
"no way! anime is the best ever!",
"nuh-uh, anime is the best!",
),
),
(
r"(are|r) (you|u) (.*)",
("am i%1??! how come u ask that!", "maybe! y shud i tell u?? kekeke >_>"),
),
(
r"what (.*)",
("hee u think im gonna tell u? .v.", "booooooooring! ask me somethin else!"),
),
(r"how (.*)", ("not tellin!! kekekekekeke ^_^",)),
(r"(hi|hello|hey) (.*)", ("hi!!! how r u!!",)),
(
r"quit",
(
"mom says i have to go eat dinner now :,( bye!!",
"awww u have to go?? see u next time!!",
"how to see u again soon! ^_^",
),
),
(
r"(.*)",
(
"ur funny! kekeke",
"boooooring! talk about something else! tell me wat u like!",
"do u like anime??",
"do u watch anime? i like sailor moon! ^_^",
"i wish i was a kitty!! kekekeke ^_^",
),
),
)
iesha_chatbot = Chat(pairs, reflections)
def iesha_chat():
print("Iesha the TeenBoT\n---------")
print("Talk to the program by typing in plain English, using normal upper-")
print('and lower-case letters and punctuation. Enter "quit" when done.')
print("=" * 72)
print("hi!! i'm iesha! who r u??!")
iesha_chatbot.converse()
def demo():
iesha_chat()
if __name__ == "__main__":
demo()

View File

@@ -0,0 +1,125 @@
# Natural Language Toolkit: Rude Chatbot
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Peter Spiller <pspiller@csse.unimelb.edu.au>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
from nltk.chat.util import Chat, reflections
pairs = (
(
r"We (.*)",
(
"What do you mean, 'we'?",
"Don't include me in that!",
"I wouldn't be so sure about that.",
),
),
(
r"You should (.*)",
("Don't tell me what to do, buddy.", "Really? I should, should I?"),
),
(
r"You\'re(.*)",
(
"More like YOU'RE %1!",
"Hah! Look who's talking.",
"Come over here and tell me I'm %1.",
),
),
(
r"You are(.*)",
(
"More like YOU'RE %1!",
"Hah! Look who's talking.",
"Come over here and tell me I'm %1.",
),
),
(
r"I can\'t(.*)",
(
"You do sound like the type who can't %1.",
"Hear that splashing sound? That's my heart bleeding for you.",
"Tell somebody who might actually care.",
),
),
(
r"I think (.*)",
(
"I wouldn't think too hard if I were you.",
"You actually think? I'd never have guessed...",
),
),
(
r"I (.*)",
(
"I'm getting a bit tired of hearing about you.",
"How about we talk about me instead?",
"Me, me, me... Frankly, I don't care.",
),
),
(
r"How (.*)",
(
"How do you think?",
"Take a wild guess.",
"I'm not even going to dignify that with an answer.",
),
),
(r"What (.*)", ("Do I look like an encyclopedia?", "Figure it out yourself.")),
(
r"Why (.*)",
(
"Why not?",
"That's so obvious I thought even you'd have already figured it out.",
),
),
(
r"(.*)shut up(.*)",
(
"Make me.",
"Getting angry at a feeble NLP assignment? Somebody's losing it.",
"Say that again, I dare you.",
),
),
(
r"Shut up(.*)",
(
"Make me.",
"Getting angry at a feeble NLP assignment? Somebody's losing it.",
"Say that again, I dare you.",
),
),
(
r"Hello(.*)",
("Oh good, somebody else to talk to. Joy.", "'Hello'? How original..."),
),
(
r"(.*)",
(
"I'm getting bored here. Become more interesting.",
"Either become more thrilling or get lost, buddy.",
"Change the subject before I die of fatal boredom.",
),
),
)
rude_chatbot = Chat(pairs, reflections)
def rude_chat():
print("Talk to the program by typing in plain English, using normal upper-")
print('and lower-case letters and punctuation. Enter "quit" when done.')
print("=" * 72)
print("I suppose I should say hello.")
rude_chatbot.converse()
def demo():
rude_chat()
if __name__ == "__main__":
demo()

View File

@@ -0,0 +1,140 @@
# Natural Language Toolkit: Sun Tsu-Bot
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Sam Huston 2007
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Tsu bot responds to all queries with a Sun Tsu sayings
Quoted from Sun Tsu's The Art of War
Translated by LIONEL GILES, M.A. 1910
Hosted by the Gutenberg Project
https://www.gutenberg.org/
"""
from nltk.chat.util import Chat, reflections
pairs = (
(r"quit", ("Good-bye.", "Plan well", "May victory be your future")),
(
r"[^\?]*\?",
(
"Please consider whether you can answer your own question.",
"Ask me no questions!",
),
),
(
r"[0-9]+(.*)",
(
"It is the rule in war, if our forces are ten to the enemy's one, to surround him; if five to one, to attack him; if twice as numerous, to divide our army into two.",
"There are five essentials for victory",
),
),
(
r"[A-Ca-c](.*)",
(
"The art of war is of vital importance to the State.",
"All warfare is based on deception.",
"If your opponent is secure at all points, be prepared for him. If he is in superior strength, evade him.",
"If the campaign is protracted, the resources of the State will not be equal to the strain.",
"Attack him where he is unprepared, appear where you are not expected.",
"There is no instance of a country having benefited from prolonged warfare.",
),
),
(
r"[D-Fd-f](.*)",
(
"The skillful soldier does not raise a second levy, neither are his supply-wagons loaded more than twice.",
"Bring war material with you from home, but forage on the enemy.",
"In war, then, let your great object be victory, not lengthy campaigns.",
"To fight and conquer in all your battles is not supreme excellence; supreme excellence consists in breaking the enemy's resistance without fighting.",
),
),
(
r"[G-Ig-i](.*)",
(
"Heaven signifies night and day, cold and heat, times and seasons.",
"It is the rule in war, if our forces are ten to the enemy's one, to surround him; if five to one, to attack him; if twice as numerous, to divide our army into two.",
"The good fighters of old first put themselves beyond the possibility of defeat, and then waited for an opportunity of defeating the enemy.",
"One may know how to conquer without being able to do it.",
),
),
(
r"[J-Lj-l](.*)",
(
"There are three ways in which a ruler can bring misfortune upon his army.",
"By commanding the army to advance or to retreat, being ignorant of the fact that it cannot obey. This is called hobbling the army.",
"By attempting to govern an army in the same way as he administers a kingdom, being ignorant of the conditions which obtain in an army. This causes restlessness in the soldier's minds.",
"By employing the officers of his army without discrimination, through ignorance of the military principle of adaptation to circumstances. This shakes the confidence of the soldiers.",
"There are five essentials for victory",
"He will win who knows when to fight and when not to fight.",
"He will win who knows how to handle both superior and inferior forces.",
"He will win whose army is animated by the same spirit throughout all its ranks.",
"He will win who, prepared himself, waits to take the enemy unprepared.",
"He will win who has military capacity and is not interfered with by the sovereign.",
),
),
(
r"[M-Om-o](.*)",
(
"If you know the enemy and know yourself, you need not fear the result of a hundred battles.",
"If you know yourself but not the enemy, for every victory gained you will also suffer a defeat.",
"If you know neither the enemy nor yourself, you will succumb in every battle.",
"The control of a large force is the same principle as the control of a few men: it is merely a question of dividing up their numbers.",
),
),
(
r"[P-Rp-r](.*)",
(
"Security against defeat implies defensive tactics; ability to defeat the enemy means taking the offensive.",
"Standing on the defensive indicates insufficient strength; attacking, a superabundance of strength.",
"He wins his battles by making no mistakes. Making no mistakes is what establishes the certainty of victory, for it means conquering an enemy that is already defeated.",
"A victorious army opposed to a routed one, is as a pound's weight placed in the scale against a single grain.",
"The onrush of a conquering force is like the bursting of pent-up waters into a chasm a thousand fathoms deep.",
),
),
(
r"[S-Us-u](.*)",
(
"What the ancients called a clever fighter is one who not only wins, but excels in winning with ease.",
"Hence his victories bring him neither reputation for wisdom nor credit for courage.",
"Hence the skillful fighter puts himself into a position which makes defeat impossible, and does not miss the moment for defeating the enemy.",
"In war the victorious strategist only seeks battle after the victory has been won, whereas he who is destined to defeat first fights and afterwards looks for victory.",
"There are not more than five musical notes, yet the combinations of these five give rise to more melodies than can ever be heard.",
"Appear at points which the enemy must hasten to defend; march swiftly to places where you are not expected.",
),
),
(
r"[V-Zv-z](.*)",
(
"It is a matter of life and death, a road either to safety or to ruin.",
"Hold out baits to entice the enemy. Feign disorder, and crush him.",
"All men can see the tactics whereby I conquer, but what none can see is the strategy out of which victory is evolved.",
"Do not repeat the tactics which have gained you one victory, but let your methods be regulated by the infinite variety of circumstances.",
"So in war, the way is to avoid what is strong and to strike at what is weak.",
"Just as water retains no constant shape, so in warfare there are no constant conditions.",
),
),
(r"(.*)", ("Your statement insults me.", "")),
)
suntsu_chatbot = Chat(pairs, reflections)
def suntsu_chat():
print("Talk to the program by typing in plain English, using normal upper-")
print('and lower-case letters and punctuation. Enter "quit" when done.')
print("=" * 72)
print("You seek enlightenment?")
suntsu_chatbot.converse()
def demo():
suntsu_chat()
if __name__ == "__main__":
demo()

View File

@@ -0,0 +1,124 @@
# Natural Language Toolkit: Chatbot Utilities
#
# Copyright (C) 2001-2025 NLTK Project
# Authors: Steven Bird <stevenbird1@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
# Based on an Eliza implementation by Joe Strout <joe@strout.net>,
# Jeff Epler <jepler@inetnebr.com> and Jez Higgins <jez@jezuk.co.uk>.
import random
import re
reflections = {
"i am": "you are",
"i was": "you were",
"i": "you",
"i'm": "you are",
"i'd": "you would",
"i've": "you have",
"i'll": "you will",
"my": "your",
"you are": "I am",
"you were": "I was",
"you've": "I have",
"you'll": "I will",
"your": "my",
"yours": "mine",
"you": "me",
"me": "you",
}
class Chat:
def __init__(self, pairs, reflections={}):
"""
Initialize the chatbot. Pairs is a list of patterns and responses. Each
pattern is a regular expression matching the user's statement or question,
e.g. r'I like (.*)'. For each such pattern a list of possible responses
is given, e.g. ['Why do you like %1', 'Did you ever dislike %1']. Material
which is matched by parenthesized sections of the patterns (e.g. .*) is mapped to
the numbered positions in the responses, e.g. %1.
:type pairs: list of tuple
:param pairs: The patterns and responses
:type reflections: dict
:param reflections: A mapping between first and second person expressions
:rtype: None
"""
self._pairs = [(re.compile(x, re.IGNORECASE), y) for (x, y) in pairs]
self._reflections = reflections
self._regex = self._compile_reflections()
def _compile_reflections(self):
sorted_refl = sorted(self._reflections, key=len, reverse=True)
return re.compile(
r"\b({})\b".format("|".join(map(re.escape, sorted_refl))), re.IGNORECASE
)
def _substitute(self, str):
"""
Substitute words in the string, according to the specified reflections,
e.g. "I'm" -> "you are"
:type str: str
:param str: The string to be mapped
:rtype: str
"""
return self._regex.sub(
lambda mo: self._reflections[mo.string[mo.start() : mo.end()]], str.lower()
)
def _wildcards(self, response, match):
pos = response.find("%")
while pos >= 0:
num = int(response[pos + 1 : pos + 2])
response = (
response[:pos]
+ self._substitute(match.group(num))
+ response[pos + 2 :]
)
pos = response.find("%")
return response
def respond(self, str):
"""
Generate a response to the user input.
:type str: str
:param str: The string to be mapped
:rtype: str
"""
# check each pattern
for pattern, response in self._pairs:
match = pattern.match(str)
# did the pattern match?
if match:
resp = random.choice(response) # pick a random response
resp = self._wildcards(resp, match) # process wildcards
# fix munged punctuation at the end
if resp[-2:] == "?.":
resp = resp[:-2] + "."
if resp[-2:] == "??":
resp = resp[:-2] + "?"
return resp
# Hold a conversation with a chatbot
def converse(self, quit="quit"):
user_input = ""
while user_input != quit:
user_input = quit
try:
user_input = input(">")
except EOFError:
print(user_input)
if user_input:
while user_input[-1] in "!.":
user_input = user_input[:-1]
print(self.respond(user_input))

View File

@@ -0,0 +1,329 @@
# Natural Language Toolkit: Zen Chatbot
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Amy Holland <amyrh@csse.unimelb.edu.au>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Zen Chatbot talks in gems of Zen wisdom.
This is a sample conversation with Zen Chatbot:
ZC: Welcome, my child.
me: Good afternoon.
ZC: Ask the question you have come to ask.
me: How can I achieve enlightenment?
ZC: How do you suppose?
me: Through meditation.
ZC: Form is emptiness, and emptiness form.
me: How can I empty my mind of worldly troubles?
ZC: Will an answer to that really help in your search for enlightenment?
me: Yes.
ZC: It is better to be right than to be certain.
me: I seek truth and wisdom.
ZC: The search for truth is a long journey.
me: Are you sure?
ZC: Maybe sure, maybe not sure.
The chatbot structure is based on that of chat.eliza. Thus, it uses
a translation table to convert from question to response
i.e. "I am" --> "you are"
Of course, since Zen Chatbot does not understand the meaning of any words,
responses are very limited. Zen Chatbot will usually answer very vaguely, or
respond to a question by asking a different question, in much the same way
as Eliza.
"""
from nltk.chat.util import Chat, reflections
# responses are matched top to bottom, so non-specific matches occur later
# for each match, a list of possible responses is provided
responses = (
# Zen Chatbot opens with the line "Welcome, my child." The usual
# response will be a greeting problem: 'good' matches "good morning",
# "good day" etc, but also "good grief!" and other sentences starting
# with the word 'good' that may not be a greeting
(
r"(hello(.*))|(good [a-zA-Z]+)",
(
"The path to enlightenment is often difficult to see.",
"Greetings. I sense your mind is troubled. Tell me of your troubles.",
"Ask the question you have come to ask.",
"Hello. Do you seek englightenment?",
),
),
# "I need" and "I want" can be followed by a thing (eg 'help')
# or an action (eg 'to see you')
#
# This is a problem with this style of response -
# person: "I need you"
# chatbot: "me can be achieved by hard work and dedication of the mind"
# i.e. 'you' is not really a thing that can be mapped this way, so this
# interpretation only makes sense for some inputs
#
(
r"i need (.*)",
(
"%1 can be achieved by hard work and dedication of the mind.",
"%1 is not a need, but a desire of the mind. Clear your mind of such concerns.",
"Focus your mind on%1, and you will find what you need.",
),
),
(
r"i want (.*)",
(
"Desires of the heart will distract you from the path to enlightenment.",
"Will%1 help you attain enlightenment?",
"Is%1 a desire of the mind, or of the heart?",
),
),
# why questions are separated into three types:
# "why..I" e.g. "why am I here?" "Why do I like cake?"
# "why..you" e.g. "why are you here?" "Why won't you tell me?"
# "why..." e.g. "Why is the sky blue?"
# problems:
# person: "Why can't you tell me?"
# chatbot: "Are you sure I tell you?"
# - this style works for positives (e.g. "why do you like cake?")
# but does not work for negatives (e.g. "why don't you like cake?")
(r"why (.*) i (.*)\?", ("You%1%2?", "Perhaps you only think you%1%2")),
(r"why (.*) you(.*)\?", ("Why%1 you%2?", "%2 I%1", "Are you sure I%2?")),
(r"why (.*)\?", ("I cannot tell you why%1.", "Why do you think %1?")),
# e.g. "are you listening?", "are you a duck"
(
r"are you (.*)\?",
("Maybe%1, maybe not%1.", "Whether I am%1 or not is God's business."),
),
# e.g. "am I a duck?", "am I going to die?"
(
r"am i (.*)\?",
("Perhaps%1, perhaps not%1.", "Whether you are%1 or not is not for me to say."),
),
# what questions, e.g. "what time is it?"
# problems:
# person: "What do you want?"
# chatbot: "Seek truth, not what do me want."
(r"what (.*)\?", ("Seek truth, not what%1.", "What%1 should not concern you.")),
# how questions, e.g. "how do you do?"
(
r"how (.*)\?",
(
"How do you suppose?",
"Will an answer to that really help in your search for enlightenment?",
"Ask yourself not how, but why.",
),
),
# can questions, e.g. "can you run?", "can you come over here please?"
(
r"can you (.*)\?",
(
"I probably can, but I may not.",
"Maybe I can%1, and maybe I cannot.",
"I can do all, and I can do nothing.",
),
),
# can questions, e.g. "can I have some cake?", "can I know truth?"
(
r"can i (.*)\?",
(
"You can%1 if you believe you can%1, and have a pure spirit.",
"Seek truth and you will know if you can%1.",
),
),
# e.g. "It is raining" - implies the speaker is certain of a fact
(
r"it is (.*)",
(
"How can you be certain that%1, when you do not even know yourself?",
"Whether it is%1 or not does not change the way the world is.",
),
),
# e.g. "is there a doctor in the house?"
(
r"is there (.*)\?",
("There is%1 if you believe there is.", "It is possible that there is%1."),
),
# e.g. "is it possible?", "is this true?"
(r"is(.*)\?", ("%1 is not relevant.", "Does this matter?")),
# non-specific question
(
r"(.*)\?",
(
"Do you think %1?",
"You seek the truth. Does the truth seek you?",
"If you intentionally pursue the answers to your questions, the answers become hard to see.",
"The answer to your question cannot be told. It must be experienced.",
),
),
# expression of hate of form "I hate you" or "Kelly hates cheese"
(
r"(.*) (hate[s]?)|(dislike[s]?)|(don\'t like)(.*)",
(
"Perhaps it is not about hating %2, but about hate from within.",
"Weeds only grow when we dislike them",
"Hate is a very strong emotion.",
),
),
# statement containing the word 'truth'
(
r"(.*) truth(.*)",
(
"Seek truth, and truth will seek you.",
"Remember, it is not the spoon which bends - only yourself.",
"The search for truth is a long journey.",
),
),
# desire to do an action
# e.g. "I want to go shopping"
(
r"i want to (.*)",
("You may %1 if your heart truly desires to.", "You may have to %1."),
),
# desire for an object
# e.g. "I want a pony"
(
r"i want (.*)",
(
"Does your heart truly desire %1?",
"Is this a desire of the heart, or of the mind?",
),
),
# e.g. "I can't wait" or "I can't do this"
(
r"i can\'t (.*)",
(
"What we can and can't do is a limitation of the mind.",
"There are limitations of the body, and limitations of the mind.",
"Have you tried to%1 with a clear mind?",
),
),
# "I think.." indicates uncertainty. e.g. "I think so."
# problem: exceptions...
# e.g. "I think, therefore I am"
(
r"i think (.*)",
(
"Uncertainty in an uncertain world.",
"Indeed, how can we be certain of anything in such uncertain times.",
"Are you not, in fact, certain that%1?",
),
),
# "I feel...emotions/sick/light-headed..."
(
r"i feel (.*)",
(
"Your body and your emotions are both symptoms of your mind."
"What do you believe is the root of such feelings?",
"Feeling%1 can be a sign of your state-of-mind.",
),
),
# exclaimation mark indicating emotion
# e.g. "Wow!" or "No!"
(
r"(.*)!",
(
"I sense that you are feeling emotional today.",
"You need to calm your emotions.",
),
),
# because [statement]
# e.g. "because I said so"
(
r"because (.*)",
(
"Does knowning the reasons behind things help you to understand"
" the things themselves?",
"If%1, what else must be true?",
),
),
# yes or no - raise an issue of certainty/correctness
(
r"(yes)|(no)",
(
"Is there certainty in an uncertain world?",
"It is better to be right than to be certain.",
),
),
# sentence containing word 'love'
(
r"(.*)love(.*)",
(
"Think of the trees: they let the birds perch and fly with no intention to call them when they come, and no longing for their return when they fly away. Let your heart be like the trees.",
"Free love!",
),
),
# sentence containing word 'understand' - r
(
r"(.*)understand(.*)",
(
"If you understand, things are just as they are;"
" if you do not understand, things are just as they are.",
"Imagination is more important than knowledge.",
),
),
# 'I', 'me', 'my' - person is talking about themself.
# this breaks down when words contain these - eg 'Thyme', 'Irish'
(
r"(.*)(me )|( me)|(my)|(mine)|(i)(.*)",
(
"'I', 'me', 'my'... these are selfish expressions.",
"Have you ever considered that you might be a selfish person?",
"Try to consider others, not just yourself.",
"Think not just of yourself, but of others.",
),
),
# 'you' starting a sentence
# e.g. "you stink!"
(
r"you (.*)",
("My path is not of concern to you.", "I am but one, and you but one more."),
),
# say goodbye with some extra Zen wisdom.
(
r"exit",
(
"Farewell. The obstacle is the path.",
"Farewell. Life is a journey, not a destination.",
"Good bye. We are cups, constantly and quietly being filled."
"\nThe trick is knowning how to tip ourselves over and let the beautiful stuff out.",
),
),
# fall through case -
# when stumped, respond with generic zen wisdom
#
(
r"(.*)",
(
"When you're enlightened, every word is wisdom.",
"Random talk is useless.",
"The reverse side also has a reverse side.",
"Form is emptiness, and emptiness is form.",
"I pour out a cup of water. Is the cup empty?",
),
),
)
zen_chatbot = Chat(responses, reflections)
def zen_chat():
print("*" * 75)
print("Zen Chatbot!".center(75))
print("*" * 75)
print('"Look beyond mere words and letters - look into your mind"'.center(75))
print("* Talk your way to truth with Zen Chatbot.")
print("* Type 'quit' when you have had enough.")
print("*" * 75)
print("Welcome, my child.")
zen_chatbot.converse()
def demo():
zen_chat()
if __name__ == "__main__":
demo()

View File

@@ -0,0 +1,205 @@
# Natural Language Toolkit: Chunkers
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
#
"""
Classes and interfaces for identifying non-overlapping linguistic
groups (such as base noun phrases) in unrestricted text. This task is
called "chunk parsing" or "chunking", and the identified groups are
called "chunks". The chunked text is represented using a shallow
tree called a "chunk structure." A chunk structure is a tree
containing tokens and chunks, where each chunk is a subtree containing
only tokens. For example, the chunk structure for base noun phrase
chunks in the sentence "I saw the big dog on the hill" is::
(SENTENCE:
(NP: <I>)
<saw>
(NP: <the> <big> <dog>)
<on>
(NP: <the> <hill>))
To convert a chunk structure back to a list of tokens, simply use the
chunk structure's ``leaves()`` method.
This module defines ``ChunkParserI``, a standard interface for
chunking texts; and ``RegexpChunkParser``, a regular-expression based
implementation of that interface. It also defines ``ChunkScore``, a
utility class for scoring chunk parsers.
RegexpChunkParser
=================
``RegexpChunkParser`` is an implementation of the chunk parser interface
that uses regular-expressions over tags to chunk a text. Its
``parse()`` method first constructs a ``ChunkString``, which encodes a
particular chunking of the input text. Initially, nothing is
chunked. ``parse.RegexpChunkParser`` then applies a sequence of
``RegexpChunkRule`` rules to the ``ChunkString``, each of which modifies
the chunking that it encodes. Finally, the ``ChunkString`` is
transformed back into a chunk structure, which is returned.
``RegexpChunkParser`` can only be used to chunk a single kind of phrase.
For example, you can use an ``RegexpChunkParser`` to chunk the noun
phrases in a text, or the verb phrases in a text; but you can not
use it to simultaneously chunk both noun phrases and verb phrases in
the same text. (This is a limitation of ``RegexpChunkParser``, not of
chunk parsers in general.)
RegexpChunkRules
----------------
A ``RegexpChunkRule`` is a transformational rule that updates the
chunking of a text by modifying its ``ChunkString``. Each
``RegexpChunkRule`` defines the ``apply()`` method, which modifies
the chunking encoded by a ``ChunkString``. The
``RegexpChunkRule`` class itself can be used to implement any
transformational rule based on regular expressions. There are
also a number of subclasses, which can be used to implement
simpler types of rules:
- ``ChunkRule`` chunks anything that matches a given regular
expression.
- ``StripRule`` strips anything that matches a given regular
expression.
- ``UnChunkRule`` will un-chunk any chunk that matches a given
regular expression.
- ``MergeRule`` can be used to merge two contiguous chunks.
- ``SplitRule`` can be used to split a single chunk into two
smaller chunks.
- ``ExpandLeftRule`` will expand a chunk to incorporate new
unchunked material on the left.
- ``ExpandRightRule`` will expand a chunk to incorporate new
unchunked material on the right.
Tag Patterns
~~~~~~~~~~~~
A ``RegexpChunkRule`` uses a modified version of regular
expression patterns, called "tag patterns". Tag patterns are
used to match sequences of tags. Examples of tag patterns are::
r'(<DT>|<JJ>|<NN>)+'
r'<NN>+'
r'<NN.*>'
The differences between regular expression patterns and tag
patterns are:
- In tag patterns, ``'<'`` and ``'>'`` act as parentheses; so
``'<NN>+'`` matches one or more repetitions of ``'<NN>'``, not
``'<NN'`` followed by one or more repetitions of ``'>'``.
- Whitespace in tag patterns is ignored. So
``'<DT> | <NN>'`` is equivalent to ``'<DT>|<NN>'``
- In tag patterns, ``'.'`` is equivalent to ``'[^{}<>]'``; so
``'<NN.*>'`` matches any single tag starting with ``'NN'``.
The function ``tag_pattern2re_pattern`` can be used to transform
a tag pattern to an equivalent regular expression pattern.
Efficiency
----------
Preliminary tests indicate that ``RegexpChunkParser`` can chunk at a
rate of about 300 tokens/second, with a moderately complex rule set.
There may be problems if ``RegexpChunkParser`` is used with more than
5,000 tokens at a time. In particular, evaluation of some regular
expressions may cause the Python regular expression engine to
exceed its maximum recursion depth. We have attempted to minimize
these problems, but it is impossible to avoid them completely. We
therefore recommend that you apply the chunk parser to a single
sentence at a time.
Emacs Tip
---------
If you evaluate the following elisp expression in emacs, it will
colorize a ``ChunkString`` when you use an interactive python shell
with emacs or xemacs ("C-c !")::
(let ()
(defconst comint-mode-font-lock-keywords
'(("<[^>]+>" 0 'font-lock-reference-face)
("[{}]" 0 'font-lock-function-name-face)))
(add-hook 'comint-mode-hook (lambda () (turn-on-font-lock))))
You can evaluate this code by copying it to a temporary buffer,
placing the cursor after the last close parenthesis, and typing
"``C-x C-e``". You should evaluate it before running the interactive
session. The change will last until you close emacs.
Unresolved Issues
-----------------
If we use the ``re`` module for regular expressions, Python's
regular expression engine generates "maximum recursion depth
exceeded" errors when processing very large texts, even for
regular expressions that should not require any recursion. We
therefore use the ``pre`` module instead. But note that ``pre``
does not include Unicode support, so this module will not work
with unicode strings. Note also that ``pre`` regular expressions
are not quite as advanced as ``re`` ones (e.g., no leftward
zero-length assertions).
:type CHUNK_TAG_PATTERN: regexp
:var CHUNK_TAG_PATTERN: A regular expression to test whether a tag
pattern is valid.
"""
from nltk.chunk.api import ChunkParserI
from nltk.chunk.named_entity import Maxent_NE_Chunker
from nltk.chunk.regexp import RegexpChunkParser, RegexpParser
from nltk.chunk.util import (
ChunkScore,
accuracy,
conllstr2tree,
conlltags2tree,
ieerstr2tree,
tagstr2tree,
tree2conllstr,
tree2conlltags,
)
def ne_chunker(fmt="multiclass"):
"""
Load NLTK's currently recommended named entity chunker.
"""
return Maxent_NE_Chunker(fmt)
def ne_chunk(tagged_tokens, binary=False):
"""
Use NLTK's currently recommended named entity chunker to
chunk the given list of tagged tokens.
>>> from nltk.chunk import ne_chunk
>>> from nltk.corpus import treebank
>>> from pprint import pprint
>>> pprint(ne_chunk(treebank.tagged_sents()[2][8:14])) # doctest: +NORMALIZE_WHITESPACE
Tree('S', [('chairman', 'NN'), ('of', 'IN'), Tree('ORGANIZATION', [('Consolidated', 'NNP'), ('Gold', 'NNP'), ('Fields', 'NNP')]), ('PLC', 'NNP')])
"""
if binary:
chunker = ne_chunker(fmt="binary")
else:
chunker = ne_chunker()
return chunker.parse(tagged_tokens)
def ne_chunk_sents(tagged_sentences, binary=False):
"""
Use NLTK's currently recommended named entity chunker to chunk the
given list of tagged sentences, each consisting of a list of tagged tokens.
"""
if binary:
chunker = ne_chunker(fmt="binary")
else:
chunker = ne_chunker()
return chunker.parse_sents(tagged_sentences)

View File

@@ -0,0 +1,56 @@
# Natural Language Toolkit: Chunk parsing API
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com> (minor additions)
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
##//////////////////////////////////////////////////////
## Chunk Parser Interface
##//////////////////////////////////////////////////////
from nltk.chunk.util import ChunkScore
from nltk.internals import deprecated
from nltk.parse import ParserI
class ChunkParserI(ParserI):
"""
A processing interface for identifying non-overlapping groups in
unrestricted text. Typically, chunk parsers are used to find base
syntactic constituents, such as base noun phrases. Unlike
``ParserI``, ``ChunkParserI`` guarantees that the ``parse()`` method
will always generate a parse.
"""
def parse(self, tokens):
"""
Return the best chunk structure for the given tokens
and return a tree.
:param tokens: The list of (word, tag) tokens to be chunked.
:type tokens: list(tuple)
:rtype: Tree
"""
raise NotImplementedError()
@deprecated("Use accuracy(gold) instead.")
def evaluate(self, gold):
return self.accuracy(gold)
def accuracy(self, gold):
"""
Score the accuracy of the chunker against the gold standard.
Remove the chunking the gold standard text, rechunk it using
the chunker, and return a ``ChunkScore`` object
reflecting the performance of this chunk parser.
:type gold: list(Tree)
:param gold: The list of chunked sentences to score the chunker on.
:rtype: ChunkScore
"""
chunkscore = ChunkScore()
for correct in gold:
chunkscore.score(correct, self.parse(correct.leaves()))
return chunkscore

View File

@@ -0,0 +1,407 @@
# Natural Language Toolkit: Chunk parsing API
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Eric Kafe <kafe.eric@gmail.com> (tab-format models)
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Named entity chunker
"""
import os
import re
from xml.etree import ElementTree as ET
from nltk.tag import ClassifierBasedTagger, pos_tag
try:
from nltk.classify import MaxentClassifier
except ImportError:
pass
from nltk.chunk.api import ChunkParserI
from nltk.chunk.util import ChunkScore
from nltk.data import find
from nltk.tokenize import word_tokenize
from nltk.tree import Tree
class NEChunkParserTagger(ClassifierBasedTagger):
"""
The IOB tagger used by the chunk parser.
"""
def __init__(self, train=None, classifier=None):
ClassifierBasedTagger.__init__(
self,
train=train,
classifier_builder=self._classifier_builder,
classifier=classifier,
)
def _classifier_builder(self, train):
return MaxentClassifier.train(
# "megam" cannot be the default algorithm since it requires compiling with ocaml
train,
algorithm="iis",
gaussian_prior_sigma=1,
trace=2,
)
def _english_wordlist(self):
try:
wl = self._en_wordlist
except AttributeError:
from nltk.corpus import words
self._en_wordlist = set(words.words("en-basic"))
wl = self._en_wordlist
return wl
def _feature_detector(self, tokens, index, history):
word = tokens[index][0]
pos = simplify_pos(tokens[index][1])
if index == 0:
prevword = prevprevword = None
prevpos = prevprevpos = None
prevshape = prevtag = prevprevtag = None
elif index == 1:
prevword = tokens[index - 1][0].lower()
prevprevword = None
prevpos = simplify_pos(tokens[index - 1][1])
prevprevpos = None
prevtag = history[index - 1][0]
prevshape = prevprevtag = None
else:
prevword = tokens[index - 1][0].lower()
prevprevword = tokens[index - 2][0].lower()
prevpos = simplify_pos(tokens[index - 1][1])
prevprevpos = simplify_pos(tokens[index - 2][1])
prevtag = history[index - 1]
prevprevtag = history[index - 2]
prevshape = shape(prevword)
if index == len(tokens) - 1:
nextword = nextnextword = None
nextpos = nextnextpos = None
elif index == len(tokens) - 2:
nextword = tokens[index + 1][0].lower()
nextpos = tokens[index + 1][1].lower()
nextnextword = None
nextnextpos = None
else:
nextword = tokens[index + 1][0].lower()
nextpos = tokens[index + 1][1].lower()
nextnextword = tokens[index + 2][0].lower()
nextnextpos = tokens[index + 2][1].lower()
# 89.6
features = {
"bias": True,
"shape": shape(word),
"wordlen": len(word),
"prefix3": word[:3].lower(),
"suffix3": word[-3:].lower(),
"pos": pos,
"word": word,
"en-wordlist": (word in self._english_wordlist()),
"prevtag": prevtag,
"prevpos": prevpos,
"nextpos": nextpos,
"prevword": prevword,
"nextword": nextword,
"word+nextpos": f"{word.lower()}+{nextpos}",
"pos+prevtag": f"{pos}+{prevtag}",
"shape+prevtag": f"{prevshape}+{prevtag}",
}
return features
class NEChunkParser(ChunkParserI):
"""
Expected input: list of pos-tagged words
"""
def __init__(self, train):
self._train(train)
def parse(self, tokens):
"""
Each token should be a pos-tagged word
"""
tagged = self._tagger.tag(tokens)
tree = self._tagged_to_parse(tagged)
return tree
def _train(self, corpus):
# Convert to tagged sequence
corpus = [self._parse_to_tagged(s) for s in corpus]
self._tagger = NEChunkParserTagger(train=corpus)
def _tagged_to_parse(self, tagged_tokens):
"""
Convert a list of tagged tokens to a chunk-parse tree.
"""
sent = Tree("S", [])
for tok, tag in tagged_tokens:
if tag == "O":
sent.append(tok)
elif tag.startswith("B-"):
sent.append(Tree(tag[2:], [tok]))
elif tag.startswith("I-"):
if sent and isinstance(sent[-1], Tree) and sent[-1].label() == tag[2:]:
sent[-1].append(tok)
else:
sent.append(Tree(tag[2:], [tok]))
return sent
@staticmethod
def _parse_to_tagged(sent):
"""
Convert a chunk-parse tree to a list of tagged tokens.
"""
toks = []
for child in sent:
if isinstance(child, Tree):
if len(child) == 0:
print("Warning -- empty chunk in sentence")
continue
toks.append((child[0], f"B-{child.label()}"))
for tok in child[1:]:
toks.append((tok, f"I-{child.label()}"))
else:
toks.append((child, "O"))
return toks
def shape(word):
if re.match(r"[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$", word, re.UNICODE):
return "number"
elif re.match(r"\W+$", word, re.UNICODE):
return "punct"
elif re.match(r"\w+$", word, re.UNICODE):
if word.istitle():
return "upcase"
elif word.islower():
return "downcase"
else:
return "mixedcase"
else:
return "other"
def simplify_pos(s):
if s.startswith("V"):
return "V"
else:
return s.split("-")[0]
def postag_tree(tree):
# Part-of-speech tagging.
words = tree.leaves()
tag_iter = (pos for (word, pos) in pos_tag(words))
newtree = Tree("S", [])
for child in tree:
if isinstance(child, Tree):
newtree.append(Tree(child.label(), []))
for subchild in child:
newtree[-1].append((subchild, next(tag_iter)))
else:
newtree.append((child, next(tag_iter)))
return newtree
def load_ace_data(roots, fmt="binary", skip_bnews=True):
for root in roots:
for root, dirs, files in os.walk(root):
if root.endswith("bnews") and skip_bnews:
continue
for f in files:
if f.endswith(".sgm"):
yield from load_ace_file(os.path.join(root, f), fmt)
def load_ace_file(textfile, fmt):
print(f" - {os.path.split(textfile)[1]}")
annfile = textfile + ".tmx.rdc.xml"
# Read the xml file, and get a list of entities
entities = []
with open(annfile) as infile:
xml = ET.parse(infile).getroot()
for entity in xml.findall("document/entity"):
typ = entity.find("entity_type").text
for mention in entity.findall("entity_mention"):
if mention.get("TYPE") != "NAME":
continue # only NEs
s = int(mention.find("head/charseq/start").text)
e = int(mention.find("head/charseq/end").text) + 1
entities.append((s, e, typ))
# Read the text file, and mark the entities.
with open(textfile) as infile:
text = infile.read()
# Strip XML tags, since they don't count towards the indices
text = re.sub("<(?!/?TEXT)[^>]+>", "", text)
# Blank out anything before/after <TEXT>
def subfunc(m):
return " " * (m.end() - m.start() - 6)
text = re.sub(r"[\s\S]*<TEXT>", subfunc, text)
text = re.sub(r"</TEXT>[\s\S]*", "", text)
# Simplify quotes
text = re.sub("``", ' "', text)
text = re.sub("''", '" ', text)
entity_types = {typ for (s, e, typ) in entities}
# Binary distinction (NE or not NE)
if fmt == "binary":
i = 0
toks = Tree("S", [])
for s, e, typ in sorted(entities):
if s < i:
s = i # Overlapping! Deal with this better?
if e <= s:
continue
toks.extend(word_tokenize(text[i:s]))
toks.append(Tree("NE", text[s:e].split()))
i = e
toks.extend(word_tokenize(text[i:]))
yield toks
# Multiclass distinction (NE type)
elif fmt == "multiclass":
i = 0
toks = Tree("S", [])
for s, e, typ in sorted(entities):
if s < i:
s = i # Overlapping! Deal with this better?
if e <= s:
continue
toks.extend(word_tokenize(text[i:s]))
toks.append(Tree(typ, text[s:e].split()))
i = e
toks.extend(word_tokenize(text[i:]))
yield toks
else:
raise ValueError("bad fmt value")
# This probably belongs in a more general-purpose location (as does
# the parse_to_tagged function).
def cmp_chunks(correct, guessed):
correct = NEChunkParser._parse_to_tagged(correct)
guessed = NEChunkParser._parse_to_tagged(guessed)
ellipsis = False
for (w, ct), (w, gt) in zip(correct, guessed):
if ct == gt == "O":
if not ellipsis:
print(f" {ct:15} {gt:15} {w}")
print(" {:15} {:15} {2}".format("...", "...", "..."))
ellipsis = True
else:
ellipsis = False
print(f" {ct:15} {gt:15} {w}")
# ======================================================================================
class Maxent_NE_Chunker(NEChunkParser):
"""
Expected input: list of pos-tagged words
"""
def __init__(self, fmt="multiclass"):
from nltk.data import find
self._fmt = fmt
self._tab_dir = find(f"chunkers/maxent_ne_chunker_tab/english_ace_{fmt}/")
self.load_params()
def load_params(self):
from nltk.classify.maxent import BinaryMaxentFeatureEncoding, load_maxent_params
wgt, mpg, lab, aon = load_maxent_params(self._tab_dir)
mc = MaxentClassifier(
BinaryMaxentFeatureEncoding(lab, mpg, alwayson_features=aon), wgt
)
self._tagger = NEChunkParserTagger(classifier=mc)
def save_params(self):
from nltk.classify.maxent import save_maxent_params
classif = self._tagger._classifier
ecg = classif._encoding
wgt = classif._weights
mpg = ecg._mapping
lab = ecg._labels
aon = ecg._alwayson
fmt = self._fmt
save_maxent_params(wgt, mpg, lab, aon, tab_dir=f"/tmp/english_ace_{fmt}/")
def build_model(fmt="multiclass"):
chunker = Maxent_NE_Chunker(fmt)
chunker.save_params()
return chunker
# ======================================================================================
"""
2004 update: pickles are not supported anymore.
Deprecated:
def build_model(fmt="binary"):
print("Loading training data...")
train_paths = [
find("corpora/ace_data/ace.dev"),
find("corpora/ace_data/ace.heldout"),
find("corpora/ace_data/bbn.dev"),
find("corpora/ace_data/muc.dev"),
]
train_trees = load_ace_data(train_paths, fmt)
train_data = [postag_tree(t) for t in train_trees]
print("Training...")
cp = NEChunkParser(train_data)
del train_data
print("Loading eval data...")
eval_paths = [find("corpora/ace_data/ace.eval")]
eval_trees = load_ace_data(eval_paths, fmt)
eval_data = [postag_tree(t) for t in eval_trees]
print("Evaluating...")
chunkscore = ChunkScore()
for i, correct in enumerate(eval_data):
guess = cp.parse(correct.leaves())
chunkscore.score(correct, guess)
if i < 3:
cmp_chunks(correct, guess)
print(chunkscore)
outfilename = f"/tmp/ne_chunker_{fmt}.pickle"
print(f"Saving chunker to {outfilename}...")
with open(outfilename, "wb") as outfile:
pickle.dump(cp, outfile, -1)
return cp
"""
if __name__ == "__main__":
# Make sure that the object has the right class name:
build_model("binary")
build_model("multiclass")

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,642 @@
# Natural Language Toolkit: Chunk format conversions
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com> (minor additions)
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import re
from nltk.metrics import accuracy as _accuracy
from nltk.tag.mapping import map_tag
from nltk.tag.util import str2tuple
from nltk.tree import Tree
##//////////////////////////////////////////////////////
## EVALUATION
##//////////////////////////////////////////////////////
def accuracy(chunker, gold):
"""
Score the accuracy of the chunker against the gold standard.
Strip the chunk information from the gold standard and rechunk it using
the chunker, then compute the accuracy score.
:type chunker: ChunkParserI
:param chunker: The chunker being evaluated.
:type gold: tree
:param gold: The chunk structures to score the chunker on.
:rtype: float
"""
gold_tags = []
test_tags = []
for gold_tree in gold:
test_tree = chunker.parse(gold_tree.flatten())
gold_tags += tree2conlltags(gold_tree)
test_tags += tree2conlltags(test_tree)
# print 'GOLD:', gold_tags[:50]
# print 'TEST:', test_tags[:50]
return _accuracy(gold_tags, test_tags)
# Patched for increased performance by Yoav Goldberg <yoavg@cs.bgu.ac.il>, 2006-01-13
# -- statistics are evaluated only on demand, instead of at every sentence evaluation
#
# SB: use nltk.metrics for precision/recall scoring?
#
class ChunkScore:
"""
A utility class for scoring chunk parsers. ``ChunkScore`` can
evaluate a chunk parser's output, based on a number of statistics
(precision, recall, f-measure, misssed chunks, incorrect chunks).
It can also combine the scores from the parsing of multiple texts;
this makes it significantly easier to evaluate a chunk parser that
operates one sentence at a time.
Texts are evaluated with the ``score`` method. The results of
evaluation can be accessed via a number of accessor methods, such
as ``precision`` and ``f_measure``. A typical use of the
``ChunkScore`` class is::
>>> chunkscore = ChunkScore() # doctest: +SKIP
>>> for correct in correct_sentences: # doctest: +SKIP
... guess = chunkparser.parse(correct.leaves()) # doctest: +SKIP
... chunkscore.score(correct, guess) # doctest: +SKIP
>>> print('F Measure:', chunkscore.f_measure()) # doctest: +SKIP
F Measure: 0.823
:ivar kwargs: Keyword arguments:
- max_tp_examples: The maximum number actual examples of true
positives to record. This affects the ``correct`` member
function: ``correct`` will not return more than this number
of true positive examples. This does *not* affect any of
the numerical metrics (precision, recall, or f-measure)
- max_fp_examples: The maximum number actual examples of false
positives to record. This affects the ``incorrect`` member
function and the ``guessed`` member function: ``incorrect``
will not return more than this number of examples, and
``guessed`` will not return more than this number of true
positive examples. This does *not* affect any of the
numerical metrics (precision, recall, or f-measure)
- max_fn_examples: The maximum number actual examples of false
negatives to record. This affects the ``missed`` member
function and the ``correct`` member function: ``missed``
will not return more than this number of examples, and
``correct`` will not return more than this number of true
negative examples. This does *not* affect any of the
numerical metrics (precision, recall, or f-measure)
- chunk_label: A regular expression indicating which chunks
should be compared. Defaults to ``'.*'`` (i.e., all chunks).
:type _tp: list(Token)
:ivar _tp: List of true positives
:type _fp: list(Token)
:ivar _fp: List of false positives
:type _fn: list(Token)
:ivar _fn: List of false negatives
:type _tp_num: int
:ivar _tp_num: Number of true positives
:type _fp_num: int
:ivar _fp_num: Number of false positives
:type _fn_num: int
:ivar _fn_num: Number of false negatives.
"""
def __init__(self, **kwargs):
self._correct = set()
self._guessed = set()
self._tp = set()
self._fp = set()
self._fn = set()
self._max_tp = kwargs.get("max_tp_examples", 100)
self._max_fp = kwargs.get("max_fp_examples", 100)
self._max_fn = kwargs.get("max_fn_examples", 100)
self._chunk_label = kwargs.get("chunk_label", ".*")
self._tp_num = 0
self._fp_num = 0
self._fn_num = 0
self._count = 0
self._tags_correct = 0.0
self._tags_total = 0.0
self._measuresNeedUpdate = False
def _updateMeasures(self):
if self._measuresNeedUpdate:
self._tp = self._guessed & self._correct
self._fn = self._correct - self._guessed
self._fp = self._guessed - self._correct
self._tp_num = len(self._tp)
self._fp_num = len(self._fp)
self._fn_num = len(self._fn)
self._measuresNeedUpdate = False
def score(self, correct, guessed):
"""
Given a correctly chunked sentence, score another chunked
version of the same sentence.
:type correct: chunk structure
:param correct: The known-correct ("gold standard") chunked
sentence.
:type guessed: chunk structure
:param guessed: The chunked sentence to be scored.
"""
self._correct |= _chunksets(correct, self._count, self._chunk_label)
self._guessed |= _chunksets(guessed, self._count, self._chunk_label)
self._count += 1
self._measuresNeedUpdate = True
# Keep track of per-tag accuracy (if possible)
try:
correct_tags = tree2conlltags(correct)
guessed_tags = tree2conlltags(guessed)
except ValueError:
# This exception case is for nested chunk structures,
# where tree2conlltags will fail with a ValueError: "Tree
# is too deeply nested to be printed in CoNLL format."
correct_tags = guessed_tags = ()
self._tags_total += len(correct_tags)
self._tags_correct += sum(
1 for (t, g) in zip(guessed_tags, correct_tags) if t == g
)
def accuracy(self):
"""
Return the overall tag-based accuracy for all text that have
been scored by this ``ChunkScore``, using the IOB (conll2000)
tag encoding.
:rtype: float
"""
if self._tags_total == 0:
return 1
return self._tags_correct / self._tags_total
def precision(self):
"""
Return the overall precision for all texts that have been
scored by this ``ChunkScore``.
:rtype: float
"""
self._updateMeasures()
div = self._tp_num + self._fp_num
if div == 0:
return 0
else:
return self._tp_num / div
def recall(self):
"""
Return the overall recall for all texts that have been
scored by this ``ChunkScore``.
:rtype: float
"""
self._updateMeasures()
div = self._tp_num + self._fn_num
if div == 0:
return 0
else:
return self._tp_num / div
def f_measure(self, alpha=0.5):
"""
Return the overall F measure for all texts that have been
scored by this ``ChunkScore``.
:param alpha: the relative weighting of precision and recall.
Larger alpha biases the score towards the precision value,
while smaller alpha biases the score towards the recall
value. ``alpha`` should have a value in the range [0,1].
:type alpha: float
:rtype: float
"""
self._updateMeasures()
p = self.precision()
r = self.recall()
if p == 0 or r == 0: # what if alpha is 0 or 1?
return 0
return 1 / (alpha / p + (1 - alpha) / r)
def missed(self):
"""
Return the chunks which were included in the
correct chunk structures, but not in the guessed chunk
structures, listed in input order.
:rtype: list of chunks
"""
self._updateMeasures()
chunks = list(self._fn)
return [c[1] for c in chunks] # discard position information
def incorrect(self):
"""
Return the chunks which were included in the guessed chunk structures,
but not in the correct chunk structures, listed in input order.
:rtype: list of chunks
"""
self._updateMeasures()
chunks = list(self._fp)
return [c[1] for c in chunks] # discard position information
def correct(self):
"""
Return the chunks which were included in the correct
chunk structures, listed in input order.
:rtype: list of chunks
"""
chunks = list(self._correct)
return [c[1] for c in chunks] # discard position information
def guessed(self):
"""
Return the chunks which were included in the guessed
chunk structures, listed in input order.
:rtype: list of chunks
"""
chunks = list(self._guessed)
return [c[1] for c in chunks] # discard position information
def __len__(self):
self._updateMeasures()
return self._tp_num + self._fn_num
def __repr__(self):
"""
Return a concise representation of this ``ChunkScoring``.
:rtype: str
"""
return "<ChunkScoring of " + repr(len(self)) + " chunks>"
def __str__(self):
"""
Return a verbose representation of this ``ChunkScoring``.
This representation includes the precision, recall, and
f-measure scores. For other information about the score,
use the accessor methods (e.g., ``missed()`` and ``incorrect()``).
:rtype: str
"""
return (
"ChunkParse score:\n"
+ f" IOB Accuracy: {self.accuracy() * 100:5.1f}%\n"
+ f" Precision: {self.precision() * 100:5.1f}%\n"
+ f" Recall: {self.recall() * 100:5.1f}%\n"
+ f" F-Measure: {self.f_measure() * 100:5.1f}%"
)
# extract chunks, and assign unique id, the absolute position of
# the first word of the chunk
def _chunksets(t, count, chunk_label):
pos = 0
chunks = []
for child in t:
if isinstance(child, Tree):
if re.match(chunk_label, child.label()):
chunks.append(((count, pos), child.freeze()))
pos += len(child.leaves())
else:
pos += 1
return set(chunks)
def tagstr2tree(
s, chunk_label="NP", root_label="S", sep="/", source_tagset=None, target_tagset=None
):
"""
Divide a string of bracketted tagged text into
chunks and unchunked tokens, and produce a Tree.
Chunks are marked by square brackets (``[...]``). Words are
delimited by whitespace, and each word should have the form
``text/tag``. Words that do not contain a slash are
assigned a ``tag`` of None.
:param s: The string to be converted
:type s: str
:param chunk_label: The label to use for chunk nodes
:type chunk_label: str
:param root_label: The label to use for the root of the tree
:type root_label: str
:rtype: Tree
"""
WORD_OR_BRACKET = re.compile(r"\[|\]|[^\[\]\s]+")
stack = [Tree(root_label, [])]
for match in WORD_OR_BRACKET.finditer(s):
text = match.group()
if text[0] == "[":
if len(stack) != 1:
raise ValueError(f"Unexpected [ at char {match.start():d}")
chunk = Tree(chunk_label, [])
stack[-1].append(chunk)
stack.append(chunk)
elif text[0] == "]":
if len(stack) != 2:
raise ValueError(f"Unexpected ] at char {match.start():d}")
stack.pop()
else:
if sep is None:
stack[-1].append(text)
else:
word, tag = str2tuple(text, sep)
if source_tagset and target_tagset:
tag = map_tag(source_tagset, target_tagset, tag)
stack[-1].append((word, tag))
if len(stack) != 1:
raise ValueError(f"Expected ] at char {len(s):d}")
return stack[0]
### CONLL
_LINE_RE = re.compile(r"(\S+)\s+(\S+)\s+([IOB])-?(\S+)?")
def conllstr2tree(s, chunk_types=("NP", "PP", "VP"), root_label="S"):
"""
Return a chunk structure for a single sentence
encoded in the given CONLL 2000 style string.
This function converts a CoNLL IOB string into a tree.
It uses the specified chunk types
(defaults to NP, PP and VP), and creates a tree rooted at a node
labeled S (by default).
:param s: The CoNLL string to be converted.
:type s: str
:param chunk_types: The chunk types to be converted.
:type chunk_types: tuple
:param root_label: The node label to use for the root.
:type root_label: str
:rtype: Tree
"""
stack = [Tree(root_label, [])]
for lineno, line in enumerate(s.split("\n")):
if not line.strip():
continue
# Decode the line.
match = _LINE_RE.match(line)
if match is None:
raise ValueError(f"Error on line {lineno:d}")
(word, tag, state, chunk_type) = match.groups()
# If it's a chunk type we don't care about, treat it as O.
if chunk_types is not None and chunk_type not in chunk_types:
state = "O"
# For "Begin"/"Outside", finish any completed chunks -
# also do so for "Inside" which don't match the previous token.
mismatch_I = state == "I" and chunk_type != stack[-1].label()
if state in "BO" or mismatch_I:
if len(stack) == 2:
stack.pop()
# For "Begin", start a new chunk.
if state == "B" or mismatch_I:
chunk = Tree(chunk_type, [])
stack[-1].append(chunk)
stack.append(chunk)
# Add the new word token.
stack[-1].append((word, tag))
return stack[0]
def tree2conlltags(t):
"""
Return a list of 3-tuples containing ``(word, tag, IOB-tag)``.
Convert a tree to the CoNLL IOB tag format.
:param t: The tree to be converted.
:type t: Tree
:rtype: list(tuple)
"""
tags = []
for child in t:
try:
category = child.label()
prefix = "B-"
for contents in child:
if isinstance(contents, Tree):
raise ValueError(
"Tree is too deeply nested to be printed in CoNLL format"
)
tags.append((contents[0], contents[1], prefix + category))
prefix = "I-"
except AttributeError:
tags.append((child[0], child[1], "O"))
return tags
def conlltags2tree(
sentence, chunk_types=("NP", "PP", "VP"), root_label="S", strict=False
):
"""
Convert the CoNLL IOB format to a tree.
"""
tree = Tree(root_label, [])
for word, postag, chunktag in sentence:
if chunktag is None:
if strict:
raise ValueError("Bad conll tag sequence")
else:
# Treat as O
tree.append((word, postag))
elif chunktag.startswith("B-"):
tree.append(Tree(chunktag[2:], [(word, postag)]))
elif chunktag.startswith("I-"):
if (
len(tree) == 0
or not isinstance(tree[-1], Tree)
or tree[-1].label() != chunktag[2:]
):
if strict:
raise ValueError("Bad conll tag sequence")
else:
# Treat as B-*
tree.append(Tree(chunktag[2:], [(word, postag)]))
else:
tree[-1].append((word, postag))
elif chunktag == "O":
tree.append((word, postag))
else:
raise ValueError(f"Bad conll tag {chunktag!r}")
return tree
def tree2conllstr(t):
"""
Return a multiline string where each line contains a word, tag and IOB tag.
Convert a tree to the CoNLL IOB string format
:param t: The tree to be converted.
:type t: Tree
:rtype: str
"""
lines = [" ".join(token) for token in tree2conlltags(t)]
return "\n".join(lines)
### IEER
_IEER_DOC_RE = re.compile(
r"<DOC>\s*"
r"(<DOCNO>\s*(?P<docno>.+?)\s*</DOCNO>\s*)?"
r"(<DOCTYPE>\s*(?P<doctype>.+?)\s*</DOCTYPE>\s*)?"
r"(<DATE_TIME>\s*(?P<date_time>.+?)\s*</DATE_TIME>\s*)?"
r"<BODY>\s*"
r"(<HEADLINE>\s*(?P<headline>.+?)\s*</HEADLINE>\s*)?"
r"<TEXT>(?P<text>.*?)</TEXT>\s*"
r"</BODY>\s*</DOC>\s*",
re.DOTALL,
)
_IEER_TYPE_RE = re.compile(r'<b_\w+\s+[^>]*?type="(?P<type>\w+)"')
def _ieer_read_text(s, root_label):
stack = [Tree(root_label, [])]
# s will be None if there is no headline in the text
# return the empty list in place of a Tree
if s is None:
return []
for piece_m in re.finditer(r"<[^>]+>|[^\s<]+", s):
piece = piece_m.group()
try:
if piece.startswith("<b_"):
m = _IEER_TYPE_RE.match(piece)
if m is None:
print("XXXX", piece)
chunk = Tree(m.group("type"), [])
stack[-1].append(chunk)
stack.append(chunk)
elif piece.startswith("<e_"):
stack.pop()
# elif piece.startswith('<'):
# print "ERROR:", piece
# raise ValueError # Unexpected HTML
else:
stack[-1].append(piece)
except (IndexError, ValueError) as e:
raise ValueError(
f"Bad IEER string (error at character {piece_m.start():d})"
) from e
if len(stack) != 1:
raise ValueError("Bad IEER string")
return stack[0]
def ieerstr2tree(
s,
chunk_types=[
"LOCATION",
"ORGANIZATION",
"PERSON",
"DURATION",
"DATE",
"CARDINAL",
"PERCENT",
"MONEY",
"MEASURE",
],
root_label="S",
):
"""
Return a chunk structure containing the chunked tagged text that is
encoded in the given IEER style string.
Convert a string of chunked tagged text in the IEER named
entity format into a chunk structure. Chunks are of several
types, LOCATION, ORGANIZATION, PERSON, DURATION, DATE, CARDINAL,
PERCENT, MONEY, and MEASURE.
:rtype: Tree
"""
# Try looking for a single document. If that doesn't work, then just
# treat everything as if it was within the <TEXT>...</TEXT>.
m = _IEER_DOC_RE.match(s)
if m:
return {
"text": _ieer_read_text(m.group("text"), root_label),
"docno": m.group("docno"),
"doctype": m.group("doctype"),
"date_time": m.group("date_time"),
#'headline': m.group('headline')
# we want to capture NEs in the headline too!
"headline": _ieer_read_text(m.group("headline"), root_label),
}
else:
return _ieer_read_text(s, root_label)
def demo():
s = "[ Pierre/NNP Vinken/NNP ] ,/, [ 61/CD years/NNS ] old/JJ ,/, will/MD join/VB [ the/DT board/NN ] ./."
import nltk
t = nltk.chunk.tagstr2tree(s, chunk_label="NP")
t.pprint()
print()
s = """
These DT B-NP
research NN I-NP
protocols NNS I-NP
offer VBP B-VP
to TO B-PP
the DT B-NP
patient NN I-NP
not RB O
only RB O
the DT B-NP
very RB I-NP
best JJS I-NP
therapy NN I-NP
which WDT B-NP
we PRP B-NP
have VBP B-VP
established VBN I-VP
today NN B-NP
but CC B-NP
also RB I-NP
the DT B-NP
hope NN I-NP
of IN B-PP
something NN B-NP
still RB B-ADJP
better JJR I-ADJP
. . O
"""
conll_tree = conllstr2tree(s, chunk_types=("NP", "PP"))
conll_tree.pprint()
# Demonstrate CoNLL output
print("CoNLL output:")
print(nltk.chunk.tree2conllstr(conll_tree))
print()
if __name__ == "__main__":
demo()

View File

@@ -0,0 +1,101 @@
# Natural Language Toolkit: Classifiers
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Classes and interfaces for labeling tokens with category labels (or
"class labels"). Typically, labels are represented with strings
(such as ``'health'`` or ``'sports'``). Classifiers can be used to
perform a wide range of classification tasks. For example,
classifiers can be used...
- to classify documents by topic
- to classify ambiguous words by which word sense is intended
- to classify acoustic signals by which phoneme they represent
- to classify sentences by their author
Features
========
In order to decide which category label is appropriate for a given
token, classifiers examine one or more 'features' of the token. These
"features" are typically chosen by hand, and indicate which aspects
of the token are relevant to the classification decision. For
example, a document classifier might use a separate feature for each
word, recording how often that word occurred in the document.
Featuresets
===========
The features describing a token are encoded using a "featureset",
which is a dictionary that maps from "feature names" to "feature
values". Feature names are unique strings that indicate what aspect
of the token is encoded by the feature. Examples include
``'prevword'``, for a feature whose value is the previous word; and
``'contains-word(library)'`` for a feature that is true when a document
contains the word ``'library'``. Feature values are typically
booleans, numbers, or strings, depending on which feature they
describe.
Featuresets are typically constructed using a "feature detector"
(also known as a "feature extractor"). A feature detector is a
function that takes a token (and sometimes information about its
context) as its input, and returns a featureset describing that token.
For example, the following feature detector converts a document
(stored as a list of words) to a featureset describing the set of
words included in the document:
>>> # Define a feature detector function.
>>> def document_features(document):
... return dict([('contains-word(%s)' % w, True) for w in document])
Feature detectors are typically applied to each token before it is fed
to the classifier:
>>> # Classify each Gutenberg document.
>>> from nltk.corpus import gutenberg
>>> for fileid in gutenberg.fileids(): # doctest: +SKIP
... doc = gutenberg.words(fileid) # doctest: +SKIP
... print(fileid, classifier.classify(document_features(doc))) # doctest: +SKIP
The parameters that a feature detector expects will vary, depending on
the task and the needs of the feature detector. For example, a
feature detector for word sense disambiguation (WSD) might take as its
input a sentence, and the index of a word that should be classified,
and return a featureset for that word. The following feature detector
for WSD includes features describing the left and right contexts of
the target word:
>>> def wsd_features(sentence, index):
... featureset = {}
... for i in range(max(0, index-3), index):
... featureset['left-context(%s)' % sentence[i]] = True
... for i in range(index, max(index+3, len(sentence))):
... featureset['right-context(%s)' % sentence[i]] = True
... return featureset
Training Classifiers
====================
Most classifiers are built by training them on a list of hand-labeled
examples, known as the "training set". Training sets are represented
as lists of ``(featuredict, label)`` tuples.
"""
from nltk.classify.api import ClassifierI, MultiClassifierI
from nltk.classify.decisiontree import DecisionTreeClassifier
from nltk.classify.maxent import (
BinaryMaxentFeatureEncoding,
ConditionalExponentialClassifier,
MaxentClassifier,
TypedMaxentFeatureEncoding,
)
from nltk.classify.megam import call_megam, config_megam
from nltk.classify.naivebayes import NaiveBayesClassifier
from nltk.classify.positivenaivebayes import PositiveNaiveBayesClassifier
from nltk.classify.rte_classify import RTEFeatureExtractor, rte_classifier, rte_features
from nltk.classify.scikitlearn import SklearnClassifier
from nltk.classify.senna import Senna
from nltk.classify.textcat import TextCat
from nltk.classify.util import accuracy, apply_features, log_likelihood
from nltk.classify.weka import WekaClassifier, config_weka

View File

@@ -0,0 +1,195 @@
# Natural Language Toolkit: Classifier Interface
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com> (minor additions)
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Interfaces for labeling tokens with category labels (or "class labels").
``ClassifierI`` is a standard interface for "single-category
classification", in which the set of categories is known, the number
of categories is finite, and each text belongs to exactly one
category.
``MultiClassifierI`` is a standard interface for "multi-category
classification", which is like single-category classification except
that each text belongs to zero or more categories.
"""
from nltk.internals import overridden
##//////////////////////////////////////////////////////
# { Classification Interfaces
##//////////////////////////////////////////////////////
class ClassifierI:
"""
A processing interface for labeling tokens with a single category
label (or "class"). Labels are typically strs or
ints, but can be any immutable type. The set of labels
that the classifier chooses from must be fixed and finite.
Subclasses must define:
- ``labels()``
- either ``classify()`` or ``classify_many()`` (or both)
Subclasses may define:
- either ``prob_classify()`` or ``prob_classify_many()`` (or both)
"""
def labels(self):
"""
:return: the list of category labels used by this classifier.
:rtype: list of (immutable)
"""
raise NotImplementedError()
def classify(self, featureset):
"""
:return: the most appropriate label for the given featureset.
:rtype: label
"""
if overridden(self.classify_many):
return self.classify_many([featureset])[0]
else:
raise NotImplementedError()
def prob_classify(self, featureset):
"""
:return: a probability distribution over labels for the given
featureset.
:rtype: ProbDistI
"""
if overridden(self.prob_classify_many):
return self.prob_classify_many([featureset])[0]
else:
raise NotImplementedError()
def classify_many(self, featuresets):
"""
Apply ``self.classify()`` to each element of ``featuresets``. I.e.:
return [self.classify(fs) for fs in featuresets]
:rtype: list(label)
"""
return [self.classify(fs) for fs in featuresets]
def prob_classify_many(self, featuresets):
"""
Apply ``self.prob_classify()`` to each element of ``featuresets``. I.e.:
return [self.prob_classify(fs) for fs in featuresets]
:rtype: list(ProbDistI)
"""
return [self.prob_classify(fs) for fs in featuresets]
class MultiClassifierI:
"""
A processing interface for labeling tokens with zero or more
category labels (or "labels"). Labels are typically strs
or ints, but can be any immutable type. The set of labels
that the multi-classifier chooses from must be fixed and finite.
Subclasses must define:
- ``labels()``
- either ``classify()`` or ``classify_many()`` (or both)
Subclasses may define:
- either ``prob_classify()`` or ``prob_classify_many()`` (or both)
"""
def labels(self):
"""
:return: the list of category labels used by this classifier.
:rtype: list of (immutable)
"""
raise NotImplementedError()
def classify(self, featureset):
"""
:return: the most appropriate set of labels for the given featureset.
:rtype: set(label)
"""
if overridden(self.classify_many):
return self.classify_many([featureset])[0]
else:
raise NotImplementedError()
def prob_classify(self, featureset):
"""
:return: a probability distribution over sets of labels for the
given featureset.
:rtype: ProbDistI
"""
if overridden(self.prob_classify_many):
return self.prob_classify_many([featureset])[0]
else:
raise NotImplementedError()
def classify_many(self, featuresets):
"""
Apply ``self.classify()`` to each element of ``featuresets``. I.e.:
return [self.classify(fs) for fs in featuresets]
:rtype: list(set(label))
"""
return [self.classify(fs) for fs in featuresets]
def prob_classify_many(self, featuresets):
"""
Apply ``self.prob_classify()`` to each element of ``featuresets``. I.e.:
return [self.prob_classify(fs) for fs in featuresets]
:rtype: list(ProbDistI)
"""
return [self.prob_classify(fs) for fs in featuresets]
# # [XX] IN PROGRESS:
# class SequenceClassifierI:
# """
# A processing interface for labeling sequences of tokens with a
# single category label (or "class"). Labels are typically
# strs or ints, but can be any immutable type. The set
# of labels that the classifier chooses from must be fixed and
# finite.
# """
# def labels(self):
# """
# :return: the list of category labels used by this classifier.
# :rtype: list of (immutable)
# """
# raise NotImplementedError()
# def prob_classify(self, featureset):
# """
# Return a probability distribution over labels for the given
# featureset.
# If ``featureset`` is a list of featuresets, then return a
# corresponding list containing the probability distribution
# over labels for each of the given featuresets, where the
# *i*\ th element of this list is the most appropriate label for
# the *i*\ th element of ``featuresets``.
# """
# raise NotImplementedError()
# def classify(self, featureset):
# """
# Return the most appropriate label for the given featureset.
# If ``featureset`` is a list of featuresets, then return a
# corresponding list containing the most appropriate label for
# each of the given featuresets, where the *i*\ th element of
# this list is the most appropriate label for the *i*\ th element
# of ``featuresets``.
# """
# raise NotImplementedError()

View File

@@ -0,0 +1,349 @@
# Natural Language Toolkit: Decision Tree Classifiers
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
A classifier model that decides which label to assign to a token on
the basis of a tree structure, where branches correspond to conditions
on feature values, and leaves correspond to label assignments.
"""
from collections import defaultdict
from nltk.classify.api import ClassifierI
from nltk.probability import FreqDist, MLEProbDist, entropy
class DecisionTreeClassifier(ClassifierI):
def __init__(self, label, feature_name=None, decisions=None, default=None):
"""
:param label: The most likely label for tokens that reach
this node in the decision tree. If this decision tree
has no children, then this label will be assigned to
any token that reaches this decision tree.
:param feature_name: The name of the feature that this
decision tree selects for.
:param decisions: A dictionary mapping from feature values
for the feature identified by ``feature_name`` to
child decision trees.
:param default: The child that will be used if the value of
feature ``feature_name`` does not match any of the keys in
``decisions``. This is used when constructing binary
decision trees.
"""
self._label = label
self._fname = feature_name
self._decisions = decisions
self._default = default
def labels(self):
labels = [self._label]
if self._decisions is not None:
for dt in self._decisions.values():
labels.extend(dt.labels())
if self._default is not None:
labels.extend(self._default.labels())
return list(set(labels))
def classify(self, featureset):
# Decision leaf:
if self._fname is None:
return self._label
# Decision tree:
fval = featureset.get(self._fname)
if fval in self._decisions:
return self._decisions[fval].classify(featureset)
elif self._default is not None:
return self._default.classify(featureset)
else:
return self._label
def error(self, labeled_featuresets):
errors = 0
for featureset, label in labeled_featuresets:
if self.classify(featureset) != label:
errors += 1
return errors / len(labeled_featuresets)
def pretty_format(self, width=70, prefix="", depth=4):
"""
Return a string containing a pretty-printed version of this
decision tree. Each line in this string corresponds to a
single decision tree node or leaf, and indentation is used to
display the structure of the decision tree.
"""
# [xx] display default!!
if self._fname is None:
n = width - len(prefix) - 15
return "{}{} {}\n".format(prefix, "." * n, self._label)
s = ""
for i, (fval, result) in enumerate(
sorted(
self._decisions.items(),
key=lambda item: (item[0] in [None, False, True], str(item[0]).lower()),
)
):
hdr = f"{prefix}{self._fname}={fval}? "
n = width - 15 - len(hdr)
s += "{}{} {}\n".format(hdr, "." * (n), result._label)
if result._fname is not None and depth > 1:
s += result.pretty_format(width, prefix + " ", depth - 1)
if self._default is not None:
n = width - len(prefix) - 21
s += "{}else: {} {}\n".format(prefix, "." * n, self._default._label)
if self._default._fname is not None and depth > 1:
s += self._default.pretty_format(width, prefix + " ", depth - 1)
return s
def pseudocode(self, prefix="", depth=4):
"""
Return a string representation of this decision tree that
expresses the decisions it makes as a nested set of pseudocode
if statements.
"""
if self._fname is None:
return f"{prefix}return {self._label!r}\n"
s = ""
for fval, result in sorted(
self._decisions.items(),
key=lambda item: (item[0] in [None, False, True], str(item[0]).lower()),
):
s += f"{prefix}if {self._fname} == {fval!r}: "
if result._fname is not None and depth > 1:
s += "\n" + result.pseudocode(prefix + " ", depth - 1)
else:
s += f"return {result._label!r}\n"
if self._default is not None:
if len(self._decisions) == 1:
s += "{}if {} != {!r}: ".format(
prefix, self._fname, list(self._decisions.keys())[0]
)
else:
s += f"{prefix}else: "
if self._default._fname is not None and depth > 1:
s += "\n" + self._default.pseudocode(prefix + " ", depth - 1)
else:
s += f"return {self._default._label!r}\n"
return s
def __str__(self):
return self.pretty_format()
@staticmethod
def train(
labeled_featuresets,
entropy_cutoff=0.05,
depth_cutoff=100,
support_cutoff=10,
binary=False,
feature_values=None,
verbose=False,
):
"""
:param binary: If true, then treat all feature/value pairs as
individual binary features, rather than using a single n-way
branch for each feature.
"""
# Collect a list of all feature names.
feature_names = set()
for featureset, label in labeled_featuresets:
for fname in featureset:
feature_names.add(fname)
# Collect a list of the values each feature can take.
if feature_values is None and binary:
feature_values = defaultdict(set)
for featureset, label in labeled_featuresets:
for fname, fval in featureset.items():
feature_values[fname].add(fval)
# Start with a stump.
if not binary:
tree = DecisionTreeClassifier.best_stump(
feature_names, labeled_featuresets, verbose
)
else:
tree = DecisionTreeClassifier.best_binary_stump(
feature_names, labeled_featuresets, feature_values, verbose
)
# Refine the stump.
tree.refine(
labeled_featuresets,
entropy_cutoff,
depth_cutoff - 1,
support_cutoff,
binary,
feature_values,
verbose,
)
# Return it
return tree
@staticmethod
def leaf(labeled_featuresets):
label = FreqDist(label for (featureset, label) in labeled_featuresets).max()
return DecisionTreeClassifier(label)
@staticmethod
def stump(feature_name, labeled_featuresets):
label = FreqDist(label for (featureset, label) in labeled_featuresets).max()
# Find the best label for each value.
freqs = defaultdict(FreqDist) # freq(label|value)
for featureset, label in labeled_featuresets:
feature_value = featureset.get(feature_name)
freqs[feature_value][label] += 1
decisions = {val: DecisionTreeClassifier(freqs[val].max()) for val in freqs}
return DecisionTreeClassifier(label, feature_name, decisions)
def refine(
self,
labeled_featuresets,
entropy_cutoff,
depth_cutoff,
support_cutoff,
binary=False,
feature_values=None,
verbose=False,
):
if len(labeled_featuresets) <= support_cutoff:
return
if self._fname is None:
return
if depth_cutoff <= 0:
return
for fval in self._decisions:
fval_featuresets = [
(featureset, label)
for (featureset, label) in labeled_featuresets
if featureset.get(self._fname) == fval
]
label_freqs = FreqDist(label for (featureset, label) in fval_featuresets)
if entropy(MLEProbDist(label_freqs)) > entropy_cutoff:
self._decisions[fval] = DecisionTreeClassifier.train(
fval_featuresets,
entropy_cutoff,
depth_cutoff,
support_cutoff,
binary,
feature_values,
verbose,
)
if self._default is not None:
default_featuresets = [
(featureset, label)
for (featureset, label) in labeled_featuresets
if featureset.get(self._fname) not in self._decisions
]
label_freqs = FreqDist(label for (featureset, label) in default_featuresets)
if entropy(MLEProbDist(label_freqs)) > entropy_cutoff:
self._default = DecisionTreeClassifier.train(
default_featuresets,
entropy_cutoff,
depth_cutoff,
support_cutoff,
binary,
feature_values,
verbose,
)
@staticmethod
def best_stump(feature_names, labeled_featuresets, verbose=False):
best_stump = DecisionTreeClassifier.leaf(labeled_featuresets)
best_error = best_stump.error(labeled_featuresets)
for fname in feature_names:
stump = DecisionTreeClassifier.stump(fname, labeled_featuresets)
stump_error = stump.error(labeled_featuresets)
if stump_error < best_error:
best_error = stump_error
best_stump = stump
if verbose:
print(
"best stump for {:6d} toks uses {:20} err={:6.4f}".format(
len(labeled_featuresets), best_stump._fname, best_error
)
)
return best_stump
@staticmethod
def binary_stump(feature_name, feature_value, labeled_featuresets):
label = FreqDist(label for (featureset, label) in labeled_featuresets).max()
# Find the best label for each value.
pos_fdist = FreqDist()
neg_fdist = FreqDist()
for featureset, label in labeled_featuresets:
if featureset.get(feature_name) == feature_value:
pos_fdist[label] += 1
else:
neg_fdist[label] += 1
decisions = {}
default = label
# But hopefully we have observations!
if pos_fdist.N() > 0:
decisions = {feature_value: DecisionTreeClassifier(pos_fdist.max())}
if neg_fdist.N() > 0:
default = DecisionTreeClassifier(neg_fdist.max())
return DecisionTreeClassifier(label, feature_name, decisions, default)
@staticmethod
def best_binary_stump(
feature_names, labeled_featuresets, feature_values, verbose=False
):
best_stump = DecisionTreeClassifier.leaf(labeled_featuresets)
best_error = best_stump.error(labeled_featuresets)
for fname in feature_names:
for fval in feature_values[fname]:
stump = DecisionTreeClassifier.binary_stump(
fname, fval, labeled_featuresets
)
stump_error = stump.error(labeled_featuresets)
if stump_error < best_error:
best_error = stump_error
best_stump = stump
if verbose:
if best_stump._decisions:
descr = "{}={}".format(
best_stump._fname, list(best_stump._decisions.keys())[0]
)
else:
descr = "(default)"
print(
"best stump for {:6d} toks uses {:20} err={:6.4f}".format(
len(labeled_featuresets), descr, best_error
)
)
return best_stump
##//////////////////////////////////////////////////////
## Demo
##//////////////////////////////////////////////////////
def f(x):
return DecisionTreeClassifier.train(x, binary=True, verbose=True)
def demo():
from nltk.classify.util import binary_names_demo_features, names_demo
classifier = names_demo(
f, binary_names_demo_features # DecisionTreeClassifier.train,
)
print(classifier.pretty_format(depth=7))
print(classifier.pseudocode(depth=7))
if __name__ == "__main__":
demo()

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,184 @@
# Natural Language Toolkit: Interface to Megam Classifier
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
A set of functions used to interface with the external megam_ maxent
optimization package. Before megam can be used, you should tell NLTK where it
can find the megam binary, using the ``config_megam()`` function. Typical
usage:
>>> from nltk.classify import megam
>>> megam.config_megam() # pass path to megam if not found in PATH # doctest: +SKIP
[Found megam: ...]
Use with MaxentClassifier. Example below, see MaxentClassifier documentation
for details.
nltk.classify.MaxentClassifier.train(corpus, 'megam')
.. _megam: https://www.umiacs.umd.edu/~hal/megam/index.html
"""
import subprocess
from nltk.internals import find_binary
try:
import numpy
except ImportError:
numpy = None
######################################################################
# { Configuration
######################################################################
_megam_bin = None
def config_megam(bin=None):
"""
Configure NLTK's interface to the ``megam`` maxent optimization
package.
:param bin: The full path to the ``megam`` binary. If not specified,
then nltk will search the system for a ``megam`` binary; and if
one is not found, it will raise a ``LookupError`` exception.
:type bin: str
"""
global _megam_bin
_megam_bin = find_binary(
"megam",
bin,
env_vars=["MEGAM"],
binary_names=["megam.opt", "megam", "megam_686", "megam_i686.opt"],
url="https://www.umiacs.umd.edu/~hal/megam/index.html",
)
######################################################################
# { Megam Interface Functions
######################################################################
def write_megam_file(train_toks, encoding, stream, bernoulli=True, explicit=True):
"""
Generate an input file for ``megam`` based on the given corpus of
classified tokens.
:type train_toks: list(tuple(dict, str))
:param train_toks: Training data, represented as a list of
pairs, the first member of which is a feature dictionary,
and the second of which is a classification label.
:type encoding: MaxentFeatureEncodingI
:param encoding: A feature encoding, used to convert featuresets
into feature vectors. May optionally implement a cost() method
in order to assign different costs to different class predictions.
:type stream: stream
:param stream: The stream to which the megam input file should be
written.
:param bernoulli: If true, then use the 'bernoulli' format. I.e.,
all joint features have binary values, and are listed iff they
are true. Otherwise, list feature values explicitly. If
``bernoulli=False``, then you must call ``megam`` with the
``-fvals`` option.
:param explicit: If true, then use the 'explicit' format. I.e.,
list the features that would fire for any of the possible
labels, for each token. If ``explicit=True``, then you must
call ``megam`` with the ``-explicit`` option.
"""
# Look up the set of labels.
labels = encoding.labels()
labelnum = {label: i for (i, label) in enumerate(labels)}
# Write the file, which contains one line per instance.
for featureset, label in train_toks:
# First, the instance number (or, in the weighted multiclass case, the cost of each label).
if hasattr(encoding, "cost"):
stream.write(
":".join(str(encoding.cost(featureset, label, l)) for l in labels)
)
else:
stream.write("%d" % labelnum[label])
# For implicit file formats, just list the features that fire
# for this instance's actual label.
if not explicit:
_write_megam_features(encoding.encode(featureset, label), stream, bernoulli)
# For explicit formats, list the features that would fire for
# any of the possible labels.
else:
for l in labels:
stream.write(" #")
_write_megam_features(encoding.encode(featureset, l), stream, bernoulli)
# End of the instance.
stream.write("\n")
def parse_megam_weights(s, features_count, explicit=True):
"""
Given the stdout output generated by ``megam`` when training a
model, return a ``numpy`` array containing the corresponding weight
vector. This function does not currently handle bias features.
"""
if numpy is None:
raise ValueError("This function requires that numpy be installed")
assert explicit, "non-explicit not supported yet"
lines = s.strip().split("\n")
weights = numpy.zeros(features_count, "d")
for line in lines:
if line.strip():
fid, weight = line.split()
weights[int(fid)] = float(weight)
return weights
def _write_megam_features(vector, stream, bernoulli):
if not vector:
raise ValueError(
"MEGAM classifier requires the use of an " "always-on feature."
)
for fid, fval in vector:
if bernoulli:
if fval == 1:
stream.write(" %s" % fid)
elif fval != 0:
raise ValueError(
"If bernoulli=True, then all" "features must be binary."
)
else:
stream.write(f" {fid} {fval}")
def call_megam(args):
"""
Call the ``megam`` binary with the given arguments.
"""
if isinstance(args, str):
raise TypeError("args should be a list of strings")
if _megam_bin is None:
config_megam()
# Call megam via a subprocess
cmd = [_megam_bin] + args
p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
(stdout, stderr) = p.communicate()
# Check the return code.
if p.returncode != 0:
print()
print(stderr)
raise OSError("megam command failed!")
if isinstance(stdout, str):
return stdout
else:
return stdout.decode("utf-8")

View File

@@ -0,0 +1,260 @@
# Natural Language Toolkit: Naive Bayes Classifiers
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
A classifier based on the Naive Bayes algorithm. In order to find the
probability for a label, this algorithm first uses the Bayes rule to
express P(label|features) in terms of P(label) and P(features|label):
| P(label) * P(features|label)
| P(label|features) = ------------------------------
| P(features)
The algorithm then makes the 'naive' assumption that all features are
independent, given the label:
| P(label) * P(f1|label) * ... * P(fn|label)
| P(label|features) = --------------------------------------------
| P(features)
Rather than computing P(features) explicitly, the algorithm just
calculates the numerator for each label, and normalizes them so they
sum to one:
| P(label) * P(f1|label) * ... * P(fn|label)
| P(label|features) = --------------------------------------------
| SUM[l]( P(l) * P(f1|l) * ... * P(fn|l) )
"""
from collections import defaultdict
from nltk.classify.api import ClassifierI
from nltk.probability import DictionaryProbDist, ELEProbDist, FreqDist, sum_logs
##//////////////////////////////////////////////////////
## Naive Bayes Classifier
##//////////////////////////////////////////////////////
class NaiveBayesClassifier(ClassifierI):
"""
A Naive Bayes classifier. Naive Bayes classifiers are
paramaterized by two probability distributions:
- P(label) gives the probability that an input will receive each
label, given no information about the input's features.
- P(fname=fval|label) gives the probability that a given feature
(fname) will receive a given value (fval), given that the
label (label).
If the classifier encounters an input with a feature that has
never been seen with any label, then rather than assigning a
probability of 0 to all labels, it will ignore that feature.
The feature value 'None' is reserved for unseen feature values;
you generally should not use 'None' as a feature value for one of
your own features.
"""
def __init__(self, label_probdist, feature_probdist):
"""
:param label_probdist: P(label), the probability distribution
over labels. It is expressed as a ``ProbDistI`` whose
samples are labels. I.e., P(label) =
``label_probdist.prob(label)``.
:param feature_probdist: P(fname=fval|label), the probability
distribution for feature values, given labels. It is
expressed as a dictionary whose keys are ``(label, fname)``
pairs and whose values are ``ProbDistI`` objects over feature
values. I.e., P(fname=fval|label) =
``feature_probdist[label,fname].prob(fval)``. If a given
``(label,fname)`` is not a key in ``feature_probdist``, then
it is assumed that the corresponding P(fname=fval|label)
is 0 for all values of ``fval``.
"""
self._label_probdist = label_probdist
self._feature_probdist = feature_probdist
self._labels = list(label_probdist.samples())
def labels(self):
return self._labels
def classify(self, featureset):
return self.prob_classify(featureset).max()
def prob_classify(self, featureset):
# Discard any feature names that we've never seen before.
# Otherwise, we'll just assign a probability of 0 to
# everything.
featureset = featureset.copy()
for fname in list(featureset.keys()):
for label in self._labels:
if (label, fname) in self._feature_probdist:
break
else:
# print('Ignoring unseen feature %s' % fname)
del featureset[fname]
# Find the log probability of each label, given the features.
# Start with the log probability of the label itself.
logprob = {}
for label in self._labels:
logprob[label] = self._label_probdist.logprob(label)
# Then add in the log probability of features given labels.
for label in self._labels:
for fname, fval in featureset.items():
if (label, fname) in self._feature_probdist:
feature_probs = self._feature_probdist[label, fname]
logprob[label] += feature_probs.logprob(fval)
else:
# nb: This case will never come up if the
# classifier was created by
# NaiveBayesClassifier.train().
logprob[label] += sum_logs([]) # = -INF.
return DictionaryProbDist(logprob, normalize=True, log=True)
def show_most_informative_features(self, n=10):
# Determine the most relevant features, and display them.
cpdist = self._feature_probdist
print("Most Informative Features")
for fname, fval in self.most_informative_features(n):
def labelprob(l):
return cpdist[l, fname].prob(fval)
labels = sorted(
(l for l in self._labels if fval in cpdist[l, fname].samples()),
key=lambda element: (-labelprob(element), element),
reverse=True,
)
if len(labels) == 1:
continue
l0 = labels[0]
l1 = labels[-1]
if cpdist[l0, fname].prob(fval) == 0:
ratio = "INF"
else:
ratio = "%8.1f" % (
cpdist[l1, fname].prob(fval) / cpdist[l0, fname].prob(fval)
)
print(
"%24s = %-14r %6s : %-6s = %s : 1.0"
% (fname, fval, ("%s" % l1)[:6], ("%s" % l0)[:6], ratio)
)
def most_informative_features(self, n=100):
"""
Return a list of the 'most informative' features used by this
classifier. For the purpose of this function, the
informativeness of a feature ``(fname,fval)`` is equal to the
highest value of P(fname=fval|label), for any label, divided by
the lowest value of P(fname=fval|label), for any label:
| max[ P(fname=fval|label1) / P(fname=fval|label2) ]
"""
if hasattr(self, "_most_informative_features"):
return self._most_informative_features[:n]
else:
# The set of (fname, fval) pairs used by this classifier.
features = set()
# The max & min probability associated w/ each (fname, fval)
# pair. Maps (fname,fval) -> float.
maxprob = defaultdict(float)
minprob = defaultdict(lambda: 1.0)
for (label, fname), probdist in self._feature_probdist.items():
for fval in probdist.samples():
feature = (fname, fval)
features.add(feature)
p = probdist.prob(fval)
maxprob[feature] = max(p, maxprob[feature])
minprob[feature] = min(p, minprob[feature])
if minprob[feature] == 0:
features.discard(feature)
# Convert features to a list, & sort it by how informative
# features are.
self._most_informative_features = sorted(
features,
key=lambda feature_: (
minprob[feature_] / maxprob[feature_],
feature_[0],
feature_[1] in [None, False, True],
str(feature_[1]).lower(),
),
)
return self._most_informative_features[:n]
@classmethod
def train(cls, labeled_featuresets, estimator=ELEProbDist):
"""
:param labeled_featuresets: A list of classified featuresets,
i.e., a list of tuples ``(featureset, label)``.
"""
label_freqdist = FreqDist()
feature_freqdist = defaultdict(FreqDist)
feature_values = defaultdict(set)
fnames = set()
# Count up how many times each feature value occurred, given
# the label and featurename.
for featureset, label in labeled_featuresets:
label_freqdist[label] += 1
for fname, fval in featureset.items():
# Increment freq(fval|label, fname)
feature_freqdist[label, fname][fval] += 1
# Record that fname can take the value fval.
feature_values[fname].add(fval)
# Keep a list of all feature names.
fnames.add(fname)
# If a feature didn't have a value given for an instance, then
# we assume that it gets the implicit value 'None.' This loop
# counts up the number of 'missing' feature values for each
# (label,fname) pair, and increments the count of the fval
# 'None' by that amount.
for label in label_freqdist:
num_samples = label_freqdist[label]
for fname in fnames:
count = feature_freqdist[label, fname].N()
# Only add a None key when necessary, i.e. if there are
# any samples with feature 'fname' missing.
if num_samples - count > 0:
feature_freqdist[label, fname][None] += num_samples - count
feature_values[fname].add(None)
# Create the P(label) distribution
label_probdist = estimator(label_freqdist)
# Create the P(fval|label, fname) distribution
feature_probdist = {}
for (label, fname), freqdist in feature_freqdist.items():
probdist = estimator(freqdist, bins=len(feature_values[fname]))
feature_probdist[label, fname] = probdist
return cls(label_probdist, feature_probdist)
##//////////////////////////////////////////////////////
## Demo
##//////////////////////////////////////////////////////
def demo():
from nltk.classify.util import names_demo
classifier = names_demo(NaiveBayesClassifier.train)
classifier.show_most_informative_features()
if __name__ == "__main__":
demo()

View File

@@ -0,0 +1,180 @@
# Natural Language Toolkit: Positive Naive Bayes Classifier
#
# Copyright (C) 2012 NLTK Project
# Author: Alessandro Presta <alessandro.presta@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
A variant of the Naive Bayes Classifier that performs binary classification with
partially-labeled training sets. In other words, assume we want to build a classifier
that assigns each example to one of two complementary classes (e.g., male names and
female names).
If we have a training set with labeled examples for both classes, we can use a
standard Naive Bayes Classifier. However, consider the case when we only have labeled
examples for one of the classes, and other, unlabeled, examples.
Then, assuming a prior distribution on the two labels, we can use the unlabeled set
to estimate the frequencies of the various features.
Let the two possible labels be 1 and 0, and let's say we only have examples labeled 1
and unlabeled examples. We are also given an estimate of P(1).
We compute P(feature|1) exactly as in the standard case.
To compute P(feature|0), we first estimate P(feature) from the unlabeled set (we are
assuming that the unlabeled examples are drawn according to the given prior distribution)
and then express the conditional probability as:
| P(feature) - P(feature|1) * P(1)
| P(feature|0) = ----------------------------------
| P(0)
Example:
>>> from nltk.classify import PositiveNaiveBayesClassifier
Some sentences about sports:
>>> sports_sentences = [ 'The team dominated the game',
... 'They lost the ball',
... 'The game was intense',
... 'The goalkeeper catched the ball',
... 'The other team controlled the ball' ]
Mixed topics, including sports:
>>> various_sentences = [ 'The President did not comment',
... 'I lost the keys',
... 'The team won the game',
... 'Sara has two kids',
... 'The ball went off the court',
... 'They had the ball for the whole game',
... 'The show is over' ]
The features of a sentence are simply the words it contains:
>>> def features(sentence):
... words = sentence.lower().split()
... return dict(('contains(%s)' % w, True) for w in words)
We use the sports sentences as positive examples, the mixed ones ad unlabeled examples:
>>> positive_featuresets = map(features, sports_sentences)
>>> unlabeled_featuresets = map(features, various_sentences)
>>> classifier = PositiveNaiveBayesClassifier.train(positive_featuresets,
... unlabeled_featuresets)
Is the following sentence about sports?
>>> classifier.classify(features('The cat is on the table'))
False
What about this one?
>>> classifier.classify(features('My team lost the game'))
True
"""
from collections import defaultdict
from nltk.classify.naivebayes import NaiveBayesClassifier
from nltk.probability import DictionaryProbDist, ELEProbDist, FreqDist
##//////////////////////////////////////////////////////
## Positive Naive Bayes Classifier
##//////////////////////////////////////////////////////
class PositiveNaiveBayesClassifier(NaiveBayesClassifier):
@staticmethod
def train(
positive_featuresets,
unlabeled_featuresets,
positive_prob_prior=0.5,
estimator=ELEProbDist,
):
"""
:param positive_featuresets: An iterable of featuresets that are known as positive
examples (i.e., their label is ``True``).
:param unlabeled_featuresets: An iterable of featuresets whose label is unknown.
:param positive_prob_prior: A prior estimate of the probability of the label
``True`` (default 0.5).
"""
positive_feature_freqdist = defaultdict(FreqDist)
unlabeled_feature_freqdist = defaultdict(FreqDist)
feature_values = defaultdict(set)
fnames = set()
# Count up how many times each feature value occurred in positive examples.
num_positive_examples = 0
for featureset in positive_featuresets:
for fname, fval in featureset.items():
positive_feature_freqdist[fname][fval] += 1
feature_values[fname].add(fval)
fnames.add(fname)
num_positive_examples += 1
# Count up how many times each feature value occurred in unlabeled examples.
num_unlabeled_examples = 0
for featureset in unlabeled_featuresets:
for fname, fval in featureset.items():
unlabeled_feature_freqdist[fname][fval] += 1
feature_values[fname].add(fval)
fnames.add(fname)
num_unlabeled_examples += 1
# If a feature didn't have a value given for an instance, then we assume that
# it gets the implicit value 'None'.
for fname in fnames:
count = positive_feature_freqdist[fname].N()
positive_feature_freqdist[fname][None] += num_positive_examples - count
feature_values[fname].add(None)
for fname in fnames:
count = unlabeled_feature_freqdist[fname].N()
unlabeled_feature_freqdist[fname][None] += num_unlabeled_examples - count
feature_values[fname].add(None)
negative_prob_prior = 1.0 - positive_prob_prior
# Create the P(label) distribution.
label_probdist = DictionaryProbDist(
{True: positive_prob_prior, False: negative_prob_prior}
)
# Create the P(fval|label, fname) distribution.
feature_probdist = {}
for fname, freqdist in positive_feature_freqdist.items():
probdist = estimator(freqdist, bins=len(feature_values[fname]))
feature_probdist[True, fname] = probdist
for fname, freqdist in unlabeled_feature_freqdist.items():
global_probdist = estimator(freqdist, bins=len(feature_values[fname]))
negative_feature_probs = {}
for fval in feature_values[fname]:
prob = (
global_probdist.prob(fval)
- positive_prob_prior * feature_probdist[True, fname].prob(fval)
) / negative_prob_prior
# TODO: We need to add some kind of smoothing here, instead of
# setting negative probabilities to zero and normalizing.
negative_feature_probs[fval] = max(prob, 0.0)
feature_probdist[False, fname] = DictionaryProbDist(
negative_feature_probs, normalize=True
)
return PositiveNaiveBayesClassifier(label_probdist, feature_probdist)
##//////////////////////////////////////////////////////
## Demo
##//////////////////////////////////////////////////////
def demo():
from nltk.classify.util import partial_names_demo
classifier = partial_names_demo(PositiveNaiveBayesClassifier.train)
classifier.show_most_informative_features()

View File

@@ -0,0 +1,183 @@
# Natural Language Toolkit: RTE Classifier
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Simple classifier for RTE corpus.
It calculates the overlap in words and named entities between text and
hypothesis, and also whether there are words / named entities in the
hypothesis which fail to occur in the text, since this is an indicator that
the hypothesis is more informative than (i.e not entailed by) the text.
TO DO: better Named Entity classification
TO DO: add lemmatization
"""
from nltk.classify.maxent import MaxentClassifier
from nltk.classify.util import accuracy
from nltk.tokenize import RegexpTokenizer
class RTEFeatureExtractor:
"""
This builds a bag of words for both the text and the hypothesis after
throwing away some stopwords, then calculates overlap and difference.
"""
def __init__(self, rtepair, stop=True, use_lemmatize=False):
"""
:param rtepair: a ``RTEPair`` from which features should be extracted
:param stop: if ``True``, stopwords are thrown away.
:type stop: bool
"""
self.stop = stop
self.stopwords = {
"a",
"the",
"it",
"they",
"of",
"in",
"to",
"is",
"have",
"are",
"were",
"and",
"very",
".",
",",
}
self.negwords = {"no", "not", "never", "failed", "rejected", "denied"}
# Try to tokenize so that abbreviations, monetary amounts, email
# addresses, URLs are single tokens.
tokenizer = RegexpTokenizer(r"[\w.@:/]+|\w+|\$[\d.]+")
# Get the set of word types for text and hypothesis
self.text_tokens = tokenizer.tokenize(rtepair.text)
self.hyp_tokens = tokenizer.tokenize(rtepair.hyp)
self.text_words = set(self.text_tokens)
self.hyp_words = set(self.hyp_tokens)
if use_lemmatize:
self.text_words = {self._lemmatize(token) for token in self.text_tokens}
self.hyp_words = {self._lemmatize(token) for token in self.hyp_tokens}
if self.stop:
self.text_words = self.text_words - self.stopwords
self.hyp_words = self.hyp_words - self.stopwords
self._overlap = self.hyp_words & self.text_words
self._hyp_extra = self.hyp_words - self.text_words
self._txt_extra = self.text_words - self.hyp_words
def overlap(self, toktype, debug=False):
"""
Compute the overlap between text and hypothesis.
:param toktype: distinguish Named Entities from ordinary words
:type toktype: 'ne' or 'word'
"""
ne_overlap = {token for token in self._overlap if self._ne(token)}
if toktype == "ne":
if debug:
print("ne overlap", ne_overlap)
return ne_overlap
elif toktype == "word":
if debug:
print("word overlap", self._overlap - ne_overlap)
return self._overlap - ne_overlap
else:
raise ValueError("Type not recognized:'%s'" % toktype)
def hyp_extra(self, toktype, debug=True):
"""
Compute the extraneous material in the hypothesis.
:param toktype: distinguish Named Entities from ordinary words
:type toktype: 'ne' or 'word'
"""
ne_extra = {token for token in self._hyp_extra if self._ne(token)}
if toktype == "ne":
return ne_extra
elif toktype == "word":
return self._hyp_extra - ne_extra
else:
raise ValueError("Type not recognized: '%s'" % toktype)
@staticmethod
def _ne(token):
"""
This just assumes that words in all caps or titles are
named entities.
:type token: str
"""
if token.istitle() or token.isupper():
return True
return False
@staticmethod
def _lemmatize(word):
"""
Use morphy from WordNet to find the base form of verbs.
"""
from nltk.corpus import wordnet as wn
lemma = wn.morphy(word, pos=wn.VERB)
if lemma is not None:
return lemma
return word
def rte_features(rtepair):
extractor = RTEFeatureExtractor(rtepair)
features = {}
features["alwayson"] = True
features["word_overlap"] = len(extractor.overlap("word"))
features["word_hyp_extra"] = len(extractor.hyp_extra("word"))
features["ne_overlap"] = len(extractor.overlap("ne"))
features["ne_hyp_extra"] = len(extractor.hyp_extra("ne"))
features["neg_txt"] = len(extractor.negwords & extractor.text_words)
features["neg_hyp"] = len(extractor.negwords & extractor.hyp_words)
return features
def rte_featurize(rte_pairs):
return [(rte_features(pair), pair.value) for pair in rte_pairs]
def rte_classifier(algorithm, sample_N=None):
from nltk.corpus import rte as rte_corpus
train_set = rte_corpus.pairs(["rte1_dev.xml", "rte2_dev.xml", "rte3_dev.xml"])
test_set = rte_corpus.pairs(["rte1_test.xml", "rte2_test.xml", "rte3_test.xml"])
if sample_N is not None:
train_set = train_set[:sample_N]
test_set = test_set[:sample_N]
featurized_train_set = rte_featurize(train_set)
featurized_test_set = rte_featurize(test_set)
# Train the classifier
print("Training classifier...")
if algorithm in ["megam"]: # MEGAM based algorithms.
clf = MaxentClassifier.train(featurized_train_set, algorithm)
elif algorithm in ["GIS", "IIS"]: # Use default GIS/IIS MaxEnt algorithm
clf = MaxentClassifier.train(featurized_train_set, algorithm)
else:
err_msg = str(
"RTEClassifier only supports these algorithms:\n "
"'megam', 'GIS', 'IIS'.\n"
)
raise Exception(err_msg)
print("Testing classifier...")
acc = accuracy(clf, featurized_test_set)
print("Accuracy: %6.4f" % acc)
return clf

View File

@@ -0,0 +1,143 @@
# Natural Language Toolkit: Interface to scikit-learn classifiers
#
# Author: Lars Buitinck <L.J.Buitinck@uva.nl>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
scikit-learn (https://scikit-learn.org) is a machine learning library for
Python. It supports many classification algorithms, including SVMs,
Naive Bayes, logistic regression (MaxEnt) and decision trees.
This package implements a wrapper around scikit-learn classifiers. To use this
wrapper, construct a scikit-learn estimator object, then use that to construct
a SklearnClassifier. E.g., to wrap a linear SVM with default settings:
>>> from sklearn.svm import LinearSVC
>>> from nltk.classify.scikitlearn import SklearnClassifier
>>> classif = SklearnClassifier(LinearSVC())
A scikit-learn classifier may include preprocessing steps when it's wrapped
in a Pipeline object. The following constructs and wraps a Naive Bayes text
classifier with tf-idf weighting and chi-square feature selection to get the
best 1000 features:
>>> from sklearn.feature_extraction.text import TfidfTransformer
>>> from sklearn.feature_selection import SelectKBest, chi2
>>> from sklearn.naive_bayes import MultinomialNB
>>> from sklearn.pipeline import Pipeline
>>> pipeline = Pipeline([('tfidf', TfidfTransformer()),
... ('chi2', SelectKBest(chi2, k=1000)),
... ('nb', MultinomialNB())])
>>> classif = SklearnClassifier(pipeline)
"""
from nltk.classify.api import ClassifierI
from nltk.probability import DictionaryProbDist
try:
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import LabelEncoder
except ImportError:
pass
__all__ = ["SklearnClassifier"]
class SklearnClassifier(ClassifierI):
"""Wrapper for scikit-learn classifiers."""
def __init__(self, estimator, dtype=float, sparse=True):
"""
:param estimator: scikit-learn classifier object.
:param dtype: data type used when building feature array.
scikit-learn estimators work exclusively on numeric data. The
default value should be fine for almost all situations.
:param sparse: Whether to use sparse matrices internally.
The estimator must support these; not all scikit-learn classifiers
do (see their respective documentation and look for "sparse
matrix"). The default value is True, since most NLP problems
involve sparse feature sets. Setting this to False may take a
great amount of memory.
:type sparse: boolean.
"""
self._clf = estimator
self._encoder = LabelEncoder()
self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)
def __repr__(self):
return "<SklearnClassifier(%r)>" % self._clf
def classify_many(self, featuresets):
"""Classify a batch of samples.
:param featuresets: An iterable over featuresets, each a dict mapping
strings to either numbers, booleans or strings.
:return: The predicted class label for each input sample.
:rtype: list
"""
X = self._vectorizer.transform(featuresets)
classes = self._encoder.classes_
return [classes[i] for i in self._clf.predict(X)]
def prob_classify_many(self, featuresets):
"""Compute per-class probabilities for a batch of samples.
:param featuresets: An iterable over featuresets, each a dict mapping
strings to either numbers, booleans or strings.
:rtype: list of ``ProbDistI``
"""
X = self._vectorizer.transform(featuresets)
y_proba_list = self._clf.predict_proba(X)
return [self._make_probdist(y_proba) for y_proba in y_proba_list]
def labels(self):
"""The class labels used by this classifier.
:rtype: list
"""
return list(self._encoder.classes_)
def train(self, labeled_featuresets):
"""
Train (fit) the scikit-learn estimator.
:param labeled_featuresets: A list of ``(featureset, label)``
where each ``featureset`` is a dict mapping strings to either
numbers, booleans or strings.
"""
X, y = list(zip(*labeled_featuresets))
X = self._vectorizer.fit_transform(X)
y = self._encoder.fit_transform(y)
self._clf.fit(X, y)
return self
def _make_probdist(self, y_proba):
classes = self._encoder.classes_
return DictionaryProbDist({classes[i]: p for i, p in enumerate(y_proba)})
if __name__ == "__main__":
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from nltk.classify.util import names_demo, names_demo_features
# Bernoulli Naive Bayes is designed for binary classification. We set the
# binarize option to False since we know we're passing boolean features.
print("scikit-learn Naive Bayes:")
names_demo(
SklearnClassifier(BernoulliNB(binarize=False)).train,
features=names_demo_features,
)
# The C parameter on logistic regression (MaxEnt) controls regularization.
# The higher it's set, the less regularized the classifier is.
print("\n\nscikit-learn logistic regression:")
names_demo(
SklearnClassifier(LogisticRegression(C=1000)).train,
features=names_demo_features,
)

View File

@@ -0,0 +1,175 @@
# Natural Language Toolkit: Senna Interface
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Rami Al-Rfou' <ralrfou@cs.stonybrook.edu>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
A general interface to the SENNA pipeline that supports any of the
operations specified in SUPPORTED_OPERATIONS.
Applying multiple operations at once has the speed advantage. For example,
Senna will automatically determine POS tags if you are extracting named
entities. Applying both of the operations will cost only the time of
extracting the named entities.
The SENNA pipeline has a fixed maximum size of the sentences that it can read.
By default it is 1024 token/sentence. If you have larger sentences, changing
the MAX_SENTENCE_SIZE value in SENNA_main.c should be considered and your
system specific binary should be rebuilt. Otherwise this could introduce
misalignment errors.
The input is:
- path to the directory that contains SENNA executables. If the path is incorrect,
Senna will automatically search for executable file specified in SENNA environment variable
- List of the operations needed to be performed.
- (optionally) the encoding of the input data (default:utf-8)
Note: Unit tests for this module can be found in test/unit/test_senna.py
>>> from nltk.classify import Senna
>>> pipeline = Senna('/usr/share/senna-v3.0', ['pos', 'chk', 'ner']) # doctest: +SKIP
>>> sent = 'Dusseldorf is an international business center'.split()
>>> [(token['word'], token['chk'], token['ner'], token['pos']) for token in pipeline.tag(sent)] # doctest: +SKIP
[('Dusseldorf', 'B-NP', 'B-LOC', 'NNP'), ('is', 'B-VP', 'O', 'VBZ'), ('an', 'B-NP', 'O', 'DT'),
('international', 'I-NP', 'O', 'JJ'), ('business', 'I-NP', 'O', 'NN'), ('center', 'I-NP', 'O', 'NN')]
"""
from os import environ, path, sep
from platform import architecture, system
from subprocess import PIPE, Popen
from nltk.tag.api import TaggerI
class Senna(TaggerI):
SUPPORTED_OPERATIONS = ["pos", "chk", "ner"]
def __init__(self, senna_path, operations, encoding="utf-8"):
self._encoding = encoding
self._path = path.normpath(senna_path) + sep
# Verifies the existence of the executable on the self._path first
# senna_binary_file_1 = self.executable(self._path)
exe_file_1 = self.executable(self._path)
if not path.isfile(exe_file_1):
# Check for the system environment
if "SENNA" in environ:
# self._path = path.join(environ['SENNA'],'')
self._path = path.normpath(environ["SENNA"]) + sep
exe_file_2 = self.executable(self._path)
if not path.isfile(exe_file_2):
raise LookupError(
"Senna executable expected at %s or %s but not found"
% (exe_file_1, exe_file_2)
)
self.operations = operations
def executable(self, base_path):
"""
The function that determines the system specific binary that should be
used in the pipeline. In case, the system is not known the default senna binary will
be used.
"""
os_name = system()
if os_name == "Linux":
bits = architecture()[0]
if bits == "64bit":
return path.join(base_path, "senna-linux64")
return path.join(base_path, "senna-linux32")
if os_name == "Windows":
return path.join(base_path, "senna-win32.exe")
if os_name == "Darwin":
return path.join(base_path, "senna-osx")
return path.join(base_path, "senna")
def _map(self):
"""
A method that calculates the order of the columns that SENNA pipeline
will output the tags into. This depends on the operations being ordered.
"""
_map = {}
i = 1
for operation in Senna.SUPPORTED_OPERATIONS:
if operation in self.operations:
_map[operation] = i
i += 1
return _map
def tag(self, tokens):
"""
Applies the specified operation(s) on a list of tokens.
"""
return self.tag_sents([tokens])[0]
def tag_sents(self, sentences):
"""
Applies the tag method over a list of sentences. This method will return a
list of dictionaries. Every dictionary will contain a word with its
calculated annotations/tags.
"""
encoding = self._encoding
if not path.isfile(self.executable(self._path)):
raise LookupError(
"Senna executable expected at %s but not found"
% self.executable(self._path)
)
# Build the senna command to run the tagger
_senna_cmd = [
self.executable(self._path),
"-path",
self._path,
"-usrtokens",
"-iobtags",
]
_senna_cmd.extend(["-" + op for op in self.operations])
# Serialize the actual sentences to a temporary string
_input = "\n".join(" ".join(x) for x in sentences) + "\n"
if isinstance(_input, str) and encoding:
_input = _input.encode(encoding)
# Run the tagger and get the output
p = Popen(_senna_cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE)
(stdout, stderr) = p.communicate(input=_input)
senna_output = stdout
# Check the return code.
if p.returncode != 0:
raise RuntimeError("Senna command failed! Details: %s" % stderr)
if encoding:
senna_output = stdout.decode(encoding)
# Output the tagged sentences
map_ = self._map()
tagged_sentences = [[]]
sentence_index = 0
token_index = 0
for tagged_word in senna_output.strip().split("\n"):
if not tagged_word:
tagged_sentences.append([])
sentence_index += 1
token_index = 0
continue
tags = tagged_word.split("\t")
result = {}
for tag in map_:
result[tag] = tags[map_[tag]].strip()
try:
result["word"] = sentences[sentence_index][token_index]
except IndexError as e:
raise IndexError(
"Misalignment error occurred at sentence number %d. Possible reason"
" is that the sentence size exceeded the maximum size. Check the "
"documentation of Senna class for more information."
% sentence_index
) from e
tagged_sentences[-1].append(result)
token_index += 1
return tagged_sentences

View File

@@ -0,0 +1,17 @@
# Natural Language Toolkit: SVM-based classifier
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Leon Derczynski <leon@dcs.shef.ac.uk>
#
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
nltk.classify.svm was deprecated. For classification based
on support vector machines SVMs use nltk.classify.scikitlearn
(or `scikit-learn <https://scikit-learn.org>`_ directly).
"""
class SvmClassifier:
def __init__(self, *args, **kwargs):
raise NotImplementedError(__doc__)

View File

@@ -0,0 +1,122 @@
# Natural Language Toolkit: Interface to TADM Classifier
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Joseph Frazee <jfrazee@mail.utexas.edu>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import subprocess
import sys
from nltk.internals import find_binary
try:
import numpy
except ImportError:
pass
_tadm_bin = None
def config_tadm(bin=None):
global _tadm_bin
_tadm_bin = find_binary(
"tadm", bin, env_vars=["TADM"], binary_names=["tadm"], url="http://tadm.sf.net"
)
def write_tadm_file(train_toks, encoding, stream):
"""
Generate an input file for ``tadm`` based on the given corpus of
classified tokens.
:type train_toks: list(tuple(dict, str))
:param train_toks: Training data, represented as a list of
pairs, the first member of which is a feature dictionary,
and the second of which is a classification label.
:type encoding: TadmEventMaxentFeatureEncoding
:param encoding: A feature encoding, used to convert featuresets
into feature vectors.
:type stream: stream
:param stream: The stream to which the ``tadm`` input file should be
written.
"""
# See the following for a file format description:
#
# https://sf.net/forum/forum.php?thread_id=1391502&forum_id=473054
# https://sf.net/forum/forum.php?thread_id=1675097&forum_id=473054
labels = encoding.labels()
for featureset, label in train_toks:
length_line = "%d\n" % len(labels)
stream.write(length_line)
for known_label in labels:
v = encoding.encode(featureset, known_label)
line = "%d %d %s\n" % (
int(label == known_label),
len(v),
" ".join("%d %d" % u for u in v),
)
stream.write(line)
def parse_tadm_weights(paramfile):
"""
Given the stdout output generated by ``tadm`` when training a
model, return a ``numpy`` array containing the corresponding weight
vector.
"""
weights = []
for line in paramfile:
weights.append(float(line.strip()))
return numpy.array(weights, "d")
def call_tadm(args):
"""
Call the ``tadm`` binary with the given arguments.
"""
if isinstance(args, str):
raise TypeError("args should be a list of strings")
if _tadm_bin is None:
config_tadm()
# Call tadm via a subprocess
cmd = [_tadm_bin] + args
p = subprocess.Popen(cmd, stdout=sys.stdout)
(stdout, stderr) = p.communicate()
# Check the return code.
if p.returncode != 0:
print()
print(stderr)
raise OSError("tadm command failed!")
def names_demo():
from nltk.classify.maxent import TadmMaxentClassifier
from nltk.classify.util import names_demo
classifier = names_demo(TadmMaxentClassifier.train)
def encoding_demo():
import sys
from nltk.classify.maxent import TadmEventMaxentFeatureEncoding
tokens = [
({"f0": 1, "f1": 1, "f3": 1}, "A"),
({"f0": 1, "f2": 1, "f4": 1}, "B"),
({"f0": 2, "f2": 1, "f3": 1, "f4": 1}, "A"),
]
encoding = TadmEventMaxentFeatureEncoding.train(tokens)
write_tadm_file(tokens, encoding, sys.stdout)
print()
for i in range(encoding.length()):
print("%s --> %d" % (encoding.describe(i), i))
print()
if __name__ == "__main__":
encoding_demo()
names_demo()

View File

@@ -0,0 +1,193 @@
# Natural Language Toolkit: Language ID module using TextCat algorithm
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Avital Pekker <avital.pekker@utoronto.ca>
#
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
A module for language identification using the TextCat algorithm.
An implementation of the text categorization algorithm
presented in Cavnar, W. B. and J. M. Trenkle,
"N-Gram-Based Text Categorization".
The algorithm takes advantage of Zipf's law and uses
n-gram frequencies to profile languages and text-yet to
be identified-then compares using a distance measure.
Language n-grams are provided by the "An Crubadan"
project. A corpus reader was created separately to read
those files.
For details regarding the algorithm, see:
https://www.let.rug.nl/~vannoord/TextCat/textcat.pdf
For details about An Crubadan, see:
https://borel.slu.edu/crubadan/index.html
"""
from sys import maxsize
from nltk.util import trigrams
# Note: this is NOT "re" you're likely used to. The regex module
# is an alternative to the standard re module that supports
# Unicode codepoint properties with the \p{} syntax.
# You may have to "pip install regx"
try:
import regex as re
except ImportError:
re = None
######################################################################
## Language identification using TextCat
######################################################################
class TextCat:
_corpus = None
fingerprints = {}
_START_CHAR = "<"
_END_CHAR = ">"
last_distances = {}
def __init__(self):
if not re:
raise OSError(
"classify.textcat requires the regex module that "
"supports unicode. Try '$ pip install regex' and "
"see https://pypi.python.org/pypi/regex for "
"further details."
)
from nltk.corpus import crubadan
self._corpus = crubadan
# Load all language ngrams into cache
for lang in self._corpus.langs():
self._corpus.lang_freq(lang)
def remove_punctuation(self, text):
"""Get rid of punctuation except apostrophes"""
return re.sub(r"[^\P{P}\']+", "", text)
def profile(self, text):
"""Create FreqDist of trigrams within text"""
from nltk import FreqDist, word_tokenize
clean_text = self.remove_punctuation(text)
tokens = word_tokenize(clean_text)
fingerprint = FreqDist()
for t in tokens:
token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
token_trigrams = ["".join(tri) for tri in token_trigram_tuples]
for cur_trigram in token_trigrams:
if cur_trigram in fingerprint:
fingerprint[cur_trigram] += 1
else:
fingerprint[cur_trigram] = 1
return fingerprint
def calc_dist(self, lang, trigram, text_profile):
"""Calculate the "out-of-place" measure between the
text and language profile for a single trigram"""
lang_fd = self._corpus.lang_freq(lang)
dist = 0
if trigram in lang_fd:
idx_lang_profile = list(lang_fd.keys()).index(trigram)
idx_text = list(text_profile.keys()).index(trigram)
# print(idx_lang_profile, ", ", idx_text)
dist = abs(idx_lang_profile - idx_text)
else:
# Arbitrary but should be larger than
# any possible trigram file length
# in terms of total lines
dist = maxsize
return dist
def lang_dists(self, text):
"""Calculate the "out-of-place" measure between
the text and all languages"""
distances = {}
profile = self.profile(text)
# For all the languages
for lang in self._corpus._all_lang_freq.keys():
# Calculate distance metric for every trigram in
# input text to be identified
lang_dist = 0
for trigram in profile:
lang_dist += self.calc_dist(lang, trigram, profile)
distances[lang] = lang_dist
return distances
def guess_language(self, text):
"""Find the language with the min distance
to the text and return its ISO 639-3 code"""
self.last_distances = self.lang_dists(text)
return min(self.last_distances, key=self.last_distances.get)
#################################################')
def demo():
from nltk.corpus import udhr
langs = [
"Kurdish-UTF8",
"Abkhaz-UTF8",
"Farsi_Persian-UTF8",
"Hindi-UTF8",
"Hawaiian-UTF8",
"Russian-UTF8",
"Vietnamese-UTF8",
"Serbian_Srpski-UTF8",
"Esperanto-UTF8",
]
friendly = {
"kmr": "Northern Kurdish",
"abk": "Abkhazian",
"pes": "Iranian Persian",
"hin": "Hindi",
"haw": "Hawaiian",
"rus": "Russian",
"vie": "Vietnamese",
"srp": "Serbian",
"epo": "Esperanto",
}
tc = TextCat()
for cur_lang in langs:
# Get raw data from UDHR corpus
raw_sentences = udhr.sents(cur_lang)
rows = len(raw_sentences) - 1
cols = list(map(len, raw_sentences))
sample = ""
# Generate a sample text of the language
for i in range(0, rows):
cur_sent = " " + " ".join([raw_sentences[i][j] for j in range(0, cols[i])])
sample += cur_sent
# Try to detect what it is
print("Language snippet: " + sample[0:140] + "...")
guess = tc.guess_language(sample)
print(f"Language detection: {guess} ({friendly[guess]})")
print("#" * 140)
if __name__ == "__main__":
demo()

View File

@@ -0,0 +1,347 @@
# Natural Language Toolkit: Classifier Utility Functions
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com> (minor additions)
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Utility functions and classes for classifiers.
"""
import math
# from nltk.util import Deprecated
import nltk.classify.util # for accuracy & log_likelihood
from nltk.util import LazyMap
######################################################################
# { Helper Functions
######################################################################
# alternative name possibility: 'map_featurefunc()'?
# alternative name possibility: 'detect_features()'?
# alternative name possibility: 'map_featuredetect()'?
# or.. just have users use LazyMap directly?
def apply_features(feature_func, toks, labeled=None):
"""
Use the ``LazyMap`` class to construct a lazy list-like
object that is analogous to ``map(feature_func, toks)``. In
particular, if ``labeled=False``, then the returned list-like
object's values are equal to::
[feature_func(tok) for tok in toks]
If ``labeled=True``, then the returned list-like object's values
are equal to::
[(feature_func(tok), label) for (tok, label) in toks]
The primary purpose of this function is to avoid the memory
overhead involved in storing all the featuresets for every token
in a corpus. Instead, these featuresets are constructed lazily,
as-needed. The reduction in memory overhead can be especially
significant when the underlying list of tokens is itself lazy (as
is the case with many corpus readers).
:param feature_func: The function that will be applied to each
token. It should return a featureset -- i.e., a dict
mapping feature names to feature values.
:param toks: The list of tokens to which ``feature_func`` should be
applied. If ``labeled=True``, then the list elements will be
passed directly to ``feature_func()``. If ``labeled=False``,
then the list elements should be tuples ``(tok,label)``, and
``tok`` will be passed to ``feature_func()``.
:param labeled: If true, then ``toks`` contains labeled tokens --
i.e., tuples of the form ``(tok, label)``. (Default:
auto-detect based on types.)
"""
if labeled is None:
labeled = toks and isinstance(toks[0], (tuple, list))
if labeled:
def lazy_func(labeled_token):
return (feature_func(labeled_token[0]), labeled_token[1])
return LazyMap(lazy_func, toks)
else:
return LazyMap(feature_func, toks)
def attested_labels(tokens):
"""
:return: A list of all labels that are attested in the given list
of tokens.
:rtype: list of (immutable)
:param tokens: The list of classified tokens from which to extract
labels. A classified token has the form ``(token, label)``.
:type tokens: list
"""
return tuple({label for (tok, label) in tokens})
def log_likelihood(classifier, gold):
results = classifier.prob_classify_many([fs for (fs, l) in gold])
ll = [pdist.prob(l) for ((fs, l), pdist) in zip(gold, results)]
return math.log(sum(ll) / len(ll))
def accuracy(classifier, gold):
results = classifier.classify_many([fs for (fs, l) in gold])
correct = [l == r for ((fs, l), r) in zip(gold, results)]
if correct:
return sum(correct) / len(correct)
else:
return 0
class CutoffChecker:
"""
A helper class that implements cutoff checks based on number of
iterations and log likelihood.
Accuracy cutoffs are also implemented, but they're almost never
a good idea to use.
"""
def __init__(self, cutoffs):
self.cutoffs = cutoffs.copy()
if "min_ll" in cutoffs:
cutoffs["min_ll"] = -abs(cutoffs["min_ll"])
if "min_lldelta" in cutoffs:
cutoffs["min_lldelta"] = abs(cutoffs["min_lldelta"])
self.ll = None
self.acc = None
self.iter = 1
def check(self, classifier, train_toks):
cutoffs = self.cutoffs
self.iter += 1
if "max_iter" in cutoffs and self.iter >= cutoffs["max_iter"]:
return True # iteration cutoff.
new_ll = nltk.classify.util.log_likelihood(classifier, train_toks)
if math.isnan(new_ll):
return True
if "min_ll" in cutoffs or "min_lldelta" in cutoffs:
if "min_ll" in cutoffs and new_ll >= cutoffs["min_ll"]:
return True # log likelihood cutoff
if (
"min_lldelta" in cutoffs
and self.ll
and ((new_ll - self.ll) <= abs(cutoffs["min_lldelta"]))
):
return True # log likelihood delta cutoff
self.ll = new_ll
if "max_acc" in cutoffs or "min_accdelta" in cutoffs:
new_acc = nltk.classify.util.log_likelihood(classifier, train_toks)
if "max_acc" in cutoffs and new_acc >= cutoffs["max_acc"]:
return True # log likelihood cutoff
if (
"min_accdelta" in cutoffs
and self.acc
and ((new_acc - self.acc) <= abs(cutoffs["min_accdelta"]))
):
return True # log likelihood delta cutoff
self.acc = new_acc
return False # no cutoff reached.
######################################################################
# { Demos
######################################################################
def names_demo_features(name):
features = {}
features["alwayson"] = True
features["startswith"] = name[0].lower()
features["endswith"] = name[-1].lower()
for letter in "abcdefghijklmnopqrstuvwxyz":
features["count(%s)" % letter] = name.lower().count(letter)
features["has(%s)" % letter] = letter in name.lower()
return features
def binary_names_demo_features(name):
features = {}
features["alwayson"] = True
features["startswith(vowel)"] = name[0].lower() in "aeiouy"
features["endswith(vowel)"] = name[-1].lower() in "aeiouy"
for letter in "abcdefghijklmnopqrstuvwxyz":
features["count(%s)" % letter] = name.lower().count(letter)
features["has(%s)" % letter] = letter in name.lower()
features["startswith(%s)" % letter] = letter == name[0].lower()
features["endswith(%s)" % letter] = letter == name[-1].lower()
return features
def names_demo(trainer, features=names_demo_features):
import random
from nltk.corpus import names
# Construct a list of classified names, using the names corpus.
namelist = [(name, "male") for name in names.words("male.txt")] + [
(name, "female") for name in names.words("female.txt")
]
# Randomly split the names into a test & train set.
random.seed(123456)
random.shuffle(namelist)
train = namelist[:5000]
test = namelist[5000:5500]
# Train up a classifier.
print("Training classifier...")
classifier = trainer([(features(n), g) for (n, g) in train])
# Run the classifier on the test data.
print("Testing classifier...")
acc = accuracy(classifier, [(features(n), g) for (n, g) in test])
print("Accuracy: %6.4f" % acc)
# For classifiers that can find probabilities, show the log
# likelihood and some sample probability distributions.
try:
test_featuresets = [features(n) for (n, g) in test]
pdists = classifier.prob_classify_many(test_featuresets)
ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test)))
print()
print("Unseen Names P(Male) P(Female)\n" + "-" * 40)
for (name, gender), pdist in list(zip(test, pdists))[:5]:
if gender == "male":
fmt = " %-15s *%6.4f %6.4f"
else:
fmt = " %-15s %6.4f *%6.4f"
print(fmt % (name, pdist.prob("male"), pdist.prob("female")))
except NotImplementedError:
pass
# Return the classifier
return classifier
def partial_names_demo(trainer, features=names_demo_features):
import random
from nltk.corpus import names
male_names = names.words("male.txt")
female_names = names.words("female.txt")
random.seed(654321)
random.shuffle(male_names)
random.shuffle(female_names)
# Create a list of male names to be used as positive-labeled examples for training
positive = map(features, male_names[:2000])
# Create a list of male and female names to be used as unlabeled examples
unlabeled = map(features, male_names[2000:2500] + female_names[:500])
# Create a test set with correctly-labeled male and female names
test = [(name, True) for name in male_names[2500:2750]] + [
(name, False) for name in female_names[500:750]
]
random.shuffle(test)
# Train up a classifier.
print("Training classifier...")
classifier = trainer(positive, unlabeled)
# Run the classifier on the test data.
print("Testing classifier...")
acc = accuracy(classifier, [(features(n), m) for (n, m) in test])
print("Accuracy: %6.4f" % acc)
# For classifiers that can find probabilities, show the log
# likelihood and some sample probability distributions.
try:
test_featuresets = [features(n) for (n, m) in test]
pdists = classifier.prob_classify_many(test_featuresets)
ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test)))
print()
print("Unseen Names P(Male) P(Female)\n" + "-" * 40)
for (name, is_male), pdist in zip(test, pdists)[:5]:
if is_male == True:
fmt = " %-15s *%6.4f %6.4f"
else:
fmt = " %-15s %6.4f *%6.4f"
print(fmt % (name, pdist.prob(True), pdist.prob(False)))
except NotImplementedError:
pass
# Return the classifier
return classifier
_inst_cache = {}
def wsd_demo(trainer, word, features, n=1000):
import random
from nltk.corpus import senseval
# Get the instances.
print("Reading data...")
global _inst_cache
if word not in _inst_cache:
_inst_cache[word] = [(i, i.senses[0]) for i in senseval.instances(word)]
instances = _inst_cache[word][:]
if n > len(instances):
n = len(instances)
senses = list({l for (i, l) in instances})
print(" Senses: " + " ".join(senses))
# Randomly split the names into a test & train set.
print("Splitting into test & train...")
random.seed(123456)
random.shuffle(instances)
train = instances[: int(0.8 * n)]
test = instances[int(0.8 * n) : n]
# Train up a classifier.
print("Training classifier...")
classifier = trainer([(features(i), l) for (i, l) in train])
# Run the classifier on the test data.
print("Testing classifier...")
acc = accuracy(classifier, [(features(i), l) for (i, l) in test])
print("Accuracy: %6.4f" % acc)
# For classifiers that can find probabilities, show the log
# likelihood and some sample probability distributions.
try:
test_featuresets = [features(i) for (i, n) in test]
pdists = classifier.prob_classify_many(test_featuresets)
ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test)))
except NotImplementedError:
pass
# Return the classifier
return classifier
def check_megam_config():
"""
Checks whether the MEGAM binary is configured.
"""
try:
_megam_bin
except NameError as e:
err_msg = str(
"Please configure your megam binary first, e.g.\n"
">>> nltk.config_megam('/usr/bin/local/megam')"
)
raise NameError(err_msg) from e

View File

@@ -0,0 +1,377 @@
# Natural Language Toolkit: Interface to Weka Classsifiers
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Classifiers that make use of the external 'Weka' package.
"""
import os
import re
import subprocess
import tempfile
import time
import zipfile
from sys import stdin
from nltk.classify.api import ClassifierI
from nltk.internals import config_java, java
from nltk.probability import DictionaryProbDist
_weka_classpath = None
_weka_search = [
".",
"/usr/share/weka",
"/usr/local/share/weka",
"/usr/lib/weka",
"/usr/local/lib/weka",
]
def config_weka(classpath=None):
global _weka_classpath
# Make sure java's configured first.
config_java()
if classpath is not None:
_weka_classpath = classpath
if _weka_classpath is None:
searchpath = _weka_search
if "WEKAHOME" in os.environ:
searchpath.insert(0, os.environ["WEKAHOME"])
for path in searchpath:
if os.path.exists(os.path.join(path, "weka.jar")):
_weka_classpath = os.path.join(path, "weka.jar")
version = _check_weka_version(_weka_classpath)
if version:
print(f"[Found Weka: {_weka_classpath} (version {version})]")
else:
print("[Found Weka: %s]" % _weka_classpath)
_check_weka_version(_weka_classpath)
if _weka_classpath is None:
raise LookupError(
"Unable to find weka.jar! Use config_weka() "
"or set the WEKAHOME environment variable. "
"For more information about Weka, please see "
"https://www.cs.waikato.ac.nz/ml/weka/"
)
def _check_weka_version(jar):
try:
zf = zipfile.ZipFile(jar)
except (SystemExit, KeyboardInterrupt):
raise
except:
return None
try:
try:
return zf.read("weka/core/version.txt")
except KeyError:
return None
finally:
zf.close()
class WekaClassifier(ClassifierI):
def __init__(self, formatter, model_filename):
self._formatter = formatter
self._model = model_filename
def prob_classify_many(self, featuresets):
return self._classify_many(featuresets, ["-p", "0", "-distribution"])
def classify_many(self, featuresets):
return self._classify_many(featuresets, ["-p", "0"])
def _classify_many(self, featuresets, options):
# Make sure we can find java & weka.
config_weka()
temp_dir = tempfile.mkdtemp()
try:
# Write the test data file.
test_filename = os.path.join(temp_dir, "test.arff")
self._formatter.write(test_filename, featuresets)
# Call weka to classify the data.
cmd = [
"weka.classifiers.bayes.NaiveBayes",
"-l",
self._model,
"-T",
test_filename,
] + options
(stdout, stderr) = java(
cmd,
classpath=_weka_classpath,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
# Check if something went wrong:
if stderr and not stdout:
if "Illegal options: -distribution" in stderr:
raise ValueError(
"The installed version of weka does "
"not support probability distribution "
"output."
)
else:
raise ValueError("Weka failed to generate output:\n%s" % stderr)
# Parse weka's output.
return self.parse_weka_output(stdout.decode(stdin.encoding).split("\n"))
finally:
for f in os.listdir(temp_dir):
os.remove(os.path.join(temp_dir, f))
os.rmdir(temp_dir)
def parse_weka_distribution(self, s):
probs = [float(v) for v in re.split("[*,]+", s) if v.strip()]
probs = dict(zip(self._formatter.labels(), probs))
return DictionaryProbDist(probs)
def parse_weka_output(self, lines):
# Strip unwanted text from stdout
for i, line in enumerate(lines):
if line.strip().startswith("inst#"):
lines = lines[i:]
break
if lines[0].split() == ["inst#", "actual", "predicted", "error", "prediction"]:
return [line.split()[2].split(":")[1] for line in lines[1:] if line.strip()]
elif lines[0].split() == [
"inst#",
"actual",
"predicted",
"error",
"distribution",
]:
return [
self.parse_weka_distribution(line.split()[-1])
for line in lines[1:]
if line.strip()
]
# is this safe:?
elif re.match(r"^0 \w+ [01]\.[0-9]* \?\s*$", lines[0]):
return [line.split()[1] for line in lines if line.strip()]
else:
for line in lines[:10]:
print(line)
raise ValueError(
"Unhandled output format -- your version "
"of weka may not be supported.\n"
" Header: %s" % lines[0]
)
# [xx] full list of classifiers (some may be abstract?):
# ADTree, AODE, BayesNet, ComplementNaiveBayes, ConjunctiveRule,
# DecisionStump, DecisionTable, HyperPipes, IB1, IBk, Id3, J48,
# JRip, KStar, LBR, LeastMedSq, LinearRegression, LMT, Logistic,
# LogisticBase, M5Base, MultilayerPerceptron,
# MultipleClassifiersCombiner, NaiveBayes, NaiveBayesMultinomial,
# NaiveBayesSimple, NBTree, NNge, OneR, PaceRegression, PART,
# PreConstructedLinearModel, Prism, RandomForest,
# RandomizableClassifier, RandomTree, RBFNetwork, REPTree, Ridor,
# RuleNode, SimpleLinearRegression, SimpleLogistic,
# SingleClassifierEnhancer, SMO, SMOreg, UserClassifier, VFI,
# VotedPerceptron, Winnow, ZeroR
_CLASSIFIER_CLASS = {
"naivebayes": "weka.classifiers.bayes.NaiveBayes",
"C4.5": "weka.classifiers.trees.J48",
"log_regression": "weka.classifiers.functions.Logistic",
"svm": "weka.classifiers.functions.SMO",
"kstar": "weka.classifiers.lazy.KStar",
"ripper": "weka.classifiers.rules.JRip",
}
@classmethod
def train(
cls,
model_filename,
featuresets,
classifier="naivebayes",
options=[],
quiet=True,
):
# Make sure we can find java & weka.
config_weka()
# Build an ARFF formatter.
formatter = ARFF_Formatter.from_train(featuresets)
temp_dir = tempfile.mkdtemp()
try:
# Write the training data file.
train_filename = os.path.join(temp_dir, "train.arff")
formatter.write(train_filename, featuresets)
if classifier in cls._CLASSIFIER_CLASS:
javaclass = cls._CLASSIFIER_CLASS[classifier]
elif classifier in cls._CLASSIFIER_CLASS.values():
javaclass = classifier
else:
raise ValueError("Unknown classifier %s" % classifier)
# Train the weka model.
cmd = [javaclass, "-d", model_filename, "-t", train_filename]
cmd += list(options)
if quiet:
stdout = subprocess.PIPE
else:
stdout = None
java(cmd, classpath=_weka_classpath, stdout=stdout)
# Return the new classifier.
return WekaClassifier(formatter, model_filename)
finally:
for f in os.listdir(temp_dir):
os.remove(os.path.join(temp_dir, f))
os.rmdir(temp_dir)
class ARFF_Formatter:
"""
Converts featuresets and labeled featuresets to ARFF-formatted
strings, appropriate for input into Weka.
Features and classes can be specified manually in the constructor, or may
be determined from data using ``from_train``.
"""
def __init__(self, labels, features):
"""
:param labels: A list of all class labels that can be generated.
:param features: A list of feature specifications, where
each feature specification is a tuple (fname, ftype);
and ftype is an ARFF type string such as NUMERIC or
STRING.
"""
self._labels = labels
self._features = features
def format(self, tokens):
"""Returns a string representation of ARFF output for the given data."""
return self.header_section() + self.data_section(tokens)
def labels(self):
"""Returns the list of classes."""
return list(self._labels)
def write(self, outfile, tokens):
"""Writes ARFF data to a file for the given data."""
if not hasattr(outfile, "write"):
outfile = open(outfile, "w")
outfile.write(self.format(tokens))
outfile.close()
@staticmethod
def from_train(tokens):
"""
Constructs an ARFF_Formatter instance with class labels and feature
types determined from the given data. Handles boolean, numeric and
string (note: not nominal) types.
"""
# Find the set of all attested labels.
labels = {label for (tok, label) in tokens}
# Determine the types of all features.
features = {}
for tok, label in tokens:
for fname, fval in tok.items():
if issubclass(type(fval), bool):
ftype = "{True, False}"
elif issubclass(type(fval), (int, float, bool)):
ftype = "NUMERIC"
elif issubclass(type(fval), str):
ftype = "STRING"
elif fval is None:
continue # can't tell the type.
else:
raise ValueError("Unsupported value type %r" % ftype)
if features.get(fname, ftype) != ftype:
raise ValueError("Inconsistent type for %s" % fname)
features[fname] = ftype
features = sorted(features.items())
return ARFF_Formatter(labels, features)
def header_section(self):
"""Returns an ARFF header as a string."""
# Header comment.
s = (
"% Weka ARFF file\n"
+ "% Generated automatically by NLTK\n"
+ "%% %s\n\n" % time.ctime()
)
# Relation name
s += "@RELATION rel\n\n"
# Input attribute specifications
for fname, ftype in self._features:
s += "@ATTRIBUTE %-30r %s\n" % (fname, ftype)
# Label attribute specification
s += "@ATTRIBUTE %-30r {%s}\n" % ("-label-", ",".join(self._labels))
return s
def data_section(self, tokens, labeled=None):
"""
Returns the ARFF data section for the given data.
:param tokens: a list of featuresets (dicts) or labelled featuresets
which are tuples (featureset, label).
:param labeled: Indicates whether the given tokens are labeled
or not. If None, then the tokens will be assumed to be
labeled if the first token's value is a tuple or list.
"""
# Check if the tokens are labeled or unlabeled. If unlabeled,
# then use 'None'
if labeled is None:
labeled = tokens and isinstance(tokens[0], (tuple, list))
if not labeled:
tokens = [(tok, None) for tok in tokens]
# Data section
s = "\n@DATA\n"
for tok, label in tokens:
for fname, ftype in self._features:
s += "%s," % self._fmt_arff_val(tok.get(fname))
s += "%s\n" % self._fmt_arff_val(label)
return s
def _fmt_arff_val(self, fval):
if fval is None:
return "?"
elif isinstance(fval, (bool, int)):
return "%s" % fval
elif isinstance(fval, float):
return "%r" % fval
else:
return "%r" % fval
if __name__ == "__main__":
from nltk.classify.util import binary_names_demo_features, names_demo
def make_classifier(featuresets):
return WekaClassifier.train("/tmp/name.model", featuresets, "C4.5")
classifier = names_demo(make_classifier, binary_names_demo_features)

View File

@@ -0,0 +1,55 @@
# Natural Language Toolkit: NLTK Command-Line Interface
#
# Copyright (C) 2001-2025 NLTK Project
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import click
from tqdm import tqdm
from nltk import word_tokenize
from nltk.util import parallelize_preprocess
CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])
@click.group(context_settings=CONTEXT_SETTINGS)
@click.version_option()
def cli():
pass
@cli.command("tokenize")
@click.option(
"--language",
"-l",
default="en",
help="The language for the Punkt sentence tokenization.",
)
@click.option(
"--preserve-line",
"-l",
default=True,
is_flag=True,
help="An option to keep the preserve the sentence and not sentence tokenize it.",
)
@click.option("--processes", "-j", default=1, help="No. of processes.")
@click.option("--encoding", "-e", default="utf8", help="Specify encoding of file.")
@click.option(
"--delimiter", "-d", default=" ", help="Specify delimiter to join the tokens."
)
def tokenize_file(language, preserve_line, processes, encoding, delimiter):
"""This command tokenizes text stream using nltk.word_tokenize"""
with click.get_text_stream("stdin", encoding=encoding) as fin:
with click.get_text_stream("stdout", encoding=encoding) as fout:
# If it's single process, joblib parallelization is slower,
# so just process line by line normally.
if processes == 1:
for line in tqdm(fin.readlines()):
print(delimiter.join(word_tokenize(line)), end="\n", file=fout)
else:
for outline in parallelize_preprocess(
word_tokenize, fin.readlines(), processes, progress_bar=True
):
print(delimiter.join(outline), end="\n", file=fout)

View File

@@ -0,0 +1,92 @@
# Natural Language Toolkit: Clusterers
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
This module contains a number of basic clustering algorithms. Clustering
describes the task of discovering groups of similar items with a large
collection. It is also describe as unsupervised machine learning, as the data
from which it learns is unannotated with class information, as is the case for
supervised learning. Annotated data is difficult and expensive to obtain in
the quantities required for the majority of supervised learning algorithms.
This problem, the knowledge acquisition bottleneck, is common to most natural
language processing tasks, thus fueling the need for quality unsupervised
approaches.
This module contains a k-means clusterer, E-M clusterer and a group average
agglomerative clusterer (GAAC). All these clusterers involve finding good
cluster groupings for a set of vectors in multi-dimensional space.
The K-means clusterer starts with k arbitrary chosen means then allocates each
vector to the cluster with the closest mean. It then recalculates the means of
each cluster as the centroid of the vectors in the cluster. This process
repeats until the cluster memberships stabilise. This is a hill-climbing
algorithm which may converge to a local maximum. Hence the clustering is
often repeated with random initial means and the most commonly occurring
output means are chosen.
The GAAC clusterer starts with each of the *N* vectors as singleton clusters.
It then iteratively merges pairs of clusters which have the closest centroids.
This continues until there is only one cluster. The order of merges gives rise
to a dendrogram - a tree with the earlier merges lower than later merges. The
membership of a given number of clusters *c*, *1 <= c <= N*, can be found by
cutting the dendrogram at depth *c*.
The Gaussian EM clusterer models the vectors as being produced by a mixture
of k Gaussian sources. The parameters of these sources (prior probability,
mean and covariance matrix) are then found to maximise the likelihood of the
given data. This is done with the expectation maximisation algorithm. It
starts with k arbitrarily chosen means, priors and covariance matrices. It
then calculates the membership probabilities for each vector in each of the
clusters - this is the 'E' step. The cluster parameters are then updated in
the 'M' step using the maximum likelihood estimate from the cluster membership
probabilities. This process continues until the likelihood of the data does
not significantly increase.
They all extend the ClusterI interface which defines common operations
available with each clusterer. These operations include:
- cluster: clusters a sequence of vectors
- classify: assign a vector to a cluster
- classification_probdist: give the probability distribution over cluster memberships
The current existing classifiers also extend cluster.VectorSpace, an
abstract class which allows for singular value decomposition (SVD) and vector
normalisation. SVD is used to reduce the dimensionality of the vector space in
such a manner as to preserve as much of the variation as possible, by
reparameterising the axes in order of variability and discarding all bar the
first d dimensions. Normalisation ensures that vectors fall in the unit
hypersphere.
Usage example (see also demo())::
from nltk import cluster
from nltk.cluster import euclidean_distance
from numpy import array
vectors = [array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0]]]
# initialise the clusterer (will also assign the vectors to clusters)
clusterer = cluster.KMeansClusterer(2, euclidean_distance)
clusterer.cluster(vectors, True)
# classify a new vector
print(clusterer.classify(array([3, 3])))
Note that the vectors must use numpy array-like
objects. nltk_contrib.unimelb.tacohn.SparseArrays may be used for
efficiency when required.
"""
from nltk.cluster.em import EMClusterer
from nltk.cluster.gaac import GAAClusterer
from nltk.cluster.kmeans import KMeansClusterer
from nltk.cluster.util import (
Dendrogram,
VectorSpaceClusterer,
cosine_distance,
euclidean_distance,
)

View File

@@ -0,0 +1,74 @@
# Natural Language Toolkit: Clusterer Interfaces
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
# Porting: Steven Bird <stevenbird1@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
from abc import ABCMeta, abstractmethod
from nltk.probability import DictionaryProbDist
class ClusterI(metaclass=ABCMeta):
"""
Interface covering basic clustering functionality.
"""
@abstractmethod
def cluster(self, vectors, assign_clusters=False):
"""
Assigns the vectors to clusters, learning the clustering parameters
from the data. Returns a cluster identifier for each vector.
"""
@abstractmethod
def classify(self, token):
"""
Classifies the token into a cluster, setting the token's CLUSTER
parameter to that cluster identifier.
"""
def likelihood(self, vector, label):
"""
Returns the likelihood (a float) of the token having the
corresponding cluster.
"""
if self.classify(vector) == label:
return 1.0
else:
return 0.0
def classification_probdist(self, vector):
"""
Classifies the token into a cluster, returning
a probability distribution over the cluster identifiers.
"""
likelihoods = {}
sum = 0.0
for cluster in self.cluster_names():
likelihoods[cluster] = self.likelihood(vector, cluster)
sum += likelihoods[cluster]
for cluster in self.cluster_names():
likelihoods[cluster] /= sum
return DictionaryProbDist(likelihoods)
@abstractmethod
def num_clusters(self):
"""
Returns the number of clusters.
"""
def cluster_names(self):
"""
Returns the names of the clusters.
:rtype: list
"""
return list(range(self.num_clusters()))
def cluster_name(self, index):
"""
Returns the names of the cluster at index.
"""
return index

View File

@@ -0,0 +1,219 @@
# Natural Language Toolkit: Expectation Maximization Clusterer
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
try:
import numpy
except ImportError:
pass
from nltk.cluster.util import VectorSpaceClusterer
class EMClusterer(VectorSpaceClusterer):
"""
The Gaussian EM clusterer models the vectors as being produced by
a mixture of k Gaussian sources. The parameters of these sources
(prior probability, mean and covariance matrix) are then found to
maximise the likelihood of the given data. This is done with the
expectation maximisation algorithm. It starts with k arbitrarily
chosen means, priors and covariance matrices. It then calculates
the membership probabilities for each vector in each of the
clusters; this is the 'E' step. The cluster parameters are then
updated in the 'M' step using the maximum likelihood estimate from
the cluster membership probabilities. This process continues until
the likelihood of the data does not significantly increase.
"""
def __init__(
self,
initial_means,
priors=None,
covariance_matrices=None,
conv_threshold=1e-6,
bias=0.1,
normalise=False,
svd_dimensions=None,
):
"""
Creates an EM clusterer with the given starting parameters,
convergence threshold and vector mangling parameters.
:param initial_means: the means of the gaussian cluster centers
:type initial_means: [seq of] numpy array or seq of SparseArray
:param priors: the prior probability for each cluster
:type priors: numpy array or seq of float
:param covariance_matrices: the covariance matrix for each cluster
:type covariance_matrices: [seq of] numpy array
:param conv_threshold: maximum change in likelihood before deemed
convergent
:type conv_threshold: int or float
:param bias: variance bias used to ensure non-singular covariance
matrices
:type bias: float
:param normalise: should vectors be normalised to length 1
:type normalise: boolean
:param svd_dimensions: number of dimensions to use in reducing vector
dimensionsionality with SVD
:type svd_dimensions: int
"""
VectorSpaceClusterer.__init__(self, normalise, svd_dimensions)
self._means = numpy.array(initial_means, numpy.float64)
self._num_clusters = len(initial_means)
self._conv_threshold = conv_threshold
self._covariance_matrices = covariance_matrices
self._priors = priors
self._bias = bias
def num_clusters(self):
return self._num_clusters
def cluster_vectorspace(self, vectors, trace=False):
assert len(vectors) > 0
# set the parameters to initial values
dimensions = len(vectors[0])
means = self._means
priors = self._priors
if not priors:
priors = self._priors = (
numpy.ones(self._num_clusters, numpy.float64) / self._num_clusters
)
covariances = self._covariance_matrices
if not covariances:
covariances = self._covariance_matrices = [
numpy.identity(dimensions, numpy.float64)
for i in range(self._num_clusters)
]
# do the E and M steps until the likelihood plateaus
lastl = self._loglikelihood(vectors, priors, means, covariances)
converged = False
while not converged:
if trace:
print("iteration; loglikelihood", lastl)
# E-step, calculate hidden variables, h[i,j]
h = numpy.zeros((len(vectors), self._num_clusters), numpy.float64)
for i in range(len(vectors)):
for j in range(self._num_clusters):
h[i, j] = priors[j] * self._gaussian(
means[j], covariances[j], vectors[i]
)
h[i, :] /= sum(h[i, :])
# M-step, update parameters - cvm, p, mean
for j in range(self._num_clusters):
covariance_before = covariances[j]
new_covariance = numpy.zeros((dimensions, dimensions), numpy.float64)
new_mean = numpy.zeros(dimensions, numpy.float64)
sum_hj = 0.0
for i in range(len(vectors)):
delta = vectors[i] - means[j]
new_covariance += h[i, j] * numpy.multiply.outer(delta, delta)
sum_hj += h[i, j]
new_mean += h[i, j] * vectors[i]
covariances[j] = new_covariance / sum_hj
means[j] = new_mean / sum_hj
priors[j] = sum_hj / len(vectors)
# bias term to stop covariance matrix being singular
covariances[j] += self._bias * numpy.identity(dimensions, numpy.float64)
# calculate likelihood - FIXME: may be broken
l = self._loglikelihood(vectors, priors, means, covariances)
# check for convergence
if abs(lastl - l) < self._conv_threshold:
converged = True
lastl = l
def classify_vectorspace(self, vector):
best = None
for j in range(self._num_clusters):
p = self._priors[j] * self._gaussian(
self._means[j], self._covariance_matrices[j], vector
)
if not best or p > best[0]:
best = (p, j)
return best[1]
def likelihood_vectorspace(self, vector, cluster):
cid = self.cluster_names().index(cluster)
return self._priors[cluster] * self._gaussian(
self._means[cluster], self._covariance_matrices[cluster], vector
)
def _gaussian(self, mean, cvm, x):
m = len(mean)
assert cvm.shape == (m, m), "bad sized covariance matrix, %s" % str(cvm.shape)
try:
det = numpy.linalg.det(cvm)
inv = numpy.linalg.inv(cvm)
a = det**-0.5 * (2 * numpy.pi) ** (-m / 2.0)
dx = x - mean
print(dx, inv)
b = -0.5 * numpy.dot(numpy.dot(dx, inv), dx)
return a * numpy.exp(b)
except OverflowError:
# happens when the exponent is negative infinity - i.e. b = 0
# i.e. the inverse of cvm is huge (cvm is almost zero)
return 0
def _loglikelihood(self, vectors, priors, means, covariances):
llh = 0.0
for vector in vectors:
p = 0
for j in range(len(priors)):
p += priors[j] * self._gaussian(means[j], covariances[j], vector)
llh += numpy.log(p)
return llh
def __repr__(self):
return "<EMClusterer means=%s>" % list(self._means)
def demo():
"""
Non-interactive demonstration of the clusterers with simple 2-D data.
"""
from nltk import cluster
# example from figure 14.10, page 519, Manning and Schutze
vectors = [numpy.array(f) for f in [[0.5, 0.5], [1.5, 0.5], [1, 3]]]
means = [[4, 2], [4, 2.01]]
clusterer = cluster.EMClusterer(means, bias=0.1)
clusters = clusterer.cluster(vectors, True, trace=True)
print("Clustered:", vectors)
print("As: ", clusters)
print()
for c in range(2):
print("Cluster:", c)
print("Prior: ", clusterer._priors[c])
print("Mean: ", clusterer._means[c])
print("Covar: ", clusterer._covariance_matrices[c])
print()
# classify a new vector
vector = numpy.array([2, 2])
print("classify(%s):" % vector, end=" ")
print(clusterer.classify(vector))
# show the classification probabilities
vector = numpy.array([2, 2])
print("classification_probdist(%s):" % vector)
pdist = clusterer.classification_probdist(vector)
for sample in pdist.samples():
print(f"{sample} => {pdist.prob(sample) * 100:.0f}%")
if __name__ == "__main__":
demo()

View File

@@ -0,0 +1,170 @@
# Natural Language Toolkit: Group Average Agglomerative Clusterer
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
try:
import numpy
except ImportError:
pass
from nltk.cluster.util import Dendrogram, VectorSpaceClusterer, cosine_distance
class GAAClusterer(VectorSpaceClusterer):
"""
The Group Average Agglomerative starts with each of the N vectors as singleton
clusters. It then iteratively merges pairs of clusters which have the
closest centroids. This continues until there is only one cluster. The
order of merges gives rise to a dendrogram: a tree with the earlier merges
lower than later merges. The membership of a given number of clusters c, 1
<= c <= N, can be found by cutting the dendrogram at depth c.
This clusterer uses the cosine similarity metric only, which allows for
efficient speed-up in the clustering process.
"""
def __init__(self, num_clusters=1, normalise=True, svd_dimensions=None):
VectorSpaceClusterer.__init__(self, normalise, svd_dimensions)
self._num_clusters = num_clusters
self._dendrogram = None
self._groups_values = None
def cluster(self, vectors, assign_clusters=False, trace=False):
# stores the merge order
self._dendrogram = Dendrogram(
[numpy.array(vector, numpy.float64) for vector in vectors]
)
return VectorSpaceClusterer.cluster(self, vectors, assign_clusters, trace)
def cluster_vectorspace(self, vectors, trace=False):
# variables describing the initial situation
N = len(vectors)
cluster_len = [1] * N
cluster_count = N
index_map = numpy.arange(N)
# construct the similarity matrix
dims = (N, N)
dist = numpy.ones(dims, dtype=float) * numpy.inf
for i in range(N):
for j in range(i + 1, N):
dist[i, j] = cosine_distance(vectors[i], vectors[j])
while cluster_count > max(self._num_clusters, 1):
i, j = numpy.unravel_index(dist.argmin(), dims)
if trace:
print("merging %d and %d" % (i, j))
# update similarities for merging i and j
self._merge_similarities(dist, cluster_len, i, j)
# remove j
dist[:, j] = numpy.inf
dist[j, :] = numpy.inf
# merge the clusters
cluster_len[i] = cluster_len[i] + cluster_len[j]
self._dendrogram.merge(index_map[i], index_map[j])
cluster_count -= 1
# update the index map to reflect the indexes if we
# had removed j
index_map[j + 1 :] -= 1
index_map[j] = N
self.update_clusters(self._num_clusters)
def _merge_similarities(self, dist, cluster_len, i, j):
# the new cluster i merged from i and j adopts the average of
# i and j's similarity to each other cluster, weighted by the
# number of points in the clusters i and j
i_weight = cluster_len[i]
j_weight = cluster_len[j]
weight_sum = i_weight + j_weight
# update for x<i
dist[:i, i] = dist[:i, i] * i_weight + dist[:i, j] * j_weight
dist[:i, i] /= weight_sum
# update for i<x<j
dist[i, i + 1 : j] = (
dist[i, i + 1 : j] * i_weight + dist[i + 1 : j, j] * j_weight
)
# update for i<j<x
dist[i, j + 1 :] = dist[i, j + 1 :] * i_weight + dist[j, j + 1 :] * j_weight
dist[i, i + 1 :] /= weight_sum
def update_clusters(self, num_clusters):
clusters = self._dendrogram.groups(num_clusters)
self._centroids = []
for cluster in clusters:
assert len(cluster) > 0
if self._should_normalise:
centroid = self._normalise(cluster[0])
else:
centroid = numpy.array(cluster[0])
for vector in cluster[1:]:
if self._should_normalise:
centroid += self._normalise(vector)
else:
centroid += vector
centroid /= len(cluster)
self._centroids.append(centroid)
self._num_clusters = len(self._centroids)
def classify_vectorspace(self, vector):
best = None
for i in range(self._num_clusters):
centroid = self._centroids[i]
dist = cosine_distance(vector, centroid)
if not best or dist < best[0]:
best = (dist, i)
return best[1]
def dendrogram(self):
"""
:return: The dendrogram representing the current clustering
:rtype: Dendrogram
"""
return self._dendrogram
def num_clusters(self):
return self._num_clusters
def __repr__(self):
return "<GroupAverageAgglomerative Clusterer n=%d>" % self._num_clusters
def demo():
"""
Non-interactive demonstration of the clusterers with simple 2-D data.
"""
from nltk.cluster import GAAClusterer
# use a set of tokens with 2D indices
vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]
# test the GAAC clusterer with 4 clusters
clusterer = GAAClusterer(4)
clusters = clusterer.cluster(vectors, True)
print("Clusterer:", clusterer)
print("Clustered:", vectors)
print("As:", clusters)
print()
# show the dendrogram
clusterer.dendrogram().show()
# classify a new vector
vector = numpy.array([3, 3])
print("classify(%s):" % vector, end=" ")
print(clusterer.classify(vector))
print()
if __name__ == "__main__":
demo()

View File

@@ -0,0 +1,230 @@
# Natural Language Toolkit: K-Means Clusterer
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import copy
import random
import sys
try:
import numpy
except ImportError:
pass
from nltk.cluster.util import VectorSpaceClusterer
class KMeansClusterer(VectorSpaceClusterer):
"""
The K-means clusterer starts with k arbitrary chosen means then allocates
each vector to the cluster with the closest mean. It then recalculates the
means of each cluster as the centroid of the vectors in the cluster. This
process repeats until the cluster memberships stabilise. This is a
hill-climbing algorithm which may converge to a local maximum. Hence the
clustering is often repeated with random initial means and the most
commonly occurring output means are chosen.
"""
def __init__(
self,
num_means,
distance,
repeats=1,
conv_test=1e-6,
initial_means=None,
normalise=False,
svd_dimensions=None,
rng=None,
avoid_empty_clusters=False,
):
"""
:param num_means: the number of means to use (may use fewer)
:type num_means: int
:param distance: measure of distance between two vectors
:type distance: function taking two vectors and returning a float
:param repeats: number of randomised clustering trials to use
:type repeats: int
:param conv_test: maximum variation in mean differences before
deemed convergent
:type conv_test: number
:param initial_means: set of k initial means
:type initial_means: sequence of vectors
:param normalise: should vectors be normalised to length 1
:type normalise: boolean
:param svd_dimensions: number of dimensions to use in reducing vector
dimensionsionality with SVD
:type svd_dimensions: int
:param rng: random number generator (or None)
:type rng: Random
:param avoid_empty_clusters: include current centroid in computation
of next one; avoids undefined behavior
when clusters become empty
:type avoid_empty_clusters: boolean
"""
VectorSpaceClusterer.__init__(self, normalise, svd_dimensions)
self._num_means = num_means
self._distance = distance
self._max_difference = conv_test
assert not initial_means or len(initial_means) == num_means
self._means = initial_means
assert repeats >= 1
assert not (initial_means and repeats > 1)
self._repeats = repeats
self._rng = rng if rng else random.Random()
self._avoid_empty_clusters = avoid_empty_clusters
def cluster_vectorspace(self, vectors, trace=False):
if self._means and self._repeats > 1:
print("Warning: means will be discarded for subsequent trials")
meanss = []
for trial in range(self._repeats):
if trace:
print("k-means trial", trial)
if not self._means or trial > 1:
self._means = self._rng.sample(list(vectors), self._num_means)
self._cluster_vectorspace(vectors, trace)
meanss.append(self._means)
if len(meanss) > 1:
# sort the means first (so that different cluster numbering won't
# effect the distance comparison)
for means in meanss:
means.sort(key=sum)
# find the set of means that's minimally different from the others
min_difference = min_means = None
for i in range(len(meanss)):
d = 0
for j in range(len(meanss)):
if i != j:
d += self._sum_distances(meanss[i], meanss[j])
if min_difference is None or d < min_difference:
min_difference, min_means = d, meanss[i]
# use the best means
self._means = min_means
def _cluster_vectorspace(self, vectors, trace=False):
if self._num_means < len(vectors):
# perform k-means clustering
converged = False
while not converged:
# assign the tokens to clusters based on minimum distance to
# the cluster means
clusters = [[] for m in range(self._num_means)]
for vector in vectors:
index = self.classify_vectorspace(vector)
clusters[index].append(vector)
if trace:
print("iteration")
# for i in range(self._num_means):
# print ' mean', i, 'allocated', len(clusters[i]), 'vectors'
# recalculate cluster means by computing the centroid of each cluster
new_means = list(map(self._centroid, clusters, self._means))
# measure the degree of change from the previous step for convergence
difference = self._sum_distances(self._means, new_means)
if difference < self._max_difference:
converged = True
# remember the new means
self._means = new_means
def classify_vectorspace(self, vector):
# finds the closest cluster centroid
# returns that cluster's index
best_distance = best_index = None
for index in range(len(self._means)):
mean = self._means[index]
dist = self._distance(vector, mean)
if best_distance is None or dist < best_distance:
best_index, best_distance = index, dist
return best_index
def num_clusters(self):
if self._means:
return len(self._means)
else:
return self._num_means
def means(self):
"""
The means used for clustering.
"""
return self._means
def _sum_distances(self, vectors1, vectors2):
difference = 0.0
for u, v in zip(vectors1, vectors2):
difference += self._distance(u, v)
return difference
def _centroid(self, cluster, mean):
if self._avoid_empty_clusters:
centroid = copy.copy(mean)
for vector in cluster:
centroid += vector
return centroid / (1 + len(cluster))
else:
if not len(cluster):
sys.stderr.write("Error: no centroid defined for empty cluster.\n")
sys.stderr.write(
"Try setting argument 'avoid_empty_clusters' to True\n"
)
assert False
centroid = copy.copy(cluster[0])
for vector in cluster[1:]:
centroid += vector
return centroid / len(cluster)
def __repr__(self):
return "<KMeansClusterer means=%s repeats=%d>" % (self._means, self._repeats)
#################################################################################
def demo():
# example from figure 14.9, page 517, Manning and Schutze
from nltk.cluster import KMeansClusterer, euclidean_distance
vectors = [numpy.array(f) for f in [[2, 1], [1, 3], [4, 7], [6, 7]]]
means = [[4, 3], [5, 5]]
clusterer = KMeansClusterer(2, euclidean_distance, initial_means=means)
clusters = clusterer.cluster(vectors, True, trace=True)
print("Clustered:", vectors)
print("As:", clusters)
print("Means:", clusterer.means())
print()
vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]
# test k-means using the euclidean distance metric, 2 means and repeat
# clustering 10 times with random seeds
clusterer = KMeansClusterer(2, euclidean_distance, repeats=10)
clusters = clusterer.cluster(vectors, True)
print("Clustered:", vectors)
print("As:", clusters)
print("Means:", clusterer.means())
print()
# classify a new vector
vector = numpy.array([3, 3])
print("classify(%s):" % vector, end=" ")
print(clusterer.classify(vector))
print()
if __name__ == "__main__":
demo()

View File

@@ -0,0 +1,300 @@
# Natural Language Toolkit: Clusterer Utilities
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
# Contributor: J Richard Snape
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import copy
from abc import abstractmethod
from math import sqrt
from sys import stdout
try:
import numpy
except ImportError:
pass
from nltk.cluster.api import ClusterI
class VectorSpaceClusterer(ClusterI):
"""
Abstract clusterer which takes tokens and maps them into a vector space.
Optionally performs singular value decomposition to reduce the
dimensionality.
"""
def __init__(self, normalise=False, svd_dimensions=None):
"""
:param normalise: should vectors be normalised to length 1
:type normalise: boolean
:param svd_dimensions: number of dimensions to use in reducing vector
dimensionsionality with SVD
:type svd_dimensions: int
"""
self._Tt = None
self._should_normalise = normalise
self._svd_dimensions = svd_dimensions
def cluster(self, vectors, assign_clusters=False, trace=False):
assert len(vectors) > 0
# normalise the vectors
if self._should_normalise:
vectors = list(map(self._normalise, vectors))
# use SVD to reduce the dimensionality
if self._svd_dimensions and self._svd_dimensions < len(vectors[0]):
[u, d, vt] = numpy.linalg.svd(numpy.transpose(numpy.array(vectors)))
S = d[: self._svd_dimensions] * numpy.identity(
self._svd_dimensions, numpy.float64
)
T = u[:, : self._svd_dimensions]
Dt = vt[: self._svd_dimensions, :]
vectors = numpy.transpose(numpy.dot(S, Dt))
self._Tt = numpy.transpose(T)
# call abstract method to cluster the vectors
self.cluster_vectorspace(vectors, trace)
# assign the vectors to clusters
if assign_clusters:
return [self.classify(vector) for vector in vectors]
@abstractmethod
def cluster_vectorspace(self, vectors, trace):
"""
Finds the clusters using the given set of vectors.
"""
def classify(self, vector):
if self._should_normalise:
vector = self._normalise(vector)
if self._Tt is not None:
vector = numpy.dot(self._Tt, vector)
cluster = self.classify_vectorspace(vector)
return self.cluster_name(cluster)
@abstractmethod
def classify_vectorspace(self, vector):
"""
Returns the index of the appropriate cluster for the vector.
"""
def likelihood(self, vector, label):
if self._should_normalise:
vector = self._normalise(vector)
if self._Tt is not None:
vector = numpy.dot(self._Tt, vector)
return self.likelihood_vectorspace(vector, label)
def likelihood_vectorspace(self, vector, cluster):
"""
Returns the likelihood of the vector belonging to the cluster.
"""
predicted = self.classify_vectorspace(vector)
return 1.0 if cluster == predicted else 0.0
def vector(self, vector):
"""
Returns the vector after normalisation and dimensionality reduction
"""
if self._should_normalise:
vector = self._normalise(vector)
if self._Tt is not None:
vector = numpy.dot(self._Tt, vector)
return vector
def _normalise(self, vector):
"""
Normalises the vector to unit length.
"""
return vector / sqrt(numpy.dot(vector, vector))
def euclidean_distance(u, v):
"""
Returns the euclidean distance between vectors u and v. This is equivalent
to the length of the vector (u - v).
"""
diff = u - v
return sqrt(numpy.dot(diff, diff))
def cosine_distance(u, v):
"""
Returns 1 minus the cosine of the angle between vectors v and u. This is
equal to ``1 - (u.v / |u||v|)``.
"""
return 1 - (numpy.dot(u, v) / (sqrt(numpy.dot(u, u)) * sqrt(numpy.dot(v, v))))
class _DendrogramNode:
"""Tree node of a dendrogram."""
def __init__(self, value, *children):
self._value = value
self._children = children
def leaves(self, values=True):
if self._children:
leaves = []
for child in self._children:
leaves.extend(child.leaves(values))
return leaves
elif values:
return [self._value]
else:
return [self]
def groups(self, n):
queue = [(self._value, self)]
while len(queue) < n:
priority, node = queue.pop()
if not node._children:
queue.push((priority, node))
break
for child in node._children:
if child._children:
queue.append((child._value, child))
else:
queue.append((0, child))
# makes the earliest merges at the start, latest at the end
queue.sort()
groups = []
for priority, node in queue:
groups.append(node.leaves())
return groups
def __lt__(self, comparator):
return cosine_distance(self._value, comparator._value) < 0
class Dendrogram:
"""
Represents a dendrogram, a tree with a specified branching order. This
must be initialised with the leaf items, then iteratively call merge for
each branch. This class constructs a tree representing the order of calls
to the merge function.
"""
def __init__(self, items=[]):
"""
:param items: the items at the leaves of the dendrogram
:type items: sequence of (any)
"""
self._items = [_DendrogramNode(item) for item in items]
self._original_items = copy.copy(self._items)
self._merge = 1
def merge(self, *indices):
"""
Merges nodes at given indices in the dendrogram. The nodes will be
combined which then replaces the first node specified. All other nodes
involved in the merge will be removed.
:param indices: indices of the items to merge (at least two)
:type indices: seq of int
"""
assert len(indices) >= 2
node = _DendrogramNode(self._merge, *(self._items[i] for i in indices))
self._merge += 1
self._items[indices[0]] = node
for i in indices[1:]:
del self._items[i]
def groups(self, n):
"""
Finds the n-groups of items (leaves) reachable from a cut at depth n.
:param n: number of groups
:type n: int
"""
if len(self._items) > 1:
root = _DendrogramNode(self._merge, *self._items)
else:
root = self._items[0]
return root.groups(n)
def show(self, leaf_labels=[]):
"""
Print the dendrogram in ASCII art to standard out.
:param leaf_labels: an optional list of strings to use for labeling the
leaves
:type leaf_labels: list
"""
# ASCII rendering characters
JOIN, HLINK, VLINK = "+", "-", "|"
# find the root (or create one)
if len(self._items) > 1:
root = _DendrogramNode(self._merge, *self._items)
else:
root = self._items[0]
leaves = self._original_items
if leaf_labels:
last_row = leaf_labels
else:
last_row = ["%s" % leaf._value for leaf in leaves]
# find the bottom row and the best cell width
width = max(map(len, last_row)) + 1
lhalf = width // 2
rhalf = int(width - lhalf - 1)
# display functions
def format(centre, left=" ", right=" "):
return f"{lhalf * left}{centre}{right * rhalf}"
def display(str):
stdout.write(str)
# for each merge, top down
queue = [(root._value, root)]
verticals = [format(" ") for leaf in leaves]
while queue:
priority, node = queue.pop()
child_left_leaf = list(map(lambda c: c.leaves(False)[0], node._children))
indices = list(map(leaves.index, child_left_leaf))
if child_left_leaf:
min_idx = min(indices)
max_idx = max(indices)
for i in range(len(leaves)):
if leaves[i] in child_left_leaf:
if i == min_idx:
display(format(JOIN, " ", HLINK))
elif i == max_idx:
display(format(JOIN, HLINK, " "))
else:
display(format(JOIN, HLINK, HLINK))
verticals[i] = format(VLINK)
elif min_idx <= i <= max_idx:
display(format(HLINK, HLINK, HLINK))
else:
display(verticals[i])
display("\n")
for child in node._children:
if child._children:
queue.append((child._value, child))
queue.sort()
for vertical in verticals:
display(vertical)
display("\n")
# finally, display the last line
display("".join(item.center(width) for item in last_row))
display("\n")
def __repr__(self):
if len(self._items) > 1:
root = _DendrogramNode(self._merge, *self._items)
else:
root = self._items[0]
leaves = root.leaves(False)
return "<Dendrogram with %d leaves>" % len(leaves)

View File

@@ -0,0 +1,656 @@
# Natural Language Toolkit: Collections
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import bisect
from functools import total_ordering
from itertools import chain, islice
from nltk.internals import raise_unorderable_types, slice_bounds
##########################################################################
# Ordered Dictionary
##########################################################################
class OrderedDict(dict):
def __init__(self, data=None, **kwargs):
self._keys = self.keys(data, kwargs.get("keys"))
self._default_factory = kwargs.get("default_factory")
if data is None:
dict.__init__(self)
else:
dict.__init__(self, data)
def __delitem__(self, key):
dict.__delitem__(self, key)
self._keys.remove(key)
def __getitem__(self, key):
try:
return dict.__getitem__(self, key)
except KeyError:
return self.__missing__(key)
def __iter__(self):
return (key for key in self.keys())
def __missing__(self, key):
if not self._default_factory and key not in self._keys:
raise KeyError()
return self._default_factory()
def __setitem__(self, key, item):
dict.__setitem__(self, key, item)
if key not in self._keys:
self._keys.append(key)
def clear(self):
dict.clear(self)
self._keys.clear()
def copy(self):
d = dict.copy(self)
d._keys = self._keys
return d
def items(self):
return zip(self.keys(), self.values())
def keys(self, data=None, keys=None):
if data:
if keys:
assert isinstance(keys, list)
assert len(data) == len(keys)
return keys
else:
assert (
isinstance(data, dict)
or isinstance(data, OrderedDict)
or isinstance(data, list)
)
if isinstance(data, dict) or isinstance(data, OrderedDict):
return data.keys()
elif isinstance(data, list):
return [key for (key, value) in data]
elif "_keys" in self.__dict__:
return self._keys
else:
return []
def popitem(self):
if not self._keys:
raise KeyError()
key = self._keys.pop()
value = self[key]
del self[key]
return (key, value)
def setdefault(self, key, failobj=None):
dict.setdefault(self, key, failobj)
if key not in self._keys:
self._keys.append(key)
def update(self, data):
dict.update(self, data)
for key in self.keys(data):
if key not in self._keys:
self._keys.append(key)
def values(self):
return map(self.get, self._keys)
######################################################################
# Lazy Sequences
######################################################################
@total_ordering
class AbstractLazySequence:
"""
An abstract base class for read-only sequences whose values are
computed as needed. Lazy sequences act like tuples -- they can be
indexed, sliced, and iterated over; but they may not be modified.
The most common application of lazy sequences in NLTK is for
corpus view objects, which provide access to the contents of a
corpus without loading the entire corpus into memory, by loading
pieces of the corpus from disk as needed.
The result of modifying a mutable element of a lazy sequence is
undefined. In particular, the modifications made to the element
may or may not persist, depending on whether and when the lazy
sequence caches that element's value or reconstructs it from
scratch.
Subclasses are required to define two methods: ``__len__()``
and ``iterate_from()``.
"""
def __len__(self):
"""
Return the number of tokens in the corpus file underlying this
corpus view.
"""
raise NotImplementedError("should be implemented by subclass")
def iterate_from(self, start):
"""
Return an iterator that generates the tokens in the corpus
file underlying this corpus view, starting at the token number
``start``. If ``start>=len(self)``, then this iterator will
generate no tokens.
"""
raise NotImplementedError("should be implemented by subclass")
def __getitem__(self, i):
"""
Return the *i* th token in the corpus file underlying this
corpus view. Negative indices and spans are both supported.
"""
if isinstance(i, slice):
start, stop = slice_bounds(self, i)
return LazySubsequence(self, start, stop)
else:
# Handle negative indices
if i < 0:
i += len(self)
if i < 0:
raise IndexError("index out of range")
# Use iterate_from to extract it.
try:
return next(self.iterate_from(i))
except StopIteration as e:
raise IndexError("index out of range") from e
def __iter__(self):
"""Return an iterator that generates the tokens in the corpus
file underlying this corpus view."""
return self.iterate_from(0)
def count(self, value):
"""Return the number of times this list contains ``value``."""
return sum(1 for elt in self if elt == value)
def index(self, value, start=None, stop=None):
"""Return the index of the first occurrence of ``value`` in this
list that is greater than or equal to ``start`` and less than
``stop``. Negative start and stop values are treated like negative
slice bounds -- i.e., they count from the end of the list."""
start, stop = slice_bounds(self, slice(start, stop))
for i, elt in enumerate(islice(self, start, stop)):
if elt == value:
return i + start
raise ValueError("index(x): x not in list")
def __contains__(self, value):
"""Return true if this list contains ``value``."""
return bool(self.count(value))
def __add__(self, other):
"""Return a list concatenating self with other."""
return LazyConcatenation([self, other])
def __radd__(self, other):
"""Return a list concatenating other with self."""
return LazyConcatenation([other, self])
def __mul__(self, count):
"""Return a list concatenating self with itself ``count`` times."""
return LazyConcatenation([self] * count)
def __rmul__(self, count):
"""Return a list concatenating self with itself ``count`` times."""
return LazyConcatenation([self] * count)
_MAX_REPR_SIZE = 60
def __repr__(self):
"""
Return a string representation for this corpus view that is
similar to a list's representation; but if it would be more
than 60 characters long, it is truncated.
"""
pieces = []
length = 5
for elt in self:
pieces.append(repr(elt))
length += len(pieces[-1]) + 2
if length > self._MAX_REPR_SIZE and len(pieces) > 2:
return "[%s, ...]" % ", ".join(pieces[:-1])
return "[%s]" % ", ".join(pieces)
def __eq__(self, other):
return type(self) == type(other) and list(self) == list(other)
def __ne__(self, other):
return not self == other
def __lt__(self, other):
if type(other) != type(self):
raise_unorderable_types("<", self, other)
return list(self) < list(other)
def __hash__(self):
"""
:raise ValueError: Corpus view objects are unhashable.
"""
raise ValueError("%s objects are unhashable" % self.__class__.__name__)
class LazySubsequence(AbstractLazySequence):
"""
A subsequence produced by slicing a lazy sequence. This slice
keeps a reference to its source sequence, and generates its values
by looking them up in the source sequence.
"""
MIN_SIZE = 100
"""
The minimum size for which lazy slices should be created. If
``LazySubsequence()`` is called with a subsequence that is
shorter than ``MIN_SIZE``, then a tuple will be returned instead.
"""
def __new__(cls, source, start, stop):
"""
Construct a new slice from a given underlying sequence. The
``start`` and ``stop`` indices should be absolute indices --
i.e., they should not be negative (for indexing from the back
of a list) or greater than the length of ``source``.
"""
# If the slice is small enough, just use a tuple.
if stop - start < cls.MIN_SIZE:
return list(islice(source.iterate_from(start), stop - start))
else:
return object.__new__(cls)
def __init__(self, source, start, stop):
self._source = source
self._start = start
self._stop = stop
def __len__(self):
return self._stop - self._start
def iterate_from(self, start):
return islice(
self._source.iterate_from(start + self._start), max(0, len(self) - start)
)
class LazyConcatenation(AbstractLazySequence):
"""
A lazy sequence formed by concatenating a list of lists. This
underlying list of lists may itself be lazy. ``LazyConcatenation``
maintains an index that it uses to keep track of the relationship
between offsets in the concatenated lists and offsets in the
sublists.
"""
def __init__(self, list_of_lists):
self._list = list_of_lists
self._offsets = [0]
def __len__(self):
if len(self._offsets) <= len(self._list):
for _ in self.iterate_from(self._offsets[-1]):
pass
return self._offsets[-1]
def iterate_from(self, start_index):
if start_index < self._offsets[-1]:
sublist_index = bisect.bisect_right(self._offsets, start_index) - 1
else:
sublist_index = len(self._offsets) - 1
index = self._offsets[sublist_index]
# Construct an iterator over the sublists.
if isinstance(self._list, AbstractLazySequence):
sublist_iter = self._list.iterate_from(sublist_index)
else:
sublist_iter = islice(self._list, sublist_index, None)
for sublist in sublist_iter:
if sublist_index == (len(self._offsets) - 1):
assert (
index + len(sublist) >= self._offsets[-1]
), "offsets not monotonic increasing!"
self._offsets.append(index + len(sublist))
else:
assert self._offsets[sublist_index + 1] == index + len(
sublist
), "inconsistent list value (num elts)"
yield from sublist[max(0, start_index - index) :]
index += len(sublist)
sublist_index += 1
class LazyMap(AbstractLazySequence):
"""
A lazy sequence whose elements are formed by applying a given
function to each element in one or more underlying lists. The
function is applied lazily -- i.e., when you read a value from the
list, ``LazyMap`` will calculate that value by applying its
function to the underlying lists' value(s). ``LazyMap`` is
essentially a lazy version of the Python primitive function
``map``. In particular, the following two expressions are
equivalent:
>>> from nltk.collections import LazyMap
>>> function = str
>>> sequence = [1,2,3]
>>> map(function, sequence) # doctest: +SKIP
['1', '2', '3']
>>> list(LazyMap(function, sequence))
['1', '2', '3']
Like the Python ``map`` primitive, if the source lists do not have
equal size, then the value None will be supplied for the
'missing' elements.
Lazy maps can be useful for conserving memory, in cases where
individual values take up a lot of space. This is especially true
if the underlying list's values are constructed lazily, as is the
case with many corpus readers.
A typical example of a use case for this class is performing
feature detection on the tokens in a corpus. Since featuresets
are encoded as dictionaries, which can take up a lot of memory,
using a ``LazyMap`` can significantly reduce memory usage when
training and running classifiers.
"""
def __init__(self, function, *lists, **config):
"""
:param function: The function that should be applied to
elements of ``lists``. It should take as many arguments
as there are ``lists``.
:param lists: The underlying lists.
:param cache_size: Determines the size of the cache used
by this lazy map. (default=5)
"""
if not lists:
raise TypeError("LazyMap requires at least two args")
self._lists = lists
self._func = function
self._cache_size = config.get("cache_size", 5)
self._cache = {} if self._cache_size > 0 else None
# If you just take bool() of sum() here _all_lazy will be true just
# in case n >= 1 list is an AbstractLazySequence. Presumably this
# isn't what's intended.
self._all_lazy = sum(
isinstance(lst, AbstractLazySequence) for lst in lists
) == len(lists)
def iterate_from(self, index):
# Special case: one lazy sublist
if len(self._lists) == 1 and self._all_lazy:
for value in self._lists[0].iterate_from(index):
yield self._func(value)
return
# Special case: one non-lazy sublist
elif len(self._lists) == 1:
while True:
try:
yield self._func(self._lists[0][index])
except IndexError:
return
index += 1
# Special case: n lazy sublists
elif self._all_lazy:
iterators = [lst.iterate_from(index) for lst in self._lists]
while True:
elements = []
for iterator in iterators:
try:
elements.append(next(iterator))
except: # FIXME: What is this except really catching? StopIteration?
elements.append(None)
if elements == [None] * len(self._lists):
return
yield self._func(*elements)
index += 1
# general case
else:
while True:
try:
elements = [lst[index] for lst in self._lists]
except IndexError:
elements = [None] * len(self._lists)
for i, lst in enumerate(self._lists):
try:
elements[i] = lst[index]
except IndexError:
pass
if elements == [None] * len(self._lists):
return
yield self._func(*elements)
index += 1
def __getitem__(self, index):
if isinstance(index, slice):
sliced_lists = [lst[index] for lst in self._lists]
return LazyMap(self._func, *sliced_lists)
else:
# Handle negative indices
if index < 0:
index += len(self)
if index < 0:
raise IndexError("index out of range")
# Check the cache
if self._cache is not None and index in self._cache:
return self._cache[index]
# Calculate the value
try:
val = next(self.iterate_from(index))
except StopIteration as e:
raise IndexError("index out of range") from e
# Update the cache
if self._cache is not None:
if len(self._cache) > self._cache_size:
self._cache.popitem() # discard random entry
self._cache[index] = val
# Return the value
return val
def __len__(self):
return max(len(lst) for lst in self._lists)
class LazyZip(LazyMap):
"""
A lazy sequence whose elements are tuples, each containing the i-th
element from each of the argument sequences. The returned list is
truncated in length to the length of the shortest argument sequence. The
tuples are constructed lazily -- i.e., when you read a value from the
list, ``LazyZip`` will calculate that value by forming a tuple from
the i-th element of each of the argument sequences.
``LazyZip`` is essentially a lazy version of the Python primitive function
``zip``. In particular, an evaluated LazyZip is equivalent to a zip:
>>> from nltk.collections import LazyZip
>>> sequence1, sequence2 = [1, 2, 3], ['a', 'b', 'c']
>>> zip(sequence1, sequence2) # doctest: +SKIP
[(1, 'a'), (2, 'b'), (3, 'c')]
>>> list(LazyZip(sequence1, sequence2))
[(1, 'a'), (2, 'b'), (3, 'c')]
>>> sequences = [sequence1, sequence2, [6,7,8,9]]
>>> list(zip(*sequences)) == list(LazyZip(*sequences))
True
Lazy zips can be useful for conserving memory in cases where the argument
sequences are particularly long.
A typical example of a use case for this class is combining long sequences
of gold standard and predicted values in a classification or tagging task
in order to calculate accuracy. By constructing tuples lazily and
avoiding the creation of an additional long sequence, memory usage can be
significantly reduced.
"""
def __init__(self, *lists):
"""
:param lists: the underlying lists
:type lists: list(list)
"""
LazyMap.__init__(self, lambda *elts: elts, *lists)
def iterate_from(self, index):
iterator = LazyMap.iterate_from(self, index)
while index < len(self):
yield next(iterator)
index += 1
return
def __len__(self):
return min(len(lst) for lst in self._lists)
class LazyEnumerate(LazyZip):
"""
A lazy sequence whose elements are tuples, each containing a count (from
zero) and a value yielded by underlying sequence. ``LazyEnumerate`` is
useful for obtaining an indexed list. The tuples are constructed lazily
-- i.e., when you read a value from the list, ``LazyEnumerate`` will
calculate that value by forming a tuple from the count of the i-th
element and the i-th element of the underlying sequence.
``LazyEnumerate`` is essentially a lazy version of the Python primitive
function ``enumerate``. In particular, the following two expressions are
equivalent:
>>> from nltk.collections import LazyEnumerate
>>> sequence = ['first', 'second', 'third']
>>> list(enumerate(sequence))
[(0, 'first'), (1, 'second'), (2, 'third')]
>>> list(LazyEnumerate(sequence))
[(0, 'first'), (1, 'second'), (2, 'third')]
Lazy enumerations can be useful for conserving memory in cases where the
argument sequences are particularly long.
A typical example of a use case for this class is obtaining an indexed
list for a long sequence of values. By constructing tuples lazily and
avoiding the creation of an additional long sequence, memory usage can be
significantly reduced.
"""
def __init__(self, lst):
"""
:param lst: the underlying list
:type lst: list
"""
LazyZip.__init__(self, range(len(lst)), lst)
class LazyIteratorList(AbstractLazySequence):
"""
Wraps an iterator, loading its elements on demand
and making them subscriptable.
__repr__ displays only the first few elements.
"""
def __init__(self, it, known_len=None):
self._it = it
self._len = known_len
self._cache = []
def __len__(self):
if self._len:
return self._len
for _ in self.iterate_from(len(self._cache)):
pass
self._len = len(self._cache)
return self._len
def iterate_from(self, start):
"""Create a new iterator over this list starting at the given offset."""
while len(self._cache) < start:
v = next(self._it)
self._cache.append(v)
i = start
while i < len(self._cache):
yield self._cache[i]
i += 1
try:
while True:
v = next(self._it)
self._cache.append(v)
yield v
except StopIteration:
pass
def __add__(self, other):
"""Return a list concatenating self with other."""
return type(self)(chain(self, other))
def __radd__(self, other):
"""Return a list concatenating other with self."""
return type(self)(chain(other, self))
######################################################################
# Trie Implementation
######################################################################
class Trie(dict):
"""A Trie implementation for strings"""
LEAF = True
def __init__(self, strings=None):
"""Builds a Trie object, which is built around a ``dict``
If ``strings`` is provided, it will add the ``strings``, which
consist of a ``list`` of ``strings``, to the Trie.
Otherwise, it'll construct an empty Trie.
:param strings: List of strings to insert into the trie
(Default is ``None``)
:type strings: list(str)
"""
super().__init__()
if strings:
for string in strings:
self.insert(string)
def insert(self, string):
"""Inserts ``string`` into the Trie
:param string: String to insert into the trie
:type string: str
:Example:
>>> from nltk.collections import Trie
>>> trie = Trie(["abc", "def"])
>>> expected = {'a': {'b': {'c': {True: None}}}, \
'd': {'e': {'f': {True: None}}}}
>>> trie == expected
True
"""
if len(string):
self[string[0]].insert(string[1:])
else:
# mark the string is complete
self[Trie.LEAF] = None
def __missing__(self, key):
self[key] = Trie()
return self[key]

View File

@@ -0,0 +1,412 @@
# Natural Language Toolkit: Collocations and Association Measures
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Joel Nothman <jnothman@student.usyd.edu.au>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
#
"""
Tools to identify collocations --- words that often appear consecutively
--- within corpora. They may also be used to find other associations between
word occurrences.
See Manning and Schutze ch. 5 at https://nlp.stanford.edu/fsnlp/promo/colloc.pdf
and the Text::NSP Perl package at http://ngram.sourceforge.net
Finding collocations requires first calculating the frequencies of words and
their appearance in the context of other words. Often the collection of words
will then requiring filtering to only retain useful content terms. Each ngram
of words may then be scored according to some association measure, in order
to determine the relative likelihood of each ngram being a collocation.
The ``BigramCollocationFinder`` and ``TrigramCollocationFinder`` classes provide
these functionalities, dependent on being provided a function which scores a
ngram given appropriate frequency counts. A number of standard association
measures are provided in bigram_measures and trigram_measures.
"""
# Possible TODOs:
# - consider the distinction between f(x,_) and f(x) and whether our
# approximation is good enough for fragmented data, and mention it
# - add a n-gram collocation finder with measures which only utilise n-gram
# and unigram counts (raw_freq, pmi, student_t)
import itertools as _itertools
# these two unused imports are referenced in collocations.doctest
from nltk.metrics import (
BigramAssocMeasures,
ContingencyMeasures,
QuadgramAssocMeasures,
TrigramAssocMeasures,
)
from nltk.metrics.spearman import ranks_from_scores, spearman_correlation
from nltk.probability import FreqDist
from nltk.util import ngrams
class AbstractCollocationFinder:
"""
An abstract base class for collocation finders whose purpose is to
collect collocation candidate frequencies, filter and rank them.
As a minimum, collocation finders require the frequencies of each
word in a corpus, and the joint frequency of word tuples. This data
should be provided through nltk.probability.FreqDist objects or an
identical interface.
"""
def __init__(self, word_fd, ngram_fd):
self.word_fd = word_fd
self.N = word_fd.N()
self.ngram_fd = ngram_fd
@classmethod
def _build_new_documents(
cls, documents, window_size, pad_left=False, pad_right=False, pad_symbol=None
):
"""
Pad the document with the place holder according to the window_size
"""
padding = (pad_symbol,) * (window_size - 1)
if pad_right:
return _itertools.chain.from_iterable(
_itertools.chain(doc, padding) for doc in documents
)
if pad_left:
return _itertools.chain.from_iterable(
_itertools.chain(padding, doc) for doc in documents
)
@classmethod
def from_documents(cls, documents):
"""Constructs a collocation finder given a collection of documents,
each of which is a list (or iterable) of tokens.
"""
# return cls.from_words(_itertools.chain(*documents))
return cls.from_words(
cls._build_new_documents(documents, cls.default_ws, pad_right=True)
)
@staticmethod
def _ngram_freqdist(words, n):
return FreqDist(tuple(words[i : i + n]) for i in range(len(words) - 1))
def _apply_filter(self, fn=lambda ngram, freq: False):
"""Generic filter removes ngrams from the frequency distribution
if the function returns True when passed an ngram tuple.
"""
tmp_ngram = FreqDist()
for ngram, freq in self.ngram_fd.items():
if not fn(ngram, freq):
tmp_ngram[ngram] = freq
self.ngram_fd = tmp_ngram
def apply_freq_filter(self, min_freq):
"""Removes candidate ngrams which have frequency less than min_freq."""
self._apply_filter(lambda ng, freq: freq < min_freq)
def apply_ngram_filter(self, fn):
"""Removes candidate ngrams (w1, w2, ...) where fn(w1, w2, ...)
evaluates to True.
"""
self._apply_filter(lambda ng, f: fn(*ng))
def apply_word_filter(self, fn):
"""Removes candidate ngrams (w1, w2, ...) where any of (fn(w1), fn(w2),
...) evaluates to True.
"""
self._apply_filter(lambda ng, f: any(fn(w) for w in ng))
def _score_ngrams(self, score_fn):
"""Generates of (ngram, score) pairs as determined by the scoring
function provided.
"""
for tup in self.ngram_fd:
score = self.score_ngram(score_fn, *tup)
if score is not None:
yield tup, score
def score_ngrams(self, score_fn):
"""Returns a sequence of (ngram, score) pairs ordered from highest to
lowest score, as determined by the scoring function provided.
"""
return sorted(self._score_ngrams(score_fn), key=lambda t: (-t[1], t[0]))
def nbest(self, score_fn, n):
"""Returns the top n ngrams when scored by the given function."""
return [p for p, s in self.score_ngrams(score_fn)[:n]]
def above_score(self, score_fn, min_score):
"""Returns a sequence of ngrams, ordered by decreasing score, whose
scores each exceed the given minimum score.
"""
for ngram, score in self.score_ngrams(score_fn):
if score > min_score:
yield ngram
else:
break
class BigramCollocationFinder(AbstractCollocationFinder):
"""A tool for the finding and ranking of bigram collocations or other
association measures. It is often useful to use from_words() rather than
constructing an instance directly.
"""
default_ws = 2
def __init__(self, word_fd, bigram_fd, window_size=2):
"""Construct a BigramCollocationFinder, given FreqDists for
appearances of words and (possibly non-contiguous) bigrams.
"""
AbstractCollocationFinder.__init__(self, word_fd, bigram_fd)
self.window_size = window_size
@classmethod
def from_words(cls, words, window_size=2):
"""Construct a BigramCollocationFinder for all bigrams in the given
sequence. When window_size > 2, count non-contiguous bigrams, in the
style of Church and Hanks's (1990) association ratio.
"""
wfd = FreqDist()
bfd = FreqDist()
if window_size < 2:
raise ValueError("Specify window_size at least 2")
for window in ngrams(words, window_size, pad_right=True):
w1 = window[0]
if w1 is None:
continue
wfd[w1] += 1
for w2 in window[1:]:
if w2 is not None:
bfd[(w1, w2)] += 1
return cls(wfd, bfd, window_size=window_size)
def score_ngram(self, score_fn, w1, w2):
"""Returns the score for a given bigram using the given scoring
function. Following Church and Hanks (1990), counts are scaled by
a factor of 1/(window_size - 1).
"""
n_all = self.N
n_ii = self.ngram_fd[(w1, w2)] / (self.window_size - 1.0)
if not n_ii:
return
n_ix = self.word_fd[w1]
n_xi = self.word_fd[w2]
return score_fn(n_ii, (n_ix, n_xi), n_all)
class TrigramCollocationFinder(AbstractCollocationFinder):
"""A tool for the finding and ranking of trigram collocations or other
association measures. It is often useful to use from_words() rather than
constructing an instance directly.
"""
default_ws = 3
def __init__(self, word_fd, bigram_fd, wildcard_fd, trigram_fd):
"""Construct a TrigramCollocationFinder, given FreqDists for
appearances of words, bigrams, two words with any word between them,
and trigrams.
"""
AbstractCollocationFinder.__init__(self, word_fd, trigram_fd)
self.wildcard_fd = wildcard_fd
self.bigram_fd = bigram_fd
@classmethod
def from_words(cls, words, window_size=3):
"""Construct a TrigramCollocationFinder for all trigrams in the given
sequence.
"""
if window_size < 3:
raise ValueError("Specify window_size at least 3")
wfd = FreqDist()
wildfd = FreqDist()
bfd = FreqDist()
tfd = FreqDist()
for window in ngrams(words, window_size, pad_right=True):
w1 = window[0]
if w1 is None:
continue
for w2, w3 in _itertools.combinations(window[1:], 2):
wfd[w1] += 1
if w2 is None:
continue
bfd[(w1, w2)] += 1
if w3 is None:
continue
wildfd[(w1, w3)] += 1
tfd[(w1, w2, w3)] += 1
return cls(wfd, bfd, wildfd, tfd)
def bigram_finder(self):
"""Constructs a bigram collocation finder with the bigram and unigram
data from this finder. Note that this does not include any filtering
applied to this finder.
"""
return BigramCollocationFinder(self.word_fd, self.bigram_fd)
def score_ngram(self, score_fn, w1, w2, w3):
"""Returns the score for a given trigram using the given scoring
function.
"""
n_all = self.N
n_iii = self.ngram_fd[(w1, w2, w3)]
if not n_iii:
return
n_iix = self.bigram_fd[(w1, w2)]
n_ixi = self.wildcard_fd[(w1, w3)]
n_xii = self.bigram_fd[(w2, w3)]
n_ixx = self.word_fd[w1]
n_xix = self.word_fd[w2]
n_xxi = self.word_fd[w3]
return score_fn(n_iii, (n_iix, n_ixi, n_xii), (n_ixx, n_xix, n_xxi), n_all)
class QuadgramCollocationFinder(AbstractCollocationFinder):
"""A tool for the finding and ranking of quadgram collocations or other association measures.
It is often useful to use from_words() rather than constructing an instance directly.
"""
default_ws = 4
def __init__(self, word_fd, quadgram_fd, ii, iii, ixi, ixxi, iixi, ixii):
"""Construct a QuadgramCollocationFinder, given FreqDists for appearances of words,
bigrams, trigrams, two words with one word and two words between them, three words
with a word between them in both variations.
"""
AbstractCollocationFinder.__init__(self, word_fd, quadgram_fd)
self.iii = iii
self.ii = ii
self.ixi = ixi
self.ixxi = ixxi
self.iixi = iixi
self.ixii = ixii
@classmethod
def from_words(cls, words, window_size=4):
if window_size < 4:
raise ValueError("Specify window_size at least 4")
ixxx = FreqDist()
iiii = FreqDist()
ii = FreqDist()
iii = FreqDist()
ixi = FreqDist()
ixxi = FreqDist()
iixi = FreqDist()
ixii = FreqDist()
for window in ngrams(words, window_size, pad_right=True):
w1 = window[0]
if w1 is None:
continue
for w2, w3, w4 in _itertools.combinations(window[1:], 3):
ixxx[w1] += 1
if w2 is None:
continue
ii[(w1, w2)] += 1
if w3 is None:
continue
iii[(w1, w2, w3)] += 1
ixi[(w1, w3)] += 1
if w4 is None:
continue
iiii[(w1, w2, w3, w4)] += 1
ixxi[(w1, w4)] += 1
ixii[(w1, w3, w4)] += 1
iixi[(w1, w2, w4)] += 1
return cls(ixxx, iiii, ii, iii, ixi, ixxi, iixi, ixii)
def score_ngram(self, score_fn, w1, w2, w3, w4):
n_all = self.N
n_iiii = self.ngram_fd[(w1, w2, w3, w4)]
if not n_iiii:
return
n_iiix = self.iii[(w1, w2, w3)]
n_xiii = self.iii[(w2, w3, w4)]
n_iixi = self.iixi[(w1, w2, w4)]
n_ixii = self.ixii[(w1, w3, w4)]
n_iixx = self.ii[(w1, w2)]
n_xxii = self.ii[(w3, w4)]
n_xiix = self.ii[(w2, w3)]
n_ixix = self.ixi[(w1, w3)]
n_ixxi = self.ixxi[(w1, w4)]
n_xixi = self.ixi[(w2, w4)]
n_ixxx = self.word_fd[w1]
n_xixx = self.word_fd[w2]
n_xxix = self.word_fd[w3]
n_xxxi = self.word_fd[w4]
return score_fn(
n_iiii,
(n_iiix, n_iixi, n_ixii, n_xiii),
(n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix),
(n_ixxx, n_xixx, n_xxix, n_xxxi),
n_all,
)
def demo(scorer=None, compare_scorer=None):
"""Finds bigram collocations in the files of the WebText corpus."""
from nltk.metrics import (
BigramAssocMeasures,
ranks_from_scores,
spearman_correlation,
)
if scorer is None:
scorer = BigramAssocMeasures.likelihood_ratio
if compare_scorer is None:
compare_scorer = BigramAssocMeasures.raw_freq
from nltk.corpus import stopwords, webtext
ignored_words = stopwords.words("english")
word_filter = lambda w: len(w) < 3 or w.lower() in ignored_words
for file in webtext.fileids():
words = [word.lower() for word in webtext.words(file)]
cf = BigramCollocationFinder.from_words(words)
cf.apply_freq_filter(3)
cf.apply_word_filter(word_filter)
corr = spearman_correlation(
ranks_from_scores(cf.score_ngrams(scorer)),
ranks_from_scores(cf.score_ngrams(compare_scorer)),
)
print(file)
print("\t", [" ".join(tup) for tup in cf.nbest(scorer, 15)])
print(f"\t Correlation to {compare_scorer.__name__}: {corr:0.4f}")
# Slows down loading too much
# bigram_measures = BigramAssocMeasures()
# trigram_measures = TrigramAssocMeasures()
if __name__ == "__main__":
import sys
from nltk.metrics import BigramAssocMeasures
try:
scorer = eval("BigramAssocMeasures." + sys.argv[1])
except IndexError:
scorer = None
try:
compare_scorer = eval("BigramAssocMeasures." + sys.argv[2])
except IndexError:
compare_scorer = None
demo(scorer, compare_scorer)
__all__ = [
"BigramCollocationFinder",
"TrigramCollocationFinder",
"QuadgramCollocationFinder",
]

View File

@@ -0,0 +1,38 @@
# Natural Language Toolkit: Compatibility
#
# Copyright (C) 2001-2025 NLTK Project
#
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import os
from functools import wraps
# ======= Compatibility for datasets that care about Python versions ========
# The following datasets have a /PY3 subdirectory containing
# a full copy of the data which has been re-encoded or repickled.
DATA_UPDATES = []
_PY3_DATA_UPDATES = [os.path.join(*path_list) for path_list in DATA_UPDATES]
def add_py3_data(path):
for item in _PY3_DATA_UPDATES:
if item in str(path) and "/PY3" not in str(path):
pos = path.index(item) + len(item)
if path[pos : pos + 4] == ".zip":
pos += 4
path = path[:pos] + "/PY3" + path[pos:]
break
return path
# for use in adding /PY3 to the second (filename) argument
# of the file pointers in data.py
def py3_data(init_func):
def _decorator(*args, **kwargs):
args = (args[0], add_py3_data(args[1])) + args[2:]
return init_func(*args, **kwargs)
return wraps(init_func)(_decorator)

View File

@@ -0,0 +1,551 @@
# Natural Language Toolkit: Corpus Readers
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
# TODO this docstring isn't up-to-date!
"""
NLTK corpus readers. The modules in this package provide functions
that can be used to read corpus files in a variety of formats. These
functions can be used to read both the corpus files that are
distributed in the NLTK corpus package, and corpus files that are part
of external corpora.
Available Corpora
=================
Please see https://www.nltk.org/nltk_data/ for a complete list.
Install corpora using nltk.download().
Corpus Reader Functions
=======================
Each corpus module defines one or more "corpus reader functions",
which can be used to read documents from that corpus. These functions
take an argument, ``item``, which is used to indicate which document
should be read from the corpus:
- If ``item`` is one of the unique identifiers listed in the corpus
module's ``items`` variable, then the corresponding document will
be loaded from the NLTK corpus package.
- If ``item`` is a filename, then that file will be read.
Additionally, corpus reader functions can be given lists of item
names; in which case, they will return a concatenation of the
corresponding documents.
Corpus reader functions are named based on the type of information
they return. Some common examples, and their return types, are:
- words(): list of str
- sents(): list of (list of str)
- paras(): list of (list of (list of str))
- tagged_words(): list of (str,str) tuple
- tagged_sents(): list of (list of (str,str))
- tagged_paras(): list of (list of (list of (str,str)))
- chunked_sents(): list of (Tree w/ (str,str) leaves)
- parsed_sents(): list of (Tree with str leaves)
- parsed_paras(): list of (list of (Tree with str leaves))
- xml(): A single xml ElementTree
- raw(): unprocessed corpus contents
For example, to read a list of the words in the Brown Corpus, use
``nltk.corpus.brown.words()``:
>>> from nltk.corpus import brown
>>> print(", ".join(brown.words())) # doctest: +ELLIPSIS
The, Fulton, County, Grand, Jury, said, ...
"""
import re
from nltk.corpus.reader import *
from nltk.corpus.util import LazyCorpusLoader
from nltk.tokenize import RegexpTokenizer
abc: PlaintextCorpusReader = LazyCorpusLoader(
"abc",
PlaintextCorpusReader,
r"(?!\.).*\.txt",
encoding=[("science", "latin_1"), ("rural", "utf8")],
)
alpino: AlpinoCorpusReader = LazyCorpusLoader(
"alpino", AlpinoCorpusReader, tagset="alpino"
)
bcp47: BCP47CorpusReader = LazyCorpusLoader(
"bcp47", BCP47CorpusReader, r"(cldr|iana)/*"
)
brown: CategorizedTaggedCorpusReader = LazyCorpusLoader(
"brown",
CategorizedTaggedCorpusReader,
r"c[a-z]\d\d",
cat_file="cats.txt",
tagset="brown",
encoding="ascii",
)
cess_cat: BracketParseCorpusReader = LazyCorpusLoader(
"cess_cat",
BracketParseCorpusReader,
r"(?!\.).*\.tbf",
tagset="unknown",
encoding="ISO-8859-15",
)
cess_esp: BracketParseCorpusReader = LazyCorpusLoader(
"cess_esp",
BracketParseCorpusReader,
r"(?!\.).*\.tbf",
tagset="unknown",
encoding="ISO-8859-15",
)
cmudict: CMUDictCorpusReader = LazyCorpusLoader(
"cmudict", CMUDictCorpusReader, ["cmudict"]
)
comtrans: AlignedCorpusReader = LazyCorpusLoader(
"comtrans", AlignedCorpusReader, r"(?!\.).*\.txt"
)
comparative_sentences: ComparativeSentencesCorpusReader = LazyCorpusLoader(
"comparative_sentences",
ComparativeSentencesCorpusReader,
r"labeledSentences\.txt",
encoding="latin-1",
)
conll2000: ConllChunkCorpusReader = LazyCorpusLoader(
"conll2000",
ConllChunkCorpusReader,
["train.txt", "test.txt"],
("NP", "VP", "PP"),
tagset="wsj",
encoding="ascii",
)
conll2002: ConllChunkCorpusReader = LazyCorpusLoader(
"conll2002",
ConllChunkCorpusReader,
r".*\.(test|train).*",
("LOC", "PER", "ORG", "MISC"),
encoding="utf-8",
)
conll2007: DependencyCorpusReader = LazyCorpusLoader(
"conll2007",
DependencyCorpusReader,
r".*\.(test|train).*",
encoding=[("eus", "ISO-8859-2"), ("esp", "utf8")],
)
crubadan: CrubadanCorpusReader = LazyCorpusLoader(
"crubadan", CrubadanCorpusReader, r".*\.txt"
)
dependency_treebank: DependencyCorpusReader = LazyCorpusLoader(
"dependency_treebank", DependencyCorpusReader, r".*\.dp", encoding="ascii"
)
extended_omw: CorpusReader = LazyCorpusLoader(
"extended_omw", CorpusReader, r".*/wn-[a-z\-]*\.tab", encoding="utf8"
)
floresta: BracketParseCorpusReader = LazyCorpusLoader(
"floresta",
BracketParseCorpusReader,
r"(?!\.).*\.ptb",
"#",
tagset="unknown",
encoding="ISO-8859-15",
)
framenet15: FramenetCorpusReader = LazyCorpusLoader(
"framenet_v15",
FramenetCorpusReader,
[
"frRelation.xml",
"frameIndex.xml",
"fulltextIndex.xml",
"luIndex.xml",
"semTypes.xml",
],
)
framenet: FramenetCorpusReader = LazyCorpusLoader(
"framenet_v17",
FramenetCorpusReader,
[
"frRelation.xml",
"frameIndex.xml",
"fulltextIndex.xml",
"luIndex.xml",
"semTypes.xml",
],
)
gazetteers: WordListCorpusReader = LazyCorpusLoader(
"gazetteers", WordListCorpusReader, r"(?!LICENSE|\.).*\.txt", encoding="ISO-8859-2"
)
genesis: PlaintextCorpusReader = LazyCorpusLoader(
"genesis",
PlaintextCorpusReader,
r"(?!\.).*\.txt",
encoding=[
("finnish|french|german", "latin_1"),
("swedish", "cp865"),
(".*", "utf_8"),
],
)
gutenberg: PlaintextCorpusReader = LazyCorpusLoader(
"gutenberg", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="latin1"
)
ieer: IEERCorpusReader = LazyCorpusLoader("ieer", IEERCorpusReader, r"(?!README|\.).*")
inaugural: PlaintextCorpusReader = LazyCorpusLoader(
"inaugural", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="latin1"
)
# [XX] This should probably just use TaggedCorpusReader:
indian: IndianCorpusReader = LazyCorpusLoader(
"indian", IndianCorpusReader, r"(?!\.).*\.pos", tagset="unknown", encoding="utf8"
)
jeita: ChasenCorpusReader = LazyCorpusLoader(
"jeita", ChasenCorpusReader, r".*\.chasen", encoding="utf-8"
)
knbc: KNBCorpusReader = LazyCorpusLoader(
"knbc/corpus1", KNBCorpusReader, r".*/KN.*", encoding="euc-jp"
)
lin_thesaurus: LinThesaurusCorpusReader = LazyCorpusLoader(
"lin_thesaurus", LinThesaurusCorpusReader, r".*\.lsp"
)
mac_morpho: MacMorphoCorpusReader = LazyCorpusLoader(
"mac_morpho",
MacMorphoCorpusReader,
r"(?!\.).*\.txt",
tagset="unknown",
encoding="latin-1",
)
machado: PortugueseCategorizedPlaintextCorpusReader = LazyCorpusLoader(
"machado",
PortugueseCategorizedPlaintextCorpusReader,
r"(?!\.).*\.txt",
cat_pattern=r"([a-z]*)/.*",
encoding="latin-1",
)
masc_tagged: CategorizedTaggedCorpusReader = LazyCorpusLoader(
"masc_tagged",
CategorizedTaggedCorpusReader,
r"(spoken|written)/.*\.txt",
cat_file="categories.txt",
tagset="wsj",
encoding="utf-8",
sep="_",
)
movie_reviews: CategorizedPlaintextCorpusReader = LazyCorpusLoader(
"movie_reviews",
CategorizedPlaintextCorpusReader,
r"(?!\.).*\.txt",
cat_pattern=r"(neg|pos)/.*",
encoding="ascii",
)
multext_east: MTECorpusReader = LazyCorpusLoader(
"mte_teip5", MTECorpusReader, r"(oana).*\.xml", encoding="utf-8"
)
names: WordListCorpusReader = LazyCorpusLoader(
"names", WordListCorpusReader, r"(?!\.).*\.txt", encoding="ascii"
)
nps_chat: NPSChatCorpusReader = LazyCorpusLoader(
"nps_chat", NPSChatCorpusReader, r"(?!README|\.).*\.xml", tagset="wsj"
)
opinion_lexicon: OpinionLexiconCorpusReader = LazyCorpusLoader(
"opinion_lexicon",
OpinionLexiconCorpusReader,
r"(\w+)\-words\.txt",
encoding="ISO-8859-2",
)
ppattach: PPAttachmentCorpusReader = LazyCorpusLoader(
"ppattach", PPAttachmentCorpusReader, ["training", "test", "devset"]
)
product_reviews_1: ReviewsCorpusReader = LazyCorpusLoader(
"product_reviews_1", ReviewsCorpusReader, r"^(?!Readme).*\.txt", encoding="utf8"
)
product_reviews_2: ReviewsCorpusReader = LazyCorpusLoader(
"product_reviews_2", ReviewsCorpusReader, r"^(?!Readme).*\.txt", encoding="utf8"
)
pros_cons: ProsConsCorpusReader = LazyCorpusLoader(
"pros_cons",
ProsConsCorpusReader,
r"Integrated(Cons|Pros)\.txt",
cat_pattern=r"Integrated(Cons|Pros)\.txt",
encoding="ISO-8859-2",
)
ptb: CategorizedBracketParseCorpusReader = (
LazyCorpusLoader( # Penn Treebank v3: WSJ and Brown portions
"ptb",
CategorizedBracketParseCorpusReader,
r"(WSJ/\d\d/WSJ_\d\d|BROWN/C[A-Z]/C[A-Z])\d\d.MRG",
cat_file="allcats.txt",
tagset="wsj",
)
)
qc: StringCategoryCorpusReader = LazyCorpusLoader(
"qc", StringCategoryCorpusReader, ["train.txt", "test.txt"], encoding="ISO-8859-2"
)
reuters: CategorizedPlaintextCorpusReader = LazyCorpusLoader(
"reuters",
CategorizedPlaintextCorpusReader,
"(training|test).*",
cat_file="cats.txt",
encoding="ISO-8859-2",
)
rte: RTECorpusReader = LazyCorpusLoader("rte", RTECorpusReader, r"(?!\.).*\.xml")
senseval: SensevalCorpusReader = LazyCorpusLoader(
"senseval", SensevalCorpusReader, r"(?!\.).*\.pos"
)
sentence_polarity: CategorizedSentencesCorpusReader = LazyCorpusLoader(
"sentence_polarity",
CategorizedSentencesCorpusReader,
r"rt-polarity\.(neg|pos)",
cat_pattern=r"rt-polarity\.(neg|pos)",
encoding="utf-8",
)
sentiwordnet: SentiWordNetCorpusReader = LazyCorpusLoader(
"sentiwordnet", SentiWordNetCorpusReader, "SentiWordNet_3.0.0.txt", encoding="utf-8"
)
shakespeare: XMLCorpusReader = LazyCorpusLoader(
"shakespeare", XMLCorpusReader, r"(?!\.).*\.xml"
)
sinica_treebank: SinicaTreebankCorpusReader = LazyCorpusLoader(
"sinica_treebank",
SinicaTreebankCorpusReader,
["parsed"],
tagset="unknown",
encoding="utf-8",
)
state_union: PlaintextCorpusReader = LazyCorpusLoader(
"state_union", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="ISO-8859-2"
)
stopwords: WordListCorpusReader = LazyCorpusLoader(
"stopwords", WordListCorpusReader, r"(?!README|\.).*", encoding="utf8"
)
subjectivity: CategorizedSentencesCorpusReader = LazyCorpusLoader(
"subjectivity",
CategorizedSentencesCorpusReader,
r"(quote.tok.gt9|plot.tok.gt9)\.5000",
cat_map={"quote.tok.gt9.5000": ["subj"], "plot.tok.gt9.5000": ["obj"]},
encoding="latin-1",
)
swadesh: SwadeshCorpusReader = LazyCorpusLoader(
"swadesh", SwadeshCorpusReader, r"(?!README|\.).*", encoding="utf8"
)
swadesh110: PanlexSwadeshCorpusReader = LazyCorpusLoader(
"panlex_swadesh", PanlexSwadeshCorpusReader, r"swadesh110/.*\.txt", encoding="utf8"
)
swadesh207: PanlexSwadeshCorpusReader = LazyCorpusLoader(
"panlex_swadesh", PanlexSwadeshCorpusReader, r"swadesh207/.*\.txt", encoding="utf8"
)
switchboard: SwitchboardCorpusReader = LazyCorpusLoader(
"switchboard", SwitchboardCorpusReader, tagset="wsj"
)
timit: TimitCorpusReader = LazyCorpusLoader("timit", TimitCorpusReader)
timit_tagged: TimitTaggedCorpusReader = LazyCorpusLoader(
"timit", TimitTaggedCorpusReader, r".+\.tags", tagset="wsj", encoding="ascii"
)
toolbox: ToolboxCorpusReader = LazyCorpusLoader(
"toolbox", ToolboxCorpusReader, r"(?!.*(README|\.)).*\.(dic|txt)"
)
treebank: BracketParseCorpusReader = LazyCorpusLoader(
"treebank/combined",
BracketParseCorpusReader,
r"wsj_.*\.mrg",
tagset="wsj",
encoding="ascii",
)
treebank_chunk: ChunkedCorpusReader = LazyCorpusLoader(
"treebank/tagged",
ChunkedCorpusReader,
r"wsj_.*\.pos",
sent_tokenizer=RegexpTokenizer(r"(?<=/\.)\s*(?![^\[]*\])", gaps=True),
para_block_reader=tagged_treebank_para_block_reader,
tagset="wsj",
encoding="ascii",
)
treebank_raw: PlaintextCorpusReader = LazyCorpusLoader(
"treebank/raw", PlaintextCorpusReader, r"wsj_.*", encoding="ISO-8859-2"
)
twitter_samples: TwitterCorpusReader = LazyCorpusLoader(
"twitter_samples", TwitterCorpusReader, r".*\.json"
)
udhr: UdhrCorpusReader = LazyCorpusLoader("udhr", UdhrCorpusReader)
udhr2: PlaintextCorpusReader = LazyCorpusLoader(
"udhr2", PlaintextCorpusReader, r".*\.txt", encoding="utf8"
)
universal_treebanks: ConllCorpusReader = LazyCorpusLoader(
"universal_treebanks_v20",
ConllCorpusReader,
r".*\.conll",
columntypes=(
"ignore",
"words",
"ignore",
"ignore",
"pos",
"ignore",
"ignore",
"ignore",
"ignore",
"ignore",
),
)
verbnet: VerbnetCorpusReader = LazyCorpusLoader(
"verbnet", VerbnetCorpusReader, r"(?!\.).*\.xml"
)
webtext: PlaintextCorpusReader = LazyCorpusLoader(
"webtext", PlaintextCorpusReader, r"(?!README|\.).*\.txt", encoding="ISO-8859-2"
)
wordnet: WordNetCorpusReader = LazyCorpusLoader(
"wordnet",
WordNetCorpusReader,
LazyCorpusLoader("omw-1.4", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
)
## Use the following template to add a custom Wordnet package.
## Just uncomment, and replace the identifier (my_wordnet) in two places:
##
# my_wordnet: WordNetCorpusReader = LazyCorpusLoader(
# "my_wordnet",
# WordNetCorpusReader,
# LazyCorpusLoader("omw-1.4", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
# )
wordnet31: WordNetCorpusReader = LazyCorpusLoader(
"wordnet31",
WordNetCorpusReader,
LazyCorpusLoader("omw-1.4", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
)
wordnet2021: WordNetCorpusReader = LazyCorpusLoader(
# Obsolete, use english_wordnet instead.
"wordnet2021",
WordNetCorpusReader,
LazyCorpusLoader("omw-1.4", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
)
wordnet2022: WordNetCorpusReader = LazyCorpusLoader(
# Obsolete, use english_wordnet instead.
"wordnet2022",
WordNetCorpusReader,
LazyCorpusLoader("omw-1.4", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
)
english_wordnet: WordNetCorpusReader = LazyCorpusLoader(
# Latest Open English Wordnet
"english_wordnet",
WordNetCorpusReader,
LazyCorpusLoader("omw-1.4", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
)
wordnet_ic: WordNetICCorpusReader = LazyCorpusLoader(
"wordnet_ic", WordNetICCorpusReader, r".*\.dat"
)
words: WordListCorpusReader = LazyCorpusLoader(
"words", WordListCorpusReader, r"(?!README|\.).*", encoding="ascii"
)
# defined after treebank
propbank: PropbankCorpusReader = LazyCorpusLoader(
"propbank",
PropbankCorpusReader,
"prop.txt",
r"frames/.*\.xml",
"verbs.txt",
lambda filename: re.sub(r"^wsj/\d\d/", "", filename),
treebank,
) # Must be defined *after* treebank corpus.
nombank: NombankCorpusReader = LazyCorpusLoader(
"nombank.1.0",
NombankCorpusReader,
"nombank.1.0",
r"frames/.*\.xml",
"nombank.1.0.words",
lambda filename: re.sub(r"^wsj/\d\d/", "", filename),
treebank,
) # Must be defined *after* treebank corpus.
propbank_ptb: PropbankCorpusReader = LazyCorpusLoader(
"propbank",
PropbankCorpusReader,
"prop.txt",
r"frames/.*\.xml",
"verbs.txt",
lambda filename: filename.upper(),
ptb,
) # Must be defined *after* ptb corpus.
nombank_ptb: NombankCorpusReader = LazyCorpusLoader(
"nombank.1.0",
NombankCorpusReader,
"nombank.1.0",
r"frames/.*\.xml",
"nombank.1.0.words",
lambda filename: filename.upper(),
ptb,
) # Must be defined *after* ptb corpus.
semcor: SemcorCorpusReader = LazyCorpusLoader(
"semcor", SemcorCorpusReader, r"brown./tagfiles/br-.*\.xml", wordnet
) # Must be defined *after* wordnet corpus.
nonbreaking_prefixes: NonbreakingPrefixesCorpusReader = LazyCorpusLoader(
"nonbreaking_prefixes",
NonbreakingPrefixesCorpusReader,
r"(?!README|\.).*",
encoding="utf8",
)
perluniprops: UnicharsCorpusReader = LazyCorpusLoader(
"perluniprops",
UnicharsCorpusReader,
r"(?!README|\.).*",
nltk_data_subdir="misc",
encoding="utf8",
)
# mwa_ppdb = LazyCorpusLoader(
# 'mwa_ppdb', MWAPPDBCorpusReader, r'(?!README|\.).*', nltk_data_subdir='misc', encoding='utf8')
# See https://github.com/nltk/nltk/issues/1579
# and https://github.com/nltk/nltk/issues/1716
#
# pl196x = LazyCorpusLoader(
# 'pl196x', Pl196xCorpusReader, r'[a-z]-.*\.xml',
# cat_file='cats.txt', textid_file='textids.txt', encoding='utf8')
#
# ipipan = LazyCorpusLoader(
# 'ipipan', IPIPANCorpusReader, r'(?!\.).*morph\.xml')
#
# nkjp = LazyCorpusLoader(
# 'nkjp', NKJPCorpusReader, r'', encoding='utf8')
#
# panlex_lite = LazyCorpusLoader(
# 'panlex_lite', PanLexLiteCorpusReader)
#
# ycoe = LazyCorpusLoader(
# 'ycoe', YCOECorpusReader)
#
# corpus not available with NLTK; these lines caused help(nltk.corpus) to break
# hebrew_treebank = LazyCorpusLoader(
# 'hebrew_treebank', BracketParseCorpusReader, r'.*\.txt')
# FIXME: override any imported demo from various corpora, see https://github.com/nltk/nltk/issues/2116
def demo():
# This is out-of-date:
abc.demo()
brown.demo()
# chat80.demo()
cmudict.demo()
conll2000.demo()
conll2002.demo()
genesis.demo()
gutenberg.demo()
ieer.demo()
inaugural.demo()
indian.demo()
names.demo()
ppattach.demo()
senseval.demo()
shakespeare.demo()
sinica_treebank.demo()
state_union.demo()
stopwords.demo()
timit.demo()
toolbox.demo()
treebank.demo()
udhr.demo()
webtext.demo()
words.demo()
# ycoe.demo()
if __name__ == "__main__":
# demo()
pass

View File

@@ -0,0 +1,56 @@
# Natural Language Toolkit: Europarl Corpus Readers
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Nitin Madnani <nmadnani@umiacs.umd.edu>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import re
from nltk.corpus.reader import *
from nltk.corpus.util import LazyCorpusLoader
# Create a new corpus reader instance for each European language
danish: EuroparlCorpusReader = LazyCorpusLoader(
"europarl_raw/danish", EuroparlCorpusReader, r"ep-.*\.da", encoding="utf-8"
)
dutch: EuroparlCorpusReader = LazyCorpusLoader(
"europarl_raw/dutch", EuroparlCorpusReader, r"ep-.*\.nl", encoding="utf-8"
)
english: EuroparlCorpusReader = LazyCorpusLoader(
"europarl_raw/english", EuroparlCorpusReader, r"ep-.*\.en", encoding="utf-8"
)
finnish: EuroparlCorpusReader = LazyCorpusLoader(
"europarl_raw/finnish", EuroparlCorpusReader, r"ep-.*\.fi", encoding="utf-8"
)
french: EuroparlCorpusReader = LazyCorpusLoader(
"europarl_raw/french", EuroparlCorpusReader, r"ep-.*\.fr", encoding="utf-8"
)
german: EuroparlCorpusReader = LazyCorpusLoader(
"europarl_raw/german", EuroparlCorpusReader, r"ep-.*\.de", encoding="utf-8"
)
greek: EuroparlCorpusReader = LazyCorpusLoader(
"europarl_raw/greek", EuroparlCorpusReader, r"ep-.*\.el", encoding="utf-8"
)
italian: EuroparlCorpusReader = LazyCorpusLoader(
"europarl_raw/italian", EuroparlCorpusReader, r"ep-.*\.it", encoding="utf-8"
)
portuguese: EuroparlCorpusReader = LazyCorpusLoader(
"europarl_raw/portuguese", EuroparlCorpusReader, r"ep-.*\.pt", encoding="utf-8"
)
spanish: EuroparlCorpusReader = LazyCorpusLoader(
"europarl_raw/spanish", EuroparlCorpusReader, r"ep-.*\.es", encoding="utf-8"
)
swedish: EuroparlCorpusReader = LazyCorpusLoader(
"europarl_raw/swedish", EuroparlCorpusReader, r"ep-.*\.sv", encoding="utf-8"
)

View File

@@ -0,0 +1,186 @@
# Natural Language Toolkit: Corpus Readers
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
NLTK corpus readers. The modules in this package provide functions
that can be used to read corpus fileids in a variety of formats. These
functions can be used to read both the corpus fileids that are
distributed in the NLTK corpus package, and corpus fileids that are part
of external corpora.
Corpus Reader Functions
=======================
Each corpus module defines one or more "corpus reader functions",
which can be used to read documents from that corpus. These functions
take an argument, ``item``, which is used to indicate which document
should be read from the corpus:
- If ``item`` is one of the unique identifiers listed in the corpus
module's ``items`` variable, then the corresponding document will
be loaded from the NLTK corpus package.
- If ``item`` is a fileid, then that file will be read.
Additionally, corpus reader functions can be given lists of item
names; in which case, they will return a concatenation of the
corresponding documents.
Corpus reader functions are named based on the type of information
they return. Some common examples, and their return types, are:
- words(): list of str
- sents(): list of (list of str)
- paras(): list of (list of (list of str))
- tagged_words(): list of (str,str) tuple
- tagged_sents(): list of (list of (str,str))
- tagged_paras(): list of (list of (list of (str,str)))
- chunked_sents(): list of (Tree w/ (str,str) leaves)
- parsed_sents(): list of (Tree with str leaves)
- parsed_paras(): list of (list of (Tree with str leaves))
- xml(): A single xml ElementTree
- raw(): unprocessed corpus contents
For example, to read a list of the words in the Brown Corpus, use
``nltk.corpus.brown.words()``:
>>> from nltk.corpus import brown
>>> print(", ".join(brown.words()[:6])) # only first 6 words
The, Fulton, County, Grand, Jury, said
isort:skip_file
"""
from nltk.corpus.reader.plaintext import *
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
from nltk.corpus.reader.tagged import *
from nltk.corpus.reader.cmudict import *
from nltk.corpus.reader.conll import *
from nltk.corpus.reader.chunked import *
from nltk.corpus.reader.wordlist import *
from nltk.corpus.reader.xmldocs import *
from nltk.corpus.reader.ppattach import *
from nltk.corpus.reader.senseval import *
from nltk.corpus.reader.ieer import *
from nltk.corpus.reader.sinica_treebank import *
from nltk.corpus.reader.bracket_parse import *
from nltk.corpus.reader.indian import *
from nltk.corpus.reader.toolbox import *
from nltk.corpus.reader.timit import *
from nltk.corpus.reader.ycoe import *
from nltk.corpus.reader.rte import *
from nltk.corpus.reader.string_category import *
from nltk.corpus.reader.propbank import *
from nltk.corpus.reader.verbnet import *
from nltk.corpus.reader.bnc import *
from nltk.corpus.reader.nps_chat import *
from nltk.corpus.reader.wordnet import *
from nltk.corpus.reader.switchboard import *
from nltk.corpus.reader.dependency import *
from nltk.corpus.reader.nombank import *
from nltk.corpus.reader.ipipan import *
from nltk.corpus.reader.pl196x import *
from nltk.corpus.reader.knbc import *
from nltk.corpus.reader.chasen import *
from nltk.corpus.reader.childes import *
from nltk.corpus.reader.aligned import *
from nltk.corpus.reader.lin import *
from nltk.corpus.reader.semcor import *
from nltk.corpus.reader.framenet import *
from nltk.corpus.reader.udhr import *
from nltk.corpus.reader.bnc import *
from nltk.corpus.reader.sentiwordnet import *
from nltk.corpus.reader.twitter import *
from nltk.corpus.reader.nkjp import *
from nltk.corpus.reader.crubadan import *
from nltk.corpus.reader.mte import *
from nltk.corpus.reader.reviews import *
from nltk.corpus.reader.opinion_lexicon import *
from nltk.corpus.reader.pros_cons import *
from nltk.corpus.reader.categorized_sents import *
from nltk.corpus.reader.comparative_sents import *
from nltk.corpus.reader.panlex_lite import *
from nltk.corpus.reader.panlex_swadesh import *
from nltk.corpus.reader.bcp47 import *
# Make sure that nltk.corpus.reader.bracket_parse gives the module, not
# the function bracket_parse() defined in nltk.tree:
from nltk.corpus.reader import bracket_parse
__all__ = [
"CorpusReader",
"CategorizedCorpusReader",
"PlaintextCorpusReader",
"find_corpus_fileids",
"TaggedCorpusReader",
"CMUDictCorpusReader",
"ConllChunkCorpusReader",
"WordListCorpusReader",
"PPAttachmentCorpusReader",
"SensevalCorpusReader",
"IEERCorpusReader",
"ChunkedCorpusReader",
"SinicaTreebankCorpusReader",
"BracketParseCorpusReader",
"IndianCorpusReader",
"ToolboxCorpusReader",
"TimitCorpusReader",
"YCOECorpusReader",
"MacMorphoCorpusReader",
"SyntaxCorpusReader",
"AlpinoCorpusReader",
"RTECorpusReader",
"StringCategoryCorpusReader",
"EuroparlCorpusReader",
"CategorizedBracketParseCorpusReader",
"CategorizedTaggedCorpusReader",
"CategorizedPlaintextCorpusReader",
"PortugueseCategorizedPlaintextCorpusReader",
"tagged_treebank_para_block_reader",
"PropbankCorpusReader",
"VerbnetCorpusReader",
"BNCCorpusReader",
"ConllCorpusReader",
"XMLCorpusReader",
"NPSChatCorpusReader",
"SwadeshCorpusReader",
"WordNetCorpusReader",
"WordNetICCorpusReader",
"SwitchboardCorpusReader",
"DependencyCorpusReader",
"NombankCorpusReader",
"IPIPANCorpusReader",
"Pl196xCorpusReader",
"TEICorpusView",
"KNBCorpusReader",
"ChasenCorpusReader",
"CHILDESCorpusReader",
"AlignedCorpusReader",
"TimitTaggedCorpusReader",
"LinThesaurusCorpusReader",
"SemcorCorpusReader",
"FramenetCorpusReader",
"UdhrCorpusReader",
"BNCCorpusReader",
"SentiWordNetCorpusReader",
"SentiSynset",
"TwitterCorpusReader",
"NKJPCorpusReader",
"CrubadanCorpusReader",
"MTECorpusReader",
"ReviewsCorpusReader",
"OpinionLexiconCorpusReader",
"ProsConsCorpusReader",
"CategorizedSentencesCorpusReader",
"ComparativeSentencesCorpusReader",
"PanLexLiteCorpusReader",
"NonbreakingPrefixesCorpusReader",
"UnicharsCorpusReader",
"MWAPPDBCorpusReader",
"PanlexSwadeshCorpusReader",
"BCP47CorpusReader",
]

View File

@@ -0,0 +1,154 @@
# Natural Language Toolkit: Aligned Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# URL: <https://www.nltk.org/>
# Author: Steven Bird <stevenbird1@gmail.com>
# For license information, see LICENSE.TXT
from nltk.corpus.reader.api import CorpusReader
from nltk.corpus.reader.util import (
StreamBackedCorpusView,
concat,
read_alignedsent_block,
)
from nltk.tokenize import RegexpTokenizer, WhitespaceTokenizer
from nltk.translate import AlignedSent, Alignment
class AlignedCorpusReader(CorpusReader):
"""
Reader for corpora of word-aligned sentences. Tokens are assumed
to be separated by whitespace. Sentences begin on separate lines.
"""
def __init__(
self,
root,
fileids,
sep="/",
word_tokenizer=WhitespaceTokenizer(),
sent_tokenizer=RegexpTokenizer("\n", gaps=True),
alignedsent_block_reader=read_alignedsent_block,
encoding="latin1",
):
"""
Construct a new Aligned Corpus reader for a set of documents
located at the given root directory. Example usage:
>>> root = '/...path to corpus.../'
>>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP
:param root: The root directory for this corpus.
:param fileids: A list or regexp specifying the fileids in this corpus.
"""
CorpusReader.__init__(self, root, fileids, encoding)
self._sep = sep
self._word_tokenizer = word_tokenizer
self._sent_tokenizer = sent_tokenizer
self._alignedsent_block_reader = alignedsent_block_reader
def words(self, fileids=None):
"""
:return: the given file(s) as a list of words
and punctuation symbols.
:rtype: list(str)
"""
return concat(
[
AlignedSentCorpusView(
fileid,
enc,
False,
False,
self._word_tokenizer,
self._sent_tokenizer,
self._alignedsent_block_reader,
)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def sents(self, fileids=None):
"""
:return: the given file(s) as a list of
sentences or utterances, each encoded as a list of word
strings.
:rtype: list(list(str))
"""
return concat(
[
AlignedSentCorpusView(
fileid,
enc,
False,
True,
self._word_tokenizer,
self._sent_tokenizer,
self._alignedsent_block_reader,
)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def aligned_sents(self, fileids=None):
"""
:return: the given file(s) as a list of AlignedSent objects.
:rtype: list(AlignedSent)
"""
return concat(
[
AlignedSentCorpusView(
fileid,
enc,
True,
True,
self._word_tokenizer,
self._sent_tokenizer,
self._alignedsent_block_reader,
)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
class AlignedSentCorpusView(StreamBackedCorpusView):
"""
A specialized corpus view for aligned sentences.
``AlignedSentCorpusView`` objects are typically created by
``AlignedCorpusReader`` (not directly by nltk users).
"""
def __init__(
self,
corpus_file,
encoding,
aligned,
group_by_sent,
word_tokenizer,
sent_tokenizer,
alignedsent_block_reader,
):
self._aligned = aligned
self._group_by_sent = group_by_sent
self._word_tokenizer = word_tokenizer
self._sent_tokenizer = sent_tokenizer
self._alignedsent_block_reader = alignedsent_block_reader
StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
def read_block(self, stream):
block = [
self._word_tokenizer.tokenize(sent_str)
for alignedsent_str in self._alignedsent_block_reader(stream)
for sent_str in self._sent_tokenizer.tokenize(alignedsent_str)
]
if self._aligned:
block[2] = Alignment.fromstring(
" ".join(block[2])
) # kludge; we shouldn't have tokenized the alignment string
block = [AlignedSent(*block)]
elif self._group_by_sent:
block = [block[0]]
else:
block = block[0]
return block

View File

@@ -0,0 +1,517 @@
# Natural Language Toolkit: API for Corpus Readers
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
API for corpus readers.
"""
import os
import re
from collections import defaultdict
from itertools import chain
from nltk.corpus.reader.util import *
from nltk.data import FileSystemPathPointer, PathPointer, ZipFilePathPointer
class CorpusReader:
"""
A base class for "corpus reader" classes, each of which can be
used to read a specific corpus format. Each individual corpus
reader instance is used to read a specific corpus, consisting of
one or more files under a common root directory. Each file is
identified by its ``file identifier``, which is the relative path
to the file from the root directory.
A separate subclass is defined for each corpus format. These
subclasses define one or more methods that provide 'views' on the
corpus contents, such as ``words()`` (for a list of words) and
``parsed_sents()`` (for a list of parsed sentences). Called with
no arguments, these methods will return the contents of the entire
corpus. For most corpora, these methods define one or more
selection arguments, such as ``fileids`` or ``categories``, which can
be used to select which portion of the corpus should be returned.
"""
def __init__(self, root, fileids, encoding="utf8", tagset=None):
"""
:type root: PathPointer or str
:param root: A path pointer identifying the root directory for
this corpus. If a string is specified, then it will be
converted to a ``PathPointer`` automatically.
:param fileids: A list of the files that make up this corpus.
This list can either be specified explicitly, as a list of
strings; or implicitly, as a regular expression over file
paths. The absolute path for each file will be constructed
by joining the reader's root to each file name.
:param encoding: The default unicode encoding for the files
that make up the corpus. The value of ``encoding`` can be any
of the following:
- A string: ``encoding`` is the encoding name for all files.
- A dictionary: ``encoding[file_id]`` is the encoding
name for the file whose identifier is ``file_id``. If
``file_id`` is not in ``encoding``, then the file
contents will be processed using non-unicode byte strings.
- A list: ``encoding`` should be a list of ``(regexp, encoding)``
tuples. The encoding for a file whose identifier is ``file_id``
will be the ``encoding`` value for the first tuple whose
``regexp`` matches the ``file_id``. If no tuple's ``regexp``
matches the ``file_id``, the file contents will be processed
using non-unicode byte strings.
- None: the file contents of all files will be
processed using non-unicode byte strings.
:param tagset: The name of the tagset used by this corpus, to be used
for normalizing or converting the POS tags returned by the
``tagged_...()`` methods.
"""
# Convert the root to a path pointer, if necessary.
if isinstance(root, str) and not isinstance(root, PathPointer):
m = re.match(r"(.*\.zip)/?(.*)$|", root)
zipfile, zipentry = m.groups()
if zipfile:
root = ZipFilePathPointer(zipfile, zipentry)
else:
root = FileSystemPathPointer(root)
elif not isinstance(root, PathPointer):
raise TypeError("CorpusReader: expected a string or a PathPointer")
# If `fileids` is a regexp, then expand it.
if isinstance(fileids, str):
fileids = find_corpus_fileids(root, fileids)
self._fileids = fileids
"""A list of the relative paths for the fileids that make up
this corpus."""
self._root = root
"""The root directory for this corpus."""
self._readme = "README"
self._license = "LICENSE"
self._citation = "citation.bib"
# If encoding was specified as a list of regexps, then convert
# it to a dictionary.
if isinstance(encoding, list):
encoding_dict = {}
for fileid in self._fileids:
for x in encoding:
(regexp, enc) = x
if re.match(regexp, fileid):
encoding_dict[fileid] = enc
break
encoding = encoding_dict
self._encoding = encoding
"""The default unicode encoding for the fileids that make up
this corpus. If ``encoding`` is None, then the file
contents are processed using byte strings."""
self._tagset = tagset
def __repr__(self):
if isinstance(self._root, ZipFilePathPointer):
path = f"{self._root.zipfile.filename}/{self._root.entry}"
else:
path = "%s" % self._root.path
return f"<{self.__class__.__name__} in {path!r}>"
def ensure_loaded(self):
"""
Load this corpus (if it has not already been loaded). This is
used by LazyCorpusLoader as a simple method that can be used to
make sure a corpus is loaded -- e.g., in case a user wants to
do help(some_corpus).
"""
pass # no need to actually do anything.
def readme(self):
"""
Return the contents of the corpus README file, if it exists.
"""
with self.open(self._readme) as f:
return f.read()
def license(self):
"""
Return the contents of the corpus LICENSE file, if it exists.
"""
with self.open(self._license) as f:
return f.read()
def citation(self):
"""
Return the contents of the corpus citation.bib file, if it exists.
"""
with self.open(self._citation) as f:
return f.read()
def fileids(self):
"""
Return a list of file identifiers for the fileids that make up
this corpus.
"""
return self._fileids
def abspath(self, fileid):
"""
Return the absolute path for the given file.
:type fileid: str
:param fileid: The file identifier for the file whose path
should be returned.
:rtype: PathPointer
"""
return self._root.join(fileid)
def abspaths(self, fileids=None, include_encoding=False, include_fileid=False):
"""
Return a list of the absolute paths for all fileids in this corpus;
or for the given list of fileids, if specified.
:type fileids: None or str or list
:param fileids: Specifies the set of fileids for which paths should
be returned. Can be None, for all fileids; a list of
file identifiers, for a specified set of fileids; or a single
file identifier, for a single file. Note that the return
value is always a list of paths, even if ``fileids`` is a
single file identifier.
:param include_encoding: If true, then return a list of
``(path_pointer, encoding)`` tuples.
:rtype: list(PathPointer)
"""
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, str):
fileids = [fileids]
paths = [self._root.join(f) for f in fileids]
if include_encoding and include_fileid:
return list(zip(paths, [self.encoding(f) for f in fileids], fileids))
elif include_fileid:
return list(zip(paths, fileids))
elif include_encoding:
return list(zip(paths, [self.encoding(f) for f in fileids]))
else:
return paths
def raw(self, fileids=None):
"""
:param fileids: A list specifying the fileids that should be used.
:return: the given file(s) as a single string.
:rtype: str
"""
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, str):
fileids = [fileids]
contents = []
for f in fileids:
with self.open(f) as fp:
contents.append(fp.read())
return concat(contents)
def open(self, file):
"""
Return an open stream that can be used to read the given file.
If the file's encoding is not None, then the stream will
automatically decode the file's contents into unicode.
:param file: The file identifier of the file to read.
"""
encoding = self.encoding(file)
stream = self._root.join(file).open(encoding)
return stream
def encoding(self, file):
"""
Return the unicode encoding for the given corpus file, if known.
If the encoding is unknown, or if the given file should be
processed using byte strings (str), then return None.
"""
if isinstance(self._encoding, dict):
return self._encoding.get(file)
else:
return self._encoding
def _get_root(self):
return self._root
root = property(
_get_root,
doc="""
The directory where this corpus is stored.
:type: PathPointer""",
)
######################################################################
# { Corpora containing categorized items
######################################################################
class CategorizedCorpusReader:
"""
A mixin class used to aid in the implementation of corpus readers
for categorized corpora. This class defines the method
``categories()``, which returns a list of the categories for the
corpus or for a specified set of fileids; and overrides ``fileids()``
to take a ``categories`` argument, restricting the set of fileids to
be returned.
Subclasses are expected to:
- Call ``__init__()`` to set up the mapping.
- Override all view methods to accept a ``categories`` parameter,
which can be used *instead* of the ``fileids`` parameter, to
select which fileids should be included in the returned view.
"""
def __init__(self, kwargs):
"""
Initialize this mapping based on keyword arguments, as
follows:
- cat_pattern: A regular expression pattern used to find the
category for each file identifier. The pattern will be
applied to each file identifier, and the first matching
group will be used as the category label for that file.
- cat_map: A dictionary, mapping from file identifiers to
category labels.
- cat_file: The name of a file that contains the mapping
from file identifiers to categories. The argument
``cat_delimiter`` can be used to specify a delimiter.
The corresponding argument will be deleted from ``kwargs``. If
more than one argument is specified, an exception will be
raised.
"""
self._f2c = None #: file-to-category mapping
self._c2f = None #: category-to-file mapping
self._pattern = None #: regexp specifying the mapping
self._map = None #: dict specifying the mapping
self._file = None #: fileid of file containing the mapping
self._delimiter = None #: delimiter for ``self._file``
if "cat_pattern" in kwargs:
self._pattern = kwargs["cat_pattern"]
del kwargs["cat_pattern"]
elif "cat_map" in kwargs:
self._map = kwargs["cat_map"]
del kwargs["cat_map"]
elif "cat_file" in kwargs:
self._file = kwargs["cat_file"]
del kwargs["cat_file"]
if "cat_delimiter" in kwargs:
self._delimiter = kwargs["cat_delimiter"]
del kwargs["cat_delimiter"]
else:
raise ValueError(
"Expected keyword argument cat_pattern or " "cat_map or cat_file."
)
if "cat_pattern" in kwargs or "cat_map" in kwargs or "cat_file" in kwargs:
raise ValueError(
"Specify exactly one of: cat_pattern, " "cat_map, cat_file."
)
def _init(self):
self._f2c = defaultdict(set)
self._c2f = defaultdict(set)
if self._pattern is not None:
for file_id in self._fileids:
category = re.match(self._pattern, file_id).group(1)
self._add(file_id, category)
elif self._map is not None:
for file_id, categories in self._map.items():
for category in categories:
self._add(file_id, category)
elif self._file is not None:
with self.open(self._file) as f:
for line in f.readlines():
line = line.strip()
file_id, categories = line.split(self._delimiter, 1)
if file_id not in self.fileids():
raise ValueError(
"In category mapping file %s: %s "
"not found" % (self._file, file_id)
)
for category in categories.split(self._delimiter):
self._add(file_id, category)
def _add(self, file_id, category):
self._f2c[file_id].add(category)
self._c2f[category].add(file_id)
def categories(self, fileids=None):
"""
Return a list of the categories that are defined for this corpus,
or for the file(s) if it is given.
"""
if self._f2c is None:
self._init()
if fileids is None:
return sorted(self._c2f)
if isinstance(fileids, str):
fileids = [fileids]
return sorted(set.union(*(self._f2c[d] for d in fileids)))
def fileids(self, categories=None):
"""
Return a list of file identifiers for the files that make up
this corpus, or that make up the given category(s) if specified.
"""
if categories is None:
return super().fileids()
elif isinstance(categories, str):
if self._f2c is None:
self._init()
if categories in self._c2f:
return sorted(self._c2f[categories])
else:
raise ValueError("Category %s not found" % categories)
else:
if self._f2c is None:
self._init()
return sorted(set.union(*(self._c2f[c] for c in categories)))
def _resolve(self, fileids, categories):
if fileids is not None and categories is not None:
raise ValueError("Specify fileids or categories, not both")
if categories is not None:
return self.fileids(categories)
else:
return fileids
def raw(self, fileids=None, categories=None):
return super().raw(self._resolve(fileids, categories))
def words(self, fileids=None, categories=None):
return super().words(self._resolve(fileids, categories))
def sents(self, fileids=None, categories=None):
return super().sents(self._resolve(fileids, categories))
def paras(self, fileids=None, categories=None):
return super().paras(self._resolve(fileids, categories))
######################################################################
# { Treebank readers
######################################################################
# [xx] is it worth it to factor this out?
class SyntaxCorpusReader(CorpusReader):
"""
An abstract base class for reading corpora consisting of
syntactically parsed text. Subclasses should define:
- ``__init__``, which specifies the location of the corpus
and a method for detecting the sentence blocks in corpus files.
- ``_read_block``, which reads a block from the input stream.
- ``_word``, which takes a block and returns a list of list of words.
- ``_tag``, which takes a block and returns a list of list of tagged
words.
- ``_parse``, which takes a block and returns a list of parsed
sentences.
"""
def _parse(self, s):
raise NotImplementedError()
def _word(self, s):
raise NotImplementedError()
def _tag(self, s):
raise NotImplementedError()
def _read_block(self, stream):
raise NotImplementedError()
def parsed_sents(self, fileids=None):
reader = self._read_parsed_sent_block
return concat(
[
StreamBackedCorpusView(fileid, reader, encoding=enc)
for fileid, enc in self.abspaths(fileids, True)
]
)
def tagged_sents(self, fileids=None, tagset=None):
def reader(stream):
return self._read_tagged_sent_block(stream, tagset)
return concat(
[
StreamBackedCorpusView(fileid, reader, encoding=enc)
for fileid, enc in self.abspaths(fileids, True)
]
)
def sents(self, fileids=None):
reader = self._read_sent_block
return concat(
[
StreamBackedCorpusView(fileid, reader, encoding=enc)
for fileid, enc in self.abspaths(fileids, True)
]
)
def tagged_words(self, fileids=None, tagset=None):
def reader(stream):
return self._read_tagged_word_block(stream, tagset)
return concat(
[
StreamBackedCorpusView(fileid, reader, encoding=enc)
for fileid, enc in self.abspaths(fileids, True)
]
)
def words(self, fileids=None):
return concat(
[
StreamBackedCorpusView(fileid, self._read_word_block, encoding=enc)
for fileid, enc in self.abspaths(fileids, True)
]
)
# ------------------------------------------------------------
# { Block Readers
def _read_word_block(self, stream):
return list(chain.from_iterable(self._read_sent_block(stream)))
def _read_tagged_word_block(self, stream, tagset=None):
return list(chain.from_iterable(self._read_tagged_sent_block(stream, tagset)))
def _read_sent_block(self, stream):
return list(filter(None, [self._word(t) for t in self._read_block(stream)]))
def _read_tagged_sent_block(self, stream, tagset=None):
return list(
filter(None, [self._tag(t, tagset) for t in self._read_block(stream)])
)
def _read_parsed_sent_block(self, stream):
return list(filter(None, [self._parse(t) for t in self._read_block(stream)]))
# } End of Block Readers
# ------------------------------------------------------------

View File

@@ -0,0 +1,218 @@
# Natural Language Toolkit: BCP-47 language tags
#
# Copyright (C) 2022-2023 NLTK Project
# Author: Eric Kafe <kafe.eric@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import re
from warnings import warn
from xml.etree import ElementTree as et
from nltk.corpus.reader import CorpusReader
class BCP47CorpusReader(CorpusReader):
"""
Parse BCP-47 composite language tags
Supports all the main subtags, and the 'u-sd' extension:
>>> from nltk.corpus import bcp47
>>> bcp47.name('oc-gascon-u-sd-fr64')
'Occitan (post 1500): Gascon: Pyrénées-Atlantiques'
Can load a conversion table to Wikidata Q-codes:
>>> bcp47.load_wiki_q()
>>> bcp47.wiki_q['en-GI-spanglis']
'Q79388'
"""
def __init__(self, root, fileids):
"""Read the BCP-47 database"""
super().__init__(root, fileids)
self.langcode = {}
with self.open("iana/language-subtag-registry.txt") as fp:
self.db = self.data_dict(fp.read().split("%%\n"))
with self.open("cldr/common-subdivisions-en.xml") as fp:
self.subdiv = self.subdiv_dict(
et.parse(fp).iterfind("localeDisplayNames/subdivisions/subdivision")
)
self.morphology()
def load_wiki_q(self):
"""Load conversion table to Wikidata Q-codes (only if needed)"""
with self.open("cldr/tools-cldr-rdf-external-entityToCode.tsv") as fp:
self.wiki_q = self.wiki_dict(fp.read().strip().split("\n")[1:])
def wiki_dict(self, lines):
"""Convert Wikidata list of Q-codes to a BCP-47 dictionary"""
return {
pair[1]: pair[0].split("/")[-1]
for pair in [line.strip().split("\t") for line in lines]
}
def subdiv_dict(self, subdivs):
"""Convert the CLDR subdivisions list to a dictionary"""
return {sub.attrib["type"]: sub.text for sub in subdivs}
def morphology(self):
self.casing = {
"language": str.lower,
"extlang": str.lower,
"script": str.title,
"region": str.upper,
"variant": str.lower,
}
dig = "[0-9]"
low = "[a-z]"
up = "[A-Z]"
alnum = "[a-zA-Z0-9]"
self.format = {
"language": re.compile(f"{low*3}?"),
"extlang": re.compile(f"{low*3}"),
"script": re.compile(f"{up}{low*3}"),
"region": re.compile(f"({up*2})|({dig*3})"),
"variant": re.compile(f"{alnum*4}{(alnum+'?')*4}"),
"singleton": re.compile(f"{low}"),
}
def data_dict(self, records):
"""Convert the BCP-47 language subtag registry to a dictionary"""
self.version = records[0].replace("File-Date:", "").strip()
dic = {}
dic["deprecated"] = {}
for label in [
"language",
"extlang",
"script",
"region",
"variant",
"redundant",
"grandfathered",
]:
dic["deprecated"][label] = {}
for record in records[1:]:
fields = [field.split(": ") for field in record.strip().split("\n")]
typ = fields[0][1]
tag = fields[1][1]
if typ not in dic:
dic[typ] = {}
subfields = {}
for field in fields[2:]:
if len(field) == 2:
[key, val] = field
if key not in subfields:
subfields[key] = [val]
else: # multiple value
subfields[key].append(val)
else: # multiline field
subfields[key][-1] += " " + field[0].strip()
if (
"Deprecated" not in record
and typ == "language"
and key == "Description"
):
self.langcode[subfields[key][-1]] = tag
for key in subfields:
if len(subfields[key]) == 1: # single value
subfields[key] = subfields[key][0]
if "Deprecated" in record:
dic["deprecated"][typ][tag] = subfields
else:
dic[typ][tag] = subfields
return dic
def val2str(self, val):
"""Return only first value"""
if type(val) == list:
# val = "/".join(val) # Concatenate all values
val = val[0]
return val
def lang2str(self, lg_record):
"""Concatenate subtag values"""
name = f"{lg_record['language']}"
for label in ["extlang", "script", "region", "variant", "extension"]:
if label in lg_record:
name += f": {lg_record[label]}"
return name
def parse_tag(self, tag):
"""Convert a BCP-47 tag to a dictionary of labelled subtags"""
subtags = tag.split("-")
lang = {}
labels = ["language", "extlang", "script", "region", "variant", "variant"]
while subtags and labels:
subtag = subtags.pop(0)
found = False
while labels:
label = labels.pop(0)
subtag = self.casing[label](subtag)
if self.format[label].fullmatch(subtag):
if subtag in self.db[label]:
found = True
valstr = self.val2str(self.db[label][subtag]["Description"])
if label == "variant" and label in lang:
lang[label] += ": " + valstr
else:
lang[label] = valstr
break
elif subtag in self.db["deprecated"][label]:
found = True
note = f"The {subtag!r} {label} code is deprecated"
if "Preferred-Value" in self.db["deprecated"][label][subtag]:
prefer = self.db["deprecated"][label][subtag][
"Preferred-Value"
]
note += f"', prefer '{self.val2str(prefer)}'"
lang[label] = self.val2str(
self.db["deprecated"][label][subtag]["Description"]
)
warn(note)
break
if not found:
if subtag == "u" and subtags[0] == "sd": # CLDR regional subdivisions
sd = subtags[1]
if sd in self.subdiv:
ext = self.subdiv[sd]
else:
ext = f"<Unknown subdivision: {ext}>"
else: # other extension subtags are not supported yet
ext = f"{subtag}{''.join(['-'+ext for ext in subtags])}".lower()
if not self.format["singleton"].fullmatch(subtag):
ext = f"<Invalid extension: {ext}>"
warn(ext)
lang["extension"] = ext
subtags = []
return lang
def name(self, tag):
"""
Convert a BCP-47 tag to a colon-separated string of subtag names
>>> from nltk.corpus import bcp47
>>> bcp47.name('ca-Latn-ES-valencia')
'Catalan: Latin: Spain: Valencian'
"""
for label in ["redundant", "grandfathered"]:
val = None
if tag in self.db[label]:
val = f"{self.db[label][tag]['Description']}"
note = f"The {tag!r} code is {label}"
elif tag in self.db["deprecated"][label]:
val = f"{self.db['deprecated'][label][tag]['Description']}"
note = f"The {tag!r} code is {label} and deprecated"
if "Preferred-Value" in self.db["deprecated"][label][tag]:
prefer = self.db["deprecated"][label][tag]["Preferred-Value"]
note += f", prefer {self.val2str(prefer)!r}"
if val:
warn(note)
return val
try:
return self.lang2str(self.parse_tag(tag))
except:
warn(f"Tag {tag!r} was not recognized")
return None

View File

@@ -0,0 +1,265 @@
# Natural Language Toolkit: Plaintext Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""Corpus reader for the XML version of the British National Corpus."""
from nltk.corpus.reader.util import concat
from nltk.corpus.reader.xmldocs import ElementTree, XMLCorpusReader, XMLCorpusView
class BNCCorpusReader(XMLCorpusReader):
r"""Corpus reader for the XML version of the British National Corpus.
For access to the complete XML data structure, use the ``xml()``
method. For access to simple word lists and tagged word lists, use
``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``.
You can obtain the full version of the BNC corpus at
https://www.ota.ox.ac.uk/desc/2554
If you extracted the archive to a directory called `BNC`, then you can
instantiate the reader as::
BNCCorpusReader(root='BNC/Texts/', fileids=r'[A-K]/\w*/\w*\.xml')
"""
def __init__(self, root, fileids, lazy=True):
XMLCorpusReader.__init__(self, root, fileids)
self._lazy = lazy
def words(self, fileids=None, strip_space=True, stem=False):
"""
:return: the given file(s) as a list of words
and punctuation symbols.
:rtype: list(str)
:param strip_space: If true, then strip trailing spaces from
word tokens. Otherwise, leave the spaces on the tokens.
:param stem: If true, then use word stems instead of word strings.
"""
return self._views(fileids, False, None, strip_space, stem)
def tagged_words(self, fileids=None, c5=False, strip_space=True, stem=False):
"""
:return: the given file(s) as a list of tagged
words and punctuation symbols, encoded as tuples
``(word,tag)``.
:rtype: list(tuple(str,str))
:param c5: If true, then the tags used will be the more detailed
c5 tags. Otherwise, the simplified tags will be used.
:param strip_space: If true, then strip trailing spaces from
word tokens. Otherwise, leave the spaces on the tokens.
:param stem: If true, then use word stems instead of word strings.
"""
tag = "c5" if c5 else "pos"
return self._views(fileids, False, tag, strip_space, stem)
def sents(self, fileids=None, strip_space=True, stem=False):
"""
:return: the given file(s) as a list of
sentences or utterances, each encoded as a list of word
strings.
:rtype: list(list(str))
:param strip_space: If true, then strip trailing spaces from
word tokens. Otherwise, leave the spaces on the tokens.
:param stem: If true, then use word stems instead of word strings.
"""
return self._views(fileids, True, None, strip_space, stem)
def tagged_sents(self, fileids=None, c5=False, strip_space=True, stem=False):
"""
:return: the given file(s) as a list of
sentences, each encoded as a list of ``(word,tag)`` tuples.
:rtype: list(list(tuple(str,str)))
:param c5: If true, then the tags used will be the more detailed
c5 tags. Otherwise, the simplified tags will be used.
:param strip_space: If true, then strip trailing spaces from
word tokens. Otherwise, leave the spaces on the tokens.
:param stem: If true, then use word stems instead of word strings.
"""
tag = "c5" if c5 else "pos"
return self._views(
fileids, sent=True, tag=tag, strip_space=strip_space, stem=stem
)
def _views(self, fileids=None, sent=False, tag=False, strip_space=True, stem=False):
"""A helper function that instantiates BNCWordViews or the list of words/sentences."""
f = BNCWordView if self._lazy else self._words
return concat(
[
f(fileid, sent, tag, strip_space, stem)
for fileid in self.abspaths(fileids)
]
)
def _words(self, fileid, bracket_sent, tag, strip_space, stem):
"""
Helper used to implement the view methods -- returns a list of
words or a list of sentences, optionally tagged.
:param fileid: The name of the underlying file.
:param bracket_sent: If true, include sentence bracketing.
:param tag: The name of the tagset to use, or None for no tags.
:param strip_space: If true, strip spaces from word tokens.
:param stem: If true, then substitute stems for words.
"""
result = []
xmldoc = ElementTree.parse(fileid).getroot()
for xmlsent in xmldoc.findall(".//s"):
sent = []
for xmlword in _all_xmlwords_in(xmlsent):
word = xmlword.text
if not word:
word = "" # fixes issue 337?
if strip_space or stem:
word = word.strip()
if stem:
word = xmlword.get("hw", word)
if tag == "c5":
word = (word, xmlword.get("c5"))
elif tag == "pos":
word = (word, xmlword.get("pos", xmlword.get("c5")))
sent.append(word)
if bracket_sent:
result.append(BNCSentence(xmlsent.attrib["n"], sent))
else:
result.extend(sent)
assert None not in result
return result
def _all_xmlwords_in(elt, result=None):
if result is None:
result = []
for child in elt:
if child.tag in ("c", "w"):
result.append(child)
else:
_all_xmlwords_in(child, result)
return result
class BNCSentence(list):
"""
A list of words, augmented by an attribute ``num`` used to record
the sentence identifier (the ``n`` attribute from the XML).
"""
def __init__(self, num, items):
self.num = num
list.__init__(self, items)
class BNCWordView(XMLCorpusView):
"""
A stream backed corpus view specialized for use with the BNC corpus.
"""
tags_to_ignore = {
"pb",
"gap",
"vocal",
"event",
"unclear",
"shift",
"pause",
"align",
}
"""These tags are ignored. For their description refer to the
technical documentation, for example,
http://www.natcorp.ox.ac.uk/docs/URG/ref-vocal.html
"""
def __init__(self, fileid, sent, tag, strip_space, stem):
"""
:param fileid: The name of the underlying file.
:param sent: If true, include sentence bracketing.
:param tag: The name of the tagset to use, or None for no tags.
:param strip_space: If true, strip spaces from word tokens.
:param stem: If true, then substitute stems for words.
"""
if sent:
tagspec = ".*/s"
else:
tagspec = ".*/s/(.*/)?(c|w)"
self._sent = sent
self._tag = tag
self._strip_space = strip_space
self._stem = stem
self.title = None #: Title of the document.
self.author = None #: Author of the document.
self.editor = None #: Editor
self.resps = None #: Statement of responsibility
XMLCorpusView.__init__(self, fileid, tagspec)
# Read in a tasty header.
self._open()
self.read_block(self._stream, ".*/teiHeader$", self.handle_header)
self.close()
# Reset tag context.
self._tag_context = {0: ()}
def handle_header(self, elt, context):
# Set up some metadata!
titles = elt.findall("titleStmt/title")
if titles:
self.title = "\n".join(title.text.strip() for title in titles)
authors = elt.findall("titleStmt/author")
if authors:
self.author = "\n".join(author.text.strip() for author in authors)
editors = elt.findall("titleStmt/editor")
if editors:
self.editor = "\n".join(editor.text.strip() for editor in editors)
resps = elt.findall("titleStmt/respStmt")
if resps:
self.resps = "\n\n".join(
"\n".join(resp_elt.text.strip() for resp_elt in resp) for resp in resps
)
def handle_elt(self, elt, context):
if self._sent:
return self.handle_sent(elt)
else:
return self.handle_word(elt)
def handle_word(self, elt):
word = elt.text
if not word:
word = "" # fixes issue 337?
if self._strip_space or self._stem:
word = word.strip()
if self._stem:
word = elt.get("hw", word)
if self._tag == "c5":
word = (word, elt.get("c5"))
elif self._tag == "pos":
word = (word, elt.get("pos", elt.get("c5")))
return word
def handle_sent(self, elt):
sent = []
for child in elt:
if child.tag in ("mw", "hi", "corr", "trunc"):
sent += [self.handle_word(w) for w in child]
elif child.tag in ("w", "c"):
sent.append(self.handle_word(child))
elif child.tag not in self.tags_to_ignore:
raise ValueError("Unexpected element %s" % child.tag)
return BNCSentence(elt.attrib["n"], sent)

View File

@@ -0,0 +1,237 @@
# Natural Language Toolkit: Penn Treebank Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Corpus reader for corpora that consist of parenthesis-delineated parse trees.
"""
import sys
from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *
from nltk.tag import map_tag
from nltk.tree import Tree
# we use [^\s()]+ instead of \S+? to avoid matching ()
SORTTAGWRD = re.compile(r"\((\d+) ([^\s()]+) ([^\s()]+)\)")
TAGWORD = re.compile(r"\(([^\s()]+) ([^\s()]+)\)")
WORD = re.compile(r"\([^\s()]+ ([^\s()]+)\)")
EMPTY_BRACKETS = re.compile(r"\s*\(\s*\(")
class BracketParseCorpusReader(SyntaxCorpusReader):
"""
Reader for corpora that consist of parenthesis-delineated parse trees,
like those found in the "combined" section of the Penn Treebank,
e.g. "(S (NP (DT the) (JJ little) (NN dog)) (VP (VBD barked)))".
"""
def __init__(
self,
root,
fileids,
comment_char=None,
detect_blocks="unindented_paren",
encoding="utf8",
tagset=None,
):
"""
:param root: The root directory for this corpus.
:param fileids: A list or regexp specifying the fileids in this corpus.
:param comment_char: The character which can appear at the start of
a line to indicate that the rest of the line is a comment.
:param detect_blocks: The method that is used to find blocks
in the corpus; can be 'unindented_paren' (every unindented
parenthesis starts a new parse) or 'sexpr' (brackets are
matched).
:param tagset: The name of the tagset used by this corpus, to be used
for normalizing or converting the POS tags returned by the
``tagged_...()`` methods.
"""
SyntaxCorpusReader.__init__(self, root, fileids, encoding)
self._comment_char = comment_char
self._detect_blocks = detect_blocks
self._tagset = tagset
def _read_block(self, stream):
if self._detect_blocks == "sexpr":
return read_sexpr_block(stream, comment_char=self._comment_char)
elif self._detect_blocks == "blankline":
return read_blankline_block(stream)
elif self._detect_blocks == "unindented_paren":
# Tokens start with unindented left parens.
toks = read_regexp_block(stream, start_re=r"^\(")
# Strip any comments out of the tokens.
if self._comment_char:
toks = [
re.sub("(?m)^%s.*" % re.escape(self._comment_char), "", tok)
for tok in toks
]
return toks
else:
assert 0, "bad block type"
def _normalize(self, t):
# Replace leaves of the form (!), (,), with (! !), (, ,)
t = re.sub(r"\((.)\)", r"(\1 \1)", t)
# Replace leaves of the form (tag word root) with (tag word)
t = re.sub(r"\(([^\s()]+) ([^\s()]+) [^\s()]+\)", r"(\1 \2)", t)
return t
def _parse(self, t):
try:
tree = Tree.fromstring(self._normalize(t))
# If there's an empty node at the top, strip it off
if tree.label() == "" and len(tree) == 1:
return tree[0]
else:
return tree
except ValueError as e:
sys.stderr.write("Bad tree detected; trying to recover...\n")
# Try to recover, if we can:
if e.args == ("mismatched parens",):
for n in range(1, 5):
try:
v = Tree(self._normalize(t + ")" * n))
sys.stderr.write(
" Recovered by adding %d close " "paren(s)\n" % n
)
return v
except ValueError:
pass
# Try something else:
sys.stderr.write(" Recovered by returning a flat parse.\n")
# sys.stderr.write(' '.join(t.split())+'\n')
return Tree("S", self._tag(t))
def _tag(self, t, tagset=None):
tagged_sent = [(w, p) for (p, w) in TAGWORD.findall(self._normalize(t))]
if tagset and tagset != self._tagset:
tagged_sent = [
(w, map_tag(self._tagset, tagset, p)) for (w, p) in tagged_sent
]
return tagged_sent
def _word(self, t):
return WORD.findall(self._normalize(t))
class CategorizedBracketParseCorpusReader(
CategorizedCorpusReader, BracketParseCorpusReader
):
"""
A reader for parsed corpora whose documents are
divided into categories based on their file identifiers.
@author: Nathan Schneider <nschneid@cs.cmu.edu>
"""
def __init__(self, *args, **kwargs):
"""
Initialize the corpus reader. Categorization arguments
(C{cat_pattern}, C{cat_map}, and C{cat_file}) are passed to
the L{CategorizedCorpusReader constructor
<CategorizedCorpusReader.__init__>}. The remaining arguments
are passed to the L{BracketParseCorpusReader constructor
<BracketParseCorpusReader.__init__>}.
"""
CategorizedCorpusReader.__init__(self, kwargs)
BracketParseCorpusReader.__init__(self, *args, **kwargs)
def tagged_words(self, fileids=None, categories=None, tagset=None):
return super().tagged_words(self._resolve(fileids, categories), tagset)
def tagged_sents(self, fileids=None, categories=None, tagset=None):
return super().tagged_sents(self._resolve(fileids, categories), tagset)
def tagged_paras(self, fileids=None, categories=None, tagset=None):
return super().tagged_paras(self._resolve(fileids, categories), tagset)
def parsed_words(self, fileids=None, categories=None):
return super().parsed_words(self._resolve(fileids, categories))
def parsed_sents(self, fileids=None, categories=None):
return super().parsed_sents(self._resolve(fileids, categories))
def parsed_paras(self, fileids=None, categories=None):
return super().parsed_paras(self._resolve(fileids, categories))
class AlpinoCorpusReader(BracketParseCorpusReader):
"""
Reader for the Alpino Dutch Treebank.
This corpus has a lexical breakdown structure embedded, as read by `_parse`
Unfortunately this puts punctuation and some other words out of the sentence
order in the xml element tree. This is no good for `tag_` and `word_`
`_tag` and `_word` will be overridden to use a non-default new parameter 'ordered'
to the overridden _normalize function. The _parse function can then remain
untouched.
"""
def __init__(self, root, encoding="ISO-8859-1", tagset=None):
BracketParseCorpusReader.__init__(
self,
root,
r"alpino\.xml",
detect_blocks="blankline",
encoding=encoding,
tagset=tagset,
)
def _normalize(self, t, ordered=False):
"""Normalize the xml sentence element in t.
The sentence elements <alpino_ds>, although embedded in a few overall
xml elements, are separated by blank lines. That's how the reader can
deliver them one at a time.
Each sentence has a few category subnodes that are of no use to us.
The remaining word nodes may or may not appear in the proper order.
Each word node has attributes, among which:
- begin : the position of the word in the sentence
- pos : Part of Speech: the Tag
- word : the actual word
The return value is a string with all xml elementes replaced by
clauses: either a cat clause with nested clauses, or a word clause.
The order of the bracket clauses closely follows the xml.
If ordered == True, the word clauses include an order sequence number.
If ordered == False, the word clauses only have pos and word parts.
"""
if t[:10] != "<alpino_ds":
return ""
# convert XML to sexpr notation
t = re.sub(r' <node .*? cat="(\w+)".*>', r"(\1", t)
if ordered:
t = re.sub(
r' <node. *?begin="(\d+)".*? pos="(\w+)".*? word="([^"]+)".*?/>',
r"(\1 \2 \3)",
t,
)
else:
t = re.sub(r' <node .*?pos="(\w+)".*? word="([^"]+)".*?/>', r"(\1 \2)", t)
t = re.sub(r" </node>", r")", t)
t = re.sub(r"<sentence>.*</sentence>", r"", t)
t = re.sub(r"</?alpino_ds.*>", r"", t)
return t
def _tag(self, t, tagset=None):
tagged_sent = [
(int(o), w, p)
for (o, p, w) in SORTTAGWRD.findall(self._normalize(t, ordered=True))
]
tagged_sent.sort()
if tagset and tagset != self._tagset:
tagged_sent = [
(w, map_tag(self._tagset, tagset, p)) for (o, w, p) in tagged_sent
]
else:
tagged_sent = [(w, p) for (o, w, p) in tagged_sent]
return tagged_sent
def _word(self, t):
"""Return a correctly ordered list if words"""
tagged_sent = self._tag(t)
return [w for (w, p) in tagged_sent]

View File

@@ -0,0 +1,168 @@
# Natural Language Toolkit: Categorized Sentences Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
CorpusReader structured for corpora that contain one instance on each row.
This CorpusReader is specifically used for the Subjectivity Dataset and the
Sentence Polarity Dataset.
- Subjectivity Dataset information -
Authors: Bo Pang and Lillian Lee.
Url: https://www.cs.cornell.edu/people/pabo/movie-review-data
Distributed with permission.
Related papers:
- Bo Pang and Lillian Lee. "A Sentimental Education: Sentiment Analysis Using
Subjectivity Summarization Based on Minimum Cuts". Proceedings of the ACL,
2004.
- Sentence Polarity Dataset information -
Authors: Bo Pang and Lillian Lee.
Url: https://www.cs.cornell.edu/people/pabo/movie-review-data
Related papers:
- Bo Pang and Lillian Lee. "Seeing stars: Exploiting class relationships for
sentiment categorization with respect to rating scales". Proceedings of the
ACL, 2005.
"""
from nltk.corpus.reader.api import *
from nltk.tokenize import *
class CategorizedSentencesCorpusReader(CategorizedCorpusReader, CorpusReader):
"""
A reader for corpora in which each row represents a single instance, mainly
a sentence. Istances are divided into categories based on their file identifiers
(see CategorizedCorpusReader).
Since many corpora allow rows that contain more than one sentence, it is
possible to specify a sentence tokenizer to retrieve all sentences instead
than all rows.
Examples using the Subjectivity Dataset:
>>> from nltk.corpus import subjectivity
>>> subjectivity.sents()[23] # doctest: +NORMALIZE_WHITESPACE
['television', 'made', 'him', 'famous', ',', 'but', 'his', 'biggest', 'hits',
'happened', 'off', 'screen', '.']
>>> subjectivity.categories()
['obj', 'subj']
>>> subjectivity.words(categories='subj')
['smart', 'and', 'alert', ',', 'thirteen', ...]
Examples using the Sentence Polarity Dataset:
>>> from nltk.corpus import sentence_polarity
>>> sentence_polarity.sents() # doctest: +NORMALIZE_WHITESPACE
[['simplistic', ',', 'silly', 'and', 'tedious', '.'], ["it's", 'so', 'laddish',
'and', 'juvenile', ',', 'only', 'teenage', 'boys', 'could', 'possibly', 'find',
'it', 'funny', '.'], ...]
>>> sentence_polarity.categories()
['neg', 'pos']
"""
CorpusView = StreamBackedCorpusView
def __init__(
self,
root,
fileids,
word_tokenizer=WhitespaceTokenizer(),
sent_tokenizer=None,
encoding="utf8",
**kwargs
):
"""
:param root: The root directory for the corpus.
:param fileids: a list or regexp specifying the fileids in the corpus.
:param word_tokenizer: a tokenizer for breaking sentences or paragraphs
into words. Default: `WhitespaceTokenizer`
:param sent_tokenizer: a tokenizer for breaking paragraphs into sentences.
:param encoding: the encoding that should be used to read the corpus.
:param kwargs: additional parameters passed to CategorizedCorpusReader.
"""
CorpusReader.__init__(self, root, fileids, encoding)
CategorizedCorpusReader.__init__(self, kwargs)
self._word_tokenizer = word_tokenizer
self._sent_tokenizer = sent_tokenizer
def sents(self, fileids=None, categories=None):
"""
Return all sentences in the corpus or in the specified file(s).
:param fileids: a list or regexp specifying the ids of the files whose
sentences have to be returned.
:param categories: a list specifying the categories whose sentences have
to be returned.
:return: the given file(s) as a list of sentences.
Each sentence is tokenized using the specified word_tokenizer.
:rtype: list(list(str))
"""
fileids = self._resolve(fileids, categories)
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, str):
fileids = [fileids]
return concat(
[
self.CorpusView(path, self._read_sent_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def words(self, fileids=None, categories=None):
"""
Return all words and punctuation symbols in the corpus or in the specified
file(s).
:param fileids: a list or regexp specifying the ids of the files whose
words have to be returned.
:param categories: a list specifying the categories whose words have to
be returned.
:return: the given file(s) as a list of words and punctuation symbols.
:rtype: list(str)
"""
fileids = self._resolve(fileids, categories)
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, str):
fileids = [fileids]
return concat(
[
self.CorpusView(path, self._read_word_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def _read_sent_block(self, stream):
sents = []
for i in range(20): # Read 20 lines at a time.
line = stream.readline()
if not line:
continue
if self._sent_tokenizer:
sents.extend(
[
self._word_tokenizer.tokenize(sent)
for sent in self._sent_tokenizer.tokenize(line)
]
)
else:
sents.append(self._word_tokenizer.tokenize(line))
return sents
def _read_word_block(self, stream):
words = []
for sent in self._read_sent_block(stream):
words.extend(sent)
return words

View File

@@ -0,0 +1,154 @@
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Masato Hagiwara <hagisan@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import sys
from nltk.corpus.reader import util
from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *
class ChasenCorpusReader(CorpusReader):
def __init__(self, root, fileids, encoding="utf8", sent_splitter=None):
self._sent_splitter = sent_splitter
CorpusReader.__init__(self, root, fileids, encoding)
def words(self, fileids=None):
return concat(
[
ChasenCorpusView(fileid, enc, False, False, False, self._sent_splitter)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def tagged_words(self, fileids=None):
return concat(
[
ChasenCorpusView(fileid, enc, True, False, False, self._sent_splitter)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def sents(self, fileids=None):
return concat(
[
ChasenCorpusView(fileid, enc, False, True, False, self._sent_splitter)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def tagged_sents(self, fileids=None):
return concat(
[
ChasenCorpusView(fileid, enc, True, True, False, self._sent_splitter)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def paras(self, fileids=None):
return concat(
[
ChasenCorpusView(fileid, enc, False, True, True, self._sent_splitter)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def tagged_paras(self, fileids=None):
return concat(
[
ChasenCorpusView(fileid, enc, True, True, True, self._sent_splitter)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
class ChasenCorpusView(StreamBackedCorpusView):
"""
A specialized corpus view for ChasenReader. Similar to ``TaggedCorpusView``,
but this'll use fixed sets of word and sentence tokenizer.
"""
def __init__(
self,
corpus_file,
encoding,
tagged,
group_by_sent,
group_by_para,
sent_splitter=None,
):
self._tagged = tagged
self._group_by_sent = group_by_sent
self._group_by_para = group_by_para
self._sent_splitter = sent_splitter
StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
def read_block(self, stream):
"""Reads one paragraph at a time."""
block = []
for para_str in read_regexp_block(stream, r".", r"^EOS\n"):
para = []
sent = []
for line in para_str.splitlines():
_eos = line.strip() == "EOS"
_cells = line.split("\t")
w = (_cells[0], "\t".join(_cells[1:]))
if not _eos:
sent.append(w)
if _eos or (self._sent_splitter and self._sent_splitter(w)):
if not self._tagged:
sent = [w for (w, t) in sent]
if self._group_by_sent:
para.append(sent)
else:
para.extend(sent)
sent = []
if len(sent) > 0:
if not self._tagged:
sent = [w for (w, t) in sent]
if self._group_by_sent:
para.append(sent)
else:
para.extend(sent)
if self._group_by_para:
block.append(para)
else:
block.extend(para)
return block
def demo():
import nltk
from nltk.corpus.util import LazyCorpusLoader
jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*chasen", encoding="utf-8")
print("/".join(jeita.words()[22100:22140]))
print(
"\nEOS\n".join(
"\n".join("{}/{}".format(w[0], w[1].split("\t")[2]) for w in sent)
for sent in jeita.tagged_sents()[2170:2173]
)
)
def test():
from nltk.corpus.util import LazyCorpusLoader
jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*chasen", encoding="utf-8")
assert isinstance(jeita.tagged_words()[0][1], str)
if __name__ == "__main__":
demo()
test()

View File

@@ -0,0 +1,630 @@
# CHILDES XML Corpus Reader
# Copyright (C) 2001-2025 NLTK Project
# Author: Tomonori Nagano <tnagano@gc.cuny.edu>
# Alexis Dimitriadis <A.Dimitriadis@uu.nl>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Corpus reader for the XML version of the CHILDES corpus.
"""
__docformat__ = "epytext en"
import re
from collections import defaultdict
from nltk.corpus.reader.util import concat
from nltk.corpus.reader.xmldocs import ElementTree, XMLCorpusReader
from nltk.util import LazyConcatenation, LazyMap, flatten
# to resolve the namespace issue
NS = "http://www.talkbank.org/ns/talkbank"
class CHILDESCorpusReader(XMLCorpusReader):
"""
Corpus reader for the XML version of the CHILDES corpus.
The CHILDES corpus is available at ``https://childes.talkbank.org/``. The XML
version of CHILDES is located at ``https://childes.talkbank.org/data-xml/``.
Copy the needed parts of the CHILDES XML corpus into the NLTK data directory
(``nltk_data/corpora/CHILDES/``).
For access to the file text use the usual nltk functions,
``words()``, ``sents()``, ``tagged_words()`` and ``tagged_sents()``.
"""
def __init__(self, root, fileids, lazy=True):
XMLCorpusReader.__init__(self, root, fileids)
self._lazy = lazy
def words(
self,
fileids=None,
speaker="ALL",
stem=False,
relation=False,
strip_space=True,
replace=False,
):
"""
:return: the given file(s) as a list of words
:rtype: list(str)
:param speaker: If specified, select specific speaker(s) defined
in the corpus. Default is 'ALL' (all participants). Common choices
are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
researchers)
:param stem: If true, then use word stems instead of word strings.
:param relation: If true, then return tuples of (stem, index,
dependent_index)
:param strip_space: If true, then strip trailing spaces from word
tokens. Otherwise, leave the spaces on the tokens.
:param replace: If true, then use the replaced (intended) word instead
of the original word (e.g., 'wat' will be replaced with 'watch')
"""
sent = None
pos = False
if not self._lazy:
return [
self._get_words(
fileid, speaker, sent, stem, relation, pos, strip_space, replace
)
for fileid in self.abspaths(fileids)
]
get_words = lambda fileid: self._get_words(
fileid, speaker, sent, stem, relation, pos, strip_space, replace
)
return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
def tagged_words(
self,
fileids=None,
speaker="ALL",
stem=False,
relation=False,
strip_space=True,
replace=False,
):
"""
:return: the given file(s) as a list of tagged
words and punctuation symbols, encoded as tuples
``(word,tag)``.
:rtype: list(tuple(str,str))
:param speaker: If specified, select specific speaker(s) defined
in the corpus. Default is 'ALL' (all participants). Common choices
are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
researchers)
:param stem: If true, then use word stems instead of word strings.
:param relation: If true, then return tuples of (stem, index,
dependent_index)
:param strip_space: If true, then strip trailing spaces from word
tokens. Otherwise, leave the spaces on the tokens.
:param replace: If true, then use the replaced (intended) word instead
of the original word (e.g., 'wat' will be replaced with 'watch')
"""
sent = None
pos = True
if not self._lazy:
return [
self._get_words(
fileid, speaker, sent, stem, relation, pos, strip_space, replace
)
for fileid in self.abspaths(fileids)
]
get_words = lambda fileid: self._get_words(
fileid, speaker, sent, stem, relation, pos, strip_space, replace
)
return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
def sents(
self,
fileids=None,
speaker="ALL",
stem=False,
relation=None,
strip_space=True,
replace=False,
):
"""
:return: the given file(s) as a list of sentences or utterances, each
encoded as a list of word strings.
:rtype: list(list(str))
:param speaker: If specified, select specific speaker(s) defined
in the corpus. Default is 'ALL' (all participants). Common choices
are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
researchers)
:param stem: If true, then use word stems instead of word strings.
:param relation: If true, then return tuples of ``(str,pos,relation_list)``.
If there is manually-annotated relation info, it will return
tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)``
:param strip_space: If true, then strip trailing spaces from word
tokens. Otherwise, leave the spaces on the tokens.
:param replace: If true, then use the replaced (intended) word instead
of the original word (e.g., 'wat' will be replaced with 'watch')
"""
sent = True
pos = False
if not self._lazy:
return [
self._get_words(
fileid, speaker, sent, stem, relation, pos, strip_space, replace
)
for fileid in self.abspaths(fileids)
]
get_words = lambda fileid: self._get_words(
fileid, speaker, sent, stem, relation, pos, strip_space, replace
)
return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
def tagged_sents(
self,
fileids=None,
speaker="ALL",
stem=False,
relation=None,
strip_space=True,
replace=False,
):
"""
:return: the given file(s) as a list of
sentences, each encoded as a list of ``(word,tag)`` tuples.
:rtype: list(list(tuple(str,str)))
:param speaker: If specified, select specific speaker(s) defined
in the corpus. Default is 'ALL' (all participants). Common choices
are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
researchers)
:param stem: If true, then use word stems instead of word strings.
:param relation: If true, then return tuples of ``(str,pos,relation_list)``.
If there is manually-annotated relation info, it will return
tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)``
:param strip_space: If true, then strip trailing spaces from word
tokens. Otherwise, leave the spaces on the tokens.
:param replace: If true, then use the replaced (intended) word instead
of the original word (e.g., 'wat' will be replaced with 'watch')
"""
sent = True
pos = True
if not self._lazy:
return [
self._get_words(
fileid, speaker, sent, stem, relation, pos, strip_space, replace
)
for fileid in self.abspaths(fileids)
]
get_words = lambda fileid: self._get_words(
fileid, speaker, sent, stem, relation, pos, strip_space, replace
)
return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
def corpus(self, fileids=None):
"""
:return: the given file(s) as a dict of ``(corpus_property_key, value)``
:rtype: list(dict)
"""
if not self._lazy:
return [self._get_corpus(fileid) for fileid in self.abspaths(fileids)]
return LazyMap(self._get_corpus, self.abspaths(fileids))
def _get_corpus(self, fileid):
results = dict()
xmldoc = ElementTree.parse(fileid).getroot()
for key, value in xmldoc.items():
results[key] = value
return results
def participants(self, fileids=None):
"""
:return: the given file(s) as a dict of
``(participant_property_key, value)``
:rtype: list(dict)
"""
if not self._lazy:
return [self._get_participants(fileid) for fileid in self.abspaths(fileids)]
return LazyMap(self._get_participants, self.abspaths(fileids))
def _get_participants(self, fileid):
# multidimensional dicts
def dictOfDicts():
return defaultdict(dictOfDicts)
xmldoc = ElementTree.parse(fileid).getroot()
# getting participants' data
pat = dictOfDicts()
for participant in xmldoc.findall(
f".//{{{NS}}}Participants/{{{NS}}}participant"
):
for key, value in participant.items():
pat[participant.get("id")][key] = value
return pat
def age(self, fileids=None, speaker="CHI", month=False):
"""
:return: the given file(s) as string or int
:rtype: list or int
:param month: If true, return months instead of year-month-date
"""
if not self._lazy:
return [
self._get_age(fileid, speaker, month)
for fileid in self.abspaths(fileids)
]
get_age = lambda fileid: self._get_age(fileid, speaker, month)
return LazyMap(get_age, self.abspaths(fileids))
def _get_age(self, fileid, speaker, month):
xmldoc = ElementTree.parse(fileid).getroot()
for pat in xmldoc.findall(f".//{{{NS}}}Participants/{{{NS}}}participant"):
try:
if pat.get("id") == speaker:
age = pat.get("age")
if month:
age = self.convert_age(age)
return age
# some files don't have age data
except (TypeError, AttributeError) as e:
return None
def convert_age(self, age_year):
"Caclculate age in months from a string in CHILDES format"
m = re.match(r"P(\d+)Y(\d+)M?(\d?\d?)D?", age_year)
age_month = int(m.group(1)) * 12 + int(m.group(2))
try:
if int(m.group(3)) > 15:
age_month += 1
# some corpora don't have age information?
except ValueError as e:
pass
return age_month
def MLU(self, fileids=None, speaker="CHI"):
"""
:return: the given file(s) as a floating number
:rtype: list(float)
"""
if not self._lazy:
return [
self._getMLU(fileid, speaker=speaker)
for fileid in self.abspaths(fileids)
]
get_MLU = lambda fileid: self._getMLU(fileid, speaker=speaker)
return LazyMap(get_MLU, self.abspaths(fileids))
def _getMLU(self, fileid, speaker):
sents = self._get_words(
fileid,
speaker=speaker,
sent=True,
stem=True,
relation=False,
pos=True,
strip_space=True,
replace=True,
)
results = []
lastSent = []
numFillers = 0
sentDiscount = 0
for sent in sents:
posList = [pos for (word, pos) in sent]
# if any part of the sentence is intelligible
if any(pos == "unk" for pos in posList):
continue
# if the sentence is null
elif sent == []:
continue
# if the sentence is the same as the last sent
elif sent == lastSent:
continue
else:
results.append([word for (word, pos) in sent])
# count number of fillers
if len({"co", None}.intersection(posList)) > 0:
numFillers += posList.count("co")
numFillers += posList.count(None)
sentDiscount += 1
lastSent = sent
try:
thisWordList = flatten(results)
# count number of morphemes
# (e.g., 'read' = 1 morpheme but 'read-PAST' is 2 morphemes)
numWords = (
len(flatten([word.split("-") for word in thisWordList])) - numFillers
)
numSents = len(results) - sentDiscount
mlu = numWords / numSents
except ZeroDivisionError:
mlu = 0
# return {'mlu':mlu,'wordNum':numWords,'sentNum':numSents}
return mlu
def _get_words(
self, fileid, speaker, sent, stem, relation, pos, strip_space, replace
):
if (
isinstance(speaker, str) and speaker != "ALL"
): # ensure we have a list of speakers
speaker = [speaker]
xmldoc = ElementTree.parse(fileid).getroot()
# processing each xml doc
results = []
for xmlsent in xmldoc.findall(".//{%s}u" % NS):
sents = []
# select speakers
if speaker == "ALL" or xmlsent.get("who") in speaker:
for xmlword in xmlsent.findall(".//{%s}w" % NS):
infl = None
suffixStem = None
suffixTag = None
# getting replaced words
if replace and xmlsent.find(f".//{{{NS}}}w/{{{NS}}}replacement"):
xmlword = xmlsent.find(
f".//{{{NS}}}w/{{{NS}}}replacement/{{{NS}}}w"
)
elif replace and xmlsent.find(f".//{{{NS}}}w/{{{NS}}}wk"):
xmlword = xmlsent.find(f".//{{{NS}}}w/{{{NS}}}wk")
# get text
if xmlword.text:
word = xmlword.text
else:
word = ""
# strip tailing space
if strip_space:
word = word.strip()
# stem
if relation or stem:
try:
xmlstem = xmlword.find(".//{%s}stem" % NS)
word = xmlstem.text
except AttributeError as e:
pass
# if there is an inflection
try:
xmlinfl = xmlword.find(
f".//{{{NS}}}mor/{{{NS}}}mw/{{{NS}}}mk"
)
word += "-" + xmlinfl.text
except:
pass
# if there is a suffix
try:
xmlsuffix = xmlword.find(
".//{%s}mor/{%s}mor-post/{%s}mw/{%s}stem"
% (NS, NS, NS, NS)
)
suffixStem = xmlsuffix.text
except AttributeError:
suffixStem = ""
if suffixStem:
word += "~" + suffixStem
# pos
if relation or pos:
try:
xmlpos = xmlword.findall(".//{%s}c" % NS)
xmlpos2 = xmlword.findall(".//{%s}s" % NS)
if xmlpos2 != []:
tag = xmlpos[0].text + ":" + xmlpos2[0].text
else:
tag = xmlpos[0].text
except (AttributeError, IndexError) as e:
tag = ""
try:
xmlsuffixpos = xmlword.findall(
".//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}c"
% (NS, NS, NS, NS, NS)
)
xmlsuffixpos2 = xmlword.findall(
".//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}s"
% (NS, NS, NS, NS, NS)
)
if xmlsuffixpos2:
suffixTag = (
xmlsuffixpos[0].text + ":" + xmlsuffixpos2[0].text
)
else:
suffixTag = xmlsuffixpos[0].text
except:
pass
if suffixTag:
tag += "~" + suffixTag
word = (word, tag)
# relational
# the gold standard is stored in
# <mor></mor><mor type="trn"><gra type="grt">
if relation == True:
for xmlstem_rel in xmlword.findall(
f".//{{{NS}}}mor/{{{NS}}}gra"
):
if not xmlstem_rel.get("type") == "grt":
word = (
word[0],
word[1],
xmlstem_rel.get("index")
+ "|"
+ xmlstem_rel.get("head")
+ "|"
+ xmlstem_rel.get("relation"),
)
else:
word = (
word[0],
word[1],
word[2],
word[0],
word[1],
xmlstem_rel.get("index")
+ "|"
+ xmlstem_rel.get("head")
+ "|"
+ xmlstem_rel.get("relation"),
)
try:
for xmlpost_rel in xmlword.findall(
f".//{{{NS}}}mor/{{{NS}}}mor-post/{{{NS}}}gra"
):
if not xmlpost_rel.get("type") == "grt":
suffixStem = (
suffixStem[0],
suffixStem[1],
xmlpost_rel.get("index")
+ "|"
+ xmlpost_rel.get("head")
+ "|"
+ xmlpost_rel.get("relation"),
)
else:
suffixStem = (
suffixStem[0],
suffixStem[1],
suffixStem[2],
suffixStem[0],
suffixStem[1],
xmlpost_rel.get("index")
+ "|"
+ xmlpost_rel.get("head")
+ "|"
+ xmlpost_rel.get("relation"),
)
except:
pass
sents.append(word)
if sent or relation:
results.append(sents)
else:
results.extend(sents)
return LazyMap(lambda x: x, results)
# Ready-to-use browser opener
"""
The base URL for viewing files on the childes website. This
shouldn't need to be changed, unless CHILDES changes the configuration
of their server or unless the user sets up their own corpus webserver.
"""
childes_url_base = r"https://childes.talkbank.org/browser/index.php?url="
def webview_file(self, fileid, urlbase=None):
"""Map a corpus file to its web version on the CHILDES website,
and open it in a web browser.
The complete URL to be used is:
childes.childes_url_base + urlbase + fileid.replace('.xml', '.cha')
If no urlbase is passed, we try to calculate it. This
requires that the childes corpus was set up to mirror the
folder hierarchy under childes.psy.cmu.edu/data-xml/, e.g.:
nltk_data/corpora/childes/Eng-USA/Cornell/??? or
nltk_data/corpora/childes/Romance/Spanish/Aguirre/???
The function first looks (as a special case) if "Eng-USA" is
on the path consisting of <corpus root>+fileid; then if
"childes", possibly followed by "data-xml", appears. If neither
one is found, we use the unmodified fileid and hope for the best.
If this is not right, specify urlbase explicitly, e.g., if the
corpus root points to the Cornell folder, urlbase='Eng-USA/Cornell'.
"""
import webbrowser
if urlbase:
path = urlbase + "/" + fileid
else:
full = self.root + "/" + fileid
full = re.sub(r"\\", "/", full)
if "/childes/" in full.lower():
# Discard /data-xml/ if present
path = re.findall(r"(?i)/childes(?:/data-xml)?/(.*)\.xml", full)[0]
elif "eng-usa" in full.lower():
path = "Eng-USA/" + re.findall(r"/(?i)Eng-USA/(.*)\.xml", full)[0]
else:
path = fileid
# Strip ".xml" and add ".cha", as necessary:
if path.endswith(".xml"):
path = path[:-4]
if not path.endswith(".cha"):
path = path + ".cha"
url = self.childes_url_base + path
webbrowser.open_new_tab(url)
print("Opening in browser:", url)
# Pausing is a good idea, but it's up to the user...
# raw_input("Hit Return to continue")
def demo(corpus_root=None):
"""
The CHILDES corpus should be manually downloaded and saved
to ``[NLTK_Data_Dir]/corpora/childes/``
"""
if not corpus_root:
from nltk.data import find
corpus_root = find("corpora/childes/data-xml/Eng-USA/")
try:
childes = CHILDESCorpusReader(corpus_root, ".*.xml")
# describe all corpus
for file in childes.fileids()[:5]:
corpus = ""
corpus_id = ""
for key, value in childes.corpus(file)[0].items():
if key == "Corpus":
corpus = value
if key == "Id":
corpus_id = value
print("Reading", corpus, corpus_id, " .....")
print("words:", childes.words(file)[:7], "...")
print(
"words with replaced words:",
childes.words(file, replace=True)[:7],
" ...",
)
print("words with pos tags:", childes.tagged_words(file)[:7], " ...")
print("words (only MOT):", childes.words(file, speaker="MOT")[:7], "...")
print("words (only CHI):", childes.words(file, speaker="CHI")[:7], "...")
print("stemmed words:", childes.words(file, stem=True)[:7], " ...")
print(
"words with relations and pos-tag:",
childes.words(file, relation=True)[:5],
" ...",
)
print("sentence:", childes.sents(file)[:2], " ...")
for participant, values in childes.participants(file)[0].items():
for key, value in values.items():
print("\tparticipant", participant, key, ":", value)
print("num of sent:", len(childes.sents(file)))
print("num of morphemes:", len(childes.words(file, stem=True)))
print("age:", childes.age(file))
print("age in month:", childes.age(file, month=True))
print("MLU:", childes.MLU(file))
print()
except LookupError as e:
print(
"""The CHILDES corpus, or the parts you need, should be manually
downloaded from https://childes.talkbank.org/data-xml/ and saved at
[NLTK_Data_Dir]/corpora/childes/
Alternately, you can call the demo with the path to a portion of the CHILDES corpus, e.g.:
demo('/path/to/childes/data-xml/Eng-USA/")
"""
)
# corpus_root_http = urllib2.urlopen('https://childes.talkbank.org/data-xml/Eng-USA/Bates.zip')
# corpus_root_http_bates = zipfile.ZipFile(cStringIO.StringIO(corpus_root_http.read()))
##this fails
# childes = CHILDESCorpusReader(corpus_root_http_bates,corpus_root_http_bates.namelist())
if __name__ == "__main__":
demo()

View File

@@ -0,0 +1,273 @@
# Natural Language Toolkit: Chunked Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
A reader for corpora that contain chunked (and optionally tagged)
documents.
"""
import codecs
import os.path
import nltk
from nltk.chunk import tagstr2tree
from nltk.corpus.reader.api import *
from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader
from nltk.corpus.reader.util import *
from nltk.tokenize import *
from nltk.tree import Tree
class ChunkedCorpusReader(CorpusReader):
"""
Reader for chunked (and optionally tagged) corpora. Paragraphs
are split using a block reader. They are then tokenized into
sentences using a sentence tokenizer. Finally, these sentences
are parsed into chunk trees using a string-to-chunktree conversion
function. Each of these steps can be performed using a default
function or a custom function. By default, paragraphs are split
on blank lines; sentences are listed one per line; and sentences
are parsed into chunk trees using ``nltk.chunk.tagstr2tree``.
"""
def __init__(
self,
root,
fileids,
extension="",
str2chunktree=tagstr2tree,
sent_tokenizer=RegexpTokenizer("\n", gaps=True),
para_block_reader=read_blankline_block,
encoding="utf8",
tagset=None,
):
"""
:param root: The root directory for this corpus.
:param fileids: A list or regexp specifying the fileids in this corpus.
"""
CorpusReader.__init__(self, root, fileids, encoding)
self._cv_args = (str2chunktree, sent_tokenizer, para_block_reader, tagset)
"""Arguments for corpus views generated by this corpus: a tuple
(str2chunktree, sent_tokenizer, para_block_tokenizer)"""
def words(self, fileids=None):
"""
:return: the given file(s) as a list of words
and punctuation symbols.
:rtype: list(str)
"""
return concat(
[
ChunkedCorpusView(f, enc, 0, 0, 0, 0, *self._cv_args)
for (f, enc) in self.abspaths(fileids, True)
]
)
def sents(self, fileids=None):
"""
:return: the given file(s) as a list of
sentences or utterances, each encoded as a list of word
strings.
:rtype: list(list(str))
"""
return concat(
[
ChunkedCorpusView(f, enc, 0, 1, 0, 0, *self._cv_args)
for (f, enc) in self.abspaths(fileids, True)
]
)
def paras(self, fileids=None):
"""
:return: the given file(s) as a list of
paragraphs, each encoded as a list of sentences, which are
in turn encoded as lists of word strings.
:rtype: list(list(list(str)))
"""
return concat(
[
ChunkedCorpusView(f, enc, 0, 1, 1, 0, *self._cv_args)
for (f, enc) in self.abspaths(fileids, True)
]
)
def tagged_words(self, fileids=None, tagset=None):
"""
:return: the given file(s) as a list of tagged
words and punctuation symbols, encoded as tuples
``(word,tag)``.
:rtype: list(tuple(str,str))
"""
return concat(
[
ChunkedCorpusView(
f, enc, 1, 0, 0, 0, *self._cv_args, target_tagset=tagset
)
for (f, enc) in self.abspaths(fileids, True)
]
)
def tagged_sents(self, fileids=None, tagset=None):
"""
:return: the given file(s) as a list of
sentences, each encoded as a list of ``(word,tag)`` tuples.
:rtype: list(list(tuple(str,str)))
"""
return concat(
[
ChunkedCorpusView(
f, enc, 1, 1, 0, 0, *self._cv_args, target_tagset=tagset
)
for (f, enc) in self.abspaths(fileids, True)
]
)
def tagged_paras(self, fileids=None, tagset=None):
"""
:return: the given file(s) as a list of
paragraphs, each encoded as a list of sentences, which are
in turn encoded as lists of ``(word,tag)`` tuples.
:rtype: list(list(list(tuple(str,str))))
"""
return concat(
[
ChunkedCorpusView(
f, enc, 1, 1, 1, 0, *self._cv_args, target_tagset=tagset
)
for (f, enc) in self.abspaths(fileids, True)
]
)
def chunked_words(self, fileids=None, tagset=None):
"""
:return: the given file(s) as a list of tagged
words and chunks. Words are encoded as ``(word, tag)``
tuples (if the corpus has tags) or word strings (if the
corpus has no tags). Chunks are encoded as depth-one
trees over ``(word,tag)`` tuples or word strings.
:rtype: list(tuple(str,str) and Tree)
"""
return concat(
[
ChunkedCorpusView(
f, enc, 1, 0, 0, 1, *self._cv_args, target_tagset=tagset
)
for (f, enc) in self.abspaths(fileids, True)
]
)
def chunked_sents(self, fileids=None, tagset=None):
"""
:return: the given file(s) as a list of
sentences, each encoded as a shallow Tree. The leaves
of these trees are encoded as ``(word, tag)`` tuples (if
the corpus has tags) or word strings (if the corpus has no
tags).
:rtype: list(Tree)
"""
return concat(
[
ChunkedCorpusView(
f, enc, 1, 1, 0, 1, *self._cv_args, target_tagset=tagset
)
for (f, enc) in self.abspaths(fileids, True)
]
)
def chunked_paras(self, fileids=None, tagset=None):
"""
:return: the given file(s) as a list of
paragraphs, each encoded as a list of sentences, which are
in turn encoded as a shallow Tree. The leaves of these
trees are encoded as ``(word, tag)`` tuples (if the corpus
has tags) or word strings (if the corpus has no tags).
:rtype: list(list(Tree))
"""
return concat(
[
ChunkedCorpusView(
f, enc, 1, 1, 1, 1, *self._cv_args, target_tagset=tagset
)
for (f, enc) in self.abspaths(fileids, True)
]
)
def _read_block(self, stream):
return [tagstr2tree(t) for t in read_blankline_block(stream)]
class ChunkedCorpusView(StreamBackedCorpusView):
def __init__(
self,
fileid,
encoding,
tagged,
group_by_sent,
group_by_para,
chunked,
str2chunktree,
sent_tokenizer,
para_block_reader,
source_tagset=None,
target_tagset=None,
):
StreamBackedCorpusView.__init__(self, fileid, encoding=encoding)
self._tagged = tagged
self._group_by_sent = group_by_sent
self._group_by_para = group_by_para
self._chunked = chunked
self._str2chunktree = str2chunktree
self._sent_tokenizer = sent_tokenizer
self._para_block_reader = para_block_reader
self._source_tagset = source_tagset
self._target_tagset = target_tagset
def read_block(self, stream):
block = []
for para_str in self._para_block_reader(stream):
para = []
for sent_str in self._sent_tokenizer.tokenize(para_str):
sent = self._str2chunktree(
sent_str,
source_tagset=self._source_tagset,
target_tagset=self._target_tagset,
)
# If requested, throw away the tags.
if not self._tagged:
sent = self._untag(sent)
# If requested, throw away the chunks.
if not self._chunked:
sent = sent.leaves()
# Add the sentence to `para`.
if self._group_by_sent:
para.append(sent)
else:
para.extend(sent)
# Add the paragraph to `block`.
if self._group_by_para:
block.append(para)
else:
block.extend(para)
# Return the block
return block
def _untag(self, tree):
for i, child in enumerate(tree):
if isinstance(child, Tree):
self._untag(child)
elif isinstance(child, tuple):
tree[i] = child[0]
else:
raise ValueError("expected child to be Tree or tuple")
return tree

View File

@@ -0,0 +1,88 @@
# Natural Language Toolkit: Carnegie Mellon Pronouncing Dictionary Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
The Carnegie Mellon Pronouncing Dictionary [cmudict.0.6]
ftp://ftp.cs.cmu.edu/project/speech/dict/
Copyright 1998 Carnegie Mellon University
File Format: Each line consists of an uppercased word, a counter
(for alternative pronunciations), and a transcription. Vowels are
marked for stress (1=primary, 2=secondary, 0=no stress). E.g.:
NATURAL 1 N AE1 CH ER0 AH0 L
The dictionary contains 127069 entries. Of these, 119400 words are assigned
a unique pronunciation, 6830 words have two pronunciations, and 839 words have
three or more pronunciations. Many of these are fast-speech variants.
Phonemes: There are 39 phonemes, as shown below:
Phoneme Example Translation Phoneme Example Translation
------- ------- ----------- ------- ------- -----------
AA odd AA D AE at AE T
AH hut HH AH T AO ought AO T
AW cow K AW AY hide HH AY D
B be B IY CH cheese CH IY Z
D dee D IY DH thee DH IY
EH Ed EH D ER hurt HH ER T
EY ate EY T F fee F IY
G green G R IY N HH he HH IY
IH it IH T IY eat IY T
JH gee JH IY K key K IY
L lee L IY M me M IY
N knee N IY NG ping P IH NG
OW oat OW T OY toy T OY
P pee P IY R read R IY D
S sea S IY SH she SH IY
T tea T IY TH theta TH EY T AH
UH hood HH UH D UW two T UW
V vee V IY W we W IY
Y yield Y IY L D Z zee Z IY
ZH seizure S IY ZH ER
"""
from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *
from nltk.util import Index
class CMUDictCorpusReader(CorpusReader):
def entries(self):
"""
:return: the cmudict lexicon as a list of entries
containing (word, transcriptions) tuples.
"""
return concat(
[
StreamBackedCorpusView(fileid, read_cmudict_block, encoding=enc)
for fileid, enc in self.abspaths(None, True)
]
)
def words(self):
"""
:return: a list of all words defined in the cmudict lexicon.
"""
return [word.lower() for (word, _) in self.entries()]
def dict(self):
"""
:return: the cmudict lexicon as a dictionary, whose keys are
lowercase words and whose values are lists of pronunciations.
"""
return dict(Index(self.entries()))
def read_cmudict_block(stream):
entries = []
while len(entries) < 100: # Read 100 at a time.
line = stream.readline()
if line == "":
return entries # end of file.
pieces = line.split()
entries.append((pieces[0].lower(), pieces[2:]))
return entries

View File

@@ -0,0 +1,309 @@
# Natural Language Toolkit: Comparative Sentence Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
CorpusReader for the Comparative Sentence Dataset.
- Comparative Sentence Dataset information -
Annotated by: Nitin Jindal and Bing Liu, 2006.
Department of Computer Sicence
University of Illinois at Chicago
Contact: Nitin Jindal, njindal@cs.uic.edu
Bing Liu, liub@cs.uic.edu (https://www.cs.uic.edu/~liub)
Distributed with permission.
Related papers:
- Nitin Jindal and Bing Liu. "Identifying Comparative Sentences in Text Documents".
Proceedings of the ACM SIGIR International Conference on Information Retrieval
(SIGIR-06), 2006.
- Nitin Jindal and Bing Liu. "Mining Comprative Sentences and Relations".
Proceedings of Twenty First National Conference on Artificial Intelligence
(AAAI-2006), 2006.
- Murthy Ganapathibhotla and Bing Liu. "Mining Opinions in Comparative Sentences".
Proceedings of the 22nd International Conference on Computational Linguistics
(Coling-2008), Manchester, 18-22 August, 2008.
"""
import re
from nltk.corpus.reader.api import *
from nltk.tokenize import *
# Regular expressions for dataset components
STARS = re.compile(r"^\*+$")
COMPARISON = re.compile(r"<cs-[1234]>")
CLOSE_COMPARISON = re.compile(r"</cs-[1234]>")
GRAD_COMPARISON = re.compile(r"<cs-[123]>")
NON_GRAD_COMPARISON = re.compile(r"<cs-4>")
ENTITIES_FEATS = re.compile(r"(\d)_((?:[\.\w\s/-](?!\d_))+)")
KEYWORD = re.compile(r"\(([^\(]*)\)$")
class Comparison:
"""
A Comparison represents a comparative sentence and its constituents.
"""
def __init__(
self,
text=None,
comp_type=None,
entity_1=None,
entity_2=None,
feature=None,
keyword=None,
):
"""
:param text: a string (optionally tokenized) containing a comparison.
:param comp_type: an integer defining the type of comparison expressed.
Values can be: 1 (Non-equal gradable), 2 (Equative), 3 (Superlative),
4 (Non-gradable).
:param entity_1: the first entity considered in the comparison relation.
:param entity_2: the second entity considered in the comparison relation.
:param feature: the feature considered in the comparison relation.
:param keyword: the word or phrase which is used for that comparative relation.
"""
self.text = text
self.comp_type = comp_type
self.entity_1 = entity_1
self.entity_2 = entity_2
self.feature = feature
self.keyword = keyword
def __repr__(self):
return (
'Comparison(text="{}", comp_type={}, entity_1="{}", entity_2="{}", '
'feature="{}", keyword="{}")'
).format(
self.text,
self.comp_type,
self.entity_1,
self.entity_2,
self.feature,
self.keyword,
)
class ComparativeSentencesCorpusReader(CorpusReader):
"""
Reader for the Comparative Sentence Dataset by Jindal and Liu (2006).
>>> from nltk.corpus import comparative_sentences
>>> comparison = comparative_sentences.comparisons()[0]
>>> comparison.text # doctest: +NORMALIZE_WHITESPACE
['its', 'fast-forward', 'and', 'rewind', 'work', 'much', 'more', 'smoothly',
'and', 'consistently', 'than', 'those', 'of', 'other', 'models', 'i', "'ve",
'had', '.']
>>> comparison.entity_2
'models'
>>> (comparison.feature, comparison.keyword)
('rewind', 'more')
>>> len(comparative_sentences.comparisons())
853
"""
CorpusView = StreamBackedCorpusView
def __init__(
self,
root,
fileids,
word_tokenizer=WhitespaceTokenizer(),
sent_tokenizer=None,
encoding="utf8",
):
"""
:param root: The root directory for this corpus.
:param fileids: a list or regexp specifying the fileids in this corpus.
:param word_tokenizer: tokenizer for breaking sentences or paragraphs
into words. Default: `WhitespaceTokenizer`
:param sent_tokenizer: tokenizer for breaking paragraphs into sentences.
:param encoding: the encoding that should be used to read the corpus.
"""
CorpusReader.__init__(self, root, fileids, encoding)
self._word_tokenizer = word_tokenizer
self._sent_tokenizer = sent_tokenizer
self._readme = "README.txt"
def comparisons(self, fileids=None):
"""
Return all comparisons in the corpus.
:param fileids: a list or regexp specifying the ids of the files whose
comparisons have to be returned.
:return: the given file(s) as a list of Comparison objects.
:rtype: list(Comparison)
"""
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, str):
fileids = [fileids]
return concat(
[
self.CorpusView(path, self._read_comparison_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def keywords(self, fileids=None):
"""
Return a set of all keywords used in the corpus.
:param fileids: a list or regexp specifying the ids of the files whose
keywords have to be returned.
:return: the set of keywords and comparative phrases used in the corpus.
:rtype: set(str)
"""
all_keywords = concat(
[
self.CorpusView(path, self._read_keyword_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
keywords_set = {keyword.lower() for keyword in all_keywords if keyword}
return keywords_set
def keywords_readme(self):
"""
Return the list of words and constituents considered as clues of a
comparison (from listOfkeywords.txt).
"""
keywords = []
with self.open("listOfkeywords.txt") as fp:
raw_text = fp.read()
for line in raw_text.split("\n"):
if not line or line.startswith("//"):
continue
keywords.append(line.strip())
return keywords
def sents(self, fileids=None):
"""
Return all sentences in the corpus.
:param fileids: a list or regexp specifying the ids of the files whose
sentences have to be returned.
:return: all sentences of the corpus as lists of tokens (or as plain
strings, if no word tokenizer is specified).
:rtype: list(list(str)) or list(str)
"""
return concat(
[
self.CorpusView(path, self._read_sent_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def words(self, fileids=None):
"""
Return all words and punctuation symbols in the corpus.
:param fileids: a list or regexp specifying the ids of the files whose
words have to be returned.
:return: the given file(s) as a list of words and punctuation symbols.
:rtype: list(str)
"""
return concat(
[
self.CorpusView(path, self._read_word_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def _read_comparison_block(self, stream):
while True:
line = stream.readline()
if not line:
return [] # end of file.
comparison_tags = re.findall(COMPARISON, line)
if comparison_tags:
grad_comparisons = re.findall(GRAD_COMPARISON, line)
non_grad_comparisons = re.findall(NON_GRAD_COMPARISON, line)
# Advance to the next line (it contains the comparative sentence)
comparison_text = stream.readline().strip()
if self._word_tokenizer:
comparison_text = self._word_tokenizer.tokenize(comparison_text)
# Skip the next line (it contains closing comparison tags)
stream.readline()
# If gradable comparisons are found, create Comparison instances
# and populate their fields
comparison_bundle = []
if grad_comparisons:
# Each comparison tag has its own relations on a separate line
for comp in grad_comparisons:
comp_type = int(re.match(r"<cs-(\d)>", comp).group(1))
comparison = Comparison(
text=comparison_text, comp_type=comp_type
)
line = stream.readline()
entities_feats = ENTITIES_FEATS.findall(line)
if entities_feats:
for code, entity_feat in entities_feats:
if code == "1":
comparison.entity_1 = entity_feat.strip()
elif code == "2":
comparison.entity_2 = entity_feat.strip()
elif code == "3":
comparison.feature = entity_feat.strip()
keyword = KEYWORD.findall(line)
if keyword:
comparison.keyword = keyword[0]
comparison_bundle.append(comparison)
# If non-gradable comparisons are found, create a simple Comparison
# instance for each one
if non_grad_comparisons:
for comp in non_grad_comparisons:
# comp_type in this case should always be 4.
comp_type = int(re.match(r"<cs-(\d)>", comp).group(1))
comparison = Comparison(
text=comparison_text, comp_type=comp_type
)
comparison_bundle.append(comparison)
# Flatten the list of comparisons before returning them
# return concat([comparison_bundle])
return comparison_bundle
def _read_keyword_block(self, stream):
keywords = []
for comparison in self._read_comparison_block(stream):
keywords.append(comparison.keyword)
return keywords
def _read_sent_block(self, stream):
while True:
line = stream.readline()
if re.match(STARS, line):
while True:
line = stream.readline()
if re.match(STARS, line):
break
continue
if (
not re.findall(COMPARISON, line)
and not ENTITIES_FEATS.findall(line)
and not re.findall(CLOSE_COMPARISON, line)
):
if self._sent_tokenizer:
return [
self._word_tokenizer.tokenize(sent)
for sent in self._sent_tokenizer.tokenize(line)
]
else:
return [self._word_tokenizer.tokenize(line)]
def _read_word_block(self, stream):
words = []
for sent in self._read_sent_block(stream):
words.extend(sent)
return words

View File

@@ -0,0 +1,579 @@
# Natural Language Toolkit: CONLL Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Read CoNLL-style chunk fileids.
"""
import textwrap
from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *
from nltk.tag import map_tag
from nltk.tree import Tree
from nltk.util import LazyConcatenation, LazyMap
class ConllCorpusReader(CorpusReader):
"""
A corpus reader for CoNLL-style files. These files consist of a
series of sentences, separated by blank lines. Each sentence is
encoded using a table (or "grid") of values, where each line
corresponds to a single word, and each column corresponds to an
annotation type. The set of columns used by CoNLL-style files can
vary from corpus to corpus; the ``ConllCorpusReader`` constructor
therefore takes an argument, ``columntypes``, which is used to
specify the columns that are used by a given corpus. By default
columns are split by consecutive whitespaces, with the
``separator`` argument you can set a string to split by (e.g.
``\'\t\'``).
@todo: Add support for reading from corpora where different
parallel files contain different columns.
@todo: Possibly add caching of the grid corpus view? This would
allow the same grid view to be used by different data access
methods (eg words() and parsed_sents() could both share the
same grid corpus view object).
@todo: Better support for -DOCSTART-. Currently, we just ignore
it, but it could be used to define methods that retrieve a
document at a time (eg parsed_documents()).
"""
# /////////////////////////////////////////////////////////////////
# Column Types
# /////////////////////////////////////////////////////////////////
WORDS = "words" #: column type for words
POS = "pos" #: column type for part-of-speech tags
TREE = "tree" #: column type for parse trees
CHUNK = "chunk" #: column type for chunk structures
NE = "ne" #: column type for named entities
SRL = "srl" #: column type for semantic role labels
IGNORE = "ignore" #: column type for column that should be ignored
#: A list of all column types supported by the conll corpus reader.
COLUMN_TYPES = (WORDS, POS, TREE, CHUNK, NE, SRL, IGNORE)
# /////////////////////////////////////////////////////////////////
# Constructor
# /////////////////////////////////////////////////////////////////
def __init__(
self,
root,
fileids,
columntypes,
chunk_types=None,
root_label="S",
pos_in_tree=False,
srl_includes_roleset=True,
encoding="utf8",
tree_class=Tree,
tagset=None,
separator=None,
):
for columntype in columntypes:
if columntype not in self.COLUMN_TYPES:
raise ValueError("Bad column type %r" % columntype)
if isinstance(chunk_types, str):
chunk_types = [chunk_types]
self._chunk_types = chunk_types
self._colmap = {c: i for (i, c) in enumerate(columntypes)}
self._pos_in_tree = pos_in_tree
self._root_label = root_label # for chunks
self._srl_includes_roleset = srl_includes_roleset
self._tree_class = tree_class
CorpusReader.__init__(self, root, fileids, encoding)
self._tagset = tagset
self.sep = separator
# /////////////////////////////////////////////////////////////////
# Data Access Methods
# /////////////////////////////////////////////////////////////////
def words(self, fileids=None):
self._require(self.WORDS)
return LazyConcatenation(LazyMap(self._get_words, self._grids(fileids)))
def sents(self, fileids=None):
self._require(self.WORDS)
return LazyMap(self._get_words, self._grids(fileids))
def tagged_words(self, fileids=None, tagset=None):
self._require(self.WORDS, self.POS)
def get_tagged_words(grid):
return self._get_tagged_words(grid, tagset)
return LazyConcatenation(LazyMap(get_tagged_words, self._grids(fileids)))
def tagged_sents(self, fileids=None, tagset=None):
self._require(self.WORDS, self.POS)
def get_tagged_words(grid):
return self._get_tagged_words(grid, tagset)
return LazyMap(get_tagged_words, self._grids(fileids))
def chunked_words(self, fileids=None, chunk_types=None, tagset=None):
self._require(self.WORDS, self.POS, self.CHUNK)
if chunk_types is None:
chunk_types = self._chunk_types
def get_chunked_words(grid): # capture chunk_types as local var
return self._get_chunked_words(grid, chunk_types, tagset)
return LazyConcatenation(LazyMap(get_chunked_words, self._grids(fileids)))
def chunked_sents(self, fileids=None, chunk_types=None, tagset=None):
self._require(self.WORDS, self.POS, self.CHUNK)
if chunk_types is None:
chunk_types = self._chunk_types
def get_chunked_words(grid): # capture chunk_types as local var
return self._get_chunked_words(grid, chunk_types, tagset)
return LazyMap(get_chunked_words, self._grids(fileids))
def parsed_sents(self, fileids=None, pos_in_tree=None, tagset=None):
self._require(self.WORDS, self.POS, self.TREE)
if pos_in_tree is None:
pos_in_tree = self._pos_in_tree
def get_parsed_sent(grid): # capture pos_in_tree as local var
return self._get_parsed_sent(grid, pos_in_tree, tagset)
return LazyMap(get_parsed_sent, self._grids(fileids))
def srl_spans(self, fileids=None):
self._require(self.SRL)
return LazyMap(self._get_srl_spans, self._grids(fileids))
def srl_instances(self, fileids=None, pos_in_tree=None, flatten=True):
self._require(self.WORDS, self.POS, self.TREE, self.SRL)
if pos_in_tree is None:
pos_in_tree = self._pos_in_tree
def get_srl_instances(grid): # capture pos_in_tree as local var
return self._get_srl_instances(grid, pos_in_tree)
result = LazyMap(get_srl_instances, self._grids(fileids))
if flatten:
result = LazyConcatenation(result)
return result
def iob_words(self, fileids=None, tagset=None):
"""
:return: a list of word/tag/IOB tuples
:rtype: list(tuple)
:param fileids: the list of fileids that make up this corpus
:type fileids: None or str or list
"""
self._require(self.WORDS, self.POS, self.CHUNK)
def get_iob_words(grid):
return self._get_iob_words(grid, tagset)
return LazyConcatenation(LazyMap(get_iob_words, self._grids(fileids)))
def iob_sents(self, fileids=None, tagset=None):
"""
:return: a list of lists of word/tag/IOB tuples
:rtype: list(list)
:param fileids: the list of fileids that make up this corpus
:type fileids: None or str or list
"""
self._require(self.WORDS, self.POS, self.CHUNK)
def get_iob_words(grid):
return self._get_iob_words(grid, tagset)
return LazyMap(get_iob_words, self._grids(fileids))
# /////////////////////////////////////////////////////////////////
# Grid Reading
# /////////////////////////////////////////////////////////////////
def _grids(self, fileids=None):
# n.b.: we could cache the object returned here (keyed on
# fileids), which would let us reuse the same corpus view for
# different things (eg srl and parse trees).
return concat(
[
StreamBackedCorpusView(fileid, self._read_grid_block, encoding=enc)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def _read_grid_block(self, stream):
grids = []
for block in read_blankline_block(stream):
block = block.strip()
if not block:
continue
grid = [line.split(self.sep) for line in block.split("\n")]
# If there's a docstart row, then discard. ([xx] eventually it
# would be good to actually use it)
if grid[0][self._colmap.get("words", 0)] == "-DOCSTART-":
del grid[0]
# Check that the grid is consistent.
for row in grid:
if len(row) != len(grid[0]):
raise ValueError("Inconsistent number of columns:\n%s" % block)
grids.append(grid)
return grids
# /////////////////////////////////////////////////////////////////
# Transforms
# /////////////////////////////////////////////////////////////////
# given a grid, transform it into some representation (e.g.,
# a list of words or a parse tree).
def _get_words(self, grid):
return self._get_column(grid, self._colmap["words"])
def _get_tagged_words(self, grid, tagset=None):
pos_tags = self._get_column(grid, self._colmap["pos"])
if tagset and tagset != self._tagset:
pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
return list(zip(self._get_column(grid, self._colmap["words"]), pos_tags))
def _get_iob_words(self, grid, tagset=None):
pos_tags = self._get_column(grid, self._colmap["pos"])
if tagset and tagset != self._tagset:
pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
return list(
zip(
self._get_column(grid, self._colmap["words"]),
pos_tags,
self._get_column(grid, self._colmap["chunk"]),
)
)
def _get_chunked_words(self, grid, chunk_types, tagset=None):
# n.b.: this method is very similar to conllstr2tree.
words = self._get_column(grid, self._colmap["words"])
pos_tags = self._get_column(grid, self._colmap["pos"])
if tagset and tagset != self._tagset:
pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
chunk_tags = self._get_column(grid, self._colmap["chunk"])
stack = [Tree(self._root_label, [])]
for word, pos_tag, chunk_tag in zip(words, pos_tags, chunk_tags):
if chunk_tag == "O":
state, chunk_type = "O", ""
else:
(state, chunk_type) = chunk_tag.split("-")
# If it's a chunk we don't care about, treat it as O.
if chunk_types is not None and chunk_type not in chunk_types:
state = "O"
# Treat a mismatching I like a B.
if state == "I" and chunk_type != stack[-1].label():
state = "B"
# For B or I: close any open chunks
if state in "BO" and len(stack) == 2:
stack.pop()
# For B: start a new chunk.
if state == "B":
new_chunk = Tree(chunk_type, [])
stack[-1].append(new_chunk)
stack.append(new_chunk)
# Add the word token.
stack[-1].append((word, pos_tag))
return stack[0]
def _get_parsed_sent(self, grid, pos_in_tree, tagset=None):
words = self._get_column(grid, self._colmap["words"])
pos_tags = self._get_column(grid, self._colmap["pos"])
if tagset and tagset != self._tagset:
pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
parse_tags = self._get_column(grid, self._colmap["tree"])
treestr = ""
for word, pos_tag, parse_tag in zip(words, pos_tags, parse_tags):
if word == "(":
word = "-LRB-"
if word == ")":
word = "-RRB-"
if pos_tag == "(":
pos_tag = "-LRB-"
if pos_tag == ")":
pos_tag = "-RRB-"
(left, right) = parse_tag.split("*")
right = right.count(")") * ")" # only keep ')'.
treestr += f"{left} ({pos_tag} {word}) {right}"
try:
tree = self._tree_class.fromstring(treestr)
except (ValueError, IndexError):
tree = self._tree_class.fromstring(f"({self._root_label} {treestr})")
if not pos_in_tree:
for subtree in tree.subtrees():
for i, child in enumerate(subtree):
if (
isinstance(child, Tree)
and len(child) == 1
and isinstance(child[0], str)
):
subtree[i] = (child[0], child.label())
return tree
def _get_srl_spans(self, grid):
"""
list of list of (start, end), tag) tuples
"""
if self._srl_includes_roleset:
predicates = self._get_column(grid, self._colmap["srl"] + 1)
start_col = self._colmap["srl"] + 2
else:
predicates = self._get_column(grid, self._colmap["srl"])
start_col = self._colmap["srl"] + 1
# Count how many predicates there are. This tells us how many
# columns to expect for SRL data.
num_preds = len([p for p in predicates if p != "-"])
spanlists = []
for i in range(num_preds):
col = self._get_column(grid, start_col + i)
spanlist = []
stack = []
for wordnum, srl_tag in enumerate(col):
(left, right) = srl_tag.split("*")
for tag in left.split("("):
if tag:
stack.append((tag, wordnum))
for i in range(right.count(")")):
(tag, start) = stack.pop()
spanlist.append(((start, wordnum + 1), tag))
spanlists.append(spanlist)
return spanlists
def _get_srl_instances(self, grid, pos_in_tree):
tree = self._get_parsed_sent(grid, pos_in_tree)
spanlists = self._get_srl_spans(grid)
if self._srl_includes_roleset:
predicates = self._get_column(grid, self._colmap["srl"] + 1)
rolesets = self._get_column(grid, self._colmap["srl"])
else:
predicates = self._get_column(grid, self._colmap["srl"])
rolesets = [None] * len(predicates)
instances = ConllSRLInstanceList(tree)
for wordnum, predicate in enumerate(predicates):
if predicate == "-":
continue
# Decide which spanlist to use. Don't assume that they're
# sorted in the same order as the predicates (even though
# they usually are).
for spanlist in spanlists:
for (start, end), tag in spanlist:
if wordnum in range(start, end) and tag in ("V", "C-V"):
break
else:
continue
break
else:
raise ValueError("No srl column found for %r" % predicate)
instances.append(
ConllSRLInstance(tree, wordnum, predicate, rolesets[wordnum], spanlist)
)
return instances
# /////////////////////////////////////////////////////////////////
# Helper Methods
# /////////////////////////////////////////////////////////////////
def _require(self, *columntypes):
for columntype in columntypes:
if columntype not in self._colmap:
raise ValueError(
"This corpus does not contain a %s " "column." % columntype
)
@staticmethod
def _get_column(grid, column_index):
return [grid[i][column_index] for i in range(len(grid))]
class ConllSRLInstance:
"""
An SRL instance from a CoNLL corpus, which identifies and
providing labels for the arguments of a single verb.
"""
# [xx] add inst.core_arguments, inst.argm_arguments?
def __init__(self, tree, verb_head, verb_stem, roleset, tagged_spans):
self.verb = []
"""A list of the word indices of the words that compose the
verb whose arguments are identified by this instance.
This will contain multiple word indices when multi-word
verbs are used (e.g. 'turn on')."""
self.verb_head = verb_head
"""The word index of the head word of the verb whose arguments
are identified by this instance. E.g., for a sentence that
uses the verb 'turn on,' ``verb_head`` will be the word index
of the word 'turn'."""
self.verb_stem = verb_stem
self.roleset = roleset
self.arguments = []
"""A list of ``(argspan, argid)`` tuples, specifying the location
and type for each of the arguments identified by this
instance. ``argspan`` is a tuple ``start, end``, indicating
that the argument consists of the ``words[start:end]``."""
self.tagged_spans = tagged_spans
"""A list of ``(span, id)`` tuples, specifying the location and
type for each of the arguments, as well as the verb pieces,
that make up this instance."""
self.tree = tree
"""The parse tree for the sentence containing this instance."""
self.words = tree.leaves()
"""A list of the words in the sentence containing this
instance."""
# Fill in the self.verb and self.arguments values.
for (start, end), tag in tagged_spans:
if tag in ("V", "C-V"):
self.verb += list(range(start, end))
else:
self.arguments.append(((start, end), tag))
def __repr__(self):
# Originally, its:
##plural = 's' if len(self.arguments) != 1 else ''
plural = "s" if len(self.arguments) != 1 else ""
return "<ConllSRLInstance for %r with %d argument%s>" % (
(self.verb_stem, len(self.arguments), plural)
)
def pprint(self):
verbstr = " ".join(self.words[i][0] for i in self.verb)
hdr = f"SRL for {verbstr!r} (stem={self.verb_stem!r}):\n"
s = ""
for i, word in enumerate(self.words):
if isinstance(word, tuple):
word = word[0]
for (start, end), argid in self.arguments:
if i == start:
s += "[%s " % argid
if i == end:
s += "] "
if i in self.verb:
word = "<<%s>>" % word
s += word + " "
return hdr + textwrap.fill(
s.replace(" ]", "]"), initial_indent=" ", subsequent_indent=" "
)
class ConllSRLInstanceList(list):
"""
Set of instances for a single sentence
"""
def __init__(self, tree, instances=()):
self.tree = tree
list.__init__(self, instances)
def __str__(self):
return self.pprint()
def pprint(self, include_tree=False):
# Sanity check: trees should be the same
for inst in self:
if inst.tree != self.tree:
raise ValueError("Tree mismatch!")
# If desired, add trees:
if include_tree:
words = self.tree.leaves()
pos = [None] * len(words)
synt = ["*"] * len(words)
self._tree2conll(self.tree, 0, words, pos, synt)
s = ""
for i in range(len(words)):
# optional tree columns
if include_tree:
s += "%-20s " % words[i]
s += "%-8s " % pos[i]
s += "%15s*%-8s " % tuple(synt[i].split("*"))
# verb head column
for inst in self:
if i == inst.verb_head:
s += "%-20s " % inst.verb_stem
break
else:
s += "%-20s " % "-"
# Remaining columns: self
for inst in self:
argstr = "*"
for (start, end), argid in inst.tagged_spans:
if i == start:
argstr = f"({argid}{argstr}"
if i == (end - 1):
argstr += ")"
s += "%-12s " % argstr
s += "\n"
return s
def _tree2conll(self, tree, wordnum, words, pos, synt):
assert isinstance(tree, Tree)
if len(tree) == 1 and isinstance(tree[0], str):
pos[wordnum] = tree.label()
assert words[wordnum] == tree[0]
return wordnum + 1
elif len(tree) == 1 and isinstance(tree[0], tuple):
assert len(tree[0]) == 2
pos[wordnum], pos[wordnum] = tree[0]
return wordnum + 1
else:
synt[wordnum] = f"({tree.label()}{synt[wordnum]}"
for child in tree:
wordnum = self._tree2conll(child, wordnum, words, pos, synt)
synt[wordnum - 1] += ")"
return wordnum
class ConllChunkCorpusReader(ConllCorpusReader):
"""
A ConllCorpusReader whose data file contains three columns: words,
pos, and chunk.
"""
def __init__(
self, root, fileids, chunk_types, encoding="utf8", tagset=None, separator=None
):
ConllCorpusReader.__init__(
self,
root,
fileids,
("words", "pos", "chunk"),
chunk_types=chunk_types,
encoding=encoding,
tagset=tagset,
separator=separator,
)

View File

@@ -0,0 +1,106 @@
# Natural Language Toolkit: An Crubadan N-grams Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Avital Pekker <avital.pekker@utoronto.ca>
#
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
An NLTK interface for the n-gram statistics gathered from
the corpora for each language using An Crubadan.
There are multiple potential applications for the data but
this reader was created with the goal of using it in the
context of language identification.
For details about An Crubadan, this data, and its potential uses, see:
http://borel.slu.edu/crubadan/index.html
"""
import re
from os import path
from nltk.corpus.reader import CorpusReader
from nltk.data import ZipFilePathPointer
from nltk.probability import FreqDist
class CrubadanCorpusReader(CorpusReader):
"""
A corpus reader used to access language An Crubadan n-gram files.
"""
_LANG_MAPPER_FILE = "table.txt"
_all_lang_freq = {}
def __init__(self, root, fileids, encoding="utf8", tagset=None):
super().__init__(root, fileids, encoding="utf8")
self._lang_mapping_data = []
self._load_lang_mapping_data()
def lang_freq(self, lang):
"""Return n-gram FreqDist for a specific language
given ISO 639-3 language code"""
if lang not in self._all_lang_freq:
self._all_lang_freq[lang] = self._load_lang_ngrams(lang)
return self._all_lang_freq[lang]
def langs(self):
"""Return a list of supported languages as ISO 639-3 codes"""
return [row[1] for row in self._lang_mapping_data]
def iso_to_crubadan(self, lang):
"""Return internal Crubadan code based on ISO 639-3 code"""
for i in self._lang_mapping_data:
if i[1].lower() == lang.lower():
return i[0]
def crubadan_to_iso(self, lang):
"""Return ISO 639-3 code given internal Crubadan code"""
for i in self._lang_mapping_data:
if i[0].lower() == lang.lower():
return i[1]
def _load_lang_mapping_data(self):
"""Load language mappings between codes and description from table.txt"""
if isinstance(self.root, ZipFilePathPointer):
raise RuntimeError(
"Please install the 'crubadan' corpus first, use nltk.download()"
)
mapper_file = path.join(self.root, self._LANG_MAPPER_FILE)
if self._LANG_MAPPER_FILE not in self.fileids():
raise RuntimeError("Could not find language mapper file: " + mapper_file)
with open(mapper_file, encoding="utf-8") as raw:
strip_raw = raw.read().strip()
self._lang_mapping_data = [row.split("\t") for row in strip_raw.split("\n")]
def _load_lang_ngrams(self, lang):
"""Load single n-gram language file given the ISO 639-3 language code
and return its FreqDist"""
if lang not in self.langs():
raise RuntimeError("Unsupported language.")
crubadan_code = self.iso_to_crubadan(lang)
ngram_file = path.join(self.root, crubadan_code + "-3grams.txt")
if not path.isfile(ngram_file):
raise RuntimeError("No N-gram file found for requested language.")
counts = FreqDist()
with open(ngram_file, encoding="utf-8") as f:
for line in f:
data = line.split(" ")
ngram = data[1].strip("\n")
freq = int(data[0])
counts[ngram] = freq
return counts

View File

@@ -0,0 +1,115 @@
# Natural Language Toolkit: Dependency Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Kepa Sarasola <kepa.sarasola@ehu.es>
# Iker Manterola <returntothehangar@hotmail.com>
#
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *
from nltk.parse import DependencyGraph
from nltk.tokenize import *
class DependencyCorpusReader(SyntaxCorpusReader):
def __init__(
self,
root,
fileids,
encoding="utf8",
word_tokenizer=TabTokenizer(),
sent_tokenizer=RegexpTokenizer("\n", gaps=True),
para_block_reader=read_blankline_block,
):
SyntaxCorpusReader.__init__(self, root, fileids, encoding)
#########################################################
def words(self, fileids=None):
return concat(
[
DependencyCorpusView(fileid, False, False, False, encoding=enc)
for fileid, enc in self.abspaths(fileids, include_encoding=True)
]
)
def tagged_words(self, fileids=None):
return concat(
[
DependencyCorpusView(fileid, True, False, False, encoding=enc)
for fileid, enc in self.abspaths(fileids, include_encoding=True)
]
)
def sents(self, fileids=None):
return concat(
[
DependencyCorpusView(fileid, False, True, False, encoding=enc)
for fileid, enc in self.abspaths(fileids, include_encoding=True)
]
)
def tagged_sents(self, fileids=None):
return concat(
[
DependencyCorpusView(fileid, True, True, False, encoding=enc)
for fileid, enc in self.abspaths(fileids, include_encoding=True)
]
)
def parsed_sents(self, fileids=None):
sents = concat(
[
DependencyCorpusView(fileid, False, True, True, encoding=enc)
for fileid, enc in self.abspaths(fileids, include_encoding=True)
]
)
return [DependencyGraph(sent) for sent in sents]
class DependencyCorpusView(StreamBackedCorpusView):
_DOCSTART = "-DOCSTART- -DOCSTART- O\n" # dokumentu hasiera definitzen da
def __init__(
self,
corpus_file,
tagged,
group_by_sent,
dependencies,
chunk_types=None,
encoding="utf8",
):
self._tagged = tagged
self._dependencies = dependencies
self._group_by_sent = group_by_sent
self._chunk_types = chunk_types
StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
def read_block(self, stream):
# Read the next sentence.
sent = read_blankline_block(stream)[0].strip()
# Strip off the docstart marker, if present.
if sent.startswith(self._DOCSTART):
sent = sent[len(self._DOCSTART) :].lstrip()
# extract word and tag from any of the formats
if not self._dependencies:
lines = [line.split("\t") for line in sent.split("\n")]
if len(lines[0]) == 3 or len(lines[0]) == 4:
sent = [(line[0], line[1]) for line in lines]
elif len(lines[0]) == 10:
sent = [(line[1], line[4]) for line in lines]
else:
raise ValueError("Unexpected number of fields in dependency tree file")
# discard tags if they weren't requested
if not self._tagged:
sent = [word for (word, tag) in sent]
# Return the result.
if self._group_by_sent:
return [sent]
else:
return list(sent)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,116 @@
# Natural Language Toolkit: IEER Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Corpus reader for the Information Extraction and Entity Recognition Corpus.
NIST 1999 Information Extraction: Entity Recognition Evaluation
https://www.itl.nist.gov/iad/894.01/tests/ie-er/er_99/er_99.htm
This corpus contains the NEWSWIRE development test data for the
NIST 1999 IE-ER Evaluation. The files were taken from the
subdirectory: ``/ie_er_99/english/devtest/newswire/*.ref.nwt``
and filenames were shortened.
The corpus contains the following files: APW_19980314, APW_19980424,
APW_19980429, NYT_19980315, NYT_19980403, and NYT_19980407.
"""
import nltk
from nltk.corpus.reader.api import *
#: A dictionary whose keys are the names of documents in this corpus;
#: and whose values are descriptions of those documents' contents.
titles = {
"APW_19980314": "Associated Press Weekly, 14 March 1998",
"APW_19980424": "Associated Press Weekly, 24 April 1998",
"APW_19980429": "Associated Press Weekly, 29 April 1998",
"NYT_19980315": "New York Times, 15 March 1998",
"NYT_19980403": "New York Times, 3 April 1998",
"NYT_19980407": "New York Times, 7 April 1998",
}
#: A list of all documents in this corpus.
documents = sorted(titles)
class IEERDocument:
def __init__(self, text, docno=None, doctype=None, date_time=None, headline=""):
self.text = text
self.docno = docno
self.doctype = doctype
self.date_time = date_time
self.headline = headline
def __repr__(self):
if self.headline:
headline = " ".join(self.headline.leaves())
else:
headline = (
" ".join([w for w in self.text.leaves() if w[:1] != "<"][:12]) + "..."
)
if self.docno is not None:
return f"<IEERDocument {self.docno}: {headline!r}>"
else:
return "<IEERDocument: %r>" % headline
class IEERCorpusReader(CorpusReader):
""" """
def docs(self, fileids=None):
return concat(
[
StreamBackedCorpusView(fileid, self._read_block, encoding=enc)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def parsed_docs(self, fileids=None):
return concat(
[
StreamBackedCorpusView(fileid, self._read_parsed_block, encoding=enc)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def _read_parsed_block(self, stream):
# TODO: figure out while empty documents are being returned
return [
self._parse(doc)
for doc in self._read_block(stream)
if self._parse(doc).docno is not None
]
def _parse(self, doc):
val = nltk.chunk.ieerstr2tree(doc, root_label="DOCUMENT")
if isinstance(val, dict):
return IEERDocument(**val)
else:
return IEERDocument(val)
def _read_block(self, stream):
out = []
# Skip any preamble.
while True:
line = stream.readline()
if not line:
break
if line.strip() == "<DOC>":
break
out.append(line)
# Read the document
while True:
line = stream.readline()
if not line:
break
out.append(line)
if line.strip() == "</DOC>":
break
# Return the document
return ["\n".join(out)]

View File

@@ -0,0 +1,93 @@
# Natural Language Toolkit: Indian Language POS-Tagged Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Indian Language POS-Tagged Corpus
Collected by A Kumaran, Microsoft Research, India
Distributed with permission
Contents:
- Bangla: IIT Kharagpur
- Hindi: Microsoft Research India
- Marathi: IIT Bombay
- Telugu: IIIT Hyderabad
"""
from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *
from nltk.tag import map_tag, str2tuple
class IndianCorpusReader(CorpusReader):
"""
List of words, one per line. Blank lines are ignored.
"""
def words(self, fileids=None):
return concat(
[
IndianCorpusView(fileid, enc, False, False)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def tagged_words(self, fileids=None, tagset=None):
if tagset and tagset != self._tagset:
tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
else:
tag_mapping_function = None
return concat(
[
IndianCorpusView(fileid, enc, True, False, tag_mapping_function)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def sents(self, fileids=None):
return concat(
[
IndianCorpusView(fileid, enc, False, True)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def tagged_sents(self, fileids=None, tagset=None):
if tagset and tagset != self._tagset:
tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
else:
tag_mapping_function = None
return concat(
[
IndianCorpusView(fileid, enc, True, True, tag_mapping_function)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
class IndianCorpusView(StreamBackedCorpusView):
def __init__(
self, corpus_file, encoding, tagged, group_by_sent, tag_mapping_function=None
):
self._tagged = tagged
self._group_by_sent = group_by_sent
self._tag_mapping_function = tag_mapping_function
StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
def read_block(self, stream):
line = stream.readline()
if line.startswith("<"):
return []
sent = [str2tuple(word, sep="_") for word in line.split()]
if self._tag_mapping_function:
sent = [(w, self._tag_mapping_function(t)) for (w, t) in sent]
if not self._tagged:
sent = [w for (w, t) in sent]
if self._group_by_sent:
return [sent]
else:
return sent

View File

@@ -0,0 +1,354 @@
# Natural Language Toolkit: IPI PAN Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Konrad Goluchowski <kodie@mimuw.edu.pl>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import functools
from nltk.corpus.reader.api import CorpusReader
from nltk.corpus.reader.util import StreamBackedCorpusView, concat
def _parse_args(fun):
@functools.wraps(fun)
def decorator(self, fileids=None, **kwargs):
kwargs.pop("tags", None)
if not fileids:
fileids = self.fileids()
return fun(self, fileids, **kwargs)
return decorator
class IPIPANCorpusReader(CorpusReader):
"""
Corpus reader designed to work with corpus created by IPI PAN.
See http://korpus.pl/en/ for more details about IPI PAN corpus.
The corpus includes information about text domain, channel and categories.
You can access possible values using ``domains()``, ``channels()`` and
``categories()``. You can use also this metadata to filter files, e.g.:
``fileids(channel='prasa')``, ``fileids(categories='publicystyczny')``.
The reader supports methods: words, sents, paras and their tagged versions.
You can get part of speech instead of full tag by giving "simplify_tags=True"
parameter, e.g.: ``tagged_sents(simplify_tags=True)``.
Also you can get all tags disambiguated tags specifying parameter
"one_tag=False", e.g.: ``tagged_paras(one_tag=False)``.
You can get all tags that were assigned by a morphological analyzer specifying
parameter "disamb_only=False", e.g. ``tagged_words(disamb_only=False)``.
The IPIPAN Corpus contains tags indicating if there is a space between two
tokens. To add special "no space" markers, you should specify parameter
"append_no_space=True", e.g. ``tagged_words(append_no_space=True)``.
As a result in place where there should be no space between two tokens new
pair ('', 'no-space') will be inserted (for tagged data) and just '' for
methods without tags.
The corpus reader can also try to append spaces between words. To enable this
option, specify parameter "append_space=True", e.g. ``words(append_space=True)``.
As a result either ' ' or (' ', 'space') will be inserted between tokens.
By default, xml entities like &quot; and &amp; are replaced by corresponding
characters. You can turn off this feature, specifying parameter
"replace_xmlentities=False", e.g. ``words(replace_xmlentities=False)``.
"""
def __init__(self, root, fileids):
CorpusReader.__init__(self, root, fileids, None, None)
def channels(self, fileids=None):
if not fileids:
fileids = self.fileids()
return self._parse_header(fileids, "channel")
def domains(self, fileids=None):
if not fileids:
fileids = self.fileids()
return self._parse_header(fileids, "domain")
def categories(self, fileids=None):
if not fileids:
fileids = self.fileids()
return [
self._map_category(cat) for cat in self._parse_header(fileids, "keyTerm")
]
def fileids(self, channels=None, domains=None, categories=None):
if channels is not None and domains is not None and categories is not None:
raise ValueError(
"You can specify only one of channels, domains "
"and categories parameter at once"
)
if channels is None and domains is None and categories is None:
return CorpusReader.fileids(self)
if isinstance(channels, str):
channels = [channels]
if isinstance(domains, str):
domains = [domains]
if isinstance(categories, str):
categories = [categories]
if channels:
return self._list_morph_files_by("channel", channels)
elif domains:
return self._list_morph_files_by("domain", domains)
else:
return self._list_morph_files_by(
"keyTerm", categories, map=self._map_category
)
@_parse_args
def sents(self, fileids=None, **kwargs):
return concat(
[
self._view(
fileid, mode=IPIPANCorpusView.SENTS_MODE, tags=False, **kwargs
)
for fileid in self._list_morph_files(fileids)
]
)
@_parse_args
def paras(self, fileids=None, **kwargs):
return concat(
[
self._view(
fileid, mode=IPIPANCorpusView.PARAS_MODE, tags=False, **kwargs
)
for fileid in self._list_morph_files(fileids)
]
)
@_parse_args
def words(self, fileids=None, **kwargs):
return concat(
[
self._view(fileid, tags=False, **kwargs)
for fileid in self._list_morph_files(fileids)
]
)
@_parse_args
def tagged_sents(self, fileids=None, **kwargs):
return concat(
[
self._view(fileid, mode=IPIPANCorpusView.SENTS_MODE, **kwargs)
for fileid in self._list_morph_files(fileids)
]
)
@_parse_args
def tagged_paras(self, fileids=None, **kwargs):
return concat(
[
self._view(fileid, mode=IPIPANCorpusView.PARAS_MODE, **kwargs)
for fileid in self._list_morph_files(fileids)
]
)
@_parse_args
def tagged_words(self, fileids=None, **kwargs):
return concat(
[self._view(fileid, **kwargs) for fileid in self._list_morph_files(fileids)]
)
def _list_morph_files(self, fileids):
return [f for f in self.abspaths(fileids)]
def _list_header_files(self, fileids):
return [
f.replace("morph.xml", "header.xml")
for f in self._list_morph_files(fileids)
]
def _parse_header(self, fileids, tag):
values = set()
for f in self._list_header_files(fileids):
values_list = self._get_tag(f, tag)
for v in values_list:
values.add(v)
return list(values)
def _list_morph_files_by(self, tag, values, map=None):
fileids = self.fileids()
ret_fileids = set()
for f in fileids:
fp = self.abspath(f).replace("morph.xml", "header.xml")
values_list = self._get_tag(fp, tag)
for value in values_list:
if map is not None:
value = map(value)
if value in values:
ret_fileids.add(f)
return list(ret_fileids)
def _get_tag(self, f, tag):
tags = []
with open(f) as infile:
header = infile.read()
tag_end = 0
while True:
tag_pos = header.find("<" + tag, tag_end)
if tag_pos < 0:
return tags
tag_end = header.find("</" + tag + ">", tag_pos)
tags.append(header[tag_pos + len(tag) + 2 : tag_end])
def _map_category(self, cat):
pos = cat.find(">")
if pos == -1:
return cat
else:
return cat[pos + 1 :]
def _view(self, filename, **kwargs):
tags = kwargs.pop("tags", True)
mode = kwargs.pop("mode", 0)
simplify_tags = kwargs.pop("simplify_tags", False)
one_tag = kwargs.pop("one_tag", True)
disamb_only = kwargs.pop("disamb_only", True)
append_no_space = kwargs.pop("append_no_space", False)
append_space = kwargs.pop("append_space", False)
replace_xmlentities = kwargs.pop("replace_xmlentities", True)
if len(kwargs) > 0:
raise ValueError("Unexpected arguments: %s" % kwargs.keys())
if not one_tag and not disamb_only:
raise ValueError(
"You cannot specify both one_tag=False and " "disamb_only=False"
)
if not tags and (simplify_tags or not one_tag or not disamb_only):
raise ValueError(
"You cannot specify simplify_tags, one_tag or "
"disamb_only with functions other than tagged_*"
)
return IPIPANCorpusView(
filename,
tags=tags,
mode=mode,
simplify_tags=simplify_tags,
one_tag=one_tag,
disamb_only=disamb_only,
append_no_space=append_no_space,
append_space=append_space,
replace_xmlentities=replace_xmlentities,
)
class IPIPANCorpusView(StreamBackedCorpusView):
WORDS_MODE = 0
SENTS_MODE = 1
PARAS_MODE = 2
def __init__(self, filename, startpos=0, **kwargs):
StreamBackedCorpusView.__init__(self, filename, None, startpos, None)
self.in_sentence = False
self.position = 0
self.show_tags = kwargs.pop("tags", True)
self.disamb_only = kwargs.pop("disamb_only", True)
self.mode = kwargs.pop("mode", IPIPANCorpusView.WORDS_MODE)
self.simplify_tags = kwargs.pop("simplify_tags", False)
self.one_tag = kwargs.pop("one_tag", True)
self.append_no_space = kwargs.pop("append_no_space", False)
self.append_space = kwargs.pop("append_space", False)
self.replace_xmlentities = kwargs.pop("replace_xmlentities", True)
def read_block(self, stream):
sentence = []
sentences = []
space = False
no_space = False
tags = set()
lines = self._read_data(stream)
while True:
# we may have only part of last line
if len(lines) <= 1:
self._seek(stream)
lines = self._read_data(stream)
if lines == [""]:
assert not sentences
return []
line = lines.pop()
self.position += len(line) + 1
if line.startswith('<chunk type="s"'):
self.in_sentence = True
elif line.startswith('<chunk type="p"'):
pass
elif line.startswith("<tok"):
if self.append_space and space and not no_space:
self._append_space(sentence)
space = True
no_space = False
orth = ""
tags = set()
elif line.startswith("</chunk"):
if self.in_sentence:
self.in_sentence = False
self._seek(stream)
if self.mode == self.SENTS_MODE:
return [sentence]
elif self.mode == self.WORDS_MODE:
if self.append_space:
self._append_space(sentence)
return sentence
else:
sentences.append(sentence)
elif self.mode == self.PARAS_MODE:
self._seek(stream)
return [sentences]
elif line.startswith("<orth"):
orth = line[6:-7]
if self.replace_xmlentities:
orth = orth.replace("&quot;", '"').replace("&amp;", "&")
elif line.startswith("<lex"):
if not self.disamb_only or line.find("disamb=") != -1:
tag = line[line.index("<ctag") + 6 : line.index("</ctag")]
tags.add(tag)
elif line.startswith("</tok"):
if self.show_tags:
if self.simplify_tags:
tags = [t.split(":")[0] for t in tags]
if not self.one_tag or not self.disamb_only:
sentence.append((orth, tuple(tags)))
else:
sentence.append((orth, tags.pop()))
else:
sentence.append(orth)
elif line.startswith("<ns/>"):
if self.append_space:
no_space = True
if self.append_no_space:
if self.show_tags:
sentence.append(("", "no-space"))
else:
sentence.append("")
elif line.startswith("</cesAna"):
pass
def _read_data(self, stream):
self.position = stream.tell()
buff = stream.read(4096)
lines = buff.split("\n")
lines.reverse()
return lines
def _seek(self, stream):
stream.seek(self.position)
def _append_space(self, sentence):
if self.show_tags:
sentence.append((" ", "space"))
else:
sentence.append(" ")

View File

@@ -0,0 +1,186 @@
#! /usr/bin/env python
# KNB Corpus reader
# Copyright (C) 2001-2025 NLTK Project
# Author: Masato Hagiwara <hagisan@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
# For more information, see http://lilyx.net/pages/nltkjapanesecorpus.html
import re
from nltk.corpus.reader.api import CorpusReader, SyntaxCorpusReader
from nltk.corpus.reader.util import (
FileSystemPathPointer,
find_corpus_fileids,
read_blankline_block,
)
from nltk.parse import DependencyGraph
# default function to convert morphlist to str for tree representation
_morphs2str_default = lambda morphs: "/".join(m[0] for m in morphs if m[0] != "EOS")
class KNBCorpusReader(SyntaxCorpusReader):
"""
This class implements:
- ``__init__``, which specifies the location of the corpus
and a method for detecting the sentence blocks in corpus files.
- ``_read_block``, which reads a block from the input stream.
- ``_word``, which takes a block and returns a list of list of words.
- ``_tag``, which takes a block and returns a list of list of tagged
words.
- ``_parse``, which takes a block and returns a list of parsed
sentences.
The structure of tagged words:
tagged_word = (word(str), tags(tuple))
tags = (surface, reading, lemma, pos1, posid1, pos2, posid2, pos3, posid3, others ...)
Usage example
>>> from nltk.corpus.util import LazyCorpusLoader
>>> knbc = LazyCorpusLoader(
... 'knbc/corpus1',
... KNBCorpusReader,
... r'.*/KN.*',
... encoding='euc-jp',
... )
>>> len(knbc.sents()[0])
9
"""
def __init__(self, root, fileids, encoding="utf8", morphs2str=_morphs2str_default):
"""
Initialize KNBCorpusReader
morphs2str is a function to convert morphlist to str for tree representation
for _parse()
"""
SyntaxCorpusReader.__init__(self, root, fileids, encoding)
self.morphs2str = morphs2str
def _read_block(self, stream):
# blocks are split by blankline (or EOF) - default
return read_blankline_block(stream)
def _word(self, t):
res = []
for line in t.splitlines():
# ignore the Bunsets headers
if not re.match(r"EOS|\*|\#|\+", line):
cells = line.strip().split(" ")
res.append(cells[0])
return res
# ignores tagset argument
def _tag(self, t, tagset=None):
res = []
for line in t.splitlines():
# ignore the Bunsets headers
if not re.match(r"EOS|\*|\#|\+", line):
cells = line.strip().split(" ")
# convert cells to morph tuples
res.append((cells[0], " ".join(cells[1:])))
return res
def _parse(self, t):
dg = DependencyGraph()
i = 0
for line in t.splitlines():
if line[0] in "*+":
# start of bunsetsu or tag
cells = line.strip().split(" ", 3)
m = re.match(r"([\-0-9]*)([ADIP])", cells[1])
assert m is not None
node = dg.nodes[i]
node.update({"address": i, "rel": m.group(2), "word": []})
dep_parent = int(m.group(1))
if dep_parent == -1:
dg.root = node
else:
dg.nodes[dep_parent]["deps"].append(i)
i += 1
elif line[0] != "#":
# normal morph
cells = line.strip().split(" ")
# convert cells to morph tuples
morph = cells[0], " ".join(cells[1:])
dg.nodes[i - 1]["word"].append(morph)
if self.morphs2str:
for node in dg.nodes.values():
node["word"] = self.morphs2str(node["word"])
return dg.tree()
######################################################################
# Demo
######################################################################
def demo():
import nltk
from nltk.corpus.util import LazyCorpusLoader
root = nltk.data.find("corpora/knbc/corpus1")
fileids = [
f
for f in find_corpus_fileids(FileSystemPathPointer(root), ".*")
if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)
]
def _knbc_fileids_sort(x):
cells = x.split("-")
return (cells[0], int(cells[1]), int(cells[2]), int(cells[3]))
knbc = LazyCorpusLoader(
"knbc/corpus1",
KNBCorpusReader,
sorted(fileids, key=_knbc_fileids_sort),
encoding="euc-jp",
)
print(knbc.fileids()[:10])
print("".join(knbc.words()[:100]))
print("\n\n".join(str(tree) for tree in knbc.parsed_sents()[:2]))
knbc.morphs2str = lambda morphs: "/".join(
"{}({})".format(m[0], m[1].split(" ")[2]) for m in morphs if m[0] != "EOS"
).encode("utf-8")
print("\n\n".join("%s" % tree for tree in knbc.parsed_sents()[:2]))
print(
"\n".join(
" ".join("{}/{}".format(w[0], w[1].split(" ")[2]) for w in sent)
for sent in knbc.tagged_sents()[0:2]
)
)
def test():
from nltk.corpus.util import LazyCorpusLoader
knbc = LazyCorpusLoader(
"knbc/corpus1", KNBCorpusReader, r".*/KN.*", encoding="euc-jp"
)
assert isinstance(knbc.words()[0], str)
assert isinstance(knbc.sents()[0][0], str)
assert isinstance(knbc.tagged_words()[0], tuple)
assert isinstance(knbc.tagged_sents()[0][0], tuple)
if __name__ == "__main__":
demo()

View File

@@ -0,0 +1,183 @@
# Natural Language Toolkit: Lin's Thesaurus
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Dan Blanchard <dblanchard@ets.org>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.txt
import re
from collections import defaultdict
from functools import reduce
from nltk.corpus.reader import CorpusReader
class LinThesaurusCorpusReader(CorpusReader):
"""Wrapper for the LISP-formatted thesauruses distributed by Dekang Lin."""
# Compiled regular expression for extracting the key from the first line of each
# thesaurus entry
_key_re = re.compile(r'\("?([^"]+)"? \(desc [0-9.]+\).+')
@staticmethod
def __defaultdict_factory():
"""Factory for creating defaultdict of defaultdict(dict)s"""
return defaultdict(dict)
def __init__(self, root, badscore=0.0):
"""
Initialize the thesaurus.
:param root: root directory containing thesaurus LISP files
:type root: C{string}
:param badscore: the score to give to words which do not appear in each other's sets of synonyms
:type badscore: C{float}
"""
super().__init__(root, r"sim[A-Z]\.lsp")
self._thesaurus = defaultdict(LinThesaurusCorpusReader.__defaultdict_factory)
self._badscore = badscore
for path, encoding, fileid in self.abspaths(
include_encoding=True, include_fileid=True
):
with open(path) as lin_file:
first = True
for line in lin_file:
line = line.strip()
# Start of entry
if first:
key = LinThesaurusCorpusReader._key_re.sub(r"\1", line)
first = False
# End of entry
elif line == "))":
first = True
# Lines with pairs of ngrams and scores
else:
split_line = line.split("\t")
if len(split_line) == 2:
ngram, score = split_line
self._thesaurus[fileid][key][ngram.strip('"')] = float(
score
)
def similarity(self, ngram1, ngram2, fileid=None):
"""
Returns the similarity score for two ngrams.
:param ngram1: first ngram to compare
:type ngram1: C{string}
:param ngram2: second ngram to compare
:type ngram2: C{string}
:param fileid: thesaurus fileid to search in. If None, search all fileids.
:type fileid: C{string}
:return: If fileid is specified, just the score for the two ngrams; otherwise,
list of tuples of fileids and scores.
"""
# Entries don't contain themselves, so make sure similarity between item and itself is 1.0
if ngram1 == ngram2:
if fileid:
return 1.0
else:
return [(fid, 1.0) for fid in self._fileids]
else:
if fileid:
return (
self._thesaurus[fileid][ngram1][ngram2]
if ngram2 in self._thesaurus[fileid][ngram1]
else self._badscore
)
else:
return [
(
fid,
(
self._thesaurus[fid][ngram1][ngram2]
if ngram2 in self._thesaurus[fid][ngram1]
else self._badscore
),
)
for fid in self._fileids
]
def scored_synonyms(self, ngram, fileid=None):
"""
Returns a list of scored synonyms (tuples of synonyms and scores) for the current ngram
:param ngram: ngram to lookup
:type ngram: C{string}
:param fileid: thesaurus fileid to search in. If None, search all fileids.
:type fileid: C{string}
:return: If fileid is specified, list of tuples of scores and synonyms; otherwise,
list of tuples of fileids and lists, where inner lists consist of tuples of
scores and synonyms.
"""
if fileid:
return self._thesaurus[fileid][ngram].items()
else:
return [
(fileid, self._thesaurus[fileid][ngram].items())
for fileid in self._fileids
]
def synonyms(self, ngram, fileid=None):
"""
Returns a list of synonyms for the current ngram.
:param ngram: ngram to lookup
:type ngram: C{string}
:param fileid: thesaurus fileid to search in. If None, search all fileids.
:type fileid: C{string}
:return: If fileid is specified, list of synonyms; otherwise, list of tuples of fileids and
lists, where inner lists contain synonyms.
"""
if fileid:
return self._thesaurus[fileid][ngram].keys()
else:
return [
(fileid, self._thesaurus[fileid][ngram].keys())
for fileid in self._fileids
]
def __contains__(self, ngram):
"""
Determines whether or not the given ngram is in the thesaurus.
:param ngram: ngram to lookup
:type ngram: C{string}
:return: whether the given ngram is in the thesaurus.
"""
return reduce(
lambda accum, fileid: accum or (ngram in self._thesaurus[fileid]),
self._fileids,
False,
)
######################################################################
# Demo
######################################################################
def demo():
from nltk.corpus import lin_thesaurus as thes
word1 = "business"
word2 = "enterprise"
print("Getting synonyms for " + word1)
print(thes.synonyms(word1))
print("Getting scored synonyms for " + word1)
print(thes.scored_synonyms(word1))
print("Getting synonyms from simN.lsp (noun subsection) for " + word1)
print(thes.synonyms(word1, fileid="simN.lsp"))
print("Getting synonyms from simN.lsp (noun subsection) for " + word1)
print(thes.synonyms(word1, fileid="simN.lsp"))
print(f"Similarity score for {word1} and {word2}:")
print(thes.similarity(word1, word2))
if __name__ == "__main__":
demo()

View File

@@ -0,0 +1,344 @@
from collections import namedtuple
from functools import partial, wraps
from nltk.corpus.reader.api import CategorizedCorpusReader
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.corpus.reader.util import concat, read_blankline_block
from nltk.tokenize import blankline_tokenize, sent_tokenize, word_tokenize
def comma_separated_string_args(func):
"""
A decorator that allows a function to be called with
a single string of comma-separated values which become
individual function arguments.
"""
@wraps(func)
def wrapper(*args, **kwargs):
_args = list()
for arg in args:
if isinstance(arg, str):
_args.append({part.strip() for part in arg.split(",")})
elif isinstance(arg, list):
_args.append(set(arg))
else:
_args.append(arg)
for name, value in kwargs.items():
if isinstance(value, str):
kwargs[name] = {part.strip() for part in value.split(",")}
return func(*_args, **kwargs)
return wrapper
def read_parse_blankline_block(stream, parser):
block = read_blankline_block(stream)
if block:
return [parser.render(block[0])]
return block
class MarkdownBlock:
def __init__(self, content):
self.content = content
self.truncate_at = 16
def __repr__(self):
return f"{self.__class__.__name__}(content={repr(str(self))})"
def __str__(self):
return (
f"{self.content[:self.truncate_at]}"
f"{'...' if len(self.content) > self.truncate_at else ''}"
)
@property
def raw(self):
return self.content
@property
def words(self):
return word_tokenize(self.content)
@property
def sents(self):
return [word_tokenize(sent) for sent in sent_tokenize(self.content)]
@property
def paras(self):
return [
[word_tokenize(sent) for sent in sent_tokenize(para)]
for para in blankline_tokenize(self.content)
]
class CodeBlock(MarkdownBlock):
def __init__(self, language, *args):
self.language = language
super().__init__(*args)
@property
def sents(self):
return [word_tokenize(line) for line in self.content.splitlines()]
@property
def lines(self):
return self.content.splitlines()
@property
def paras(self):
return [
[word_tokenize(line) for line in para.splitlines()]
for para in blankline_tokenize(self.content)
]
class MarkdownSection(MarkdownBlock):
def __init__(self, heading, level, *args):
self.heading = heading
self.level = level
super().__init__(*args)
Image = namedtuple("Image", "label, src, title")
Link = namedtuple("Link", "label, href, title")
List = namedtuple("List", "is_ordered, items")
class MarkdownCorpusReader(PlaintextCorpusReader):
def __init__(self, *args, parser=None, **kwargs):
from markdown_it import MarkdownIt
from mdit_plain.renderer import RendererPlain
from mdit_py_plugins.front_matter import front_matter_plugin
self.parser = parser
if self.parser is None:
self.parser = MarkdownIt("commonmark", renderer_cls=RendererPlain)
self.parser.use(front_matter_plugin)
kwargs.setdefault(
"para_block_reader", partial(read_parse_blankline_block, parser=self.parser)
)
super().__init__(*args, **kwargs)
# This override takes care of removing markup.
def _read_word_block(self, stream):
words = list()
for para in self._para_block_reader(stream):
words.extend(self._word_tokenizer.tokenize(para))
return words
class CategorizedMarkdownCorpusReader(CategorizedCorpusReader, MarkdownCorpusReader):
"""
A reader for markdown corpora whose documents are divided into
categories based on their file identifiers.
Based on nltk.corpus.reader.plaintext.CategorizedPlaintextCorpusReader:
https://www.nltk.org/_modules/nltk/corpus/reader/api.html#CategorizedCorpusReader
"""
def __init__(self, *args, cat_field="tags", **kwargs):
"""
Initialize the corpus reader. Categorization arguments
(``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
the ``CategorizedCorpusReader`` constructor. The remaining arguments
are passed to the ``MarkdownCorpusReader`` constructor.
"""
cat_args = ["cat_pattern", "cat_map", "cat_file"]
if not any(arg in kwargs for arg in cat_args):
# Initialize with a blank map now,
# and try to build categories from document metadata later.
kwargs["cat_map"] = dict()
CategorizedCorpusReader.__init__(self, kwargs)
MarkdownCorpusReader.__init__(self, *args, **kwargs)
# Map file IDs to categories if self._map exists but is still empty:
if self._map is not None and not self._map:
for file_id in self._fileids:
metadata = self.metadata(file_id)
if metadata:
self._map[file_id] = metadata[0].get(cat_field, [])
### Begin CategorizedCorpusReader Overrides
@comma_separated_string_args
def categories(self, fileids=None):
return super().categories(fileids)
@comma_separated_string_args
def fileids(self, categories=None):
if categories is None:
return self._fileids
return super().fileids(categories)
### End CategorizedCorpusReader Overrides
### Begin MarkdownCorpusReader Overrides
@comma_separated_string_args
def raw(self, fileids=None, categories=None):
return super().raw(self._resolve(fileids, categories))
@comma_separated_string_args
def words(self, fileids=None, categories=None):
return super().words(self._resolve(fileids, categories))
@comma_separated_string_args
def sents(self, fileids=None, categories=None):
return super().sents(self._resolve(fileids, categories))
@comma_separated_string_args
def paras(self, fileids=None, categories=None):
return super().paras(self._resolve(fileids, categories))
### End MarkdownCorpusReader Overrides
def concatenated_view(self, reader, fileids, categories):
return concat(
[
self.CorpusView(path, reader, encoding=enc)
for (path, enc) in self.abspaths(
self._resolve(fileids, categories), include_encoding=True
)
]
)
def metadata_reader(self, stream):
from yaml import safe_load
return [
safe_load(t.content)
for t in self.parser.parse(stream.read())
if t.type == "front_matter"
]
@comma_separated_string_args
def metadata(self, fileids=None, categories=None):
return self.concatenated_view(self.metadata_reader, fileids, categories)
def blockquote_reader(self, stream):
tokens = self.parser.parse(stream.read())
opening_tokens = filter(
lambda t: t.level == 0 and t.type == "blockquote_open", tokens
)
closing_tokens = filter(
lambda t: t.level == 0 and t.type == "blockquote_close", tokens
)
blockquotes = list()
for o, c in zip(opening_tokens, closing_tokens):
opening_index = tokens.index(o)
closing_index = tokens.index(c, opening_index)
blockquotes.append(tokens[opening_index : closing_index + 1])
return [
MarkdownBlock(
self.parser.renderer.render(block, self.parser.options, env=None)
)
for block in blockquotes
]
@comma_separated_string_args
def blockquotes(self, fileids=None, categories=None):
return self.concatenated_view(self.blockquote_reader, fileids, categories)
def code_block_reader(self, stream):
return [
CodeBlock(
t.info,
t.content,
)
for t in self.parser.parse(stream.read())
if t.level == 0 and t.type in ("fence", "code_block")
]
@comma_separated_string_args
def code_blocks(self, fileids=None, categories=None):
return self.concatenated_view(self.code_block_reader, fileids, categories)
def image_reader(self, stream):
return [
Image(
child_token.content,
child_token.attrGet("src"),
child_token.attrGet("title"),
)
for inline_token in filter(
lambda t: t.type == "inline", self.parser.parse(stream.read())
)
for child_token in inline_token.children
if child_token.type == "image"
]
@comma_separated_string_args
def images(self, fileids=None, categories=None):
return self.concatenated_view(self.image_reader, fileids, categories)
def link_reader(self, stream):
return [
Link(
inline_token.children[i + 1].content,
child_token.attrGet("href"),
child_token.attrGet("title"),
)
for inline_token in filter(
lambda t: t.type == "inline", self.parser.parse(stream.read())
)
for i, child_token in enumerate(inline_token.children)
if child_token.type == "link_open"
]
@comma_separated_string_args
def links(self, fileids=None, categories=None):
return self.concatenated_view(self.link_reader, fileids, categories)
def list_reader(self, stream):
tokens = self.parser.parse(stream.read())
opening_types = ("bullet_list_open", "ordered_list_open")
opening_tokens = filter(
lambda t: t.level == 0 and t.type in opening_types, tokens
)
closing_types = ("bullet_list_close", "ordered_list_close")
closing_tokens = filter(
lambda t: t.level == 0 and t.type in closing_types, tokens
)
list_blocks = list()
for o, c in zip(opening_tokens, closing_tokens):
opening_index = tokens.index(o)
closing_index = tokens.index(c, opening_index)
list_blocks.append(tokens[opening_index : closing_index + 1])
return [
List(
tokens[0].type == "ordered_list_open",
[t.content for t in tokens if t.content],
)
for tokens in list_blocks
]
@comma_separated_string_args
def lists(self, fileids=None, categories=None):
return self.concatenated_view(self.list_reader, fileids, categories)
def section_reader(self, stream):
section_blocks, block = list(), list()
for t in self.parser.parse(stream.read()):
if t.level == 0 and t.type == "heading_open":
if not block:
block.append(t)
else:
section_blocks.append(block)
block = [t]
elif block:
block.append(t)
if block:
section_blocks.append(block)
return [
MarkdownSection(
block[1].content,
block[0].markup.count("#"),
self.parser.renderer.render(block, self.parser.options, env=None),
)
for block in section_blocks
]
@comma_separated_string_args
def sections(self, fileids=None, categories=None):
return self.concatenated_view(self.section_reader, fileids, categories)

View File

@@ -0,0 +1,398 @@
"""
A reader for corpora whose documents are in MTE format.
"""
import os
import re
from functools import reduce
from nltk.corpus.reader import TaggedCorpusReader, concat
from nltk.corpus.reader.xmldocs import XMLCorpusView
def xpath(root, path, ns):
return root.findall(path, ns)
class MTECorpusView(XMLCorpusView):
"""
Class for lazy viewing the MTE Corpus.
"""
def __init__(self, fileid, tagspec, elt_handler=None):
XMLCorpusView.__init__(self, fileid, tagspec, elt_handler)
def read_block(self, stream, tagspec=None, elt_handler=None):
return list(
filter(
lambda x: x is not None,
XMLCorpusView.read_block(self, stream, tagspec, elt_handler),
)
)
class MTEFileReader:
"""
Class for loading the content of the multext-east corpus. It
parses the xml files and does some tag-filtering depending on the
given method parameters.
"""
ns = {
"tei": "https://www.tei-c.org/ns/1.0",
"xml": "https://www.w3.org/XML/1998/namespace",
}
tag_ns = "{https://www.tei-c.org/ns/1.0}"
xml_ns = "{https://www.w3.org/XML/1998/namespace}"
word_path = "TEI/text/body/div/div/p/s/(w|c)"
sent_path = "TEI/text/body/div/div/p/s"
para_path = "TEI/text/body/div/div/p"
def __init__(self, file_path):
self.__file_path = file_path
@classmethod
def _word_elt(cls, elt, context):
return elt.text
@classmethod
def _sent_elt(cls, elt, context):
return [cls._word_elt(w, None) for w in xpath(elt, "*", cls.ns)]
@classmethod
def _para_elt(cls, elt, context):
return [cls._sent_elt(s, None) for s in xpath(elt, "*", cls.ns)]
@classmethod
def _tagged_word_elt(cls, elt, context):
if "ana" not in elt.attrib:
return (elt.text, "")
if cls.__tags == "" and cls.__tagset == "msd":
return (elt.text, elt.attrib["ana"])
elif cls.__tags == "" and cls.__tagset == "universal":
return (elt.text, MTETagConverter.msd_to_universal(elt.attrib["ana"]))
else:
tags = re.compile("^" + re.sub("-", ".", cls.__tags) + ".*$")
if tags.match(elt.attrib["ana"]):
if cls.__tagset == "msd":
return (elt.text, elt.attrib["ana"])
else:
return (
elt.text,
MTETagConverter.msd_to_universal(elt.attrib["ana"]),
)
else:
return None
@classmethod
def _tagged_sent_elt(cls, elt, context):
return list(
filter(
lambda x: x is not None,
[cls._tagged_word_elt(w, None) for w in xpath(elt, "*", cls.ns)],
)
)
@classmethod
def _tagged_para_elt(cls, elt, context):
return list(
filter(
lambda x: x is not None,
[cls._tagged_sent_elt(s, None) for s in xpath(elt, "*", cls.ns)],
)
)
@classmethod
def _lemma_word_elt(cls, elt, context):
if "lemma" not in elt.attrib:
return (elt.text, "")
else:
return (elt.text, elt.attrib["lemma"])
@classmethod
def _lemma_sent_elt(cls, elt, context):
return [cls._lemma_word_elt(w, None) for w in xpath(elt, "*", cls.ns)]
@classmethod
def _lemma_para_elt(cls, elt, context):
return [cls._lemma_sent_elt(s, None) for s in xpath(elt, "*", cls.ns)]
def words(self):
return MTECorpusView(
self.__file_path, MTEFileReader.word_path, MTEFileReader._word_elt
)
def sents(self):
return MTECorpusView(
self.__file_path, MTEFileReader.sent_path, MTEFileReader._sent_elt
)
def paras(self):
return MTECorpusView(
self.__file_path, MTEFileReader.para_path, MTEFileReader._para_elt
)
def lemma_words(self):
return MTECorpusView(
self.__file_path, MTEFileReader.word_path, MTEFileReader._lemma_word_elt
)
def tagged_words(self, tagset, tags):
MTEFileReader.__tagset = tagset
MTEFileReader.__tags = tags
return MTECorpusView(
self.__file_path, MTEFileReader.word_path, MTEFileReader._tagged_word_elt
)
def lemma_sents(self):
return MTECorpusView(
self.__file_path, MTEFileReader.sent_path, MTEFileReader._lemma_sent_elt
)
def tagged_sents(self, tagset, tags):
MTEFileReader.__tagset = tagset
MTEFileReader.__tags = tags
return MTECorpusView(
self.__file_path, MTEFileReader.sent_path, MTEFileReader._tagged_sent_elt
)
def lemma_paras(self):
return MTECorpusView(
self.__file_path, MTEFileReader.para_path, MTEFileReader._lemma_para_elt
)
def tagged_paras(self, tagset, tags):
MTEFileReader.__tagset = tagset
MTEFileReader.__tags = tags
return MTECorpusView(
self.__file_path, MTEFileReader.para_path, MTEFileReader._tagged_para_elt
)
class MTETagConverter:
"""
Class for converting msd tags to universal tags, more conversion
options are currently not implemented.
"""
mapping_msd_universal = {
"A": "ADJ",
"S": "ADP",
"R": "ADV",
"C": "CONJ",
"D": "DET",
"N": "NOUN",
"M": "NUM",
"Q": "PRT",
"P": "PRON",
"V": "VERB",
".": ".",
"-": "X",
}
@staticmethod
def msd_to_universal(tag):
"""
This function converts the annotation from the Multex-East to the universal tagset
as described in Chapter 5 of the NLTK-Book
Unknown Tags will be mapped to X. Punctuation marks are not supported in MSD tags, so
"""
indicator = tag[0] if not tag[0] == "#" else tag[1]
if not indicator in MTETagConverter.mapping_msd_universal:
indicator = "-"
return MTETagConverter.mapping_msd_universal[indicator]
class MTECorpusReader(TaggedCorpusReader):
"""
Reader for corpora following the TEI-p5 xml scheme, such as MULTEXT-East.
MULTEXT-East contains part-of-speech-tagged words with a quite precise tagging
scheme. These tags can be converted to the Universal tagset
"""
def __init__(self, root=None, fileids=None, encoding="utf8"):
"""
Construct a new MTECorpusreader for a set of documents
located at the given root directory. Example usage:
>>> root = '/...path to corpus.../'
>>> reader = MTECorpusReader(root, 'oana-*.xml', 'utf8') # doctest: +SKIP
:param root: The root directory for this corpus. (default points to location in multext config file)
:param fileids: A list or regexp specifying the fileids in this corpus. (default is oana-en.xml)
:param encoding: The encoding of the given files (default is utf8)
"""
TaggedCorpusReader.__init__(self, root, fileids, encoding)
self._readme = "00README.txt"
def __fileids(self, fileids):
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, str):
fileids = [fileids]
# filter wrong userinput
fileids = filter(lambda x: x in self._fileids, fileids)
# filter multext-east sourcefiles that are not compatible to the teip5 specification
fileids = filter(lambda x: x not in ["oana-bg.xml", "oana-mk.xml"], fileids)
if not fileids:
print("No valid multext-east file specified")
return fileids
def words(self, fileids=None):
"""
:param fileids: A list specifying the fileids that should be used.
:return: the given file(s) as a list of words and punctuation symbols.
:rtype: list(str)
"""
return concat(
[
MTEFileReader(os.path.join(self._root, f)).words()
for f in self.__fileids(fileids)
]
)
def sents(self, fileids=None):
"""
:param fileids: A list specifying the fileids that should be used.
:return: the given file(s) as a list of sentences or utterances,
each encoded as a list of word strings
:rtype: list(list(str))
"""
return concat(
[
MTEFileReader(os.path.join(self._root, f)).sents()
for f in self.__fileids(fileids)
]
)
def paras(self, fileids=None):
"""
:param fileids: A list specifying the fileids that should be used.
:return: the given file(s) as a list of paragraphs, each encoded as a list
of sentences, which are in turn encoded as lists of word string
:rtype: list(list(list(str)))
"""
return concat(
[
MTEFileReader(os.path.join(self._root, f)).paras()
for f in self.__fileids(fileids)
]
)
def lemma_words(self, fileids=None):
"""
:param fileids: A list specifying the fileids that should be used.
:return: the given file(s) as a list of words, the corresponding lemmas
and punctuation symbols, encoded as tuples (word, lemma)
:rtype: list(tuple(str,str))
"""
return concat(
[
MTEFileReader(os.path.join(self._root, f)).lemma_words()
for f in self.__fileids(fileids)
]
)
def tagged_words(self, fileids=None, tagset="msd", tags=""):
"""
:param fileids: A list specifying the fileids that should be used.
:param tagset: The tagset that should be used in the returned object,
either "universal" or "msd", "msd" is the default
:param tags: An MSD Tag that is used to filter all parts of the used corpus
that are not more precise or at least equal to the given tag
:return: the given file(s) as a list of tagged words and punctuation symbols
encoded as tuples (word, tag)
:rtype: list(tuple(str, str))
"""
if tagset == "universal" or tagset == "msd":
return concat(
[
MTEFileReader(os.path.join(self._root, f)).tagged_words(
tagset, tags
)
for f in self.__fileids(fileids)
]
)
else:
print("Unknown tagset specified.")
def lemma_sents(self, fileids=None):
"""
:param fileids: A list specifying the fileids that should be used.
:return: the given file(s) as a list of sentences or utterances, each
encoded as a list of tuples of the word and the corresponding
lemma (word, lemma)
:rtype: list(list(tuple(str, str)))
"""
return concat(
[
MTEFileReader(os.path.join(self._root, f)).lemma_sents()
for f in self.__fileids(fileids)
]
)
def tagged_sents(self, fileids=None, tagset="msd", tags=""):
"""
:param fileids: A list specifying the fileids that should be used.
:param tagset: The tagset that should be used in the returned object,
either "universal" or "msd", "msd" is the default
:param tags: An MSD Tag that is used to filter all parts of the used corpus
that are not more precise or at least equal to the given tag
:return: the given file(s) as a list of sentences or utterances, each
each encoded as a list of (word,tag) tuples
:rtype: list(list(tuple(str, str)))
"""
if tagset == "universal" or tagset == "msd":
return concat(
[
MTEFileReader(os.path.join(self._root, f)).tagged_sents(
tagset, tags
)
for f in self.__fileids(fileids)
]
)
else:
print("Unknown tagset specified.")
def lemma_paras(self, fileids=None):
"""
:param fileids: A list specifying the fileids that should be used.
:return: the given file(s) as a list of paragraphs, each encoded as a
list of sentences, which are in turn encoded as a list of
tuples of the word and the corresponding lemma (word, lemma)
:rtype: list(List(List(tuple(str, str))))
"""
return concat(
[
MTEFileReader(os.path.join(self._root, f)).lemma_paras()
for f in self.__fileids(fileids)
]
)
def tagged_paras(self, fileids=None, tagset="msd", tags=""):
"""
:param fileids: A list specifying the fileids that should be used.
:param tagset: The tagset that should be used in the returned object,
either "universal" or "msd", "msd" is the default
:param tags: An MSD Tag that is used to filter all parts of the used corpus
that are not more precise or at least equal to the given tag
:return: the given file(s) as a list of paragraphs, each encoded as a
list of sentences, which are in turn encoded as a list
of (word,tag) tuples
:rtype: list(list(list(tuple(str, str))))
"""
if tagset == "universal" or tagset == "msd":
return concat(
[
MTEFileReader(os.path.join(self._root, f)).tagged_paras(
tagset, tags
)
for f in self.__fileids(fileids)
]
)
else:
print("Unknown tagset specified.")

View File

@@ -0,0 +1,486 @@
# Natural Language Toolkit: NKJP Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Gabriela Kaczka
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import functools
import os
import re
import tempfile
from nltk.corpus.reader.util import concat
from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView
def _parse_args(fun):
"""
Wraps function arguments:
if fileids not specified then function set NKJPCorpusReader paths.
"""
@functools.wraps(fun)
def decorator(self, fileids=None, **kwargs):
if not fileids:
fileids = self._paths
return fun(self, fileids, **kwargs)
return decorator
class NKJPCorpusReader(XMLCorpusReader):
WORDS_MODE = 0
SENTS_MODE = 1
HEADER_MODE = 2
RAW_MODE = 3
def __init__(self, root, fileids=".*"):
"""
Corpus reader designed to work with National Corpus of Polish.
See http://nkjp.pl/ for more details about NKJP.
use example:
import nltk
import nkjp
from nkjp import NKJPCorpusReader
x = NKJPCorpusReader(root='/home/USER/nltk_data/corpora/nkjp/', fileids='') # obtain the whole corpus
x.header()
x.raw()
x.words()
x.tagged_words(tags=['subst', 'comp']) #Link to find more tags: nkjp.pl/poliqarp/help/ense2.html
x.sents()
x = NKJPCorpusReader(root='/home/USER/nltk_data/corpora/nkjp/', fileids='Wilk*') # obtain particular file(s)
x.header(fileids=['WilkDom', '/home/USER/nltk_data/corpora/nkjp/WilkWilczy'])
x.tagged_words(fileids=['WilkDom', '/home/USER/nltk_data/corpora/nkjp/WilkWilczy'], tags=['subst', 'comp'])
"""
if isinstance(fileids, str):
XMLCorpusReader.__init__(self, root, fileids + ".*/header.xml")
else:
XMLCorpusReader.__init__(
self, root, [fileid + "/header.xml" for fileid in fileids]
)
self._paths = self.get_paths()
def get_paths(self):
return [
os.path.join(str(self._root), f.split("header.xml")[0])
for f in self._fileids
]
def fileids(self):
"""
Returns a list of file identifiers for the fileids that make up
this corpus.
"""
return [f.split("header.xml")[0] for f in self._fileids]
def _view(self, filename, tags=None, **kwargs):
"""
Returns a view specialised for use with particular corpus file.
"""
mode = kwargs.pop("mode", NKJPCorpusReader.WORDS_MODE)
if mode is NKJPCorpusReader.WORDS_MODE:
return NKJPCorpus_Morph_View(filename, tags=tags)
elif mode is NKJPCorpusReader.SENTS_MODE:
return NKJPCorpus_Segmentation_View(filename, tags=tags)
elif mode is NKJPCorpusReader.HEADER_MODE:
return NKJPCorpus_Header_View(filename, tags=tags)
elif mode is NKJPCorpusReader.RAW_MODE:
return NKJPCorpus_Text_View(
filename, tags=tags, mode=NKJPCorpus_Text_View.RAW_MODE
)
else:
raise NameError("No such mode!")
def add_root(self, fileid):
"""
Add root if necessary to specified fileid.
"""
if self.root in fileid:
return fileid
return self.root + fileid
@_parse_args
def header(self, fileids=None, **kwargs):
"""
Returns header(s) of specified fileids.
"""
return concat(
[
self._view(
self.add_root(fileid), mode=NKJPCorpusReader.HEADER_MODE, **kwargs
).handle_query()
for fileid in fileids
]
)
@_parse_args
def sents(self, fileids=None, **kwargs):
"""
Returns sentences in specified fileids.
"""
return concat(
[
self._view(
self.add_root(fileid), mode=NKJPCorpusReader.SENTS_MODE, **kwargs
).handle_query()
for fileid in fileids
]
)
@_parse_args
def words(self, fileids=None, **kwargs):
"""
Returns words in specified fileids.
"""
return concat(
[
self._view(
self.add_root(fileid), mode=NKJPCorpusReader.WORDS_MODE, **kwargs
).handle_query()
for fileid in fileids
]
)
@_parse_args
def tagged_words(self, fileids=None, **kwargs):
"""
Call with specified tags as a list, e.g. tags=['subst', 'comp'].
Returns tagged words in specified fileids.
"""
tags = kwargs.pop("tags", [])
return concat(
[
self._view(
self.add_root(fileid),
mode=NKJPCorpusReader.WORDS_MODE,
tags=tags,
**kwargs
).handle_query()
for fileid in fileids
]
)
@_parse_args
def raw(self, fileids=None, **kwargs):
"""
Returns words in specified fileids.
"""
return concat(
[
self._view(
self.add_root(fileid), mode=NKJPCorpusReader.RAW_MODE, **kwargs
).handle_query()
for fileid in fileids
]
)
class NKJPCorpus_Header_View(XMLCorpusView):
def __init__(self, filename, **kwargs):
"""
HEADER_MODE
A stream backed corpus view specialized for use with
header.xml files in NKJP corpus.
"""
self.tagspec = ".*/sourceDesc$"
XMLCorpusView.__init__(self, filename + "header.xml", self.tagspec)
def handle_query(self):
self._open()
header = []
while True:
segm = XMLCorpusView.read_block(self, self._stream)
if len(segm) == 0:
break
header.extend(segm)
self.close()
return header
def handle_elt(self, elt, context):
titles = elt.findall("bibl/title")
title = []
if titles:
title = "\n".join(title.text.strip() for title in titles)
authors = elt.findall("bibl/author")
author = []
if authors:
author = "\n".join(author.text.strip() for author in authors)
dates = elt.findall("bibl/date")
date = []
if dates:
date = "\n".join(date.text.strip() for date in dates)
publishers = elt.findall("bibl/publisher")
publisher = []
if publishers:
publisher = "\n".join(publisher.text.strip() for publisher in publishers)
idnos = elt.findall("bibl/idno")
idno = []
if idnos:
idno = "\n".join(idno.text.strip() for idno in idnos)
notes = elt.findall("bibl/note")
note = []
if notes:
note = "\n".join(note.text.strip() for note in notes)
return {
"title": title,
"author": author,
"date": date,
"publisher": publisher,
"idno": idno,
"note": note,
}
class XML_Tool:
"""
Helper class creating xml file to one without references to nkjp: namespace.
That's needed because the XMLCorpusView assumes that one can find short substrings
of XML that are valid XML, which is not true if a namespace is declared at top level
"""
def __init__(self, root, filename):
self.read_file = os.path.join(root, filename)
self.write_file = tempfile.NamedTemporaryFile(delete=False)
def build_preprocessed_file(self):
try:
fr = open(self.read_file)
fw = self.write_file
line = " "
while len(line):
line = fr.readline()
x = re.split(r"nkjp:[^ ]* ", line) # in all files
ret = " ".join(x)
x = re.split("<nkjp:paren>", ret) # in ann_segmentation.xml
ret = " ".join(x)
x = re.split("</nkjp:paren>", ret) # in ann_segmentation.xml
ret = " ".join(x)
x = re.split("<choice>", ret) # in ann_segmentation.xml
ret = " ".join(x)
x = re.split("</choice>", ret) # in ann_segmentation.xml
ret = " ".join(x)
fw.write(ret)
fr.close()
fw.close()
return self.write_file.name
except Exception as e:
self.remove_preprocessed_file()
raise Exception from e
def remove_preprocessed_file(self):
os.remove(self.write_file.name)
class NKJPCorpus_Segmentation_View(XMLCorpusView):
"""
A stream backed corpus view specialized for use with
ann_segmentation.xml files in NKJP corpus.
"""
def __init__(self, filename, **kwargs):
self.tagspec = ".*p/.*s"
# intersperse NKJPCorpus_Text_View
self.text_view = NKJPCorpus_Text_View(
filename, mode=NKJPCorpus_Text_View.SENTS_MODE
)
self.text_view.handle_query()
# xml preprocessing
self.xml_tool = XML_Tool(filename, "ann_segmentation.xml")
# base class init
XMLCorpusView.__init__(
self, self.xml_tool.build_preprocessed_file(), self.tagspec
)
def get_segm_id(self, example_word):
return example_word.split("(")[1].split(",")[0]
def get_sent_beg(self, beg_word):
# returns index of beginning letter in sentence
return int(beg_word.split(",")[1])
def get_sent_end(self, end_word):
# returns index of end letter in sentence
splitted = end_word.split(")")[0].split(",")
return int(splitted[1]) + int(splitted[2])
def get_sentences(self, sent_segm):
# returns one sentence
id = self.get_segm_id(sent_segm[0])
segm = self.text_view.segm_dict[id] # text segment
beg = self.get_sent_beg(sent_segm[0])
end = self.get_sent_end(sent_segm[len(sent_segm) - 1])
return segm[beg:end]
def remove_choice(self, segm):
ret = []
prev_txt_end = -1
prev_txt_nr = -1
for word in segm:
txt_nr = self.get_segm_id(word)
# get increasing sequence of ids: in case of choice get first possibility
if self.get_sent_beg(word) > prev_txt_end - 1 or prev_txt_nr != txt_nr:
ret.append(word)
prev_txt_end = self.get_sent_end(word)
prev_txt_nr = txt_nr
return ret
def handle_query(self):
try:
self._open()
sentences = []
while True:
sent_segm = XMLCorpusView.read_block(self, self._stream)
if len(sent_segm) == 0:
break
for segm in sent_segm:
segm = self.remove_choice(segm)
sentences.append(self.get_sentences(segm))
self.close()
self.xml_tool.remove_preprocessed_file()
return sentences
except Exception as e:
self.xml_tool.remove_preprocessed_file()
raise Exception from e
def handle_elt(self, elt, context):
ret = []
for seg in elt:
ret.append(seg.get("corresp"))
return ret
class NKJPCorpus_Text_View(XMLCorpusView):
"""
A stream backed corpus view specialized for use with
text.xml files in NKJP corpus.
"""
SENTS_MODE = 0
RAW_MODE = 1
def __init__(self, filename, **kwargs):
self.mode = kwargs.pop("mode", 0)
self.tagspec = ".*/div/ab"
self.segm_dict = dict()
# xml preprocessing
self.xml_tool = XML_Tool(filename, "text.xml")
# base class init
XMLCorpusView.__init__(
self, self.xml_tool.build_preprocessed_file(), self.tagspec
)
def handle_query(self):
try:
self._open()
x = self.read_block(self._stream)
self.close()
self.xml_tool.remove_preprocessed_file()
return x
except Exception as e:
self.xml_tool.remove_preprocessed_file()
raise Exception from e
def read_block(self, stream, tagspec=None, elt_handler=None):
"""
Returns text as a list of sentences.
"""
txt = []
while True:
segm = XMLCorpusView.read_block(self, stream)
if len(segm) == 0:
break
for part in segm:
txt.append(part)
return [" ".join([segm for segm in txt])]
def get_segm_id(self, elt):
for attr in elt.attrib:
if attr.endswith("id"):
return elt.get(attr)
def handle_elt(self, elt, context):
# fill dictionary to use later in sents mode
if self.mode is NKJPCorpus_Text_View.SENTS_MODE:
self.segm_dict[self.get_segm_id(elt)] = elt.text
return elt.text
class NKJPCorpus_Morph_View(XMLCorpusView):
"""
A stream backed corpus view specialized for use with
ann_morphosyntax.xml files in NKJP corpus.
"""
def __init__(self, filename, **kwargs):
self.tags = kwargs.pop("tags", None)
self.tagspec = ".*/seg/fs"
self.xml_tool = XML_Tool(filename, "ann_morphosyntax.xml")
XMLCorpusView.__init__(
self, self.xml_tool.build_preprocessed_file(), self.tagspec
)
def handle_query(self):
try:
self._open()
words = []
while True:
segm = XMLCorpusView.read_block(self, self._stream)
if len(segm) == 0:
break
for part in segm:
if part is not None:
words.append(part)
self.close()
self.xml_tool.remove_preprocessed_file()
return words
except Exception as e:
self.xml_tool.remove_preprocessed_file()
raise Exception from e
def handle_elt(self, elt, context):
word = ""
flag = False
is_not_interp = True
# if tags not specified, then always return word
if self.tags is None:
flag = True
for child in elt:
# get word
if "name" in child.keys() and child.attrib["name"] == "orth":
for symbol in child:
if symbol.tag == "string":
word = symbol.text
elif "name" in child.keys() and child.attrib["name"] == "interps":
for symbol in child:
if "type" in symbol.keys() and symbol.attrib["type"] == "lex":
for symbol2 in symbol:
if (
"name" in symbol2.keys()
and symbol2.attrib["name"] == "ctag"
):
for symbol3 in symbol2:
if (
"value" in symbol3.keys()
and self.tags is not None
and symbol3.attrib["value"] in self.tags
):
flag = True
elif (
"value" in symbol3.keys()
and symbol3.attrib["value"] == "interp"
):
is_not_interp = False
if flag and is_not_interp:
return word

View File

@@ -0,0 +1,465 @@
# Natural Language Toolkit: NomBank Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Authors: Paul Bedaride <paul.bedaride@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
from functools import total_ordering
from xml.etree import ElementTree
from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *
from nltk.internals import raise_unorderable_types
from nltk.tree import Tree
class NombankCorpusReader(CorpusReader):
"""
Corpus reader for the nombank corpus, which augments the Penn
Treebank with information about the predicate argument structure
of every noun instance. The corpus consists of two parts: the
predicate-argument annotations themselves, and a set of "frameset
files" which define the argument labels used by the annotations,
on a per-noun basis. Each "frameset file" contains one or more
predicates, such as ``'turn'`` or ``'turn_on'``, each of which is
divided into coarse-grained word senses called "rolesets". For
each "roleset", the frameset file provides descriptions of the
argument roles, along with examples.
"""
def __init__(
self,
root,
nomfile,
framefiles="",
nounsfile=None,
parse_fileid_xform=None,
parse_corpus=None,
encoding="utf8",
):
"""
:param root: The root directory for this corpus.
:param nomfile: The name of the file containing the predicate-
argument annotations (relative to ``root``).
:param framefiles: A list or regexp specifying the frameset
fileids for this corpus.
:param parse_fileid_xform: A transform that should be applied
to the fileids in this corpus. This should be a function
of one argument (a fileid) that returns a string (the new
fileid).
:param parse_corpus: The corpus containing the parse trees
corresponding to this corpus. These parse trees are
necessary to resolve the tree pointers used by nombank.
"""
# If framefiles is specified as a regexp, expand it.
if isinstance(framefiles, str):
self._fileids = find_corpus_fileids(root, framefiles)
self._fileids = list(framefiles)
# Initialize the corpus reader.
CorpusReader.__init__(self, root, framefiles, encoding)
# Record our nom file & nouns file.
self._nomfile = nomfile
self._nounsfile = nounsfile
self._parse_fileid_xform = parse_fileid_xform
self._parse_corpus = parse_corpus
def instances(self, baseform=None):
"""
:return: a corpus view that acts as a list of
``NombankInstance`` objects, one for each noun in the corpus.
"""
kwargs = {}
if baseform is not None:
kwargs["instance_filter"] = lambda inst: inst.baseform == baseform
return StreamBackedCorpusView(
self.abspath(self._nomfile),
lambda stream: self._read_instance_block(stream, **kwargs),
encoding=self.encoding(self._nomfile),
)
def lines(self):
"""
:return: a corpus view that acts as a list of strings, one for
each line in the predicate-argument annotation file.
"""
return StreamBackedCorpusView(
self.abspath(self._nomfile),
read_line_block,
encoding=self.encoding(self._nomfile),
)
def roleset(self, roleset_id):
"""
:return: the xml description for the given roleset.
"""
baseform = roleset_id.split(".")[0]
baseform = baseform.replace("perc-sign", "%")
baseform = baseform.replace("oneslashonezero", "1/10").replace(
"1/10", "1-slash-10"
)
framefile = "frames/%s.xml" % baseform
if framefile not in self.fileids():
raise ValueError("Frameset file for %s not found" % roleset_id)
# n.b.: The encoding for XML fileids is specified by the file
# itself; so we ignore self._encoding here.
with self.abspath(framefile).open() as fp:
etree = ElementTree.parse(fp).getroot()
for roleset in etree.findall("predicate/roleset"):
if roleset.attrib["id"] == roleset_id:
return roleset
raise ValueError(f"Roleset {roleset_id} not found in {framefile}")
def rolesets(self, baseform=None):
"""
:return: list of xml descriptions for rolesets.
"""
if baseform is not None:
framefile = "frames/%s.xml" % baseform
if framefile not in self.fileids():
raise ValueError("Frameset file for %s not found" % baseform)
framefiles = [framefile]
else:
framefiles = self.fileids()
rsets = []
for framefile in framefiles:
# n.b.: The encoding for XML fileids is specified by the file
# itself; so we ignore self._encoding here.
with self.abspath(framefile).open() as fp:
etree = ElementTree.parse(fp).getroot()
rsets.append(etree.findall("predicate/roleset"))
return LazyConcatenation(rsets)
def nouns(self):
"""
:return: a corpus view that acts as a list of all noun lemmas
in this corpus (from the nombank.1.0.words file).
"""
return StreamBackedCorpusView(
self.abspath(self._nounsfile),
read_line_block,
encoding=self.encoding(self._nounsfile),
)
def _read_instance_block(self, stream, instance_filter=lambda inst: True):
block = []
# Read 100 at a time.
for i in range(100):
line = stream.readline().strip()
if line:
inst = NombankInstance.parse(
line, self._parse_fileid_xform, self._parse_corpus
)
if instance_filter(inst):
block.append(inst)
return block
######################################################################
# { Nombank Instance & related datatypes
######################################################################
class NombankInstance:
def __init__(
self,
fileid,
sentnum,
wordnum,
baseform,
sensenumber,
predicate,
predid,
arguments,
parse_corpus=None,
):
self.fileid = fileid
"""The name of the file containing the parse tree for this
instance's sentence."""
self.sentnum = sentnum
"""The sentence number of this sentence within ``fileid``.
Indexing starts from zero."""
self.wordnum = wordnum
"""The word number of this instance's predicate within its
containing sentence. Word numbers are indexed starting from
zero, and include traces and other empty parse elements."""
self.baseform = baseform
"""The baseform of the predicate."""
self.sensenumber = sensenumber
"""The sense number of the predicate."""
self.predicate = predicate
"""A ``NombankTreePointer`` indicating the position of this
instance's predicate within its containing sentence."""
self.predid = predid
"""Identifier of the predicate."""
self.arguments = tuple(arguments)
"""A list of tuples (argloc, argid), specifying the location
and identifier for each of the predicate's argument in the
containing sentence. Argument identifiers are strings such as
``'ARG0'`` or ``'ARGM-TMP'``. This list does *not* contain
the predicate."""
self.parse_corpus = parse_corpus
"""A corpus reader for the parse trees corresponding to the
instances in this nombank corpus."""
@property
def roleset(self):
"""The name of the roleset used by this instance's predicate.
Use ``nombank.roleset() <NombankCorpusReader.roleset>`` to
look up information about the roleset."""
r = self.baseform.replace("%", "perc-sign")
r = r.replace("1/10", "1-slash-10").replace("1-slash-10", "oneslashonezero")
return f"{r}.{self.sensenumber}"
def __repr__(self):
return "<NombankInstance: {}, sent {}, word {}>".format(
self.fileid,
self.sentnum,
self.wordnum,
)
def __str__(self):
s = "{} {} {} {} {}".format(
self.fileid,
self.sentnum,
self.wordnum,
self.baseform,
self.sensenumber,
)
items = self.arguments + ((self.predicate, "rel"),)
for argloc, argid in sorted(items):
s += f" {argloc}-{argid}"
return s
def _get_tree(self):
if self.parse_corpus is None:
return None
if self.fileid not in self.parse_corpus.fileids():
return None
return self.parse_corpus.parsed_sents(self.fileid)[self.sentnum]
tree = property(
_get_tree,
doc="""
The parse tree corresponding to this instance, or None if
the corresponding tree is not available.""",
)
@staticmethod
def parse(s, parse_fileid_xform=None, parse_corpus=None):
pieces = s.split()
if len(pieces) < 6:
raise ValueError("Badly formatted nombank line: %r" % s)
# Divide the line into its basic pieces.
(fileid, sentnum, wordnum, baseform, sensenumber) = pieces[:5]
args = pieces[5:]
rel = [args.pop(i) for i, p in enumerate(args) if "-rel" in p]
if len(rel) != 1:
raise ValueError("Badly formatted nombank line: %r" % s)
# Apply the fileid selector, if any.
if parse_fileid_xform is not None:
fileid = parse_fileid_xform(fileid)
# Convert sentence & word numbers to ints.
sentnum = int(sentnum)
wordnum = int(wordnum)
# Parse the predicate location.
predloc, predid = rel[0].split("-", 1)
predicate = NombankTreePointer.parse(predloc)
# Parse the arguments.
arguments = []
for arg in args:
argloc, argid = arg.split("-", 1)
arguments.append((NombankTreePointer.parse(argloc), argid))
# Put it all together.
return NombankInstance(
fileid,
sentnum,
wordnum,
baseform,
sensenumber,
predicate,
predid,
arguments,
parse_corpus,
)
class NombankPointer:
"""
A pointer used by nombank to identify one or more constituents in
a parse tree. ``NombankPointer`` is an abstract base class with
three concrete subclasses:
- ``NombankTreePointer`` is used to point to single constituents.
- ``NombankSplitTreePointer`` is used to point to 'split'
constituents, which consist of a sequence of two or more
``NombankTreePointer`` pointers.
- ``NombankChainTreePointer`` is used to point to entire trace
chains in a tree. It consists of a sequence of pieces, which
can be ``NombankTreePointer`` or ``NombankSplitTreePointer`` pointers.
"""
def __init__(self):
if self.__class__ == NombankPointer:
raise NotImplementedError()
class NombankChainTreePointer(NombankPointer):
def __init__(self, pieces):
self.pieces = pieces
"""A list of the pieces that make up this chain. Elements may
be either ``NombankSplitTreePointer`` or
``NombankTreePointer`` pointers."""
def __str__(self):
return "*".join("%s" % p for p in self.pieces)
def __repr__(self):
return "<NombankChainTreePointer: %s>" % self
def select(self, tree):
if tree is None:
raise ValueError("Parse tree not available")
return Tree("*CHAIN*", [p.select(tree) for p in self.pieces])
class NombankSplitTreePointer(NombankPointer):
def __init__(self, pieces):
self.pieces = pieces
"""A list of the pieces that make up this chain. Elements are
all ``NombankTreePointer`` pointers."""
def __str__(self):
return ",".join("%s" % p for p in self.pieces)
def __repr__(self):
return "<NombankSplitTreePointer: %s>" % self
def select(self, tree):
if tree is None:
raise ValueError("Parse tree not available")
return Tree("*SPLIT*", [p.select(tree) for p in self.pieces])
@total_ordering
class NombankTreePointer(NombankPointer):
"""
wordnum:height*wordnum:height*...
wordnum:height,
"""
def __init__(self, wordnum, height):
self.wordnum = wordnum
self.height = height
@staticmethod
def parse(s):
# Deal with chains (xx*yy*zz)
pieces = s.split("*")
if len(pieces) > 1:
return NombankChainTreePointer(
[NombankTreePointer.parse(elt) for elt in pieces]
)
# Deal with split args (xx,yy,zz)
pieces = s.split(",")
if len(pieces) > 1:
return NombankSplitTreePointer(
[NombankTreePointer.parse(elt) for elt in pieces]
)
# Deal with normal pointers.
pieces = s.split(":")
if len(pieces) != 2:
raise ValueError("bad nombank pointer %r" % s)
return NombankTreePointer(int(pieces[0]), int(pieces[1]))
def __str__(self):
return f"{self.wordnum}:{self.height}"
def __repr__(self):
return "NombankTreePointer(%d, %d)" % (self.wordnum, self.height)
def __eq__(self, other):
while isinstance(other, (NombankChainTreePointer, NombankSplitTreePointer)):
other = other.pieces[0]
if not isinstance(other, NombankTreePointer):
return self is other
return self.wordnum == other.wordnum and self.height == other.height
def __ne__(self, other):
return not self == other
def __lt__(self, other):
while isinstance(other, (NombankChainTreePointer, NombankSplitTreePointer)):
other = other.pieces[0]
if not isinstance(other, NombankTreePointer):
return id(self) < id(other)
return (self.wordnum, -self.height) < (other.wordnum, -other.height)
def select(self, tree):
if tree is None:
raise ValueError("Parse tree not available")
return tree[self.treepos(tree)]
def treepos(self, tree):
"""
Convert this pointer to a standard 'tree position' pointer,
given that it points to the given tree.
"""
if tree is None:
raise ValueError("Parse tree not available")
stack = [tree]
treepos = []
wordnum = 0
while True:
# tree node:
if isinstance(stack[-1], Tree):
# Select the next child.
if len(treepos) < len(stack):
treepos.append(0)
else:
treepos[-1] += 1
# Update the stack.
if treepos[-1] < len(stack[-1]):
stack.append(stack[-1][treepos[-1]])
else:
# End of node's child list: pop up a level.
stack.pop()
treepos.pop()
# word node:
else:
if wordnum == self.wordnum:
return tuple(treepos[: len(treepos) - self.height - 1])
else:
wordnum += 1
stack.pop()

View File

@@ -0,0 +1,90 @@
# Natural Language Toolkit: NPS Chat Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import re
import textwrap
from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *
from nltk.corpus.reader.xmldocs import *
from nltk.internals import ElementWrapper
from nltk.tag import map_tag
from nltk.util import LazyConcatenation
class NPSChatCorpusReader(XMLCorpusReader):
def __init__(self, root, fileids, wrap_etree=False, tagset=None):
XMLCorpusReader.__init__(self, root, fileids, wrap_etree)
self._tagset = tagset
def xml_posts(self, fileids=None):
if self._wrap_etree:
return concat(
[
XMLCorpusView(fileid, "Session/Posts/Post", self._wrap_elt)
for fileid in self.abspaths(fileids)
]
)
else:
return concat(
[
XMLCorpusView(fileid, "Session/Posts/Post")
for fileid in self.abspaths(fileids)
]
)
def posts(self, fileids=None):
return concat(
[
XMLCorpusView(
fileid, "Session/Posts/Post/terminals", self._elt_to_words
)
for fileid in self.abspaths(fileids)
]
)
def tagged_posts(self, fileids=None, tagset=None):
def reader(elt, handler):
return self._elt_to_tagged_words(elt, handler, tagset)
return concat(
[
XMLCorpusView(fileid, "Session/Posts/Post/terminals", reader)
for fileid in self.abspaths(fileids)
]
)
def words(self, fileids=None):
return LazyConcatenation(self.posts(fileids))
def tagged_words(self, fileids=None, tagset=None):
return LazyConcatenation(self.tagged_posts(fileids, tagset))
def _wrap_elt(self, elt, handler):
return ElementWrapper(elt)
def _elt_to_words(self, elt, handler):
return [self._simplify_username(t.attrib["word"]) for t in elt.findall("t")]
def _elt_to_tagged_words(self, elt, handler, tagset=None):
tagged_post = [
(self._simplify_username(t.attrib["word"]), t.attrib["pos"])
for t in elt.findall("t")
]
if tagset and tagset != self._tagset:
tagged_post = [
(w, map_tag(self._tagset, tagset, t)) for (w, t) in tagged_post
]
return tagged_post
@staticmethod
def _simplify_username(word):
if "User" in word:
word = "U" + word.split("User", 1)[1]
elif isinstance(word, bytes):
word = word.decode("ascii")
return word

View File

@@ -0,0 +1,125 @@
# Natural Language Toolkit: Opinion Lexicon Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
CorpusReader for the Opinion Lexicon.
Opinion Lexicon information
===========================
Authors: Minqing Hu and Bing Liu, 2004.
Department of Computer Science
University of Illinois at Chicago
Contact: Bing Liu, liub@cs.uic.edu
https://www.cs.uic.edu/~liub
Distributed with permission.
Related papers:
- Minqing Hu and Bing Liu. "Mining and summarizing customer reviews".
Proceedings of the ACM SIGKDD International Conference on Knowledge Discovery
& Data Mining (KDD-04), Aug 22-25, 2004, Seattle, Washington, USA.
- Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing and
Comparing Opinions on the Web". Proceedings of the 14th International World
Wide Web conference (WWW-2005), May 10-14, 2005, Chiba, Japan.
"""
from nltk.corpus.reader import WordListCorpusReader
from nltk.corpus.reader.api import *
class IgnoreReadmeCorpusView(StreamBackedCorpusView):
"""
This CorpusView is used to skip the initial readme block of the corpus.
"""
def __init__(self, *args, **kwargs):
StreamBackedCorpusView.__init__(self, *args, **kwargs)
# open self._stream
self._open()
# skip the readme block
read_blankline_block(self._stream)
# Set the initial position to the current stream position
self._filepos = [self._stream.tell()]
class OpinionLexiconCorpusReader(WordListCorpusReader):
"""
Reader for Liu and Hu opinion lexicon. Blank lines and readme are ignored.
>>> from nltk.corpus import opinion_lexicon
>>> opinion_lexicon.words()
['2-faced', '2-faces', 'abnormal', 'abolish', ...]
The OpinionLexiconCorpusReader provides shortcuts to retrieve positive/negative
words:
>>> opinion_lexicon.negative()
['2-faced', '2-faces', 'abnormal', 'abolish', ...]
Note that words from `words()` method are sorted by file id, not alphabetically:
>>> opinion_lexicon.words()[0:10] # doctest: +NORMALIZE_WHITESPACE
['2-faced', '2-faces', 'abnormal', 'abolish', 'abominable', 'abominably',
'abominate', 'abomination', 'abort', 'aborted']
>>> sorted(opinion_lexicon.words())[0:10] # doctest: +NORMALIZE_WHITESPACE
['2-faced', '2-faces', 'a+', 'abnormal', 'abolish', 'abominable', 'abominably',
'abominate', 'abomination', 'abort']
"""
CorpusView = IgnoreReadmeCorpusView
def words(self, fileids=None):
"""
Return all words in the opinion lexicon. Note that these words are not
sorted in alphabetical order.
:param fileids: a list or regexp specifying the ids of the files whose
words have to be returned.
:return: the given file(s) as a list of words and punctuation symbols.
:rtype: list(str)
"""
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, str):
fileids = [fileids]
return concat(
[
self.CorpusView(path, self._read_word_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def positive(self):
"""
Return all positive words in alphabetical order.
:return: a list of positive words.
:rtype: list(str)
"""
return self.words("positive-words.txt")
def negative(self):
"""
Return all negative words in alphabetical order.
:return: a list of negative words.
:rtype: list(str)
"""
return self.words("negative-words.txt")
def _read_word_block(self, stream):
words = []
for i in range(20): # Read 20 lines at a time.
line = stream.readline()
if not line:
continue
words.append(line.strip())
return words

View File

@@ -0,0 +1,174 @@
# Natural Language Toolkit: PanLex Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: David Kamholz <kamholz@panlex.org>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
CorpusReader for PanLex Lite, a stripped down version of PanLex distributed
as an SQLite database. See the README.txt in the panlex_lite corpus directory
for more information on PanLex Lite.
"""
import os
import sqlite3
from nltk.corpus.reader.api import CorpusReader
class PanLexLiteCorpusReader(CorpusReader):
MEANING_Q = """
SELECT dnx2.mn, dnx2.uq, dnx2.ap, dnx2.ui, ex2.tt, ex2.lv
FROM dnx
JOIN ex ON (ex.ex = dnx.ex)
JOIN dnx dnx2 ON (dnx2.mn = dnx.mn)
JOIN ex ex2 ON (ex2.ex = dnx2.ex)
WHERE dnx.ex != dnx2.ex AND ex.tt = ? AND ex.lv = ?
ORDER BY dnx2.uq DESC
"""
TRANSLATION_Q = """
SELECT s.tt, sum(s.uq) AS trq FROM (
SELECT ex2.tt, max(dnx.uq) AS uq
FROM dnx
JOIN ex ON (ex.ex = dnx.ex)
JOIN dnx dnx2 ON (dnx2.mn = dnx.mn)
JOIN ex ex2 ON (ex2.ex = dnx2.ex)
WHERE dnx.ex != dnx2.ex AND ex.lv = ? AND ex.tt = ? AND ex2.lv = ?
GROUP BY ex2.tt, dnx.ui
) s
GROUP BY s.tt
ORDER BY trq DESC, s.tt
"""
def __init__(self, root):
self._c = sqlite3.connect(os.path.join(root, "db.sqlite")).cursor()
self._uid_lv = {}
self._lv_uid = {}
for row in self._c.execute("SELECT uid, lv FROM lv"):
self._uid_lv[row[0]] = row[1]
self._lv_uid[row[1]] = row[0]
def language_varieties(self, lc=None):
"""
Return a list of PanLex language varieties.
:param lc: ISO 639 alpha-3 code. If specified, filters returned varieties
by this code. If unspecified, all varieties are returned.
:return: the specified language varieties as a list of tuples. The first
element is the language variety's seven-character uniform identifier,
and the second element is its default name.
:rtype: list(tuple)
"""
if lc is None:
return self._c.execute("SELECT uid, tt FROM lv ORDER BY uid").fetchall()
else:
return self._c.execute(
"SELECT uid, tt FROM lv WHERE lc = ? ORDER BY uid", (lc,)
).fetchall()
def meanings(self, expr_uid, expr_tt):
"""
Return a list of meanings for an expression.
:param expr_uid: the expression's language variety, as a seven-character
uniform identifier.
:param expr_tt: the expression's text.
:return: a list of Meaning objects.
:rtype: list(Meaning)
"""
expr_lv = self._uid_lv[expr_uid]
mn_info = {}
for i in self._c.execute(self.MEANING_Q, (expr_tt, expr_lv)):
mn = i[0]
uid = self._lv_uid[i[5]]
if not mn in mn_info:
mn_info[mn] = {
"uq": i[1],
"ap": i[2],
"ui": i[3],
"ex": {expr_uid: [expr_tt]},
}
if not uid in mn_info[mn]["ex"]:
mn_info[mn]["ex"][uid] = []
mn_info[mn]["ex"][uid].append(i[4])
return [Meaning(mn, mn_info[mn]) for mn in mn_info]
def translations(self, from_uid, from_tt, to_uid):
"""
Return a list of translations for an expression into a single language
variety.
:param from_uid: the source expression's language variety, as a
seven-character uniform identifier.
:param from_tt: the source expression's text.
:param to_uid: the target language variety, as a seven-character
uniform identifier.
:return: a list of translation tuples. The first element is the expression
text and the second element is the translation quality.
:rtype: list(tuple)
"""
from_lv = self._uid_lv[from_uid]
to_lv = self._uid_lv[to_uid]
return self._c.execute(self.TRANSLATION_Q, (from_lv, from_tt, to_lv)).fetchall()
class Meaning(dict):
"""
Represents a single PanLex meaning. A meaning is a translation set derived
from a single source.
"""
def __init__(self, mn, attr):
super().__init__(**attr)
self["mn"] = mn
def id(self):
"""
:return: the meaning's id.
:rtype: int
"""
return self["mn"]
def quality(self):
"""
:return: the meaning's source's quality (0=worst, 9=best).
:rtype: int
"""
return self["uq"]
def source(self):
"""
:return: the meaning's source id.
:rtype: int
"""
return self["ap"]
def source_group(self):
"""
:return: the meaning's source group id.
:rtype: int
"""
return self["ui"]
def expressions(self):
"""
:return: the meaning's expressions as a dictionary whose keys are language
variety uniform identifiers and whose values are lists of expression
texts.
:rtype: dict
"""
return self["ex"]

View File

@@ -0,0 +1,95 @@
# Natural Language Toolkit: Word List Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import re
from collections import defaultdict, namedtuple
from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *
from nltk.corpus.reader.wordlist import WordListCorpusReader
from nltk.tokenize import line_tokenize
PanlexLanguage = namedtuple(
"PanlexLanguage",
[
"panlex_uid", # (1) PanLex UID
"iso639", # (2) ISO 639 language code
"iso639_type", # (3) ISO 639 language type, see README
"script", # (4) normal scripts of expressions
"name", # (5) PanLex default name
"langvar_uid", # (6) UID of the language variety in which the default name is an expression
],
)
class PanlexSwadeshCorpusReader(WordListCorpusReader):
"""
This is a class to read the PanLex Swadesh list from
David Kamholz, Jonathan Pool, and Susan M. Colowick (2014).
PanLex: Building a Resource for Panlingual Lexical Translation.
In LREC. http://www.lrec-conf.org/proceedings/lrec2014/pdf/1029_Paper.pdf
License: CC0 1.0 Universal
https://creativecommons.org/publicdomain/zero/1.0/legalcode
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# Find the swadesh size using the fileids' path.
self.swadesh_size = re.match(r"swadesh([0-9].*)\/", self.fileids()[0]).group(1)
self._languages = {lang.panlex_uid: lang for lang in self.get_languages()}
self._macro_langauges = self.get_macrolanguages()
def license(self):
return "CC0 1.0 Universal"
def language_codes(self):
return self._languages.keys()
def get_languages(self):
for line in self.raw(f"langs{self.swadesh_size}.txt").split("\n"):
if not line.strip(): # Skip empty lines.
continue
yield PanlexLanguage(*line.strip().split("\t"))
def get_macrolanguages(self):
macro_langauges = defaultdict(list)
for lang in self._languages.values():
macro_langauges[lang.iso639].append(lang.panlex_uid)
return macro_langauges
def words_by_lang(self, lang_code):
"""
:return: a list of list(str)
"""
fileid = f"swadesh{self.swadesh_size}/{lang_code}.txt"
return [concept.split("\t") for concept in self.words(fileid)]
def words_by_iso639(self, iso63_code):
"""
:return: a list of list(str)
"""
fileids = [
f"swadesh{self.swadesh_size}/{lang_code}.txt"
for lang_code in self._macro_langauges[iso63_code]
]
return [
concept.split("\t") for fileid in fileids for concept in self.words(fileid)
]
def entries(self, fileids=None):
"""
:return: a tuple of words for the specified fileids.
"""
if not fileids:
fileids = self.fileids()
wordlists = [self.words(f) for f in fileids]
return list(zip(*wordlists))

View File

@@ -0,0 +1,373 @@
# Natural Language Toolkit:
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Piotr Kasprzyk <p.j.kasprzyk@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
from nltk.corpus.reader.api import *
from nltk.corpus.reader.xmldocs import XMLCorpusReader
PARA = re.compile(r"<p(?: [^>]*){0,1}>(.*?)</p>")
SENT = re.compile(r"<s(?: [^>]*){0,1}>(.*?)</s>")
TAGGEDWORD = re.compile(r"<([wc](?: [^>]*){0,1}>)(.*?)</[wc]>")
WORD = re.compile(r"<[wc](?: [^>]*){0,1}>(.*?)</[wc]>")
TYPE = re.compile(r'type="(.*?)"')
ANA = re.compile(r'ana="(.*?)"')
TEXTID = re.compile(r'text id="(.*?)"')
class TEICorpusView(StreamBackedCorpusView):
def __init__(
self,
corpus_file,
tagged,
group_by_sent,
group_by_para,
tagset=None,
head_len=0,
textids=None,
):
self._tagged = tagged
self._textids = textids
self._group_by_sent = group_by_sent
self._group_by_para = group_by_para
# WARNING -- skip header
StreamBackedCorpusView.__init__(self, corpus_file, startpos=head_len)
_pagesize = 4096
def read_block(self, stream):
block = stream.readlines(self._pagesize)
block = concat(block)
while (block.count("<text id") > block.count("</text>")) or block.count(
"<text id"
) == 0:
tmp = stream.readline()
if len(tmp) <= 0:
break
block += tmp
block = block.replace("\n", "")
textids = TEXTID.findall(block)
if self._textids:
for tid in textids:
if tid not in self._textids:
beg = block.find(tid) - 1
end = block[beg:].find("</text>") + len("</text>")
block = block[:beg] + block[beg + end :]
output = []
for para_str in PARA.findall(block):
para = []
for sent_str in SENT.findall(para_str):
if not self._tagged:
sent = WORD.findall(sent_str)
else:
sent = list(map(self._parse_tag, TAGGEDWORD.findall(sent_str)))
if self._group_by_sent:
para.append(sent)
else:
para.extend(sent)
if self._group_by_para:
output.append(para)
else:
output.extend(para)
return output
def _parse_tag(self, tag_word_tuple):
(tag, word) = tag_word_tuple
if tag.startswith("w"):
tag = ANA.search(tag).group(1)
else: # tag.startswith('c')
tag = TYPE.search(tag).group(1)
return word, tag
class Pl196xCorpusReader(CategorizedCorpusReader, XMLCorpusReader):
head_len = 2770
def __init__(self, *args, **kwargs):
if "textid_file" in kwargs:
self._textids = kwargs["textid_file"]
else:
self._textids = None
XMLCorpusReader.__init__(self, *args)
CategorizedCorpusReader.__init__(self, kwargs)
self._init_textids()
def _init_textids(self):
self._f2t = defaultdict(list)
self._t2f = defaultdict(list)
if self._textids is not None:
with open(self._textids) as fp:
for line in fp:
line = line.strip()
file_id, text_ids = line.split(" ", 1)
if file_id not in self.fileids():
raise ValueError(
"In text_id mapping file %s: %s not found"
% (self._textids, file_id)
)
for text_id in text_ids.split(self._delimiter):
self._add_textids(file_id, text_id)
def _add_textids(self, file_id, text_id):
self._f2t[file_id].append(text_id)
self._t2f[text_id].append(file_id)
def _resolve(self, fileids, categories, textids=None):
tmp = None
if (
len(
list(
filter(
lambda accessor: accessor is None,
(fileids, categories, textids),
)
)
)
!= 1
):
raise ValueError(
"Specify exactly one of: fileids, " "categories or textids"
)
if fileids is not None:
return fileids, None
if categories is not None:
return self.fileids(categories), None
if textids is not None:
if isinstance(textids, str):
textids = [textids]
files = sum((self._t2f[t] for t in textids), [])
tdict = dict()
for f in files:
tdict[f] = set(self._f2t[f]) & set(textids)
return files, tdict
def decode_tag(self, tag):
# to be implemented
return tag
def textids(self, fileids=None, categories=None):
"""
In the pl196x corpus each category is stored in single
file and thus both methods provide identical functionality. In order
to accommodate finer granularity, a non-standard textids() method was
implemented. All the main functions can be supplied with a list
of required chunks---giving much more control to the user.
"""
fileids, _ = self._resolve(fileids, categories)
if fileids is None:
return sorted(self._t2f)
if isinstance(fileids, str):
fileids = [fileids]
return sorted(sum((self._f2t[d] for d in fileids), []))
def words(self, fileids=None, categories=None, textids=None):
fileids, textids = self._resolve(fileids, categories, textids)
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, str):
fileids = [fileids]
if textids:
return concat(
[
TEICorpusView(
self.abspath(fileid),
False,
False,
False,
head_len=self.head_len,
textids=textids[fileid],
)
for fileid in fileids
]
)
else:
return concat(
[
TEICorpusView(
self.abspath(fileid),
False,
False,
False,
head_len=self.head_len,
)
for fileid in fileids
]
)
def sents(self, fileids=None, categories=None, textids=None):
fileids, textids = self._resolve(fileids, categories, textids)
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, str):
fileids = [fileids]
if textids:
return concat(
[
TEICorpusView(
self.abspath(fileid),
False,
True,
False,
head_len=self.head_len,
textids=textids[fileid],
)
for fileid in fileids
]
)
else:
return concat(
[
TEICorpusView(
self.abspath(fileid), False, True, False, head_len=self.head_len
)
for fileid in fileids
]
)
def paras(self, fileids=None, categories=None, textids=None):
fileids, textids = self._resolve(fileids, categories, textids)
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, str):
fileids = [fileids]
if textids:
return concat(
[
TEICorpusView(
self.abspath(fileid),
False,
True,
True,
head_len=self.head_len,
textids=textids[fileid],
)
for fileid in fileids
]
)
else:
return concat(
[
TEICorpusView(
self.abspath(fileid), False, True, True, head_len=self.head_len
)
for fileid in fileids
]
)
def tagged_words(self, fileids=None, categories=None, textids=None):
fileids, textids = self._resolve(fileids, categories, textids)
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, str):
fileids = [fileids]
if textids:
return concat(
[
TEICorpusView(
self.abspath(fileid),
True,
False,
False,
head_len=self.head_len,
textids=textids[fileid],
)
for fileid in fileids
]
)
else:
return concat(
[
TEICorpusView(
self.abspath(fileid), True, False, False, head_len=self.head_len
)
for fileid in fileids
]
)
def tagged_sents(self, fileids=None, categories=None, textids=None):
fileids, textids = self._resolve(fileids, categories, textids)
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, str):
fileids = [fileids]
if textids:
return concat(
[
TEICorpusView(
self.abspath(fileid),
True,
True,
False,
head_len=self.head_len,
textids=textids[fileid],
)
for fileid in fileids
]
)
else:
return concat(
[
TEICorpusView(
self.abspath(fileid), True, True, False, head_len=self.head_len
)
for fileid in fileids
]
)
def tagged_paras(self, fileids=None, categories=None, textids=None):
fileids, textids = self._resolve(fileids, categories, textids)
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, str):
fileids = [fileids]
if textids:
return concat(
[
TEICorpusView(
self.abspath(fileid),
True,
True,
True,
head_len=self.head_len,
textids=textids[fileid],
)
for fileid in fileids
]
)
else:
return concat(
[
TEICorpusView(
self.abspath(fileid), True, True, True, head_len=self.head_len
)
for fileid in fileids
]
)
def xml(self, fileids=None, categories=None):
fileids, _ = self._resolve(fileids, categories)
if len(fileids) == 1:
return XMLCorpusReader.xml(self, fileids[0])
else:
raise TypeError("Expected a single file")

View File

@@ -0,0 +1,237 @@
# Natural Language Toolkit: Plaintext Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# Nitin Madnani <nmadnani@umiacs.umd.edu>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
A reader for corpora that consist of plaintext documents.
"""
import nltk.data
from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *
from nltk.tokenize import *
class PlaintextCorpusReader(CorpusReader):
"""
Reader for corpora that consist of plaintext documents. Paragraphs
are assumed to be split using blank lines. Sentences and words can
be tokenized using the default tokenizers, or by custom tokenizers
specified as parameters to the constructor.
This corpus reader can be customized (e.g., to skip preface
sections of specific document formats) by creating a subclass and
overriding the ``CorpusView`` class variable.
"""
CorpusView = StreamBackedCorpusView
"""The corpus view class used by this reader. Subclasses of
``PlaintextCorpusReader`` may specify alternative corpus view
classes (e.g., to skip the preface sections of documents.)"""
def __init__(
self,
root,
fileids,
word_tokenizer=WordPunctTokenizer(),
sent_tokenizer=None,
para_block_reader=read_blankline_block,
encoding="utf8",
):
r"""
Construct a new plaintext corpus reader for a set of documents
located at the given root directory. Example usage:
>>> root = '/usr/local/share/nltk_data/corpora/webtext/'
>>> reader = PlaintextCorpusReader(root, '.*\.txt') # doctest: +SKIP
:param root: The root directory for this corpus.
:param fileids: A list or regexp specifying the fileids in this corpus.
:param word_tokenizer: Tokenizer for breaking sentences or
paragraphs into words.
:param sent_tokenizer: Tokenizer for breaking paragraphs
into words.
:param para_block_reader: The block reader used to divide the
corpus into paragraph blocks.
"""
CorpusReader.__init__(self, root, fileids, encoding)
self._word_tokenizer = word_tokenizer
self._sent_tokenizer = sent_tokenizer
self._para_block_reader = para_block_reader
def words(self, fileids=None):
"""
:return: the given file(s) as a list of words
and punctuation symbols.
:rtype: list(str)
"""
return concat(
[
self.CorpusView(path, self._read_word_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def sents(self, fileids=None):
"""
:return: the given file(s) as a list of
sentences or utterances, each encoded as a list of word
strings.
:rtype: list(list(str))
"""
if self._sent_tokenizer is None:
try:
self._sent_tokenizer = PunktTokenizer()
except:
raise ValueError("No sentence tokenizer for this corpus")
return concat(
[
self.CorpusView(path, self._read_sent_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def paras(self, fileids=None):
"""
:return: the given file(s) as a list of
paragraphs, each encoded as a list of sentences, which are
in turn encoded as lists of word strings.
:rtype: list(list(list(str)))
"""
if self._sent_tokenizer is None:
try:
self._sent_tokenizer = PunktTokenizer()
except:
raise ValueError("No sentence tokenizer for this corpus")
return concat(
[
self.CorpusView(path, self._read_para_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def _read_word_block(self, stream):
words = []
for i in range(20): # Read 20 lines at a time.
words.extend(self._word_tokenizer.tokenize(stream.readline()))
return words
def _read_sent_block(self, stream):
sents = []
for para in self._para_block_reader(stream):
sents.extend(
[
self._word_tokenizer.tokenize(sent)
for sent in self._sent_tokenizer.tokenize(para)
]
)
return sents
def _read_para_block(self, stream):
paras = []
for para in self._para_block_reader(stream):
paras.append(
[
self._word_tokenizer.tokenize(sent)
for sent in self._sent_tokenizer.tokenize(para)
]
)
return paras
class CategorizedPlaintextCorpusReader(CategorizedCorpusReader, PlaintextCorpusReader):
"""
A reader for plaintext corpora whose documents are divided into
categories based on their file identifiers.
"""
def __init__(self, *args, **kwargs):
"""
Initialize the corpus reader. Categorization arguments
(``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
the ``CategorizedCorpusReader`` constructor. The remaining arguments
are passed to the ``PlaintextCorpusReader`` constructor.
"""
CategorizedCorpusReader.__init__(self, kwargs)
PlaintextCorpusReader.__init__(self, *args, **kwargs)
class PortugueseCategorizedPlaintextCorpusReader(CategorizedPlaintextCorpusReader):
"""
This class is identical with CategorizedPlaintextCorpusReader,
except that it initializes a Portuguese PunktTokenizer:
>>> from nltk.corpus import machado
>>> print(machado._sent_tokenizer._lang)
portuguese
"""
def __init__(self, *args, **kwargs):
CategorizedPlaintextCorpusReader.__init__(self, *args, **kwargs)
# Fixed (@ekaf 2025), new way to invoke Punkt:
self._sent_tokenizer = PunktTokenizer("portuguese")
class EuroparlCorpusReader(PlaintextCorpusReader):
"""
Reader for Europarl corpora that consist of plaintext documents.
Documents are divided into chapters instead of paragraphs as
for regular plaintext documents. Chapters are separated using blank
lines. Everything is inherited from ``PlaintextCorpusReader`` except
that:
- Since the corpus is pre-processed and pre-tokenized, the
word tokenizer should just split the line at whitespaces.
- For the same reason, the sentence tokenizer should just
split the paragraph at line breaks.
- There is a new 'chapters()' method that returns chapters instead
instead of paragraphs.
- The 'paras()' method inherited from PlaintextCorpusReader is
made non-functional to remove any confusion between chapters
and paragraphs for Europarl.
"""
def _read_word_block(self, stream):
words = []
for i in range(20): # Read 20 lines at a time.
words.extend(stream.readline().split())
return words
def _read_sent_block(self, stream):
sents = []
for para in self._para_block_reader(stream):
sents.extend([sent.split() for sent in para.splitlines()])
return sents
def _read_para_block(self, stream):
paras = []
for para in self._para_block_reader(stream):
paras.append([sent.split() for sent in para.splitlines()])
return paras
def chapters(self, fileids=None):
"""
:return: the given file(s) as a list of
chapters, each encoded as a list of sentences, which are
in turn encoded as lists of word strings.
:rtype: list(list(list(str)))
"""
return concat(
[
self.CorpusView(fileid, self._read_para_block, encoding=enc)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def paras(self, fileids=None):
raise NotImplementedError(
"The Europarl corpus reader does not support paragraphs. Please use chapters() instead."
)

View File

@@ -0,0 +1,95 @@
# Natural Language Toolkit: PP Attachment Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Read lines from the Prepositional Phrase Attachment Corpus.
The PP Attachment Corpus contains several files having the format:
sentence_id verb noun1 preposition noun2 attachment
For example:
42960 gives authority to administration V
46742 gives inventors of microchip N
The PP attachment is to the verb phrase (V) or noun phrase (N), i.e.:
(VP gives (NP authority) (PP to administration))
(VP gives (NP inventors (PP of microchip)))
The corpus contains the following files:
training: training set
devset: development test set, used for algorithm development.
test: test set, used to report results
bitstrings: word classes derived from Mutual Information Clustering for the Wall Street Journal.
Ratnaparkhi, Adwait (1994). A Maximum Entropy Model for Prepositional
Phrase Attachment. Proceedings of the ARPA Human Language Technology
Conference. [http://www.cis.upenn.edu/~adwait/papers/hlt94.ps]
The PP Attachment Corpus is distributed with NLTK with the permission
of the author.
"""
from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *
class PPAttachment:
def __init__(self, sent, verb, noun1, prep, noun2, attachment):
self.sent = sent
self.verb = verb
self.noun1 = noun1
self.prep = prep
self.noun2 = noun2
self.attachment = attachment
def __repr__(self):
return (
"PPAttachment(sent=%r, verb=%r, noun1=%r, prep=%r, "
"noun2=%r, attachment=%r)"
% (self.sent, self.verb, self.noun1, self.prep, self.noun2, self.attachment)
)
class PPAttachmentCorpusReader(CorpusReader):
"""
sentence_id verb noun1 preposition noun2 attachment
"""
def attachments(self, fileids):
return concat(
[
StreamBackedCorpusView(fileid, self._read_obj_block, encoding=enc)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def tuples(self, fileids):
return concat(
[
StreamBackedCorpusView(fileid, self._read_tuple_block, encoding=enc)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def _read_tuple_block(self, stream):
line = stream.readline()
if line:
return [tuple(line.split())]
else:
return []
def _read_obj_block(self, stream):
line = stream.readline()
if line:
return [PPAttachment(*line.split())]
else:
return []

View File

@@ -0,0 +1,519 @@
# Natural Language Toolkit: PropBank Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import re
from functools import total_ordering
from xml.etree import ElementTree
from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *
from nltk.internals import raise_unorderable_types
from nltk.tree import Tree
class PropbankCorpusReader(CorpusReader):
"""
Corpus reader for the propbank corpus, which augments the Penn
Treebank with information about the predicate argument structure
of every verb instance. The corpus consists of two parts: the
predicate-argument annotations themselves, and a set of "frameset
files" which define the argument labels used by the annotations,
on a per-verb basis. Each "frameset file" contains one or more
predicates, such as ``'turn'`` or ``'turn_on'``, each of which is
divided into coarse-grained word senses called "rolesets". For
each "roleset", the frameset file provides descriptions of the
argument roles, along with examples.
"""
def __init__(
self,
root,
propfile,
framefiles="",
verbsfile=None,
parse_fileid_xform=None,
parse_corpus=None,
encoding="utf8",
):
"""
:param root: The root directory for this corpus.
:param propfile: The name of the file containing the predicate-
argument annotations (relative to ``root``).
:param framefiles: A list or regexp specifying the frameset
fileids for this corpus.
:param parse_fileid_xform: A transform that should be applied
to the fileids in this corpus. This should be a function
of one argument (a fileid) that returns a string (the new
fileid).
:param parse_corpus: The corpus containing the parse trees
corresponding to this corpus. These parse trees are
necessary to resolve the tree pointers used by propbank.
"""
# If framefiles is specified as a regexp, expand it.
if isinstance(framefiles, str):
framefiles = find_corpus_fileids(root, framefiles)
framefiles = list(framefiles)
# Initialize the corpus reader.
CorpusReader.__init__(self, root, [propfile, verbsfile] + framefiles, encoding)
# Record our frame fileids & prop file.
self._propfile = propfile
self._framefiles = framefiles
self._verbsfile = verbsfile
self._parse_fileid_xform = parse_fileid_xform
self._parse_corpus = parse_corpus
def instances(self, baseform=None):
"""
:return: a corpus view that acts as a list of
``PropBankInstance`` objects, one for each noun in the corpus.
"""
kwargs = {}
if baseform is not None:
kwargs["instance_filter"] = lambda inst: inst.baseform == baseform
return StreamBackedCorpusView(
self.abspath(self._propfile),
lambda stream: self._read_instance_block(stream, **kwargs),
encoding=self.encoding(self._propfile),
)
def lines(self):
"""
:return: a corpus view that acts as a list of strings, one for
each line in the predicate-argument annotation file.
"""
return StreamBackedCorpusView(
self.abspath(self._propfile),
read_line_block,
encoding=self.encoding(self._propfile),
)
def roleset(self, roleset_id):
"""
:return: the xml description for the given roleset.
"""
baseform = roleset_id.split(".")[0]
framefile = "frames/%s.xml" % baseform
if framefile not in self._framefiles:
raise ValueError("Frameset file for %s not found" % roleset_id)
# n.b.: The encoding for XML fileids is specified by the file
# itself; so we ignore self._encoding here.
with self.abspath(framefile).open() as fp:
etree = ElementTree.parse(fp).getroot()
for roleset in etree.findall("predicate/roleset"):
if roleset.attrib["id"] == roleset_id:
return roleset
raise ValueError(f"Roleset {roleset_id} not found in {framefile}")
def rolesets(self, baseform=None):
"""
:return: list of xml descriptions for rolesets.
"""
if baseform is not None:
framefile = "frames/%s.xml" % baseform
if framefile not in self._framefiles:
raise ValueError("Frameset file for %s not found" % baseform)
framefiles = [framefile]
else:
framefiles = self._framefiles
rsets = []
for framefile in framefiles:
# n.b.: The encoding for XML fileids is specified by the file
# itself; so we ignore self._encoding here.
with self.abspath(framefile).open() as fp:
etree = ElementTree.parse(fp).getroot()
rsets.append(etree.findall("predicate/roleset"))
return LazyConcatenation(rsets)
def verbs(self):
"""
:return: a corpus view that acts as a list of all verb lemmas
in this corpus (from the verbs.txt file).
"""
return StreamBackedCorpusView(
self.abspath(self._verbsfile),
read_line_block,
encoding=self.encoding(self._verbsfile),
)
def _read_instance_block(self, stream, instance_filter=lambda inst: True):
block = []
# Read 100 at a time.
for i in range(100):
line = stream.readline().strip()
if line:
inst = PropbankInstance.parse(
line, self._parse_fileid_xform, self._parse_corpus
)
if instance_filter(inst):
block.append(inst)
return block
######################################################################
# { Propbank Instance & related datatypes
######################################################################
class PropbankInstance:
def __init__(
self,
fileid,
sentnum,
wordnum,
tagger,
roleset,
inflection,
predicate,
arguments,
parse_corpus=None,
):
self.fileid = fileid
"""The name of the file containing the parse tree for this
instance's sentence."""
self.sentnum = sentnum
"""The sentence number of this sentence within ``fileid``.
Indexing starts from zero."""
self.wordnum = wordnum
"""The word number of this instance's predicate within its
containing sentence. Word numbers are indexed starting from
zero, and include traces and other empty parse elements."""
self.tagger = tagger
"""An identifier for the tagger who tagged this instance; or
``'gold'`` if this is an adjuticated instance."""
self.roleset = roleset
"""The name of the roleset used by this instance's predicate.
Use ``propbank.roleset() <PropbankCorpusReader.roleset>`` to
look up information about the roleset."""
self.inflection = inflection
"""A ``PropbankInflection`` object describing the inflection of
this instance's predicate."""
self.predicate = predicate
"""A ``PropbankTreePointer`` indicating the position of this
instance's predicate within its containing sentence."""
self.arguments = tuple(arguments)
"""A list of tuples (argloc, argid), specifying the location
and identifier for each of the predicate's argument in the
containing sentence. Argument identifiers are strings such as
``'ARG0'`` or ``'ARGM-TMP'``. This list does *not* contain
the predicate."""
self.parse_corpus = parse_corpus
"""A corpus reader for the parse trees corresponding to the
instances in this propbank corpus."""
@property
def baseform(self):
"""The baseform of the predicate."""
return self.roleset.split(".")[0]
@property
def sensenumber(self):
"""The sense number of the predicate."""
return self.roleset.split(".")[1]
@property
def predid(self):
"""Identifier of the predicate."""
return "rel"
def __repr__(self):
return "<PropbankInstance: {}, sent {}, word {}>".format(
self.fileid,
self.sentnum,
self.wordnum,
)
def __str__(self):
s = "{} {} {} {} {} {}".format(
self.fileid,
self.sentnum,
self.wordnum,
self.tagger,
self.roleset,
self.inflection,
)
items = self.arguments + ((self.predicate, "rel"),)
for argloc, argid in sorted(items):
s += f" {argloc}-{argid}"
return s
def _get_tree(self):
if self.parse_corpus is None:
return None
if self.fileid not in self.parse_corpus.fileids():
return None
return self.parse_corpus.parsed_sents(self.fileid)[self.sentnum]
tree = property(
_get_tree,
doc="""
The parse tree corresponding to this instance, or None if
the corresponding tree is not available.""",
)
@staticmethod
def parse(s, parse_fileid_xform=None, parse_corpus=None):
pieces = s.split()
if len(pieces) < 7:
raise ValueError("Badly formatted propbank line: %r" % s)
# Divide the line into its basic pieces.
(fileid, sentnum, wordnum, tagger, roleset, inflection) = pieces[:6]
rel = [p for p in pieces[6:] if p.endswith("-rel")]
args = [p for p in pieces[6:] if not p.endswith("-rel")]
if len(rel) != 1:
raise ValueError("Badly formatted propbank line: %r" % s)
# Apply the fileid selector, if any.
if parse_fileid_xform is not None:
fileid = parse_fileid_xform(fileid)
# Convert sentence & word numbers to ints.
sentnum = int(sentnum)
wordnum = int(wordnum)
# Parse the inflection
inflection = PropbankInflection.parse(inflection)
# Parse the predicate location.
predicate = PropbankTreePointer.parse(rel[0][:-4])
# Parse the arguments.
arguments = []
for arg in args:
argloc, argid = arg.split("-", 1)
arguments.append((PropbankTreePointer.parse(argloc), argid))
# Put it all together.
return PropbankInstance(
fileid,
sentnum,
wordnum,
tagger,
roleset,
inflection,
predicate,
arguments,
parse_corpus,
)
class PropbankPointer:
"""
A pointer used by propbank to identify one or more constituents in
a parse tree. ``PropbankPointer`` is an abstract base class with
three concrete subclasses:
- ``PropbankTreePointer`` is used to point to single constituents.
- ``PropbankSplitTreePointer`` is used to point to 'split'
constituents, which consist of a sequence of two or more
``PropbankTreePointer`` pointers.
- ``PropbankChainTreePointer`` is used to point to entire trace
chains in a tree. It consists of a sequence of pieces, which
can be ``PropbankTreePointer`` or ``PropbankSplitTreePointer`` pointers.
"""
def __init__(self):
if self.__class__ == PropbankPointer:
raise NotImplementedError()
class PropbankChainTreePointer(PropbankPointer):
def __init__(self, pieces):
self.pieces = pieces
"""A list of the pieces that make up this chain. Elements may
be either ``PropbankSplitTreePointer`` or
``PropbankTreePointer`` pointers."""
def __str__(self):
return "*".join("%s" % p for p in self.pieces)
def __repr__(self):
return "<PropbankChainTreePointer: %s>" % self
def select(self, tree):
if tree is None:
raise ValueError("Parse tree not available")
return Tree("*CHAIN*", [p.select(tree) for p in self.pieces])
class PropbankSplitTreePointer(PropbankPointer):
def __init__(self, pieces):
self.pieces = pieces
"""A list of the pieces that make up this chain. Elements are
all ``PropbankTreePointer`` pointers."""
def __str__(self):
return ",".join("%s" % p for p in self.pieces)
def __repr__(self):
return "<PropbankSplitTreePointer: %s>" % self
def select(self, tree):
if tree is None:
raise ValueError("Parse tree not available")
return Tree("*SPLIT*", [p.select(tree) for p in self.pieces])
@total_ordering
class PropbankTreePointer(PropbankPointer):
"""
wordnum:height*wordnum:height*...
wordnum:height,
"""
def __init__(self, wordnum, height):
self.wordnum = wordnum
self.height = height
@staticmethod
def parse(s):
# Deal with chains (xx*yy*zz)
pieces = s.split("*")
if len(pieces) > 1:
return PropbankChainTreePointer(
[PropbankTreePointer.parse(elt) for elt in pieces]
)
# Deal with split args (xx,yy,zz)
pieces = s.split(",")
if len(pieces) > 1:
return PropbankSplitTreePointer(
[PropbankTreePointer.parse(elt) for elt in pieces]
)
# Deal with normal pointers.
pieces = s.split(":")
if len(pieces) != 2:
raise ValueError("bad propbank pointer %r" % s)
return PropbankTreePointer(int(pieces[0]), int(pieces[1]))
def __str__(self):
return f"{self.wordnum}:{self.height}"
def __repr__(self):
return "PropbankTreePointer(%d, %d)" % (self.wordnum, self.height)
def __eq__(self, other):
while isinstance(other, (PropbankChainTreePointer, PropbankSplitTreePointer)):
other = other.pieces[0]
if not isinstance(other, PropbankTreePointer):
return self is other
return self.wordnum == other.wordnum and self.height == other.height
def __ne__(self, other):
return not self == other
def __lt__(self, other):
while isinstance(other, (PropbankChainTreePointer, PropbankSplitTreePointer)):
other = other.pieces[0]
if not isinstance(other, PropbankTreePointer):
return id(self) < id(other)
return (self.wordnum, -self.height) < (other.wordnum, -other.height)
def select(self, tree):
if tree is None:
raise ValueError("Parse tree not available")
return tree[self.treepos(tree)]
def treepos(self, tree):
"""
Convert this pointer to a standard 'tree position' pointer,
given that it points to the given tree.
"""
if tree is None:
raise ValueError("Parse tree not available")
stack = [tree]
treepos = []
wordnum = 0
while True:
# tree node:
if isinstance(stack[-1], Tree):
# Select the next child.
if len(treepos) < len(stack):
treepos.append(0)
else:
treepos[-1] += 1
# Update the stack.
if treepos[-1] < len(stack[-1]):
stack.append(stack[-1][treepos[-1]])
else:
# End of node's child list: pop up a level.
stack.pop()
treepos.pop()
# word node:
else:
if wordnum == self.wordnum:
return tuple(treepos[: len(treepos) - self.height - 1])
else:
wordnum += 1
stack.pop()
class PropbankInflection:
# { Inflection Form
INFINITIVE = "i"
GERUND = "g"
PARTICIPLE = "p"
FINITE = "v"
# { Inflection Tense
FUTURE = "f"
PAST = "p"
PRESENT = "n"
# { Inflection Aspect
PERFECT = "p"
PROGRESSIVE = "o"
PERFECT_AND_PROGRESSIVE = "b"
# { Inflection Person
THIRD_PERSON = "3"
# { Inflection Voice
ACTIVE = "a"
PASSIVE = "p"
# { Inflection
NONE = "-"
# }
def __init__(self, form="-", tense="-", aspect="-", person="-", voice="-"):
self.form = form
self.tense = tense
self.aspect = aspect
self.person = person
self.voice = voice
def __str__(self):
return self.form + self.tense + self.aspect + self.person + self.voice
def __repr__(self):
return "<PropbankInflection: %s>" % self
_VALIDATE = re.compile(r"[igpv\-][fpn\-][pob\-][3\-][ap\-]$")
@staticmethod
def parse(s):
if not isinstance(s, str):
raise TypeError("expected a string")
if len(s) != 5 or not PropbankInflection._VALIDATE.match(s):
raise ValueError("Bad propbank inflection string %r" % s)
return PropbankInflection(*s)

View File

@@ -0,0 +1,133 @@
# Natural Language Toolkit: Pros and Cons Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
CorpusReader for the Pros and Cons dataset.
- Pros and Cons dataset information -
Contact: Bing Liu, liub@cs.uic.edu
https://www.cs.uic.edu/~liub
Distributed with permission.
Related papers:
- Murthy Ganapathibhotla and Bing Liu. "Mining Opinions in Comparative Sentences".
Proceedings of the 22nd International Conference on Computational Linguistics
(Coling-2008), Manchester, 18-22 August, 2008.
- Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing and Comparing
Opinions on the Web". Proceedings of the 14th international World Wide Web
conference (WWW-2005), May 10-14, 2005, in Chiba, Japan.
"""
import re
from nltk.corpus.reader.api import *
from nltk.tokenize import *
class ProsConsCorpusReader(CategorizedCorpusReader, CorpusReader):
"""
Reader for the Pros and Cons sentence dataset.
>>> from nltk.corpus import pros_cons
>>> pros_cons.sents(categories='Cons') # doctest: +NORMALIZE_WHITESPACE
[['East', 'batteries', '!', 'On', '-', 'off', 'switch', 'too', 'easy',
'to', 'maneuver', '.'], ['Eats', '...', 'no', ',', 'GULPS', 'batteries'],
...]
>>> pros_cons.words('IntegratedPros.txt')
['Easy', 'to', 'use', ',', 'economical', '!', ...]
"""
CorpusView = StreamBackedCorpusView
def __init__(
self,
root,
fileids,
word_tokenizer=WordPunctTokenizer(),
encoding="utf8",
**kwargs
):
"""
:param root: The root directory for the corpus.
:param fileids: a list or regexp specifying the fileids in the corpus.
:param word_tokenizer: a tokenizer for breaking sentences or paragraphs
into words. Default: `WhitespaceTokenizer`
:param encoding: the encoding that should be used to read the corpus.
:param kwargs: additional parameters passed to CategorizedCorpusReader.
"""
CorpusReader.__init__(self, root, fileids, encoding)
CategorizedCorpusReader.__init__(self, kwargs)
self._word_tokenizer = word_tokenizer
def sents(self, fileids=None, categories=None):
"""
Return all sentences in the corpus or in the specified files/categories.
:param fileids: a list or regexp specifying the ids of the files whose
sentences have to be returned.
:param categories: a list specifying the categories whose sentences
have to be returned.
:return: the given file(s) as a list of sentences. Each sentence is
tokenized using the specified word_tokenizer.
:rtype: list(list(str))
"""
fileids = self._resolve(fileids, categories)
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, str):
fileids = [fileids]
return concat(
[
self.CorpusView(path, self._read_sent_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def words(self, fileids=None, categories=None):
"""
Return all words and punctuation symbols in the corpus or in the specified
files/categories.
:param fileids: a list or regexp specifying the ids of the files whose
words have to be returned.
:param categories: a list specifying the categories whose words have
to be returned.
:return: the given file(s) as a list of words and punctuation symbols.
:rtype: list(str)
"""
fileids = self._resolve(fileids, categories)
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, str):
fileids = [fileids]
return concat(
[
self.CorpusView(path, self._read_word_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def _read_sent_block(self, stream):
sents = []
for i in range(20): # Read 20 lines at a time.
line = stream.readline()
if not line:
continue
sent = re.match(r"^(?!\n)\s*<(Pros|Cons)>(.*)</(?:Pros|Cons)>", line)
if sent:
sents.append(self._word_tokenizer.tokenize(sent.group(2).strip()))
return sents
def _read_word_block(self, stream):
words = []
for sent in self._read_sent_block(stream):
words.extend(sent)
return words

View File

@@ -0,0 +1,331 @@
# Natural Language Toolkit: Product Reviews Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
CorpusReader for reviews corpora (syntax based on Customer Review Corpus).
Customer Review Corpus information
==================================
Annotated by: Minqing Hu and Bing Liu, 2004.
Department of Computer Science
University of Illinois at Chicago
Contact: Bing Liu, liub@cs.uic.edu
https://www.cs.uic.edu/~liub
Distributed with permission.
The "product_reviews_1" and "product_reviews_2" datasets respectively contain
annotated customer reviews of 5 and 9 products from amazon.com.
Related papers:
- Minqing Hu and Bing Liu. "Mining and summarizing customer reviews".
Proceedings of the ACM SIGKDD International Conference on Knowledge
Discovery & Data Mining (KDD-04), 2004.
- Minqing Hu and Bing Liu. "Mining Opinion Features in Customer Reviews".
Proceedings of Nineteeth National Conference on Artificial Intelligence
(AAAI-2004), 2004.
- Xiaowen Ding, Bing Liu and Philip S. Yu. "A Holistic Lexicon-Based Appraoch to
Opinion Mining." Proceedings of First ACM International Conference on Web
Search and Data Mining (WSDM-2008), Feb 11-12, 2008, Stanford University,
Stanford, California, USA.
Symbols used in the annotated reviews:
:[t]: the title of the review: Each [t] tag starts a review.
:xxxx[+|-n]: xxxx is a product feature.
:[+n]: Positive opinion, n is the opinion strength: 3 strongest, and 1 weakest.
Note that the strength is quite subjective.
You may want ignore it, but only considering + and -
:[-n]: Negative opinion
:##: start of each sentence. Each line is a sentence.
:[u]: feature not appeared in the sentence.
:[p]: feature not appeared in the sentence. Pronoun resolution is needed.
:[s]: suggestion or recommendation.
:[cc]: comparison with a competing product from a different brand.
:[cs]: comparison with a competing product from the same brand.
Note: Some of the files (e.g. "ipod.txt", "Canon PowerShot SD500.txt") do not
provide separation between different reviews. This is due to the fact that
the dataset was specifically designed for aspect/feature-based sentiment
analysis, for which sentence-level annotation is sufficient. For document-
level classification and analysis, this peculiarity should be taken into
consideration.
"""
import re
from nltk.corpus.reader.api import *
from nltk.tokenize import *
TITLE = re.compile(r"^\[t\](.*)$") # [t] Title
FEATURES = re.compile(
r"((?:(?:\w+\s)+)?\w+)\[((?:\+|\-)\d)\]"
) # find 'feature' in feature[+3]
NOTES = re.compile(r"\[(?!t)(p|u|s|cc|cs)\]") # find 'p' in camera[+2][p]
SENT = re.compile(r"##(.*)$") # find tokenized sentence
class Review:
"""
A Review is the main block of a ReviewsCorpusReader.
"""
def __init__(self, title=None, review_lines=None):
"""
:param title: the title of the review.
:param review_lines: the list of the ReviewLines that belong to the Review.
"""
self.title = title
if review_lines is None:
self.review_lines = []
else:
self.review_lines = review_lines
def add_line(self, review_line):
"""
Add a line (ReviewLine) to the review.
:param review_line: a ReviewLine instance that belongs to the Review.
"""
assert isinstance(review_line, ReviewLine)
self.review_lines.append(review_line)
def features(self):
"""
Return a list of features in the review. Each feature is a tuple made of
the specific item feature and the opinion strength about that feature.
:return: all features of the review as a list of tuples (feat, score).
:rtype: list(tuple)
"""
features = []
for review_line in self.review_lines:
features.extend(review_line.features)
return features
def sents(self):
"""
Return all tokenized sentences in the review.
:return: all sentences of the review as lists of tokens.
:rtype: list(list(str))
"""
return [review_line.sent for review_line in self.review_lines]
def __repr__(self):
return 'Review(title="{}", review_lines={})'.format(
self.title, self.review_lines
)
class ReviewLine:
"""
A ReviewLine represents a sentence of the review, together with (optional)
annotations of its features and notes about the reviewed item.
"""
def __init__(self, sent, features=None, notes=None):
self.sent = sent
if features is None:
self.features = []
else:
self.features = features
if notes is None:
self.notes = []
else:
self.notes = notes
def __repr__(self):
return "ReviewLine(features={}, notes={}, sent={})".format(
self.features, self.notes, self.sent
)
class ReviewsCorpusReader(CorpusReader):
"""
Reader for the Customer Review Data dataset by Hu, Liu (2004).
Note: we are not applying any sentence tokenization at the moment, just word
tokenization.
>>> from nltk.corpus import product_reviews_1
>>> camera_reviews = product_reviews_1.reviews('Canon_G3.txt')
>>> review = camera_reviews[0]
>>> review.sents()[0] # doctest: +NORMALIZE_WHITESPACE
['i', 'recently', 'purchased', 'the', 'canon', 'powershot', 'g3', 'and', 'am',
'extremely', 'satisfied', 'with', 'the', 'purchase', '.']
>>> review.features() # doctest: +NORMALIZE_WHITESPACE
[('canon powershot g3', '+3'), ('use', '+2'), ('picture', '+2'),
('picture quality', '+1'), ('picture quality', '+1'), ('camera', '+2'),
('use', '+2'), ('feature', '+1'), ('picture quality', '+3'), ('use', '+1'),
('option', '+1')]
We can also reach the same information directly from the stream:
>>> product_reviews_1.features('Canon_G3.txt')
[('canon powershot g3', '+3'), ('use', '+2'), ...]
We can compute stats for specific product features:
>>> n_reviews = len([(feat,score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture'])
>>> tot = sum([int(score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture'])
>>> mean = tot / n_reviews
>>> print(n_reviews, tot, mean)
15 24 1.6
"""
CorpusView = StreamBackedCorpusView
def __init__(
self, root, fileids, word_tokenizer=WordPunctTokenizer(), encoding="utf8"
):
"""
:param root: The root directory for the corpus.
:param fileids: a list or regexp specifying the fileids in the corpus.
:param word_tokenizer: a tokenizer for breaking sentences or paragraphs
into words. Default: `WordPunctTokenizer`
:param encoding: the encoding that should be used to read the corpus.
"""
CorpusReader.__init__(self, root, fileids, encoding)
self._word_tokenizer = word_tokenizer
self._readme = "README.txt"
def features(self, fileids=None):
"""
Return a list of features. Each feature is a tuple made of the specific
item feature and the opinion strength about that feature.
:param fileids: a list or regexp specifying the ids of the files whose
features have to be returned.
:return: all features for the item(s) in the given file(s).
:rtype: list(tuple)
"""
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, str):
fileids = [fileids]
return concat(
[
self.CorpusView(fileid, self._read_features, encoding=enc)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def reviews(self, fileids=None):
"""
Return all the reviews as a list of Review objects. If `fileids` is
specified, return all the reviews from each of the specified files.
:param fileids: a list or regexp specifying the ids of the files whose
reviews have to be returned.
:return: the given file(s) as a list of reviews.
"""
if fileids is None:
fileids = self._fileids
return concat(
[
self.CorpusView(fileid, self._read_review_block, encoding=enc)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def sents(self, fileids=None):
"""
Return all sentences in the corpus or in the specified files.
:param fileids: a list or regexp specifying the ids of the files whose
sentences have to be returned.
:return: the given file(s) as a list of sentences, each encoded as a
list of word strings.
:rtype: list(list(str))
"""
return concat(
[
self.CorpusView(path, self._read_sent_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def words(self, fileids=None):
"""
Return all words and punctuation symbols in the corpus or in the specified
files.
:param fileids: a list or regexp specifying the ids of the files whose
words have to be returned.
:return: the given file(s) as a list of words and punctuation symbols.
:rtype: list(str)
"""
return concat(
[
self.CorpusView(path, self._read_word_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def _read_features(self, stream):
features = []
for i in range(20):
line = stream.readline()
if not line:
return features
features.extend(re.findall(FEATURES, line))
return features
def _read_review_block(self, stream):
while True:
line = stream.readline()
if not line:
return [] # end of file.
title_match = re.match(TITLE, line)
if title_match:
review = Review(
title=title_match.group(1).strip()
) # We create a new review
break
# Scan until we find another line matching the regexp, or EOF.
while True:
oldpos = stream.tell()
line = stream.readline()
# End of file:
if not line:
return [review]
# Start of a new review: backup to just before it starts, and
# return the review we've already collected.
if re.match(TITLE, line):
stream.seek(oldpos)
return [review]
# Anything else is part of the review line.
feats = re.findall(FEATURES, line)
notes = re.findall(NOTES, line)
sent = re.findall(SENT, line)
if sent:
sent = self._word_tokenizer.tokenize(sent[0])
review_line = ReviewLine(sent=sent, features=feats, notes=notes)
review.add_line(review_line)
def _read_sent_block(self, stream):
sents = []
for review in self._read_review_block(stream):
sents.extend([sent for sent in review.sents()])
return sents
def _read_word_block(self, stream):
words = []
for i in range(20): # Read 20 lines at a time.
line = stream.readline()
sent = re.findall(SENT, line)
if sent:
words.extend(self._word_tokenizer.tokenize(sent[0]))
return words

View File

@@ -0,0 +1,146 @@
# Natural Language Toolkit: RTE Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Corpus reader for the Recognizing Textual Entailment (RTE) Challenge Corpora.
The files were taken from the RTE1, RTE2 and RTE3 datasets and the files
were regularized.
Filenames are of the form rte*_dev.xml and rte*_test.xml. The latter are the
gold standard annotated files.
Each entailment corpus is a list of 'text'/'hypothesis' pairs. The following
example is taken from RTE3::
<pair id="1" entailment="YES" task="IE" length="short" >
<t>The sale was made to pay Yukos' US$ 27.5 billion tax bill,
Yuganskneftegaz was originally sold for US$ 9.4 billion to a little known
company Baikalfinansgroup which was later bought by the Russian
state-owned oil company Rosneft .</t>
<h>Baikalfinansgroup was sold to Rosneft.</h>
</pair>
In order to provide globally unique IDs for each pair, a new attribute
``challenge`` has been added to the root element ``entailment-corpus`` of each
file, taking values 1, 2 or 3. The GID is formatted 'm-n', where 'm' is the
challenge number and 'n' is the pair ID.
"""
from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *
from nltk.corpus.reader.xmldocs import *
def norm(value_string):
"""
Normalize the string value in an RTE pair's ``value`` or ``entailment``
attribute as an integer (1, 0).
:param value_string: the label used to classify a text/hypothesis pair
:type value_string: str
:rtype: int
"""
valdict = {"TRUE": 1, "FALSE": 0, "YES": 1, "NO": 0}
return valdict[value_string.upper()]
class RTEPair:
"""
Container for RTE text-hypothesis pairs.
The entailment relation is signalled by the ``value`` attribute in RTE1, and by
``entailment`` in RTE2 and RTE3. These both get mapped on to the ``entailment``
attribute of this class.
"""
def __init__(
self,
pair,
challenge=None,
id=None,
text=None,
hyp=None,
value=None,
task=None,
length=None,
):
"""
:param challenge: version of the RTE challenge (i.e., RTE1, RTE2 or RTE3)
:param id: identifier for the pair
:param text: the text component of the pair
:param hyp: the hypothesis component of the pair
:param value: classification label for the pair
:param task: attribute for the particular NLP task that the data was drawn from
:param length: attribute for the length of the text of the pair
"""
self.challenge = challenge
self.id = pair.attrib["id"]
self.gid = f"{self.challenge}-{self.id}"
self.text = pair[0].text
self.hyp = pair[1].text
if "value" in pair.attrib:
self.value = norm(pair.attrib["value"])
elif "entailment" in pair.attrib:
self.value = norm(pair.attrib["entailment"])
else:
self.value = value
if "task" in pair.attrib:
self.task = pair.attrib["task"]
else:
self.task = task
if "length" in pair.attrib:
self.length = pair.attrib["length"]
else:
self.length = length
def __repr__(self):
if self.challenge:
return f"<RTEPair: gid={self.challenge}-{self.id}>"
else:
return "<RTEPair: id=%s>" % self.id
class RTECorpusReader(XMLCorpusReader):
"""
Corpus reader for corpora in RTE challenges.
This is just a wrapper around the XMLCorpusReader. See module docstring above for the expected
structure of input documents.
"""
def _read_etree(self, doc):
"""
Map the XML input into an RTEPair.
This uses the ``getiterator()`` method from the ElementTree package to
find all the ``<pair>`` elements.
:param doc: a parsed XML document
:rtype: list(RTEPair)
"""
try:
challenge = doc.attrib["challenge"]
except KeyError:
challenge = None
pairiter = doc.iter("pair")
return [RTEPair(pair, challenge=challenge) for pair in pairiter]
def pairs(self, fileids):
"""
Build a list of RTEPairs from a RTE corpus.
:param fileids: a list of RTE corpus fileids
:type: list
:rtype: list(RTEPair)
"""
if isinstance(fileids, str):
fileids = [fileids]
return concat([self._read_etree(self.xml(fileid)) for fileid in fileids])

View File

@@ -0,0 +1,296 @@
# Natural Language Toolkit: SemCor Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Nathan Schneider <nschneid@cs.cmu.edu>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Corpus reader for the SemCor Corpus.
"""
__docformat__ = "epytext en"
from nltk.corpus.reader.api import *
from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView
from nltk.tree import Tree
class SemcorCorpusReader(XMLCorpusReader):
"""
Corpus reader for the SemCor Corpus.
For access to the complete XML data structure, use the ``xml()``
method. For access to simple word lists and tagged word lists, use
``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``.
"""
def __init__(self, root, fileids, wordnet, lazy=True):
XMLCorpusReader.__init__(self, root, fileids)
self._lazy = lazy
self._wordnet = wordnet
def words(self, fileids=None):
"""
:return: the given file(s) as a list of words and punctuation symbols.
:rtype: list(str)
"""
return self._items(fileids, "word", False, False, False)
def chunks(self, fileids=None):
"""
:return: the given file(s) as a list of chunks,
each of which is a list of words and punctuation symbols
that form a unit.
:rtype: list(list(str))
"""
return self._items(fileids, "chunk", False, False, False)
def tagged_chunks(self, fileids=None, tag=("pos" or "sem" or "both")):
"""
:return: the given file(s) as a list of tagged chunks, represented
in tree form.
:rtype: list(Tree)
:param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'`
to indicate the kind of tags to include. Semantic tags consist of
WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity
without a specific entry in WordNet. (Named entities of type 'other'
have no lemma. Other chunks not in WordNet have no semantic tag.
Punctuation tokens have `None` for their part of speech tag.)
"""
return self._items(fileids, "chunk", False, tag != "sem", tag != "pos")
def sents(self, fileids=None):
"""
:return: the given file(s) as a list of sentences, each encoded
as a list of word strings.
:rtype: list(list(str))
"""
return self._items(fileids, "word", True, False, False)
def chunk_sents(self, fileids=None):
"""
:return: the given file(s) as a list of sentences, each encoded
as a list of chunks.
:rtype: list(list(list(str)))
"""
return self._items(fileids, "chunk", True, False, False)
def tagged_sents(self, fileids=None, tag=("pos" or "sem" or "both")):
"""
:return: the given file(s) as a list of sentences. Each sentence
is represented as a list of tagged chunks (in tree form).
:rtype: list(list(Tree))
:param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'`
to indicate the kind of tags to include. Semantic tags consist of
WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity
without a specific entry in WordNet. (Named entities of type 'other'
have no lemma. Other chunks not in WordNet have no semantic tag.
Punctuation tokens have `None` for their part of speech tag.)
"""
return self._items(fileids, "chunk", True, tag != "sem", tag != "pos")
def _items(self, fileids, unit, bracket_sent, pos_tag, sem_tag):
if unit == "word" and not bracket_sent:
# the result of the SemcorWordView may be a multiword unit, so the
# LazyConcatenation will make sure the sentence is flattened
_ = lambda *args: LazyConcatenation(
(SemcorWordView if self._lazy else self._words)(*args)
)
else:
_ = SemcorWordView if self._lazy else self._words
return concat(
[
_(fileid, unit, bracket_sent, pos_tag, sem_tag, self._wordnet)
for fileid in self.abspaths(fileids)
]
)
def _words(self, fileid, unit, bracket_sent, pos_tag, sem_tag):
"""
Helper used to implement the view methods -- returns a list of
tokens, (segmented) words, chunks, or sentences. The tokens
and chunks may optionally be tagged (with POS and sense
information).
:param fileid: The name of the underlying file.
:param unit: One of `'token'`, `'word'`, or `'chunk'`.
:param bracket_sent: If true, include sentence bracketing.
:param pos_tag: Whether to include part-of-speech tags.
:param sem_tag: Whether to include semantic tags, namely WordNet lemma
and OOV named entity status.
"""
assert unit in ("token", "word", "chunk")
result = []
xmldoc = ElementTree.parse(fileid).getroot()
for xmlsent in xmldoc.findall(".//s"):
sent = []
for xmlword in _all_xmlwords_in(xmlsent):
itm = SemcorCorpusReader._word(
xmlword, unit, pos_tag, sem_tag, self._wordnet
)
if unit == "word":
sent.extend(itm)
else:
sent.append(itm)
if bracket_sent:
result.append(SemcorSentence(xmlsent.attrib["snum"], sent))
else:
result.extend(sent)
assert None not in result
return result
@staticmethod
def _word(xmlword, unit, pos_tag, sem_tag, wordnet):
tkn = xmlword.text
if not tkn:
tkn = "" # fixes issue 337?
lemma = xmlword.get("lemma", tkn) # lemma or NE class
lexsn = xmlword.get("lexsn") # lex_sense (locator for the lemma's sense)
if lexsn is not None:
sense_key = lemma + "%" + lexsn
wnpos = ("n", "v", "a", "r", "s")[
int(lexsn.split(":")[0]) - 1
] # see http://wordnet.princeton.edu/man/senseidx.5WN.html
else:
sense_key = wnpos = None
redef = xmlword.get(
"rdf", tkn
) # redefinition--this indicates the lookup string
# does not exactly match the enclosed string, e.g. due to typographical adjustments
# or discontinuity of a multiword expression. If a redefinition has occurred,
# the "rdf" attribute holds its inflected form and "lemma" holds its lemma.
# For NEs, "rdf", "lemma", and "pn" all hold the same value (the NE class).
sensenum = xmlword.get("wnsn") # WordNet sense number
isOOVEntity = "pn" in xmlword.keys() # a "personal name" (NE) not in WordNet
pos = xmlword.get(
"pos"
) # part of speech for the whole chunk (None for punctuation)
if unit == "token":
if not pos_tag and not sem_tag:
itm = tkn
else:
itm = (
(tkn,)
+ ((pos,) if pos_tag else ())
+ ((lemma, wnpos, sensenum, isOOVEntity) if sem_tag else ())
)
return itm
else:
ww = tkn.split("_") # TODO: case where punctuation intervenes in MWE
if unit == "word":
return ww
else:
if sensenum is not None:
try:
sense = wordnet.lemma_from_key(sense_key) # Lemma object
except Exception:
# cannot retrieve the wordnet.Lemma object. possible reasons:
# (a) the wordnet corpus is not downloaded;
# (b) a nonexistent sense is annotated: e.g., such.s.00 triggers:
# nltk.corpus.reader.wordnet.WordNetError: No synset found for key u'such%5:00:01:specified:00'
# solution: just use the lemma name as a string
try:
sense = "%s.%s.%02d" % (
lemma,
wnpos,
int(sensenum),
) # e.g.: reach.v.02
except ValueError:
sense = (
lemma + "." + wnpos + "." + sensenum
) # e.g. the sense number may be "2;1"
bottom = [Tree(pos, ww)] if pos_tag else ww
if sem_tag and isOOVEntity:
if sensenum is not None:
return Tree(sense, [Tree("NE", bottom)])
else: # 'other' NE
return Tree("NE", bottom)
elif sem_tag and sensenum is not None:
return Tree(sense, bottom)
elif pos_tag:
return bottom[0]
else:
return bottom # chunk as a list
def _all_xmlwords_in(elt, result=None):
if result is None:
result = []
for child in elt:
if child.tag in ("wf", "punc"):
result.append(child)
else:
_all_xmlwords_in(child, result)
return result
class SemcorSentence(list):
"""
A list of words, augmented by an attribute ``num`` used to record
the sentence identifier (the ``n`` attribute from the XML).
"""
def __init__(self, num, items):
self.num = num
list.__init__(self, items)
class SemcorWordView(XMLCorpusView):
"""
A stream backed corpus view specialized for use with the BNC corpus.
"""
def __init__(self, fileid, unit, bracket_sent, pos_tag, sem_tag, wordnet):
"""
:param fileid: The name of the underlying file.
:param unit: One of `'token'`, `'word'`, or `'chunk'`.
:param bracket_sent: If true, include sentence bracketing.
:param pos_tag: Whether to include part-of-speech tags.
:param sem_tag: Whether to include semantic tags, namely WordNet lemma
and OOV named entity status.
"""
if bracket_sent:
tagspec = ".*/s"
else:
tagspec = ".*/s/(punc|wf)"
self._unit = unit
self._sent = bracket_sent
self._pos_tag = pos_tag
self._sem_tag = sem_tag
self._wordnet = wordnet
XMLCorpusView.__init__(self, fileid, tagspec)
def handle_elt(self, elt, context):
if self._sent:
return self.handle_sent(elt)
else:
return self.handle_word(elt)
def handle_word(self, elt):
return SemcorCorpusReader._word(
elt, self._unit, self._pos_tag, self._sem_tag, self._wordnet
)
def handle_sent(self, elt):
sent = []
for child in elt:
if child.tag in ("wf", "punc"):
itm = self.handle_word(child)
if self._unit == "word":
sent.extend(itm)
else:
sent.append(itm)
else:
raise ValueError("Unexpected element %s" % child.tag)
return SemcorSentence(elt.attrib["snum"], sent)

View File

@@ -0,0 +1,196 @@
# Natural Language Toolkit: Senseval 2 Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
# Steven Bird <stevenbird1@gmail.com> (modifications)
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Read from the Senseval 2 Corpus.
SENSEVAL [http://www.senseval.org/]
Evaluation exercises for Word Sense Disambiguation.
Organized by ACL-SIGLEX [https://www.siglex.org/]
Prepared by Ted Pedersen <tpederse@umn.edu>, University of Minnesota,
https://www.d.umn.edu/~tpederse/data.html
Distributed with permission.
The NLTK version of the Senseval 2 files uses well-formed XML.
Each instance of the ambiguous words "hard", "interest", "line", and "serve"
is tagged with a sense identifier, and supplied with context.
"""
import re
from xml.etree import ElementTree
from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *
from nltk.tokenize import *
class SensevalInstance:
def __init__(self, word, position, context, senses):
self.word = word
self.senses = tuple(senses)
self.position = position
self.context = context
def __repr__(self):
return "SensevalInstance(word=%r, position=%r, " "context=%r, senses=%r)" % (
self.word,
self.position,
self.context,
self.senses,
)
class SensevalCorpusReader(CorpusReader):
def instances(self, fileids=None):
return concat(
[
SensevalCorpusView(fileid, enc)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def _entry(self, tree):
elts = []
for lexelt in tree.findall("lexelt"):
for inst in lexelt.findall("instance"):
sense = inst[0].attrib["senseid"]
context = [(w.text, w.attrib["pos"]) for w in inst[1]]
elts.append((sense, context))
return elts
class SensevalCorpusView(StreamBackedCorpusView):
def __init__(self, fileid, encoding):
StreamBackedCorpusView.__init__(self, fileid, encoding=encoding)
self._word_tokenizer = WhitespaceTokenizer()
self._lexelt_starts = [0] # list of streampos
self._lexelts = [None] # list of lexelt names
def read_block(self, stream):
# Decide which lexical element we're in.
lexelt_num = bisect.bisect_right(self._lexelt_starts, stream.tell()) - 1
lexelt = self._lexelts[lexelt_num]
instance_lines = []
in_instance = False
while True:
line = stream.readline()
if line == "":
assert instance_lines == []
return []
# Start of a lexical element?
if line.lstrip().startswith("<lexelt"):
lexelt_num += 1
m = re.search("item=(\"[^\"]+\"|'[^']+')", line)
assert m is not None # <lexelt> has no 'item=...'
lexelt = m.group(1)[1:-1]
if lexelt_num < len(self._lexelts):
assert lexelt == self._lexelts[lexelt_num]
else:
self._lexelts.append(lexelt)
self._lexelt_starts.append(stream.tell())
# Start of an instance?
if line.lstrip().startswith("<instance"):
assert instance_lines == []
in_instance = True
# Body of an instance?
if in_instance:
instance_lines.append(line)
# End of an instance?
if line.lstrip().startswith("</instance"):
xml_block = "\n".join(instance_lines)
xml_block = _fixXML(xml_block)
inst = ElementTree.fromstring(xml_block)
return [self._parse_instance(inst, lexelt)]
def _parse_instance(self, instance, lexelt):
senses = []
context = []
position = None
for child in instance:
if child.tag == "answer":
senses.append(child.attrib["senseid"])
elif child.tag == "context":
context += self._word_tokenizer.tokenize(child.text)
for cword in child:
if cword.tag == "compound":
cword = cword[0] # is this ok to do?
if cword.tag == "head":
# Some santiy checks:
assert position is None, "head specified twice"
assert cword.text.strip() or len(cword) == 1
assert not (cword.text.strip() and len(cword) == 1)
# Record the position of the head:
position = len(context)
# Add on the head word itself:
if cword.text.strip():
context.append(cword.text.strip())
elif cword[0].tag == "wf":
context.append((cword[0].text, cword[0].attrib["pos"]))
if cword[0].tail:
context += self._word_tokenizer.tokenize(cword[0].tail)
else:
assert False, "expected CDATA or wf in <head>"
elif cword.tag == "wf":
context.append((cword.text, cword.attrib["pos"]))
elif cword.tag == "s":
pass # Sentence boundary marker.
else:
print("ACK", cword.tag)
assert False, "expected CDATA or <wf> or <head>"
if cword.tail:
context += self._word_tokenizer.tokenize(cword.tail)
else:
assert False, "unexpected tag %s" % child.tag
return SensevalInstance(lexelt, position, context, senses)
def _fixXML(text):
"""
Fix the various issues with Senseval pseudo-XML.
"""
# <~> or <^> => ~ or ^
text = re.sub(r"<([~\^])>", r"\1", text)
# fix lone &
text = re.sub(r"(\s+)\&(\s+)", r"\1&amp;\2", text)
# fix """
text = re.sub(r'"""', "'\"'", text)
# fix <s snum=dd> => <s snum="dd"/>
text = re.sub(r'(<[^<]*snum=)([^">]+)>', r'\1"\2"/>', text)
# fix foreign word tag
text = re.sub(r"<\&frasl>\s*<p[^>]*>", "FRASL", text)
# remove <&I .>
text = re.sub(r"<\&I[^>]*>", "", text)
# fix <{word}>
text = re.sub(r"<{([^}]+)}>", r"\1", text)
# remove <@>, <p>, </p>
text = re.sub(r"<(@|/?p)>", r"", text)
# remove <&M .> and <&T .> and <&Ms .>
text = re.sub(r"<&\w+ \.>", r"", text)
# remove <!DOCTYPE... > lines
text = re.sub(r"<!DOCTYPE[^>]*>", r"", text)
# remove <[hi]> and <[/p]> etc
text = re.sub(r"<\[\/?[^>]+\]*>", r"", text)
# take the thing out of the brackets: <&hellip;>
text = re.sub(r"<(\&\w+;)>", r"\1", text)
# and remove the & for those patterns that aren't regular XML
text = re.sub(r"&(?!amp|gt|lt|apos|quot)", r"", text)
# fix 'abc <p="foo"/>' style tags - now <wf pos="foo">abc</wf>
text = re.sub(
r'[ \t]*([^<>\s]+?)[ \t]*<p="([^"]*"?)"/>', r' <wf pos="\2">\1</wf>', text
)
text = re.sub(r'\s*"\s*<p=\'"\'/>', " <wf pos='\"'>\"</wf>", text)
return text

View File

@@ -0,0 +1,136 @@
# Natural Language Toolkit: SentiWordNet
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Christopher Potts <cgpotts@stanford.edu>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
An NLTK interface for SentiWordNet
SentiWordNet is a lexical resource for opinion mining.
SentiWordNet assigns to each synset of WordNet three
sentiment scores: positivity, negativity, and objectivity.
For details about SentiWordNet see:
http://sentiwordnet.isti.cnr.it/
>>> from nltk.corpus import sentiwordnet as swn
>>> print(swn.senti_synset('breakdown.n.03'))
<breakdown.n.03: PosScore=0.0 NegScore=0.25>
>>> list(swn.senti_synsets('slow'))
[SentiSynset('decelerate.v.01'), SentiSynset('slow.v.02'),\
SentiSynset('slow.v.03'), SentiSynset('slow.a.01'),\
SentiSynset('slow.a.02'), SentiSynset('dense.s.04'),\
SentiSynset('slow.a.04'), SentiSynset('boring.s.01'),\
SentiSynset('dull.s.05'), SentiSynset('slowly.r.01'),\
SentiSynset('behind.r.03')]
>>> happy = swn.senti_synsets('happy', 'a')
>>> happy0 = list(happy)[0]
>>> happy0.pos_score()
0.875
>>> happy0.neg_score()
0.0
>>> happy0.obj_score()
0.125
"""
import re
from nltk.corpus.reader import CorpusReader
class SentiWordNetCorpusReader(CorpusReader):
def __init__(self, root, fileids, encoding="utf-8"):
"""
Construct a new SentiWordNet Corpus Reader, using data from
the specified file.
"""
super().__init__(root, fileids, encoding=encoding)
if len(self._fileids) != 1:
raise ValueError("Exactly one file must be specified")
self._db = {}
self._parse_src_file()
def _parse_src_file(self):
lines = self.open(self._fileids[0]).read().splitlines()
lines = filter((lambda x: not re.search(r"^\s*#", x)), lines)
for i, line in enumerate(lines):
fields = [field.strip() for field in re.split(r"\t+", line)]
try:
pos, offset, pos_score, neg_score, synset_terms, gloss = fields
except BaseException as e:
raise ValueError(f"Line {i} formatted incorrectly: {line}\n") from e
if pos and offset:
offset = int(offset)
self._db[(pos, offset)] = (float(pos_score), float(neg_score))
def senti_synset(self, *vals):
from nltk.corpus import wordnet as wn
if tuple(vals) in self._db:
pos_score, neg_score = self._db[tuple(vals)]
pos, offset = vals
if pos == "s":
pos = "a"
synset = wn.synset_from_pos_and_offset(pos, offset)
return SentiSynset(pos_score, neg_score, synset)
else:
synset = wn.synset(vals[0])
pos = synset.pos()
if pos == "s":
pos = "a"
offset = synset.offset()
if (pos, offset) in self._db:
pos_score, neg_score = self._db[(pos, offset)]
return SentiSynset(pos_score, neg_score, synset)
else:
return None
def senti_synsets(self, string, pos=None):
from nltk.corpus import wordnet as wn
sentis = []
synset_list = wn.synsets(string, pos)
for synset in synset_list:
sentis.append(self.senti_synset(synset.name()))
sentis = filter(lambda x: x, sentis)
return sentis
def all_senti_synsets(self):
from nltk.corpus import wordnet as wn
for key, fields in self._db.items():
pos, offset = key
pos_score, neg_score = fields
synset = wn.synset_from_pos_and_offset(pos, offset)
yield SentiSynset(pos_score, neg_score, synset)
class SentiSynset:
def __init__(self, pos_score, neg_score, synset):
self._pos_score = pos_score
self._neg_score = neg_score
self._obj_score = 1.0 - (self._pos_score + self._neg_score)
self.synset = synset
def pos_score(self):
return self._pos_score
def neg_score(self):
return self._neg_score
def obj_score(self):
return self._obj_score
def __str__(self):
"""Prints just the Pos/Neg scores for now."""
s = "<"
s += self.synset.name() + ": "
s += "PosScore=%s " % self._pos_score
s += "NegScore=%s" % self._neg_score
s += ">"
return s
def __repr__(self):
return "Senti" + repr(self.synset)

View File

@@ -0,0 +1,75 @@
# Natural Language Toolkit: Sinica Treebank Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Sinica Treebank Corpus Sample
http://rocling.iis.sinica.edu.tw/CKIP/engversion/treebank.htm
10,000 parsed sentences, drawn from the Academia Sinica Balanced
Corpus of Modern Chinese. Parse tree notation is based on
Information-based Case Grammar. Tagset documentation is available
at https://www.sinica.edu.tw/SinicaCorpus/modern_e_wordtype.html
Language and Knowledge Processing Group, Institute of Information
Science, Academia Sinica
The data is distributed with the Natural Language Toolkit under the terms of
the Creative Commons Attribution-NonCommercial-ShareAlike License
[https://creativecommons.org/licenses/by-nc-sa/2.5/].
References:
Feng-Yi Chen, Pi-Fang Tsai, Keh-Jiann Chen, and Chu-Ren Huang (1999)
The Construction of Sinica Treebank. Computational Linguistics and
Chinese Language Processing, 4, pp 87-104.
Huang Chu-Ren, Keh-Jiann Chen, Feng-Yi Chen, Keh-Jiann Chen, Zhao-Ming
Gao, and Kuang-Yu Chen. 2000. Sinica Treebank: Design Criteria,
Annotation Guidelines, and On-line Interface. Proceedings of 2nd
Chinese Language Processing Workshop, Association for Computational
Linguistics.
Chen Keh-Jiann and Yu-Ming Hsieh (2004) Chinese Treebanks and Grammar
Extraction, Proceedings of IJCNLP-04, pp560-565.
"""
from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *
from nltk.tag import map_tag
from nltk.tree import sinica_parse
IDENTIFIER = re.compile(r"^#\S+\s")
APPENDIX = re.compile(r"(?<=\))#.*$")
TAGWORD = re.compile(r":([^:()|]+):([^:()|]+)")
WORD = re.compile(r":[^:()|]+:([^:()|]+)")
class SinicaTreebankCorpusReader(SyntaxCorpusReader):
"""
Reader for the sinica treebank.
"""
def _read_block(self, stream):
sent = stream.readline()
sent = IDENTIFIER.sub("", sent)
sent = APPENDIX.sub("", sent)
return [sent]
def _parse(self, sent):
return sinica_parse(sent)
def _tag(self, sent, tagset=None):
tagged_sent = [(w, t) for (t, w) in TAGWORD.findall(sent)]
if tagset and tagset != self._tagset:
tagged_sent = [
(w, map_tag(self._tagset, tagset, t)) for (w, t) in tagged_sent
]
return tagged_sent
def _word(self, sent):
return WORD.findall(sent)

View File

@@ -0,0 +1,56 @@
# Natural Language Toolkit: String Category Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Read tuples from a corpus consisting of categorized strings.
For example, from the question classification corpus:
NUM:dist How far is it from Denver to Aspen ?
LOC:city What county is Modesto , California in ?
HUM:desc Who was Galileo ?
DESC:def What is an atom ?
NUM:date When did Hawaii become a state ?
"""
from nltk.corpus.reader.api import *
# based on PPAttachmentCorpusReader
from nltk.corpus.reader.util import *
# [xx] Should the order of the tuple be reversed -- in most other places
# in nltk, we use the form (data, tag) -- e.g., tagged words and
# labeled texts for classifiers.
class StringCategoryCorpusReader(CorpusReader):
def __init__(self, root, fileids, delimiter=" ", encoding="utf8"):
"""
:param root: The root directory for this corpus.
:param fileids: A list or regexp specifying the fileids in this corpus.
:param delimiter: Field delimiter
"""
CorpusReader.__init__(self, root, fileids, encoding)
self._delimiter = delimiter
def tuples(self, fileids=None):
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, str):
fileids = [fileids]
return concat(
[
StreamBackedCorpusView(fileid, self._read_tuple_block, encoding=enc)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def _read_tuple_block(self, stream):
line = stream.readline().strip()
if line:
return [tuple(line.split(self._delimiter, 1))]
else:
return []

View File

@@ -0,0 +1,125 @@
# Natural Language Toolkit: Switchboard Corpus Reader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import re
from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *
from nltk.tag import map_tag, str2tuple
class SwitchboardTurn(list):
"""
A specialized list object used to encode switchboard utterances.
The elements of the list are the words in the utterance; and two
attributes, ``speaker`` and ``id``, are provided to retrieve the
spearker identifier and utterance id. Note that utterance ids
are only unique within a given discourse.
"""
def __init__(self, words, speaker, id):
list.__init__(self, words)
self.speaker = speaker
self.id = int(id)
def __repr__(self):
if len(self) == 0:
text = ""
elif isinstance(self[0], tuple):
text = " ".join("%s/%s" % w for w in self)
else:
text = " ".join(self)
return f"<{self.speaker}.{self.id}: {text!r}>"
class SwitchboardCorpusReader(CorpusReader):
_FILES = ["tagged"]
# Use the "tagged" file even for non-tagged data methods, since
# it's tokenized.
def __init__(self, root, tagset=None):
CorpusReader.__init__(self, root, self._FILES)
self._tagset = tagset
def words(self):
return StreamBackedCorpusView(self.abspath("tagged"), self._words_block_reader)
def tagged_words(self, tagset=None):
def tagged_words_block_reader(stream):
return self._tagged_words_block_reader(stream, tagset)
return StreamBackedCorpusView(self.abspath("tagged"), tagged_words_block_reader)
def turns(self):
return StreamBackedCorpusView(self.abspath("tagged"), self._turns_block_reader)
def tagged_turns(self, tagset=None):
def tagged_turns_block_reader(stream):
return self._tagged_turns_block_reader(stream, tagset)
return StreamBackedCorpusView(self.abspath("tagged"), tagged_turns_block_reader)
def discourses(self):
return StreamBackedCorpusView(
self.abspath("tagged"), self._discourses_block_reader
)
def tagged_discourses(self, tagset=False):
def tagged_discourses_block_reader(stream):
return self._tagged_discourses_block_reader(stream, tagset)
return StreamBackedCorpusView(
self.abspath("tagged"), tagged_discourses_block_reader
)
def _discourses_block_reader(self, stream):
# returns at most 1 discourse. (The other methods depend on this.)
return [
[
self._parse_utterance(u, include_tag=False)
for b in read_blankline_block(stream)
for u in b.split("\n")
if u.strip()
]
]
def _tagged_discourses_block_reader(self, stream, tagset=None):
# returns at most 1 discourse. (The other methods depend on this.)
return [
[
self._parse_utterance(u, include_tag=True, tagset=tagset)
for b in read_blankline_block(stream)
for u in b.split("\n")
if u.strip()
]
]
def _turns_block_reader(self, stream):
return self._discourses_block_reader(stream)[0]
def _tagged_turns_block_reader(self, stream, tagset=None):
return self._tagged_discourses_block_reader(stream, tagset)[0]
def _words_block_reader(self, stream):
return sum(self._discourses_block_reader(stream)[0], [])
def _tagged_words_block_reader(self, stream, tagset=None):
return sum(self._tagged_discourses_block_reader(stream, tagset)[0], [])
_UTTERANCE_RE = re.compile(r"(\w+)\.(\d+)\:\s*(.*)")
_SEP = "/"
def _parse_utterance(self, utterance, include_tag, tagset=None):
m = self._UTTERANCE_RE.match(utterance)
if m is None:
raise ValueError("Bad utterance %r" % utterance)
speaker, id, text = m.groups()
words = [str2tuple(s, self._SEP) for s in text.split()]
if not include_tag:
words = [w for (w, t) in words]
elif tagset and tagset != self._tagset:
words = [(w, map_tag(self._tagset, tagset, t)) for (w, t) in words]
return SwitchboardTurn(words, speaker, id)

Some files were not shown because too many files have changed in this diff Show More