Initial commit
This commit is contained in:
1
backend/venv/Lib/site-packages/nltk/VERSION
Normal file
1
backend/venv/Lib/site-packages/nltk/VERSION
Normal file
@@ -0,0 +1 @@
|
||||
3.9.2
|
||||
208
backend/venv/Lib/site-packages/nltk/__init__.py
Normal file
208
backend/venv/Lib/site-packages/nltk/__init__.py
Normal file
@@ -0,0 +1,208 @@
|
||||
# Natural Language Toolkit (NLTK)
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Authors: Steven Bird <stevenbird1@gmail.com>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
The Natural Language Toolkit (NLTK) is an open source Python library
|
||||
for Natural Language Processing. A free online book is available.
|
||||
(If you use the library for academic research, please cite the book.)
|
||||
|
||||
Steven Bird, Ewan Klein, and Edward Loper (2009).
|
||||
Natural Language Processing with Python. O'Reilly Media Inc.
|
||||
https://www.nltk.org/book/
|
||||
|
||||
isort:skip_file
|
||||
"""
|
||||
|
||||
import os
|
||||
import importlib
|
||||
|
||||
# //////////////////////////////////////////////////////
|
||||
# Metadata
|
||||
# //////////////////////////////////////////////////////
|
||||
|
||||
# Version. For each new release, the version number should be updated
|
||||
# in the file VERSION.
|
||||
try:
|
||||
# If a VERSION file exists, use it!
|
||||
version_file = os.path.join(os.path.dirname(__file__), "VERSION")
|
||||
with open(version_file) as infile:
|
||||
__version__ = infile.read().strip()
|
||||
except NameError:
|
||||
__version__ = "unknown (running code interactively?)"
|
||||
except OSError as ex:
|
||||
__version__ = "unknown (%s)" % ex
|
||||
|
||||
if __doc__ is not None: # fix for the ``python -OO``
|
||||
__doc__ += "\n@version: " + __version__
|
||||
|
||||
|
||||
# Copyright notice
|
||||
__copyright__ = """\
|
||||
Copyright (C) 2001-2025 NLTK Project.
|
||||
|
||||
Distributed and Licensed under the Apache License, Version 2.0,
|
||||
which is included by reference.
|
||||
"""
|
||||
|
||||
__license__ = "Apache License, Version 2.0"
|
||||
# Description of the toolkit, keywords, and the project's primary URL.
|
||||
__longdescr__ = """\
|
||||
The Natural Language Toolkit (NLTK) is a Python package for
|
||||
natural language processing. NLTK requires Python 3.9, 3.10, 3.11, 3.12 or 3.13."""
|
||||
__keywords__ = [
|
||||
"NLP",
|
||||
"CL",
|
||||
"natural language processing",
|
||||
"computational linguistics",
|
||||
"parsing",
|
||||
"tagging",
|
||||
"tokenizing",
|
||||
"syntax",
|
||||
"linguistics",
|
||||
"language",
|
||||
"natural language",
|
||||
"text analytics",
|
||||
]
|
||||
__url__ = "https://www.nltk.org/"
|
||||
|
||||
# Maintainer, contributors, etc.
|
||||
__maintainer__ = "NLTK Team"
|
||||
__maintainer_email__ = "nltk.team@gmail.com"
|
||||
__author__ = __maintainer__
|
||||
__author_email__ = __maintainer_email__
|
||||
|
||||
# "Trove" classifiers for Python Package Index.
|
||||
__classifiers__ = [
|
||||
"Development Status :: 5 - Production/Stable",
|
||||
"Intended Audience :: Developers",
|
||||
"Intended Audience :: Education",
|
||||
"Intended Audience :: Information Technology",
|
||||
"Intended Audience :: Science/Research",
|
||||
"License :: OSI Approved :: Apache Software License",
|
||||
"Operating System :: OS Independent",
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
"Programming Language :: Python :: 3.11",
|
||||
"Programming Language :: Python :: 3.12",
|
||||
"Programming Language :: Python :: 3.13",
|
||||
"Topic :: Scientific/Engineering",
|
||||
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
||||
"Topic :: Scientific/Engineering :: Human Machine Interfaces",
|
||||
"Topic :: Scientific/Engineering :: Information Analysis",
|
||||
"Topic :: Text Processing",
|
||||
"Topic :: Text Processing :: Filters",
|
||||
"Topic :: Text Processing :: General",
|
||||
"Topic :: Text Processing :: Indexing",
|
||||
"Topic :: Text Processing :: Linguistic",
|
||||
]
|
||||
|
||||
from nltk.internals import config_java
|
||||
|
||||
# support numpy from pypy
|
||||
try:
|
||||
import numpypy
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
# Override missing methods on environments where it cannot be used like GAE.
|
||||
import subprocess
|
||||
|
||||
if not hasattr(subprocess, "PIPE"):
|
||||
|
||||
def _fake_PIPE(*args, **kwargs):
|
||||
raise NotImplementedError("subprocess.PIPE is not supported.")
|
||||
|
||||
subprocess.PIPE = _fake_PIPE
|
||||
if not hasattr(subprocess, "Popen"):
|
||||
|
||||
def _fake_Popen(*args, **kwargs):
|
||||
raise NotImplementedError("subprocess.Popen is not supported.")
|
||||
|
||||
subprocess.Popen = _fake_Popen
|
||||
|
||||
###########################################################
|
||||
# TOP-LEVEL MODULES
|
||||
###########################################################
|
||||
|
||||
# Import top-level functionality into top-level namespace
|
||||
|
||||
from nltk.collocations import *
|
||||
from nltk.decorators import decorator, memoize
|
||||
from nltk.featstruct import *
|
||||
from nltk.grammar import *
|
||||
from nltk.probability import *
|
||||
from nltk.text import *
|
||||
from nltk.util import *
|
||||
from nltk.jsontags import *
|
||||
|
||||
###########################################################
|
||||
# PACKAGES
|
||||
###########################################################
|
||||
|
||||
from nltk.chunk import *
|
||||
from nltk.classify import *
|
||||
from nltk.inference import *
|
||||
from nltk.metrics import *
|
||||
from nltk.parse import *
|
||||
from nltk.tag import *
|
||||
from nltk.tokenize import *
|
||||
from nltk.translate import *
|
||||
from nltk.tree import *
|
||||
from nltk.sem import *
|
||||
from nltk.stem import *
|
||||
|
||||
# Packages which can be lazily imported
|
||||
# (a) we don't import *
|
||||
# (b) they're slow to import or have run-time dependencies
|
||||
# that can safely fail at run time
|
||||
|
||||
from nltk import lazyimport
|
||||
|
||||
app = lazyimport.LazyModule("app", locals(), globals())
|
||||
chat = lazyimport.LazyModule("chat", locals(), globals())
|
||||
corpus = lazyimport.LazyModule("corpus", locals(), globals())
|
||||
draw = lazyimport.LazyModule("draw", locals(), globals())
|
||||
toolbox = lazyimport.LazyModule("toolbox", locals(), globals())
|
||||
|
||||
# Optional loading
|
||||
try:
|
||||
import numpy
|
||||
except ImportError:
|
||||
pass
|
||||
else:
|
||||
from nltk import cluster
|
||||
|
||||
from nltk.downloader import download, download_shell
|
||||
|
||||
# Check if tkinter exists without importing it to avoid crashes after
|
||||
# forks on macOS. Only nltk.app, nltk.draw, and demo modules should
|
||||
# have top-level tkinter imports. See #2949 for more details.
|
||||
if importlib.util.find_spec("tkinter"):
|
||||
try:
|
||||
from nltk.downloader import download_gui
|
||||
except RuntimeError as e:
|
||||
import warnings
|
||||
|
||||
warnings.warn(
|
||||
"Corpus downloader GUI not loaded "
|
||||
"(RuntimeError during import: %s)" % str(e)
|
||||
)
|
||||
|
||||
# explicitly import all top-level modules (ensuring
|
||||
# they override the same names inadvertently imported
|
||||
# from a subpackage)
|
||||
|
||||
from nltk import ccg, chunk, classify, collocations
|
||||
from nltk import data, featstruct, grammar, help, inference, metrics
|
||||
from nltk import misc, parse, probability, sem, stem, wsd
|
||||
from nltk import tag, tbl, text, tokenize, translate, tree, util
|
||||
|
||||
|
||||
# FIXME: override any accidentally imported demo, see https://github.com/nltk/nltk/issues/2116
|
||||
def demo():
|
||||
print("To run the demo code for a module, type nltk.module.demo()")
|
||||
47
backend/venv/Lib/site-packages/nltk/app/__init__.py
Normal file
47
backend/venv/Lib/site-packages/nltk/app/__init__.py
Normal file
@@ -0,0 +1,47 @@
|
||||
# Natural Language Toolkit: Applications package
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Interactive NLTK Applications:
|
||||
|
||||
chartparser: Chart Parser
|
||||
chunkparser: Regular-Expression Chunk Parser
|
||||
collocations: Find collocations in text
|
||||
concordance: Part-of-speech concordancer
|
||||
nemo: Finding (and Replacing) Nemo regular expression tool
|
||||
rdparser: Recursive Descent Parser
|
||||
srparser: Shift-Reduce Parser
|
||||
wordnet: WordNet Browser
|
||||
"""
|
||||
|
||||
|
||||
# Import Tkinter-based modules if Tkinter is installed
|
||||
try:
|
||||
import tkinter
|
||||
except ImportError:
|
||||
import warnings
|
||||
|
||||
warnings.warn("nltk.app package not loaded (please install Tkinter library).")
|
||||
else:
|
||||
from nltk.app.chartparser_app import app as chartparser
|
||||
from nltk.app.chunkparser_app import app as chunkparser
|
||||
from nltk.app.collocations_app import app as collocations
|
||||
from nltk.app.concordance_app import app as concordance
|
||||
from nltk.app.nemo_app import app as nemo
|
||||
from nltk.app.rdparser_app import app as rdparser
|
||||
from nltk.app.srparser_app import app as srparser
|
||||
from nltk.app.wordnet_app import app as wordnet
|
||||
|
||||
try:
|
||||
from matplotlib import pylab
|
||||
except ImportError:
|
||||
import warnings
|
||||
|
||||
warnings.warn("nltk.app.wordfreq not loaded (requires the matplotlib library).")
|
||||
else:
|
||||
from nltk.app.wordfreq_app import app as wordfreq
|
||||
2569
backend/venv/Lib/site-packages/nltk/app/chartparser_app.py
Normal file
2569
backend/venv/Lib/site-packages/nltk/app/chartparser_app.py
Normal file
File diff suppressed because it is too large
Load Diff
1500
backend/venv/Lib/site-packages/nltk/app/chunkparser_app.py
Normal file
1500
backend/venv/Lib/site-packages/nltk/app/chunkparser_app.py
Normal file
File diff suppressed because it is too large
Load Diff
438
backend/venv/Lib/site-packages/nltk/app/collocations_app.py
Normal file
438
backend/venv/Lib/site-packages/nltk/app/collocations_app.py
Normal file
@@ -0,0 +1,438 @@
|
||||
# Natural Language Toolkit: Collocations Application
|
||||
# Much of the GUI code is imported from concordance.py; We intend to merge these tools together
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Sumukh Ghodke <sghodke@csse.unimelb.edu.au>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
#
|
||||
|
||||
|
||||
import queue as q
|
||||
import threading
|
||||
from tkinter import (
|
||||
END,
|
||||
LEFT,
|
||||
SUNKEN,
|
||||
Button,
|
||||
Frame,
|
||||
IntVar,
|
||||
Label,
|
||||
Menu,
|
||||
OptionMenu,
|
||||
Scrollbar,
|
||||
StringVar,
|
||||
Text,
|
||||
Tk,
|
||||
)
|
||||
from tkinter.font import Font
|
||||
|
||||
from nltk.corpus import (
|
||||
alpino,
|
||||
brown,
|
||||
cess_cat,
|
||||
cess_esp,
|
||||
floresta,
|
||||
indian,
|
||||
mac_morpho,
|
||||
machado,
|
||||
nps_chat,
|
||||
sinica_treebank,
|
||||
treebank,
|
||||
)
|
||||
from nltk.probability import FreqDist
|
||||
from nltk.util import in_idle
|
||||
|
||||
CORPUS_LOADED_EVENT = "<<CL_EVENT>>"
|
||||
ERROR_LOADING_CORPUS_EVENT = "<<ELC_EVENT>>"
|
||||
POLL_INTERVAL = 100
|
||||
|
||||
_DEFAULT = "English: Brown Corpus (Humor)"
|
||||
_CORPORA = {
|
||||
"Catalan: CESS-CAT Corpus": lambda: cess_cat.words(),
|
||||
"English: Brown Corpus": lambda: brown.words(),
|
||||
"English: Brown Corpus (Press)": lambda: brown.words(
|
||||
categories=["news", "editorial", "reviews"]
|
||||
),
|
||||
"English: Brown Corpus (Religion)": lambda: brown.words(categories="religion"),
|
||||
"English: Brown Corpus (Learned)": lambda: brown.words(categories="learned"),
|
||||
"English: Brown Corpus (Science Fiction)": lambda: brown.words(
|
||||
categories="science_fiction"
|
||||
),
|
||||
"English: Brown Corpus (Romance)": lambda: brown.words(categories="romance"),
|
||||
"English: Brown Corpus (Humor)": lambda: brown.words(categories="humor"),
|
||||
"English: NPS Chat Corpus": lambda: nps_chat.words(),
|
||||
"English: Wall Street Journal Corpus": lambda: treebank.words(),
|
||||
"Chinese: Sinica Corpus": lambda: sinica_treebank.words(),
|
||||
"Dutch: Alpino Corpus": lambda: alpino.words(),
|
||||
"Hindi: Indian Languages Corpus": lambda: indian.words(files="hindi.pos"),
|
||||
"Portuguese: Floresta Corpus (Portugal)": lambda: floresta.words(),
|
||||
"Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.words(),
|
||||
"Portuguese: Machado Corpus (Brazil)": lambda: machado.words(),
|
||||
"Spanish: CESS-ESP Corpus": lambda: cess_esp.words(),
|
||||
}
|
||||
|
||||
|
||||
class CollocationsView:
|
||||
_BACKGROUND_COLOUR = "#FFF" # white
|
||||
|
||||
def __init__(self):
|
||||
self.queue = q.Queue()
|
||||
self.model = CollocationsModel(self.queue)
|
||||
self.top = Tk()
|
||||
self._init_top(self.top)
|
||||
self._init_menubar()
|
||||
self._init_widgets(self.top)
|
||||
self.load_corpus(self.model.DEFAULT_CORPUS)
|
||||
self.after = self.top.after(POLL_INTERVAL, self._poll)
|
||||
|
||||
def _init_top(self, top):
|
||||
top.geometry("550x650+50+50")
|
||||
top.title("NLTK Collocations List")
|
||||
top.bind("<Control-q>", self.destroy)
|
||||
top.protocol("WM_DELETE_WINDOW", self.destroy)
|
||||
top.minsize(550, 650)
|
||||
|
||||
def _init_widgets(self, parent):
|
||||
self.main_frame = Frame(
|
||||
parent, dict(background=self._BACKGROUND_COLOUR, padx=1, pady=1, border=1)
|
||||
)
|
||||
self._init_corpus_select(self.main_frame)
|
||||
self._init_results_box(self.main_frame)
|
||||
self._init_paging(self.main_frame)
|
||||
self._init_status(self.main_frame)
|
||||
self.main_frame.pack(fill="both", expand=True)
|
||||
|
||||
def _init_corpus_select(self, parent):
|
||||
innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
|
||||
self.var = StringVar(innerframe)
|
||||
self.var.set(self.model.DEFAULT_CORPUS)
|
||||
Label(
|
||||
innerframe,
|
||||
justify=LEFT,
|
||||
text=" Corpus: ",
|
||||
background=self._BACKGROUND_COLOUR,
|
||||
padx=2,
|
||||
pady=1,
|
||||
border=0,
|
||||
).pack(side="left")
|
||||
|
||||
other_corpora = list(self.model.CORPORA.keys()).remove(
|
||||
self.model.DEFAULT_CORPUS
|
||||
)
|
||||
om = OptionMenu(
|
||||
innerframe,
|
||||
self.var,
|
||||
self.model.DEFAULT_CORPUS,
|
||||
command=self.corpus_selected,
|
||||
*self.model.non_default_corpora()
|
||||
)
|
||||
om["borderwidth"] = 0
|
||||
om["highlightthickness"] = 1
|
||||
om.pack(side="left")
|
||||
innerframe.pack(side="top", fill="x", anchor="n")
|
||||
|
||||
def _init_status(self, parent):
|
||||
self.status = Label(
|
||||
parent,
|
||||
justify=LEFT,
|
||||
relief=SUNKEN,
|
||||
background=self._BACKGROUND_COLOUR,
|
||||
border=0,
|
||||
padx=1,
|
||||
pady=0,
|
||||
)
|
||||
self.status.pack(side="top", anchor="sw")
|
||||
|
||||
def _init_menubar(self):
|
||||
self._result_size = IntVar(self.top)
|
||||
menubar = Menu(self.top)
|
||||
|
||||
filemenu = Menu(menubar, tearoff=0, borderwidth=0)
|
||||
filemenu.add_command(
|
||||
label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-q"
|
||||
)
|
||||
menubar.add_cascade(label="File", underline=0, menu=filemenu)
|
||||
|
||||
editmenu = Menu(menubar, tearoff=0)
|
||||
rescntmenu = Menu(editmenu, tearoff=0)
|
||||
rescntmenu.add_radiobutton(
|
||||
label="20",
|
||||
variable=self._result_size,
|
||||
underline=0,
|
||||
value=20,
|
||||
command=self.set_result_size,
|
||||
)
|
||||
rescntmenu.add_radiobutton(
|
||||
label="50",
|
||||
variable=self._result_size,
|
||||
underline=0,
|
||||
value=50,
|
||||
command=self.set_result_size,
|
||||
)
|
||||
rescntmenu.add_radiobutton(
|
||||
label="100",
|
||||
variable=self._result_size,
|
||||
underline=0,
|
||||
value=100,
|
||||
command=self.set_result_size,
|
||||
)
|
||||
rescntmenu.invoke(1)
|
||||
editmenu.add_cascade(label="Result Count", underline=0, menu=rescntmenu)
|
||||
|
||||
menubar.add_cascade(label="Edit", underline=0, menu=editmenu)
|
||||
self.top.config(menu=menubar)
|
||||
|
||||
def set_result_size(self, **kwargs):
|
||||
self.model.result_count = self._result_size.get()
|
||||
|
||||
def _init_results_box(self, parent):
|
||||
innerframe = Frame(parent)
|
||||
i1 = Frame(innerframe)
|
||||
i2 = Frame(innerframe)
|
||||
vscrollbar = Scrollbar(i1, borderwidth=1)
|
||||
hscrollbar = Scrollbar(i2, borderwidth=1, orient="horiz")
|
||||
self.results_box = Text(
|
||||
i1,
|
||||
font=Font(family="courier", size="16"),
|
||||
state="disabled",
|
||||
borderwidth=1,
|
||||
yscrollcommand=vscrollbar.set,
|
||||
xscrollcommand=hscrollbar.set,
|
||||
wrap="none",
|
||||
width="40",
|
||||
height="20",
|
||||
exportselection=1,
|
||||
)
|
||||
self.results_box.pack(side="left", fill="both", expand=True)
|
||||
vscrollbar.pack(side="left", fill="y", anchor="e")
|
||||
vscrollbar.config(command=self.results_box.yview)
|
||||
hscrollbar.pack(side="left", fill="x", expand=True, anchor="w")
|
||||
hscrollbar.config(command=self.results_box.xview)
|
||||
# there is no other way of avoiding the overlap of scrollbars while using pack layout manager!!!
|
||||
Label(i2, text=" ", background=self._BACKGROUND_COLOUR).pack(
|
||||
side="left", anchor="e"
|
||||
)
|
||||
i1.pack(side="top", fill="both", expand=True, anchor="n")
|
||||
i2.pack(side="bottom", fill="x", anchor="s")
|
||||
innerframe.pack(side="top", fill="both", expand=True)
|
||||
|
||||
def _init_paging(self, parent):
|
||||
innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
|
||||
self.prev = prev = Button(
|
||||
innerframe,
|
||||
text="Previous",
|
||||
command=self.previous,
|
||||
width="10",
|
||||
borderwidth=1,
|
||||
highlightthickness=1,
|
||||
state="disabled",
|
||||
)
|
||||
prev.pack(side="left", anchor="center")
|
||||
self.next = next = Button(
|
||||
innerframe,
|
||||
text="Next",
|
||||
command=self.__next__,
|
||||
width="10",
|
||||
borderwidth=1,
|
||||
highlightthickness=1,
|
||||
state="disabled",
|
||||
)
|
||||
next.pack(side="right", anchor="center")
|
||||
innerframe.pack(side="top", fill="y")
|
||||
self.reset_current_page()
|
||||
|
||||
def reset_current_page(self):
|
||||
self.current_page = -1
|
||||
|
||||
def _poll(self):
|
||||
try:
|
||||
event = self.queue.get(block=False)
|
||||
except q.Empty:
|
||||
pass
|
||||
else:
|
||||
if event == CORPUS_LOADED_EVENT:
|
||||
self.handle_corpus_loaded(event)
|
||||
elif event == ERROR_LOADING_CORPUS_EVENT:
|
||||
self.handle_error_loading_corpus(event)
|
||||
self.after = self.top.after(POLL_INTERVAL, self._poll)
|
||||
|
||||
def handle_error_loading_corpus(self, event):
|
||||
self.status["text"] = "Error in loading " + self.var.get()
|
||||
self.unfreeze_editable()
|
||||
self.clear_results_box()
|
||||
self.freeze_editable()
|
||||
self.reset_current_page()
|
||||
|
||||
def handle_corpus_loaded(self, event):
|
||||
self.status["text"] = self.var.get() + " is loaded"
|
||||
self.unfreeze_editable()
|
||||
self.clear_results_box()
|
||||
self.reset_current_page()
|
||||
# self.next()
|
||||
collocations = self.model.next(self.current_page + 1)
|
||||
self.write_results(collocations)
|
||||
self.current_page += 1
|
||||
|
||||
def corpus_selected(self, *args):
|
||||
new_selection = self.var.get()
|
||||
self.load_corpus(new_selection)
|
||||
|
||||
def previous(self):
|
||||
self.freeze_editable()
|
||||
collocations = self.model.prev(self.current_page - 1)
|
||||
self.current_page = self.current_page - 1
|
||||
self.clear_results_box()
|
||||
self.write_results(collocations)
|
||||
self.unfreeze_editable()
|
||||
|
||||
def __next__(self):
|
||||
self.freeze_editable()
|
||||
collocations = self.model.next(self.current_page + 1)
|
||||
self.clear_results_box()
|
||||
self.write_results(collocations)
|
||||
self.current_page += 1
|
||||
self.unfreeze_editable()
|
||||
|
||||
def load_corpus(self, selection):
|
||||
if self.model.selected_corpus != selection:
|
||||
self.status["text"] = "Loading " + selection + "..."
|
||||
self.freeze_editable()
|
||||
self.model.load_corpus(selection)
|
||||
|
||||
def freeze_editable(self):
|
||||
self.prev["state"] = "disabled"
|
||||
self.next["state"] = "disabled"
|
||||
|
||||
def clear_results_box(self):
|
||||
self.results_box["state"] = "normal"
|
||||
self.results_box.delete("1.0", END)
|
||||
self.results_box["state"] = "disabled"
|
||||
|
||||
def fire_event(self, event):
|
||||
# Firing an event so that rendering of widgets happen in the mainloop thread
|
||||
self.top.event_generate(event, when="tail")
|
||||
|
||||
def destroy(self, *e):
|
||||
if self.top is None:
|
||||
return
|
||||
self.top.after_cancel(self.after)
|
||||
self.top.destroy()
|
||||
self.top = None
|
||||
|
||||
def mainloop(self, *args, **kwargs):
|
||||
if in_idle():
|
||||
return
|
||||
self.top.mainloop(*args, **kwargs)
|
||||
|
||||
def unfreeze_editable(self):
|
||||
self.set_paging_button_states()
|
||||
|
||||
def set_paging_button_states(self):
|
||||
if self.current_page == -1 or self.current_page == 0:
|
||||
self.prev["state"] = "disabled"
|
||||
else:
|
||||
self.prev["state"] = "normal"
|
||||
if self.model.is_last_page(self.current_page):
|
||||
self.next["state"] = "disabled"
|
||||
else:
|
||||
self.next["state"] = "normal"
|
||||
|
||||
def write_results(self, results):
|
||||
self.results_box["state"] = "normal"
|
||||
row = 1
|
||||
for each in results:
|
||||
self.results_box.insert(str(row) + ".0", each[0] + " " + each[1] + "\n")
|
||||
row += 1
|
||||
self.results_box["state"] = "disabled"
|
||||
|
||||
|
||||
class CollocationsModel:
|
||||
def __init__(self, queue):
|
||||
self.result_count = None
|
||||
self.selected_corpus = None
|
||||
self.collocations = None
|
||||
self.CORPORA = _CORPORA
|
||||
self.DEFAULT_CORPUS = _DEFAULT
|
||||
self.queue = queue
|
||||
self.reset_results()
|
||||
|
||||
def reset_results(self):
|
||||
self.result_pages = []
|
||||
self.results_returned = 0
|
||||
|
||||
def load_corpus(self, name):
|
||||
self.selected_corpus = name
|
||||
self.collocations = None
|
||||
runner_thread = self.LoadCorpus(name, self)
|
||||
runner_thread.start()
|
||||
self.reset_results()
|
||||
|
||||
def non_default_corpora(self):
|
||||
copy = []
|
||||
copy.extend(list(self.CORPORA.keys()))
|
||||
copy.remove(self.DEFAULT_CORPUS)
|
||||
copy.sort()
|
||||
return copy
|
||||
|
||||
def is_last_page(self, number):
|
||||
if number < len(self.result_pages):
|
||||
return False
|
||||
return self.results_returned + (
|
||||
number - len(self.result_pages)
|
||||
) * self.result_count >= len(self.collocations)
|
||||
|
||||
def next(self, page):
|
||||
if (len(self.result_pages) - 1) < page:
|
||||
for i in range(page - (len(self.result_pages) - 1)):
|
||||
self.result_pages.append(
|
||||
self.collocations[
|
||||
self.results_returned : self.results_returned
|
||||
+ self.result_count
|
||||
]
|
||||
)
|
||||
self.results_returned += self.result_count
|
||||
return self.result_pages[page]
|
||||
|
||||
def prev(self, page):
|
||||
if page == -1:
|
||||
return []
|
||||
return self.result_pages[page]
|
||||
|
||||
class LoadCorpus(threading.Thread):
|
||||
def __init__(self, name, model):
|
||||
threading.Thread.__init__(self)
|
||||
self.model, self.name = model, name
|
||||
|
||||
def run(self):
|
||||
try:
|
||||
words = self.model.CORPORA[self.name]()
|
||||
from operator import itemgetter
|
||||
|
||||
text = [w for w in words if len(w) > 2]
|
||||
fd = FreqDist(tuple(text[i : i + 2]) for i in range(len(text) - 1))
|
||||
vocab = FreqDist(text)
|
||||
scored = [
|
||||
((w1, w2), fd[(w1, w2)] ** 3 / (vocab[w1] * vocab[w2]))
|
||||
for w1, w2 in fd
|
||||
]
|
||||
scored.sort(key=itemgetter(1), reverse=True)
|
||||
self.model.collocations = list(map(itemgetter(0), scored))
|
||||
self.model.queue.put(CORPUS_LOADED_EVENT)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
self.model.queue.put(ERROR_LOADING_CORPUS_EVENT)
|
||||
|
||||
|
||||
# def collocations():
|
||||
# colloc_strings = [w1 + ' ' + w2 for w1, w2 in self._collocations[:num]]
|
||||
|
||||
|
||||
def app():
|
||||
c = CollocationsView()
|
||||
c.mainloop()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app()
|
||||
|
||||
__all__ = ["app"]
|
||||
709
backend/venv/Lib/site-packages/nltk/app/concordance_app.py
Normal file
709
backend/venv/Lib/site-packages/nltk/app/concordance_app.py
Normal file
@@ -0,0 +1,709 @@
|
||||
# Natural Language Toolkit: Concordance Application
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Sumukh Ghodke <sghodke@csse.unimelb.edu.au>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import queue as q
|
||||
import re
|
||||
import threading
|
||||
from tkinter import (
|
||||
END,
|
||||
LEFT,
|
||||
SUNKEN,
|
||||
Button,
|
||||
Entry,
|
||||
Frame,
|
||||
IntVar,
|
||||
Label,
|
||||
Menu,
|
||||
OptionMenu,
|
||||
Scrollbar,
|
||||
StringVar,
|
||||
Text,
|
||||
Tk,
|
||||
)
|
||||
from tkinter.font import Font
|
||||
|
||||
from nltk.corpus import (
|
||||
alpino,
|
||||
brown,
|
||||
cess_cat,
|
||||
cess_esp,
|
||||
floresta,
|
||||
indian,
|
||||
mac_morpho,
|
||||
nps_chat,
|
||||
sinica_treebank,
|
||||
treebank,
|
||||
)
|
||||
from nltk.draw.util import ShowText
|
||||
from nltk.util import in_idle
|
||||
|
||||
WORD_OR_TAG = "[^/ ]+"
|
||||
BOUNDARY = r"\b"
|
||||
|
||||
CORPUS_LOADED_EVENT = "<<CL_EVENT>>"
|
||||
SEARCH_TERMINATED_EVENT = "<<ST_EVENT>>"
|
||||
SEARCH_ERROR_EVENT = "<<SE_EVENT>>"
|
||||
ERROR_LOADING_CORPUS_EVENT = "<<ELC_EVENT>>"
|
||||
|
||||
POLL_INTERVAL = 50
|
||||
|
||||
# NB All corpora must be specified in a lambda expression so as not to be
|
||||
# loaded when the module is imported.
|
||||
|
||||
_DEFAULT = "English: Brown Corpus (Humor, simplified)"
|
||||
_CORPORA = {
|
||||
"Catalan: CESS-CAT Corpus (simplified)": lambda: cess_cat.tagged_sents(
|
||||
tagset="universal"
|
||||
),
|
||||
"English: Brown Corpus": lambda: brown.tagged_sents(),
|
||||
"English: Brown Corpus (simplified)": lambda: brown.tagged_sents(
|
||||
tagset="universal"
|
||||
),
|
||||
"English: Brown Corpus (Press, simplified)": lambda: brown.tagged_sents(
|
||||
categories=["news", "editorial", "reviews"], tagset="universal"
|
||||
),
|
||||
"English: Brown Corpus (Religion, simplified)": lambda: brown.tagged_sents(
|
||||
categories="religion", tagset="universal"
|
||||
),
|
||||
"English: Brown Corpus (Learned, simplified)": lambda: brown.tagged_sents(
|
||||
categories="learned", tagset="universal"
|
||||
),
|
||||
"English: Brown Corpus (Science Fiction, simplified)": lambda: brown.tagged_sents(
|
||||
categories="science_fiction", tagset="universal"
|
||||
),
|
||||
"English: Brown Corpus (Romance, simplified)": lambda: brown.tagged_sents(
|
||||
categories="romance", tagset="universal"
|
||||
),
|
||||
"English: Brown Corpus (Humor, simplified)": lambda: brown.tagged_sents(
|
||||
categories="humor", tagset="universal"
|
||||
),
|
||||
"English: NPS Chat Corpus": lambda: nps_chat.tagged_posts(),
|
||||
"English: NPS Chat Corpus (simplified)": lambda: nps_chat.tagged_posts(
|
||||
tagset="universal"
|
||||
),
|
||||
"English: Wall Street Journal Corpus": lambda: treebank.tagged_sents(),
|
||||
"English: Wall Street Journal Corpus (simplified)": lambda: treebank.tagged_sents(
|
||||
tagset="universal"
|
||||
),
|
||||
"Chinese: Sinica Corpus": lambda: sinica_treebank.tagged_sents(),
|
||||
"Chinese: Sinica Corpus (simplified)": lambda: sinica_treebank.tagged_sents(
|
||||
tagset="universal"
|
||||
),
|
||||
"Dutch: Alpino Corpus": lambda: alpino.tagged_sents(),
|
||||
"Dutch: Alpino Corpus (simplified)": lambda: alpino.tagged_sents(
|
||||
tagset="universal"
|
||||
),
|
||||
"Hindi: Indian Languages Corpus": lambda: indian.tagged_sents(files="hindi.pos"),
|
||||
"Hindi: Indian Languages Corpus (simplified)": lambda: indian.tagged_sents(
|
||||
files="hindi.pos", tagset="universal"
|
||||
),
|
||||
"Portuguese: Floresta Corpus (Portugal)": lambda: floresta.tagged_sents(),
|
||||
"Portuguese: Floresta Corpus (Portugal, simplified)": lambda: floresta.tagged_sents(
|
||||
tagset="universal"
|
||||
),
|
||||
"Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.tagged_sents(),
|
||||
"Portuguese: MAC-MORPHO Corpus (Brazil, simplified)": lambda: mac_morpho.tagged_sents(
|
||||
tagset="universal"
|
||||
),
|
||||
"Spanish: CESS-ESP Corpus (simplified)": lambda: cess_esp.tagged_sents(
|
||||
tagset="universal"
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
class ConcordanceSearchView:
|
||||
_BACKGROUND_COLOUR = "#FFF" # white
|
||||
|
||||
# Colour of highlighted results
|
||||
_HIGHLIGHT_WORD_COLOUR = "#F00" # red
|
||||
_HIGHLIGHT_WORD_TAG = "HL_WRD_TAG"
|
||||
|
||||
_HIGHLIGHT_LABEL_COLOUR = "#C0C0C0" # dark grey
|
||||
_HIGHLIGHT_LABEL_TAG = "HL_LBL_TAG"
|
||||
|
||||
# Percentage of text left of the scrollbar position
|
||||
_FRACTION_LEFT_TEXT = 0.30
|
||||
|
||||
def __init__(self):
|
||||
self.queue = q.Queue()
|
||||
self.model = ConcordanceSearchModel(self.queue)
|
||||
self.top = Tk()
|
||||
self._init_top(self.top)
|
||||
self._init_menubar()
|
||||
self._init_widgets(self.top)
|
||||
self.load_corpus(self.model.DEFAULT_CORPUS)
|
||||
self.after = self.top.after(POLL_INTERVAL, self._poll)
|
||||
|
||||
def _init_top(self, top):
|
||||
top.geometry("950x680+50+50")
|
||||
top.title("NLTK Concordance Search")
|
||||
top.bind("<Control-q>", self.destroy)
|
||||
top.protocol("WM_DELETE_WINDOW", self.destroy)
|
||||
top.minsize(950, 680)
|
||||
|
||||
def _init_widgets(self, parent):
|
||||
self.main_frame = Frame(
|
||||
parent, dict(background=self._BACKGROUND_COLOUR, padx=1, pady=1, border=1)
|
||||
)
|
||||
self._init_corpus_select(self.main_frame)
|
||||
self._init_query_box(self.main_frame)
|
||||
self._init_results_box(self.main_frame)
|
||||
self._init_paging(self.main_frame)
|
||||
self._init_status(self.main_frame)
|
||||
self.main_frame.pack(fill="both", expand=True)
|
||||
|
||||
def _init_menubar(self):
|
||||
self._result_size = IntVar(self.top)
|
||||
self._cntx_bf_len = IntVar(self.top)
|
||||
self._cntx_af_len = IntVar(self.top)
|
||||
menubar = Menu(self.top)
|
||||
|
||||
filemenu = Menu(menubar, tearoff=0, borderwidth=0)
|
||||
filemenu.add_command(
|
||||
label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-q"
|
||||
)
|
||||
menubar.add_cascade(label="File", underline=0, menu=filemenu)
|
||||
|
||||
editmenu = Menu(menubar, tearoff=0)
|
||||
rescntmenu = Menu(editmenu, tearoff=0)
|
||||
rescntmenu.add_radiobutton(
|
||||
label="20",
|
||||
variable=self._result_size,
|
||||
underline=0,
|
||||
value=20,
|
||||
command=self.set_result_size,
|
||||
)
|
||||
rescntmenu.add_radiobutton(
|
||||
label="50",
|
||||
variable=self._result_size,
|
||||
underline=0,
|
||||
value=50,
|
||||
command=self.set_result_size,
|
||||
)
|
||||
rescntmenu.add_radiobutton(
|
||||
label="100",
|
||||
variable=self._result_size,
|
||||
underline=0,
|
||||
value=100,
|
||||
command=self.set_result_size,
|
||||
)
|
||||
rescntmenu.invoke(1)
|
||||
editmenu.add_cascade(label="Result Count", underline=0, menu=rescntmenu)
|
||||
|
||||
cntxmenu = Menu(editmenu, tearoff=0)
|
||||
cntxbfmenu = Menu(cntxmenu, tearoff=0)
|
||||
cntxbfmenu.add_radiobutton(
|
||||
label="60 characters",
|
||||
variable=self._cntx_bf_len,
|
||||
underline=0,
|
||||
value=60,
|
||||
command=self.set_cntx_bf_len,
|
||||
)
|
||||
cntxbfmenu.add_radiobutton(
|
||||
label="80 characters",
|
||||
variable=self._cntx_bf_len,
|
||||
underline=0,
|
||||
value=80,
|
||||
command=self.set_cntx_bf_len,
|
||||
)
|
||||
cntxbfmenu.add_radiobutton(
|
||||
label="100 characters",
|
||||
variable=self._cntx_bf_len,
|
||||
underline=0,
|
||||
value=100,
|
||||
command=self.set_cntx_bf_len,
|
||||
)
|
||||
cntxbfmenu.invoke(1)
|
||||
cntxmenu.add_cascade(label="Before", underline=0, menu=cntxbfmenu)
|
||||
|
||||
cntxafmenu = Menu(cntxmenu, tearoff=0)
|
||||
cntxafmenu.add_radiobutton(
|
||||
label="70 characters",
|
||||
variable=self._cntx_af_len,
|
||||
underline=0,
|
||||
value=70,
|
||||
command=self.set_cntx_af_len,
|
||||
)
|
||||
cntxafmenu.add_radiobutton(
|
||||
label="90 characters",
|
||||
variable=self._cntx_af_len,
|
||||
underline=0,
|
||||
value=90,
|
||||
command=self.set_cntx_af_len,
|
||||
)
|
||||
cntxafmenu.add_radiobutton(
|
||||
label="110 characters",
|
||||
variable=self._cntx_af_len,
|
||||
underline=0,
|
||||
value=110,
|
||||
command=self.set_cntx_af_len,
|
||||
)
|
||||
cntxafmenu.invoke(1)
|
||||
cntxmenu.add_cascade(label="After", underline=0, menu=cntxafmenu)
|
||||
|
||||
editmenu.add_cascade(label="Context", underline=0, menu=cntxmenu)
|
||||
|
||||
menubar.add_cascade(label="Edit", underline=0, menu=editmenu)
|
||||
|
||||
self.top.config(menu=menubar)
|
||||
|
||||
def set_result_size(self, **kwargs):
|
||||
self.model.result_count = self._result_size.get()
|
||||
|
||||
def set_cntx_af_len(self, **kwargs):
|
||||
self._char_after = self._cntx_af_len.get()
|
||||
|
||||
def set_cntx_bf_len(self, **kwargs):
|
||||
self._char_before = self._cntx_bf_len.get()
|
||||
|
||||
def _init_corpus_select(self, parent):
|
||||
innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
|
||||
self.var = StringVar(innerframe)
|
||||
self.var.set(self.model.DEFAULT_CORPUS)
|
||||
Label(
|
||||
innerframe,
|
||||
justify=LEFT,
|
||||
text=" Corpus: ",
|
||||
background=self._BACKGROUND_COLOUR,
|
||||
padx=2,
|
||||
pady=1,
|
||||
border=0,
|
||||
).pack(side="left")
|
||||
|
||||
other_corpora = list(self.model.CORPORA.keys()).remove(
|
||||
self.model.DEFAULT_CORPUS
|
||||
)
|
||||
om = OptionMenu(
|
||||
innerframe,
|
||||
self.var,
|
||||
self.model.DEFAULT_CORPUS,
|
||||
command=self.corpus_selected,
|
||||
*self.model.non_default_corpora()
|
||||
)
|
||||
om["borderwidth"] = 0
|
||||
om["highlightthickness"] = 1
|
||||
om.pack(side="left")
|
||||
innerframe.pack(side="top", fill="x", anchor="n")
|
||||
|
||||
def _init_status(self, parent):
|
||||
self.status = Label(
|
||||
parent,
|
||||
justify=LEFT,
|
||||
relief=SUNKEN,
|
||||
background=self._BACKGROUND_COLOUR,
|
||||
border=0,
|
||||
padx=1,
|
||||
pady=0,
|
||||
)
|
||||
self.status.pack(side="top", anchor="sw")
|
||||
|
||||
def _init_query_box(self, parent):
|
||||
innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
|
||||
another = Frame(innerframe, background=self._BACKGROUND_COLOUR)
|
||||
self.query_box = Entry(another, width=60)
|
||||
self.query_box.pack(side="left", fill="x", pady=25, anchor="center")
|
||||
self.search_button = Button(
|
||||
another,
|
||||
text="Search",
|
||||
command=self.search,
|
||||
borderwidth=1,
|
||||
highlightthickness=1,
|
||||
)
|
||||
self.search_button.pack(side="left", fill="x", pady=25, anchor="center")
|
||||
self.query_box.bind("<KeyPress-Return>", self.search_enter_keypress_handler)
|
||||
another.pack()
|
||||
innerframe.pack(side="top", fill="x", anchor="n")
|
||||
|
||||
def search_enter_keypress_handler(self, *event):
|
||||
self.search()
|
||||
|
||||
def _init_results_box(self, parent):
|
||||
innerframe = Frame(parent)
|
||||
i1 = Frame(innerframe)
|
||||
i2 = Frame(innerframe)
|
||||
vscrollbar = Scrollbar(i1, borderwidth=1)
|
||||
hscrollbar = Scrollbar(i2, borderwidth=1, orient="horiz")
|
||||
self.results_box = Text(
|
||||
i1,
|
||||
font=Font(family="courier", size="16"),
|
||||
state="disabled",
|
||||
borderwidth=1,
|
||||
yscrollcommand=vscrollbar.set,
|
||||
xscrollcommand=hscrollbar.set,
|
||||
wrap="none",
|
||||
width="40",
|
||||
height="20",
|
||||
exportselection=1,
|
||||
)
|
||||
self.results_box.pack(side="left", fill="both", expand=True)
|
||||
self.results_box.tag_config(
|
||||
self._HIGHLIGHT_WORD_TAG, foreground=self._HIGHLIGHT_WORD_COLOUR
|
||||
)
|
||||
self.results_box.tag_config(
|
||||
self._HIGHLIGHT_LABEL_TAG, foreground=self._HIGHLIGHT_LABEL_COLOUR
|
||||
)
|
||||
vscrollbar.pack(side="left", fill="y", anchor="e")
|
||||
vscrollbar.config(command=self.results_box.yview)
|
||||
hscrollbar.pack(side="left", fill="x", expand=True, anchor="w")
|
||||
hscrollbar.config(command=self.results_box.xview)
|
||||
# there is no other way of avoiding the overlap of scrollbars while using pack layout manager!!!
|
||||
Label(i2, text=" ", background=self._BACKGROUND_COLOUR).pack(
|
||||
side="left", anchor="e"
|
||||
)
|
||||
i1.pack(side="top", fill="both", expand=True, anchor="n")
|
||||
i2.pack(side="bottom", fill="x", anchor="s")
|
||||
innerframe.pack(side="top", fill="both", expand=True)
|
||||
|
||||
def _init_paging(self, parent):
|
||||
innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
|
||||
self.prev = prev = Button(
|
||||
innerframe,
|
||||
text="Previous",
|
||||
command=self.previous,
|
||||
width="10",
|
||||
borderwidth=1,
|
||||
highlightthickness=1,
|
||||
state="disabled",
|
||||
)
|
||||
prev.pack(side="left", anchor="center")
|
||||
self.next = next = Button(
|
||||
innerframe,
|
||||
text="Next",
|
||||
command=self.__next__,
|
||||
width="10",
|
||||
borderwidth=1,
|
||||
highlightthickness=1,
|
||||
state="disabled",
|
||||
)
|
||||
next.pack(side="right", anchor="center")
|
||||
innerframe.pack(side="top", fill="y")
|
||||
self.current_page = 0
|
||||
|
||||
def previous(self):
|
||||
self.clear_results_box()
|
||||
self.freeze_editable()
|
||||
self.model.prev(self.current_page - 1)
|
||||
|
||||
def __next__(self):
|
||||
self.clear_results_box()
|
||||
self.freeze_editable()
|
||||
self.model.next(self.current_page + 1)
|
||||
|
||||
def about(self, *e):
|
||||
ABOUT = "NLTK Concordance Search Demo\n"
|
||||
TITLE = "About: NLTK Concordance Search Demo"
|
||||
try:
|
||||
from tkinter.messagebox import Message
|
||||
|
||||
Message(message=ABOUT, title=TITLE, parent=self.main_frame).show()
|
||||
except:
|
||||
ShowText(self.top, TITLE, ABOUT)
|
||||
|
||||
def _bind_event_handlers(self):
|
||||
self.top.bind(CORPUS_LOADED_EVENT, self.handle_corpus_loaded)
|
||||
self.top.bind(SEARCH_TERMINATED_EVENT, self.handle_search_terminated)
|
||||
self.top.bind(SEARCH_ERROR_EVENT, self.handle_search_error)
|
||||
self.top.bind(ERROR_LOADING_CORPUS_EVENT, self.handle_error_loading_corpus)
|
||||
|
||||
def _poll(self):
|
||||
try:
|
||||
event = self.queue.get(block=False)
|
||||
except q.Empty:
|
||||
pass
|
||||
else:
|
||||
if event == CORPUS_LOADED_EVENT:
|
||||
self.handle_corpus_loaded(event)
|
||||
elif event == SEARCH_TERMINATED_EVENT:
|
||||
self.handle_search_terminated(event)
|
||||
elif event == SEARCH_ERROR_EVENT:
|
||||
self.handle_search_error(event)
|
||||
elif event == ERROR_LOADING_CORPUS_EVENT:
|
||||
self.handle_error_loading_corpus(event)
|
||||
self.after = self.top.after(POLL_INTERVAL, self._poll)
|
||||
|
||||
def handle_error_loading_corpus(self, event):
|
||||
self.status["text"] = "Error in loading " + self.var.get()
|
||||
self.unfreeze_editable()
|
||||
self.clear_all()
|
||||
self.freeze_editable()
|
||||
|
||||
def handle_corpus_loaded(self, event):
|
||||
self.status["text"] = self.var.get() + " is loaded"
|
||||
self.unfreeze_editable()
|
||||
self.clear_all()
|
||||
self.query_box.focus_set()
|
||||
|
||||
def handle_search_terminated(self, event):
|
||||
# todo: refactor the model such that it is less state sensitive
|
||||
results = self.model.get_results()
|
||||
self.write_results(results)
|
||||
self.status["text"] = ""
|
||||
if len(results) == 0:
|
||||
self.status["text"] = "No results found for " + self.model.query
|
||||
else:
|
||||
self.current_page = self.model.last_requested_page
|
||||
self.unfreeze_editable()
|
||||
self.results_box.xview_moveto(self._FRACTION_LEFT_TEXT)
|
||||
|
||||
def handle_search_error(self, event):
|
||||
self.status["text"] = "Error in query " + self.model.query
|
||||
self.unfreeze_editable()
|
||||
|
||||
def corpus_selected(self, *args):
|
||||
new_selection = self.var.get()
|
||||
self.load_corpus(new_selection)
|
||||
|
||||
def load_corpus(self, selection):
|
||||
if self.model.selected_corpus != selection:
|
||||
self.status["text"] = "Loading " + selection + "..."
|
||||
self.freeze_editable()
|
||||
self.model.load_corpus(selection)
|
||||
|
||||
def search(self):
|
||||
self.current_page = 0
|
||||
self.clear_results_box()
|
||||
self.model.reset_results()
|
||||
query = self.query_box.get()
|
||||
if len(query.strip()) == 0:
|
||||
return
|
||||
self.status["text"] = "Searching for " + query
|
||||
self.freeze_editable()
|
||||
self.model.search(query, self.current_page + 1)
|
||||
|
||||
def write_results(self, results):
|
||||
self.results_box["state"] = "normal"
|
||||
row = 1
|
||||
for each in results:
|
||||
sent, pos1, pos2 = each[0].strip(), each[1], each[2]
|
||||
if len(sent) != 0:
|
||||
if pos1 < self._char_before:
|
||||
sent, pos1, pos2 = self.pad(sent, pos1, pos2)
|
||||
sentence = sent[pos1 - self._char_before : pos1 + self._char_after]
|
||||
if not row == len(results):
|
||||
sentence += "\n"
|
||||
self.results_box.insert(str(row) + ".0", sentence)
|
||||
word_markers, label_markers = self.words_and_labels(sent, pos1, pos2)
|
||||
for marker in word_markers:
|
||||
self.results_box.tag_add(
|
||||
self._HIGHLIGHT_WORD_TAG,
|
||||
str(row) + "." + str(marker[0]),
|
||||
str(row) + "." + str(marker[1]),
|
||||
)
|
||||
for marker in label_markers:
|
||||
self.results_box.tag_add(
|
||||
self._HIGHLIGHT_LABEL_TAG,
|
||||
str(row) + "." + str(marker[0]),
|
||||
str(row) + "." + str(marker[1]),
|
||||
)
|
||||
row += 1
|
||||
self.results_box["state"] = "disabled"
|
||||
|
||||
def words_and_labels(self, sentence, pos1, pos2):
|
||||
search_exp = sentence[pos1:pos2]
|
||||
words, labels = [], []
|
||||
labeled_words = search_exp.split(" ")
|
||||
index = 0
|
||||
for each in labeled_words:
|
||||
if each == "":
|
||||
index += 1
|
||||
else:
|
||||
word, label = each.split("/")
|
||||
words.append(
|
||||
(self._char_before + index, self._char_before + index + len(word))
|
||||
)
|
||||
index += len(word) + 1
|
||||
labels.append(
|
||||
(self._char_before + index, self._char_before + index + len(label))
|
||||
)
|
||||
index += len(label)
|
||||
index += 1
|
||||
return words, labels
|
||||
|
||||
def pad(self, sent, hstart, hend):
|
||||
if hstart >= self._char_before:
|
||||
return sent, hstart, hend
|
||||
d = self._char_before - hstart
|
||||
sent = "".join([" "] * d) + sent
|
||||
return sent, hstart + d, hend + d
|
||||
|
||||
def destroy(self, *e):
|
||||
if self.top is None:
|
||||
return
|
||||
self.top.after_cancel(self.after)
|
||||
self.top.destroy()
|
||||
self.top = None
|
||||
|
||||
def clear_all(self):
|
||||
self.query_box.delete(0, END)
|
||||
self.model.reset_query()
|
||||
self.clear_results_box()
|
||||
|
||||
def clear_results_box(self):
|
||||
self.results_box["state"] = "normal"
|
||||
self.results_box.delete("1.0", END)
|
||||
self.results_box["state"] = "disabled"
|
||||
|
||||
def freeze_editable(self):
|
||||
self.query_box["state"] = "disabled"
|
||||
self.search_button["state"] = "disabled"
|
||||
self.prev["state"] = "disabled"
|
||||
self.next["state"] = "disabled"
|
||||
|
||||
def unfreeze_editable(self):
|
||||
self.query_box["state"] = "normal"
|
||||
self.search_button["state"] = "normal"
|
||||
self.set_paging_button_states()
|
||||
|
||||
def set_paging_button_states(self):
|
||||
if self.current_page == 0 or self.current_page == 1:
|
||||
self.prev["state"] = "disabled"
|
||||
else:
|
||||
self.prev["state"] = "normal"
|
||||
if self.model.has_more_pages(self.current_page):
|
||||
self.next["state"] = "normal"
|
||||
else:
|
||||
self.next["state"] = "disabled"
|
||||
|
||||
def fire_event(self, event):
|
||||
# Firing an event so that rendering of widgets happen in the mainloop thread
|
||||
self.top.event_generate(event, when="tail")
|
||||
|
||||
def mainloop(self, *args, **kwargs):
|
||||
if in_idle():
|
||||
return
|
||||
self.top.mainloop(*args, **kwargs)
|
||||
|
||||
|
||||
class ConcordanceSearchModel:
|
||||
def __init__(self, queue):
|
||||
self.queue = queue
|
||||
self.CORPORA = _CORPORA
|
||||
self.DEFAULT_CORPUS = _DEFAULT
|
||||
self.selected_corpus = None
|
||||
self.reset_query()
|
||||
self.reset_results()
|
||||
self.result_count = None
|
||||
self.last_sent_searched = 0
|
||||
|
||||
def non_default_corpora(self):
|
||||
copy = []
|
||||
copy.extend(list(self.CORPORA.keys()))
|
||||
copy.remove(self.DEFAULT_CORPUS)
|
||||
copy.sort()
|
||||
return copy
|
||||
|
||||
def load_corpus(self, name):
|
||||
self.selected_corpus = name
|
||||
self.tagged_sents = []
|
||||
runner_thread = self.LoadCorpus(name, self)
|
||||
runner_thread.start()
|
||||
|
||||
def search(self, query, page):
|
||||
self.query = query
|
||||
self.last_requested_page = page
|
||||
self.SearchCorpus(self, page, self.result_count).start()
|
||||
|
||||
def next(self, page):
|
||||
self.last_requested_page = page
|
||||
if len(self.results) < page:
|
||||
self.search(self.query, page)
|
||||
else:
|
||||
self.queue.put(SEARCH_TERMINATED_EVENT)
|
||||
|
||||
def prev(self, page):
|
||||
self.last_requested_page = page
|
||||
self.queue.put(SEARCH_TERMINATED_EVENT)
|
||||
|
||||
def reset_results(self):
|
||||
self.last_sent_searched = 0
|
||||
self.results = []
|
||||
self.last_page = None
|
||||
|
||||
def reset_query(self):
|
||||
self.query = None
|
||||
|
||||
def set_results(self, page, resultset):
|
||||
self.results.insert(page - 1, resultset)
|
||||
|
||||
def get_results(self):
|
||||
return self.results[self.last_requested_page - 1]
|
||||
|
||||
def has_more_pages(self, page):
|
||||
if self.results == [] or self.results[0] == []:
|
||||
return False
|
||||
if self.last_page is None:
|
||||
return True
|
||||
return page < self.last_page
|
||||
|
||||
class LoadCorpus(threading.Thread):
|
||||
def __init__(self, name, model):
|
||||
threading.Thread.__init__(self)
|
||||
self.model, self.name = model, name
|
||||
|
||||
def run(self):
|
||||
try:
|
||||
ts = self.model.CORPORA[self.name]()
|
||||
self.model.tagged_sents = [
|
||||
" ".join(w + "/" + t for (w, t) in sent) for sent in ts
|
||||
]
|
||||
self.model.queue.put(CORPUS_LOADED_EVENT)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
self.model.queue.put(ERROR_LOADING_CORPUS_EVENT)
|
||||
|
||||
class SearchCorpus(threading.Thread):
|
||||
def __init__(self, model, page, count):
|
||||
self.model, self.count, self.page = model, count, page
|
||||
threading.Thread.__init__(self)
|
||||
|
||||
def run(self):
|
||||
q = self.processed_query()
|
||||
sent_pos, i, sent_count = [], 0, 0
|
||||
for sent in self.model.tagged_sents[self.model.last_sent_searched :]:
|
||||
try:
|
||||
m = re.search(q, sent)
|
||||
except re.error:
|
||||
self.model.reset_results()
|
||||
self.model.queue.put(SEARCH_ERROR_EVENT)
|
||||
return
|
||||
if m:
|
||||
sent_pos.append((sent, m.start(), m.end()))
|
||||
i += 1
|
||||
if i > self.count:
|
||||
self.model.last_sent_searched += sent_count - 1
|
||||
break
|
||||
sent_count += 1
|
||||
if self.count >= len(sent_pos):
|
||||
self.model.last_sent_searched += sent_count - 1
|
||||
self.model.last_page = self.page
|
||||
self.model.set_results(self.page, sent_pos)
|
||||
else:
|
||||
self.model.set_results(self.page, sent_pos[:-1])
|
||||
self.model.queue.put(SEARCH_TERMINATED_EVENT)
|
||||
|
||||
def processed_query(self):
|
||||
new = []
|
||||
for term in self.model.query.split():
|
||||
term = re.sub(r"\.", r"[^/ ]", term)
|
||||
if re.match("[A-Z]+$", term):
|
||||
new.append(BOUNDARY + WORD_OR_TAG + "/" + term + BOUNDARY)
|
||||
elif "/" in term:
|
||||
new.append(BOUNDARY + term + BOUNDARY)
|
||||
else:
|
||||
new.append(BOUNDARY + term + "/" + WORD_OR_TAG + BOUNDARY)
|
||||
return " ".join(new)
|
||||
|
||||
|
||||
def app():
|
||||
d = ConcordanceSearchView()
|
||||
d.mainloop()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app()
|
||||
|
||||
__all__ = ["app"]
|
||||
163
backend/venv/Lib/site-packages/nltk/app/nemo_app.py
Normal file
163
backend/venv/Lib/site-packages/nltk/app/nemo_app.py
Normal file
@@ -0,0 +1,163 @@
|
||||
# Finding (and Replacing) Nemo, Version 1.1, Aristide Grange 2006/06/06
|
||||
# https://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/496783
|
||||
|
||||
"""
|
||||
Finding (and Replacing) Nemo
|
||||
|
||||
Instant Regular Expressions
|
||||
Created by Aristide Grange
|
||||
"""
|
||||
import itertools
|
||||
import re
|
||||
from tkinter import SEL_FIRST, SEL_LAST, Frame, Label, PhotoImage, Scrollbar, Text, Tk
|
||||
|
||||
windowTitle = "Finding (and Replacing) Nemo"
|
||||
initialFind = r"n(.*?)e(.*?)m(.*?)o"
|
||||
initialRepl = r"M\1A\2K\3I"
|
||||
initialText = """\
|
||||
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
|
||||
Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
|
||||
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
|
||||
Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
|
||||
"""
|
||||
images = {
|
||||
"FIND": "R0lGODlhMAAiAPcAMf/////37//35//n1v97Off///f/9/f37/fexvfOvfeEQvd7QvdrQvdrKfdaKfdSMfdSIe/v9+/v7+/v5+/n3u/e1u/Wxu/Gre+1lO+tnO+thO+Ua+97Y+97Oe97Me9rOe9rMe9jOe9jMe9jIe9aMefe5+fe3ufezuece+eEWudzQudaIedSIedKMedKIedCKedCId7e1t7Wzt7Oxt7Gvd69vd69rd61pd6ljN6UjN6Ue96EY95zY95rUt5rQt5jMd5SId5KIdbn59be3tbGztbGvda1rdaEa9Z7a9Z7WtZzQtZzOdZzMdZjMdZaQtZSOdZSMdZKMdZCKdZCGNY5Ic7W1s7Oxs7Gtc69xs69tc69rc6tpc6llM6clM6cjM6Ue86EY85zWs5rSs5SKc5KKc5KGMa1tcatrcalvcalnMaUpcZ7c8ZzMcZrUsZrOcZrMcZaQsZSOcZSMcZKMcZCKcZCGMYxIcYxGL3Gxr21tb21rb2lpb2crb2cjL2UnL2UlL2UhL2Ec717Wr17Ur1zWr1rMb1jUr1KMb1KIb1CIb0xGLWlrbWlpbWcnLWEe7V7c7VzY7VzUrVSKbVKMbVCMbVCIbU5KbUxIbUxEK2lta2lpa2clK2UjK2MnK2MlK2Ea617e61za61rY61rMa1jSq1aUq1aSq1SQq1KKa0xEKWlnKWcnKWUnKWUhKWMjKWEa6Vza6VrWqVjMaVaUqVaKaVSMaVCMaU5KaUxIaUxGJyclJyMe5yElJyEhJx7e5x7c5xrOZxaQpxSOZxKQpw5IZSMhJSEjJR7c5Rre5RrY5RrUpRSQpRSKZRCOZRCKZQxKZQxIYyEhIx7hIxza4xzY4xrc4xjUoxaa4xaUoxSSoxKQoxCMYw5GIR7c4Rzc4Rre4RjY4RjWoRaa4RSWoRSUoRSMYRKQoRCOYQ5KYQxIXtra3taY3taSntKOXtCMXtCKXNCMXM5MXMxIWtSUmtKSmtKQmtCOWs5MWs5KWs5IWNCKWMxIVIxKUIQCDkhGAAAACH+AS4ALAAAAAAwACIAAAj/AAEIHEiwoMGDCBMqXMiwoUOHMqxIeEiRoZVp7cpZ29WrF4WKIAd208dGAQEVbiTVChUjZMU9+pYQmPmBZpxgvVw+nDdKwQICNVcIXQEkTgKdDdUJ+/nggVAXK1xI3TEA6UIr2uJ8iBqka1cXXTlkqGoVYRZ7iLyqBSs0iiEtZQVKiDGxBI1u3NR6lUpGDKg8MSgEQCphU7Z22vhg0dILXRCpYLuSCcYJT4wqXASBQaBzU7klHxC127OHD7ZDJFpERqRt0x5OnwQpmZmCLEhrbgg4WIHO1RY+nbQ9WRGEDJlmnXwJ+9FBgXMCIzYMVijBBgYMFxIMqJBMSc0Ht7qh/+Gjpte2rnYsYeNlasWIBgQ6yCewIoPCCp/cyP/wgUGbXVu0QcADZNBDnh98gHMLGXYQUw02w61QU3wdbNWDbQVVIIhMMwFF1DaZiPLBAy7E04kafrjSizaK3LFNNc0AAYRQDsAHHQlJ2IDQJ2zE1+EKDjiAijShkECCC8Qgw4cr7ZgyzC2WaHPNLWWoNeNWPiRAw0QFWQFMhz8C+QQ20yAiVSrY+MGOJCsccsst2GCzoHFxxEGGC+8hgs0MB2kyCpgzrUDCbs1Es41UdtATHFFkWELMOtsoQsYcgvRRQw5RSDgGOjZMR1AvPQIq6KCo9AKOJWDd48owQlHR4DXEKP9iyRrK+DNNBTu4RwIPFeTAGUG7hAomkA84gEg1m6ADljy9PBKGGJY4ig0xlsTBRSn98FOFDUC8pwQOPkgHbCGAzhTkA850s0c7j6Hjix9+gBIrMXLeAccWXUCyiRBcBEECdEJ98KtAqtBCYQc/OvDENnl4gYpUxISCIjjzylkGGV9okYUVNogRhAOBuuAEhjG08wOgDYzAgA5bCjIoCe5uwUk80RKTTSppPREGGGCIISOQ9AXBg6cC6WIywvCpoMHAocRBwhP4bHLFLujYkV42xNxBRhAyGrc113EgYtRBerDDDHMoDCyQEL5sE083EkgwQyBhxGFHMM206DUixGxmE0wssbQjCQ4JCaFKFwgQTVAVVhQUwAVPIFJKrHfYYRwi6OCDzzuIJIFhXAD0EccPsYRiSyqKSDpFcWSMIcZRoBMkQyA2BGZDIKSYcggih8TRRg4VxM5QABVYYLxgwiev/PLMCxQQADs=",
|
||||
"find": "R0lGODlhMAAiAPQAMf////f39+/v7+fn597e3tbW1s7OzsbGxr29vbW1ta2traWlpZycnJSUlIyMjISEhHt7e3Nzc2tra2NjY1paWlJSUkpKSkJCQjk5OSkpKRgYGAAAAAAAAAAAAAAAAAAAACH+AS4ALAAAAAAwACIAAAX/ICCOZGmeaKquY2AGLiuvMCAUBuHWc48Kh0iFInEYCb4kSQCxPBiMxkMigRQEgJiSFVBYHNGG0RiZOHjblWAiiY4fkDhEYoBp06dAWfyAQyKAgAwDaHgnB0RwgYASgQ0IhDuGJDAIFhMRVFSLEX8QCJJ4AQM5AgQHTZqqjBAOCQQEkWkCDRMUFQsICQ4Vm5maEwwHOAsPDTpKMAsUDlO4CssTcb+2DAp8YGCyNFoCEsZwFQ3QDRTTVBRS0g1QbgsCd5QAAwgIBwYFAwStzQ8UEdCKVchky0yVBw7YuXkAKt4IAg74vXHVagqFBRgXSCAyYWAVCH0SNhDTitCJfSL5/4RbAPKPhQYYjVCYYAvCP0BxEDaD8CheAAHNwqh8MMGPSwgLeJWhwHSjqkYI+xg4MMCEgQjtRvZ7UAYCpghMF7CxONOWJkYR+rCpY4JlVpVxKDwYWEactKW9mhYRtqCTgwgWEMArERSK1j5q//6T8KXonFsShpiJkAECgQYVjykooCVA0JGHEWNiYCHThTFeb3UkoiCCBgwGEKQ1kuAJlhFwhA71h5SukwUM5qqeCSGBgicEWkfNiWSERtBad4JNIBaQBaQah1ToyGZBAnsIuIJs1qnqiAIVjIE2gnAB1T5x0icgzXT79ipgMOOEH6HBbREBMJCeGEY08IoLAkzB1YYFwjxwSUGSNULQJnNUwRYlCcyEkALIxECAP9cNMMABYpRhy3ZsSLDaR70oUAiABGCkAxowCGCAAfDYIQACXoElGRsdXWDBdg2Y90IWktDYGYAB9PWHP0PMdFZaF07SQgAFNDAMAQg0QA1UC8xoZQl22JGFPgWkOUCOL1pZQyhjxinnnCWEAAA7",
|
||||
"REPL": "R0lGODlhMAAjAPcAMf/////3//+lOf+UKf+MEPf///f39/f35/fv7/ecQvecOfecKfeUIfeUGPeUEPeUCPeMAO/37+/v9+/v3u/n3u/n1u+9jO+9c++1hO+ta++tY++tWu+tUu+tSu+lUu+lQu+lMe+UMe+UKe+UGO+UEO+UAO+MCOfv5+fvxufn7+fn5+fnzue9lOe9c+e1jOe1e+e1c+e1a+etWuetUuelQuecOeeUUueUCN7e597e3t7e1t7ezt7evd7Wzt7Oxt7Ovd7Otd7Opd7OnN7Gtd7Gpd69lN61hN6ta96lStbextberdbW3tbWztbWxtbOvdbOrda1hNalUtaECM7W1s7Ozs7Oxs7Otc7Gxs7Gvc69tc69rc69pc61jM6lc8bWlMbOvcbGxsbGpca9tca9pca1nMaMAL3OhL3Gtb21vb21tb2tpb2tnL2tlLW9tbW9pbW9e7W1pbWtjLWcKa21nK2tra2tnK2tlK2lpa2llK2ljK2le6WlnKWljKWUe6WUc6WUY5y1QpyclJycjJychJyUc5yMY5StY5SUe5SMhJSMe5SMc5SMWpSEa5SESoyUe4yMhIyEY4SlKYScWoSMe4SEe4SEa4R7c4R7Y3uMY3uEe3t7e3t7c3tza3tzY3trKXtjIXOcAHOUMXOEY3Nzc3NzWnNrSmulCGuUMWuMGGtzWmtrY2taMWtaGGOUOWOMAGNzUmNjWmNjSmNaUmNaQmNaOWNaIWNSCFqcAFpjUlpSMVpSIVpSEFpKKVKMAFJSUlJSSlJSMVJKMVJKGFJKAFI5CEqUAEqEAEpzQkpKIUpCQkpCGEpCAEo5EEoxAEJjOUJCOUJCAEI5IUIxADl7ADlaITlCOTkxMTkxKTkxEDkhADFzADFrGDE5OTExADEpEClrCCkxKSkpKSkpISkpACkhCCkhACkYACFzACFrACEhCCEYGBhjEBhjABghABgYCBgYABgQEBgQABAQABAIAAhjAAhSAAhKAAgIEAgICABaAABCAAAhAAAQAAAIAAAAAAAAACH+AS4ALAAAAAAwACMAAAj/AAEIHEiwoMGDCBMqXMiwocOHAA4cgEixIIIJO3JMmAjADIqKFU/8MHIkg5EgYXx4iaTkI0iHE6wE2TCggYILQayEAgXIy8uGCKz8sDCAQAMRG3iEcXULlJkJPwli3OFjh9UdYYLE6NBhA04UXHoVA2XoTZgfPKBWlOBDphAWOdfMcfMDLloeO3hIMjbWVCQ5Fn6E2UFxgpsgFjYIEBADrZU6luqEEfqjTqpt54z1uuWqTIcgWAk7PECGzIUQDRosDmxlUrVJkwQJkqVuX71v06YZcyUlROAdbnLAJKPFyAYFAhoMwFlnEh0rWkpz8raPHm7dqKKc/KFFkBUrVn1M/ziBcEIeLUEQI8/AYk0i9Be4sqjsrN66c9/OnbobhpR3HkIUoZ0WVnBE0AGLFKKFD0HAFUQe77HQgQI1hRBDEHMcY0899bBzihZuCPILJD8EccEGGzwAQhFaUHHQH82sUkgeNHISDBk8WCCCcsqFUEQWmOyzjz3sUGNNOO5Y48YOEgowAAQhnBScQV00k82V47jzjy9CXZBcjziFoco//4CDiSOyhPMPLkJZkEBqJmRQxA9uZGEQD8Ncmc044/zzDF2IZQBCCDYE8QMZz/iiCSx0neHGI7BIhhhNn+1gxRpokEcQAp7seWU7/PwTyxqG/iCEEVzQmUombnDRxRExzP9nBR2PCKLFD3UJwcMPa/SRqUGNWJmNOVn+M44ukMRB4KGcWDNLVhuUMEIJAlzwA3DJBHMJIXm4sQYhqyxCRQQGLSIsn1qac2UzysQSyzX/hLMGD0F0IMCODYAQBA9W/PKPOcRiw0wzwxTiokF9dLMnuv/Mo+fCZF7jBr0xbDDCACWEYKgb1vzjDp/jZNOMLX0IZxAKq2TZTjtaOjwOsXyG+s8sZJTIQsUdIGHoJPf8w487QI/TDSt5mGwQFZxc406o8HiDJchk/ltLHpSlJwSvz5DpTjvmuGNOM57koelBOaAhiCaaPBLL0wwbm003peRBnBZqJMJL1ECz/HXYYx/NdAIOOVCxQyLorswymU93o0wuwfAiTDNR/xz0MLXU0XdCE+UwSTRZAq2lsSATu+4wkGvt+TjNzPLrQyegAUku2Hij5cd8LhxyM8QIg4w18HgcdC6BTBFSDmfQqsovttveDcG7lFLHI75cE841sARCxeWsnxC4G9HADPK6ywzDCRqBo0EHHWhMgT1IJzziNci1N7PMKnSYfML96/90AiJKey/0KtbLX1QK0rrNnQ541xugQ7SHhkXBghN0SKACWRc4KlAhBwKcIOYymJCAAAA7",
|
||||
"repl": "R0lGODlhMAAjAPQAMf////f39+/v7+fn597e3tbW1s7OzsbGxr29vbW1ta2traWlpZycnJSUlIyMjISEhHt7e3Nzc2tra2NjY1paWlJSUkpKSkJCQjk5OTExMSkpKSEhIRgYGBAQEAgICAAAACH+AS4ALAAAAAAwACMAAAX/ICCOZGmeaKqubOu+gCDANBkIQ1EMQhAghFptYEAkEgjEwXBo7ISvweGgWCwUysPjwTgEoCafTySYIhYMxgLBjEQgCULvCw0QdAZdoVhUIJUFChISEAxYeQM1N1OMTAp+UwZ5eA4TEhFbDWYFdC4ECVMJjwl5BwsQa0umEhUVlhESDgqlBp0rAn5nVpBMDxeZDRQbHBgWFBSWDgtLBnFjKwRYCI9VqQsPs0YKEcMXFq0UEalFDWx4BAO2IwPjppAKDkrTWKYUGd7fEJJFEZpM00cOzCgh4EE8SaoWxKNixQooBRMyZMBwAYIRBhUgLDGS4MoBJeoANMhAgQsaCRZm/5lqaCUJhA4cNHjDoKEDBlJUHqkBlYBTiQUZNGjYMMxDhY3VWk6R4MEDBoMUak5AqoYBqANIBo4wcGGDUKIeLlzVZmWJggsVIkwAZaQSA3kdZzlKkIiEAAlDvW5oOkEBs488JTw44oeUIwdvVTFTUK7uiAAPgubt8GFDhQepqETAQCFU1UMGzlqAgFhUsAcCS0AO6lUDhw8xNRSbENGDhgWSHjWUe6ACbKITizmopZoBa6KvOwj9uuHDhwxyj3xekgDDhw5EvWKo0IB4iQLCOCC/njc7ZQ8UeGvza+ABZZgcxJNc4FO1gc0cOsCUrHevc8tdIMTIAhc4F198G2Qwwd8CBIQUAwEINABBBJUwR9R5wElgVRLwWODBBx4cGB8GEzDQIAo33CGJA8gh+JoH/clUgQU0YvDhdfmJdwEFC6Sjgg8yEPAABsPkh2F22cl2AQbn6QdTghTQ5eAJAQyQAAQV0MSBB9gRVZ4GE1mw5JZOAmiAVi1UWcAZDrDyZXYTeaOhA/bIVuIBPtKQ4h7ViYekUPdcEAEbzTzCRp5CADmAAwj+ORGPBcgwAAHo9ABGCYtm0ChwFHShlRiXhmHlkAcCiOeUodqQw5W0oXLAiamy4MOkjOyAaqxUymApDCEAADs=",
|
||||
}
|
||||
colors = ["#FF7B39", "#80F121"]
|
||||
emphColors = ["#DAFC33", "#F42548"]
|
||||
fieldParams = {
|
||||
"height": 3,
|
||||
"width": 70,
|
||||
"font": ("monaco", 14),
|
||||
"highlightthickness": 0,
|
||||
"borderwidth": 0,
|
||||
"background": "white",
|
||||
}
|
||||
textParams = {
|
||||
"bg": "#F7E0D4",
|
||||
"fg": "#2321F1",
|
||||
"highlightthickness": 0,
|
||||
"width": 1,
|
||||
"height": 10,
|
||||
"font": ("verdana", 16),
|
||||
"wrap": "word",
|
||||
}
|
||||
|
||||
|
||||
class Zone:
|
||||
def __init__(self, image, initialField, initialText):
|
||||
frm = Frame(root)
|
||||
frm.config(background="white")
|
||||
self.image = PhotoImage(format="gif", data=images[image.upper()])
|
||||
self.imageDimmed = PhotoImage(format="gif", data=images[image])
|
||||
self.img = Label(frm)
|
||||
self.img.config(borderwidth=0)
|
||||
self.img.pack(side="left")
|
||||
self.fld = Text(frm, **fieldParams)
|
||||
self.initScrollText(frm, self.fld, initialField)
|
||||
frm = Frame(root)
|
||||
self.txt = Text(frm, **textParams)
|
||||
self.initScrollText(frm, self.txt, initialText)
|
||||
for i in range(2):
|
||||
self.txt.tag_config(colors[i], background=colors[i])
|
||||
self.txt.tag_config("emph" + colors[i], foreground=emphColors[i])
|
||||
|
||||
def initScrollText(self, frm, txt, contents):
|
||||
scl = Scrollbar(frm)
|
||||
scl.config(command=txt.yview)
|
||||
scl.pack(side="right", fill="y")
|
||||
txt.pack(side="left", expand=True, fill="x")
|
||||
txt.config(yscrollcommand=scl.set)
|
||||
txt.insert("1.0", contents)
|
||||
frm.pack(fill="x")
|
||||
Frame(height=2, bd=1, relief="ridge").pack(fill="x")
|
||||
|
||||
def refresh(self):
|
||||
self.colorCycle = itertools.cycle(colors)
|
||||
try:
|
||||
self.substitute()
|
||||
self.img.config(image=self.image)
|
||||
except re.error:
|
||||
self.img.config(image=self.imageDimmed)
|
||||
|
||||
|
||||
class FindZone(Zone):
|
||||
def addTags(self, m):
|
||||
color = next(self.colorCycle)
|
||||
self.txt.tag_add(color, "1.0+%sc" % m.start(), "1.0+%sc" % m.end())
|
||||
try:
|
||||
self.txt.tag_add(
|
||||
"emph" + color, "1.0+%sc" % m.start("emph"), "1.0+%sc" % m.end("emph")
|
||||
)
|
||||
except:
|
||||
pass
|
||||
|
||||
def substitute(self, *args):
|
||||
for color in colors:
|
||||
self.txt.tag_remove(color, "1.0", "end")
|
||||
self.txt.tag_remove("emph" + color, "1.0", "end")
|
||||
self.rex = re.compile("") # default value in case of malformed regexp
|
||||
self.rex = re.compile(self.fld.get("1.0", "end")[:-1], re.MULTILINE)
|
||||
try:
|
||||
re.compile("(?P<emph>%s)" % self.fld.get(SEL_FIRST, SEL_LAST))
|
||||
self.rexSel = re.compile(
|
||||
"%s(?P<emph>%s)%s"
|
||||
% (
|
||||
self.fld.get("1.0", SEL_FIRST),
|
||||
self.fld.get(SEL_FIRST, SEL_LAST),
|
||||
self.fld.get(SEL_LAST, "end")[:-1],
|
||||
),
|
||||
re.MULTILINE,
|
||||
)
|
||||
except:
|
||||
self.rexSel = self.rex
|
||||
self.rexSel.sub(self.addTags, self.txt.get("1.0", "end"))
|
||||
|
||||
|
||||
class ReplaceZone(Zone):
|
||||
def addTags(self, m):
|
||||
s = sz.rex.sub(self.repl, m.group())
|
||||
self.txt.delete(
|
||||
"1.0+%sc" % (m.start() + self.diff), "1.0+%sc" % (m.end() + self.diff)
|
||||
)
|
||||
self.txt.insert("1.0+%sc" % (m.start() + self.diff), s, next(self.colorCycle))
|
||||
self.diff += len(s) - (m.end() - m.start())
|
||||
|
||||
def substitute(self):
|
||||
self.txt.delete("1.0", "end")
|
||||
self.txt.insert("1.0", sz.txt.get("1.0", "end")[:-1])
|
||||
self.diff = 0
|
||||
self.repl = rex0.sub(r"\\g<\1>", self.fld.get("1.0", "end")[:-1])
|
||||
sz.rex.sub(self.addTags, sz.txt.get("1.0", "end")[:-1])
|
||||
|
||||
|
||||
def launchRefresh(_):
|
||||
sz.fld.after_idle(sz.refresh)
|
||||
rz.fld.after_idle(rz.refresh)
|
||||
|
||||
|
||||
def app():
|
||||
global root, sz, rz, rex0
|
||||
root = Tk()
|
||||
root.resizable(height=False, width=True)
|
||||
root.title(windowTitle)
|
||||
root.minsize(width=250, height=0)
|
||||
sz = FindZone("find", initialFind, initialText)
|
||||
sz.fld.bind("<Button-1>", launchRefresh)
|
||||
sz.fld.bind("<ButtonRelease-1>", launchRefresh)
|
||||
sz.fld.bind("<B1-Motion>", launchRefresh)
|
||||
sz.rexSel = re.compile("")
|
||||
rz = ReplaceZone("repl", initialRepl, "")
|
||||
rex0 = re.compile(r"(?<!\\)\\([0-9]+)")
|
||||
root.bind_all("<Key>", launchRefresh)
|
||||
launchRefresh(None)
|
||||
root.mainloop()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app()
|
||||
|
||||
__all__ = ["app"]
|
||||
1052
backend/venv/Lib/site-packages/nltk/app/rdparser_app.py
Normal file
1052
backend/venv/Lib/site-packages/nltk/app/rdparser_app.py
Normal file
File diff suppressed because it is too large
Load Diff
937
backend/venv/Lib/site-packages/nltk/app/srparser_app.py
Normal file
937
backend/venv/Lib/site-packages/nltk/app/srparser_app.py
Normal file
@@ -0,0 +1,937 @@
|
||||
# Natural Language Toolkit: Shift-Reduce Parser Application
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
A graphical tool for exploring the shift-reduce parser.
|
||||
|
||||
The shift-reduce parser maintains a stack, which records the structure
|
||||
of the portion of the text that has been parsed. The stack is
|
||||
initially empty. Its contents are shown on the left side of the main
|
||||
canvas.
|
||||
|
||||
On the right side of the main canvas is the remaining text. This is
|
||||
the portion of the text which has not yet been considered by the
|
||||
parser.
|
||||
|
||||
The parser builds up a tree structure for the text using two
|
||||
operations:
|
||||
|
||||
- "shift" moves the first token from the remaining text to the top
|
||||
of the stack. In the demo, the top of the stack is its right-hand
|
||||
side.
|
||||
- "reduce" uses a grammar production to combine the rightmost stack
|
||||
elements into a single tree token.
|
||||
|
||||
You can control the parser's operation by using the "shift" and
|
||||
"reduce" buttons; or you can use the "step" button to let the parser
|
||||
automatically decide which operation to apply. The parser uses the
|
||||
following rules to decide which operation to apply:
|
||||
|
||||
- Only shift if no reductions are available.
|
||||
- If multiple reductions are available, then apply the reduction
|
||||
whose CFG production is listed earliest in the grammar.
|
||||
|
||||
The "reduce" button applies the reduction whose CFG production is
|
||||
listed earliest in the grammar. There are two ways to manually choose
|
||||
which reduction to apply:
|
||||
|
||||
- Click on a CFG production from the list of available reductions,
|
||||
on the left side of the main window. The reduction based on that
|
||||
production will be applied to the top of the stack.
|
||||
- Click on one of the stack elements. A popup window will appear,
|
||||
containing all available reductions. Select one, and it will be
|
||||
applied to the top of the stack.
|
||||
|
||||
Note that reductions can only be applied to the top of the stack.
|
||||
|
||||
Keyboard Shortcuts::
|
||||
[Space]\t Perform the next shift or reduce operation
|
||||
[s]\t Perform a shift operation
|
||||
[r]\t Perform a reduction operation
|
||||
[Ctrl-z]\t Undo most recent operation
|
||||
[Delete]\t Reset the parser
|
||||
[g]\t Show/hide available production list
|
||||
[Ctrl-a]\t Toggle animations
|
||||
[h]\t Help
|
||||
[Ctrl-p]\t Print
|
||||
[q]\t Quit
|
||||
|
||||
"""
|
||||
|
||||
from tkinter import Button, Frame, IntVar, Label, Listbox, Menu, Scrollbar, Tk
|
||||
from tkinter.font import Font
|
||||
|
||||
from nltk.draw import CFGEditor, TreeSegmentWidget, tree_to_treesegment
|
||||
from nltk.draw.util import CanvasFrame, EntryDialog, ShowText, TextWidget
|
||||
from nltk.parse import SteppingShiftReduceParser
|
||||
from nltk.tree import Tree
|
||||
from nltk.util import in_idle
|
||||
|
||||
"""
|
||||
Possible future improvements:
|
||||
- button/window to change and/or select text. Just pop up a window
|
||||
with an entry, and let them modify the text; and then retokenize
|
||||
it? Maybe give a warning if it contains tokens whose types are
|
||||
not in the grammar.
|
||||
- button/window to change and/or select grammar. Select from
|
||||
several alternative grammars? Or actually change the grammar? If
|
||||
the later, then I'd want to define nltk.draw.cfg, which would be
|
||||
responsible for that.
|
||||
"""
|
||||
|
||||
|
||||
class ShiftReduceApp:
|
||||
"""
|
||||
A graphical tool for exploring the shift-reduce parser. The tool
|
||||
displays the parser's stack and the remaining text, and allows the
|
||||
user to control the parser's operation. In particular, the user
|
||||
can shift tokens onto the stack, and can perform reductions on the
|
||||
top elements of the stack. A "step" button simply steps through
|
||||
the parsing process, performing the operations that
|
||||
``nltk.parse.ShiftReduceParser`` would use.
|
||||
"""
|
||||
|
||||
def __init__(self, grammar, sent, trace=0):
|
||||
self._sent = sent
|
||||
self._parser = SteppingShiftReduceParser(grammar, trace)
|
||||
|
||||
# Set up the main window.
|
||||
self._top = Tk()
|
||||
self._top.title("Shift Reduce Parser Application")
|
||||
|
||||
# Animations. animating_lock is a lock to prevent the demo
|
||||
# from performing new operations while it's animating.
|
||||
self._animating_lock = 0
|
||||
self._animate = IntVar(self._top)
|
||||
self._animate.set(10) # = medium
|
||||
|
||||
# The user can hide the grammar.
|
||||
self._show_grammar = IntVar(self._top)
|
||||
self._show_grammar.set(1)
|
||||
|
||||
# Initialize fonts.
|
||||
self._init_fonts(self._top)
|
||||
|
||||
# Set up key bindings.
|
||||
self._init_bindings()
|
||||
|
||||
# Create the basic frames.
|
||||
self._init_menubar(self._top)
|
||||
self._init_buttons(self._top)
|
||||
self._init_feedback(self._top)
|
||||
self._init_grammar(self._top)
|
||||
self._init_canvas(self._top)
|
||||
|
||||
# A popup menu for reducing.
|
||||
self._reduce_menu = Menu(self._canvas, tearoff=0)
|
||||
|
||||
# Reset the demo, and set the feedback frame to empty.
|
||||
self.reset()
|
||||
self._lastoper1["text"] = ""
|
||||
|
||||
#########################################
|
||||
## Initialization Helpers
|
||||
#########################################
|
||||
|
||||
def _init_fonts(self, root):
|
||||
# See: <http://www.astro.washington.edu/owen/ROTKFolklore.html>
|
||||
self._sysfont = Font(font=Button()["font"])
|
||||
root.option_add("*Font", self._sysfont)
|
||||
|
||||
# TWhat's our font size (default=same as sysfont)
|
||||
self._size = IntVar(root)
|
||||
self._size.set(self._sysfont.cget("size"))
|
||||
|
||||
self._boldfont = Font(family="helvetica", weight="bold", size=self._size.get())
|
||||
self._font = Font(family="helvetica", size=self._size.get())
|
||||
|
||||
def _init_grammar(self, parent):
|
||||
# Grammar view.
|
||||
self._prodframe = listframe = Frame(parent)
|
||||
self._prodframe.pack(fill="both", side="left", padx=2)
|
||||
self._prodlist_label = Label(
|
||||
self._prodframe, font=self._boldfont, text="Available Reductions"
|
||||
)
|
||||
self._prodlist_label.pack()
|
||||
self._prodlist = Listbox(
|
||||
self._prodframe,
|
||||
selectmode="single",
|
||||
relief="groove",
|
||||
background="white",
|
||||
foreground="#909090",
|
||||
font=self._font,
|
||||
selectforeground="#004040",
|
||||
selectbackground="#c0f0c0",
|
||||
)
|
||||
|
||||
self._prodlist.pack(side="right", fill="both", expand=1)
|
||||
|
||||
self._productions = list(self._parser.grammar().productions())
|
||||
for production in self._productions:
|
||||
self._prodlist.insert("end", (" %s" % production))
|
||||
self._prodlist.config(height=min(len(self._productions), 25))
|
||||
|
||||
# Add a scrollbar if there are more than 25 productions.
|
||||
if 1: # len(self._productions) > 25:
|
||||
listscroll = Scrollbar(self._prodframe, orient="vertical")
|
||||
self._prodlist.config(yscrollcommand=listscroll.set)
|
||||
listscroll.config(command=self._prodlist.yview)
|
||||
listscroll.pack(side="left", fill="y")
|
||||
|
||||
# If they select a production, apply it.
|
||||
self._prodlist.bind("<<ListboxSelect>>", self._prodlist_select)
|
||||
|
||||
# When they hover over a production, highlight it.
|
||||
self._hover = -1
|
||||
self._prodlist.bind("<Motion>", self._highlight_hover)
|
||||
self._prodlist.bind("<Leave>", self._clear_hover)
|
||||
|
||||
def _init_bindings(self):
|
||||
# Quit
|
||||
self._top.bind("<Control-q>", self.destroy)
|
||||
self._top.bind("<Control-x>", self.destroy)
|
||||
self._top.bind("<Alt-q>", self.destroy)
|
||||
self._top.bind("<Alt-x>", self.destroy)
|
||||
|
||||
# Ops (step, shift, reduce, undo)
|
||||
self._top.bind("<space>", self.step)
|
||||
self._top.bind("<s>", self.shift)
|
||||
self._top.bind("<Alt-s>", self.shift)
|
||||
self._top.bind("<Control-s>", self.shift)
|
||||
self._top.bind("<r>", self.reduce)
|
||||
self._top.bind("<Alt-r>", self.reduce)
|
||||
self._top.bind("<Control-r>", self.reduce)
|
||||
self._top.bind("<Delete>", self.reset)
|
||||
self._top.bind("<u>", self.undo)
|
||||
self._top.bind("<Alt-u>", self.undo)
|
||||
self._top.bind("<Control-u>", self.undo)
|
||||
self._top.bind("<Control-z>", self.undo)
|
||||
self._top.bind("<BackSpace>", self.undo)
|
||||
|
||||
# Misc
|
||||
self._top.bind("<Control-p>", self.postscript)
|
||||
self._top.bind("<Control-h>", self.help)
|
||||
self._top.bind("<F1>", self.help)
|
||||
self._top.bind("<Control-g>", self.edit_grammar)
|
||||
self._top.bind("<Control-t>", self.edit_sentence)
|
||||
|
||||
# Animation speed control
|
||||
self._top.bind("-", lambda e, a=self._animate: a.set(20))
|
||||
self._top.bind("=", lambda e, a=self._animate: a.set(10))
|
||||
self._top.bind("+", lambda e, a=self._animate: a.set(4))
|
||||
|
||||
def _init_buttons(self, parent):
|
||||
# Set up the frames.
|
||||
self._buttonframe = buttonframe = Frame(parent)
|
||||
buttonframe.pack(fill="none", side="bottom")
|
||||
Button(
|
||||
buttonframe,
|
||||
text="Step",
|
||||
background="#90c0d0",
|
||||
foreground="black",
|
||||
command=self.step,
|
||||
).pack(side="left")
|
||||
Button(
|
||||
buttonframe,
|
||||
text="Shift",
|
||||
underline=0,
|
||||
background="#90f090",
|
||||
foreground="black",
|
||||
command=self.shift,
|
||||
).pack(side="left")
|
||||
Button(
|
||||
buttonframe,
|
||||
text="Reduce",
|
||||
underline=0,
|
||||
background="#90f090",
|
||||
foreground="black",
|
||||
command=self.reduce,
|
||||
).pack(side="left")
|
||||
Button(
|
||||
buttonframe,
|
||||
text="Undo",
|
||||
underline=0,
|
||||
background="#f0a0a0",
|
||||
foreground="black",
|
||||
command=self.undo,
|
||||
).pack(side="left")
|
||||
|
||||
def _init_menubar(self, parent):
|
||||
menubar = Menu(parent)
|
||||
|
||||
filemenu = Menu(menubar, tearoff=0)
|
||||
filemenu.add_command(
|
||||
label="Reset Parser", underline=0, command=self.reset, accelerator="Del"
|
||||
)
|
||||
filemenu.add_command(
|
||||
label="Print to Postscript",
|
||||
underline=0,
|
||||
command=self.postscript,
|
||||
accelerator="Ctrl-p",
|
||||
)
|
||||
filemenu.add_command(
|
||||
label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x"
|
||||
)
|
||||
menubar.add_cascade(label="File", underline=0, menu=filemenu)
|
||||
|
||||
editmenu = Menu(menubar, tearoff=0)
|
||||
editmenu.add_command(
|
||||
label="Edit Grammar",
|
||||
underline=5,
|
||||
command=self.edit_grammar,
|
||||
accelerator="Ctrl-g",
|
||||
)
|
||||
editmenu.add_command(
|
||||
label="Edit Text",
|
||||
underline=5,
|
||||
command=self.edit_sentence,
|
||||
accelerator="Ctrl-t",
|
||||
)
|
||||
menubar.add_cascade(label="Edit", underline=0, menu=editmenu)
|
||||
|
||||
rulemenu = Menu(menubar, tearoff=0)
|
||||
rulemenu.add_command(
|
||||
label="Step", underline=1, command=self.step, accelerator="Space"
|
||||
)
|
||||
rulemenu.add_separator()
|
||||
rulemenu.add_command(
|
||||
label="Shift", underline=0, command=self.shift, accelerator="Ctrl-s"
|
||||
)
|
||||
rulemenu.add_command(
|
||||
label="Reduce", underline=0, command=self.reduce, accelerator="Ctrl-r"
|
||||
)
|
||||
rulemenu.add_separator()
|
||||
rulemenu.add_command(
|
||||
label="Undo", underline=0, command=self.undo, accelerator="Ctrl-u"
|
||||
)
|
||||
menubar.add_cascade(label="Apply", underline=0, menu=rulemenu)
|
||||
|
||||
viewmenu = Menu(menubar, tearoff=0)
|
||||
viewmenu.add_checkbutton(
|
||||
label="Show Grammar",
|
||||
underline=0,
|
||||
variable=self._show_grammar,
|
||||
command=self._toggle_grammar,
|
||||
)
|
||||
viewmenu.add_separator()
|
||||
viewmenu.add_radiobutton(
|
||||
label="Tiny",
|
||||
variable=self._size,
|
||||
underline=0,
|
||||
value=10,
|
||||
command=self.resize,
|
||||
)
|
||||
viewmenu.add_radiobutton(
|
||||
label="Small",
|
||||
variable=self._size,
|
||||
underline=0,
|
||||
value=12,
|
||||
command=self.resize,
|
||||
)
|
||||
viewmenu.add_radiobutton(
|
||||
label="Medium",
|
||||
variable=self._size,
|
||||
underline=0,
|
||||
value=14,
|
||||
command=self.resize,
|
||||
)
|
||||
viewmenu.add_radiobutton(
|
||||
label="Large",
|
||||
variable=self._size,
|
||||
underline=0,
|
||||
value=18,
|
||||
command=self.resize,
|
||||
)
|
||||
viewmenu.add_radiobutton(
|
||||
label="Huge",
|
||||
variable=self._size,
|
||||
underline=0,
|
||||
value=24,
|
||||
command=self.resize,
|
||||
)
|
||||
menubar.add_cascade(label="View", underline=0, menu=viewmenu)
|
||||
|
||||
animatemenu = Menu(menubar, tearoff=0)
|
||||
animatemenu.add_radiobutton(
|
||||
label="No Animation", underline=0, variable=self._animate, value=0
|
||||
)
|
||||
animatemenu.add_radiobutton(
|
||||
label="Slow Animation",
|
||||
underline=0,
|
||||
variable=self._animate,
|
||||
value=20,
|
||||
accelerator="-",
|
||||
)
|
||||
animatemenu.add_radiobutton(
|
||||
label="Normal Animation",
|
||||
underline=0,
|
||||
variable=self._animate,
|
||||
value=10,
|
||||
accelerator="=",
|
||||
)
|
||||
animatemenu.add_radiobutton(
|
||||
label="Fast Animation",
|
||||
underline=0,
|
||||
variable=self._animate,
|
||||
value=4,
|
||||
accelerator="+",
|
||||
)
|
||||
menubar.add_cascade(label="Animate", underline=1, menu=animatemenu)
|
||||
|
||||
helpmenu = Menu(menubar, tearoff=0)
|
||||
helpmenu.add_command(label="About", underline=0, command=self.about)
|
||||
helpmenu.add_command(
|
||||
label="Instructions", underline=0, command=self.help, accelerator="F1"
|
||||
)
|
||||
menubar.add_cascade(label="Help", underline=0, menu=helpmenu)
|
||||
|
||||
parent.config(menu=menubar)
|
||||
|
||||
def _init_feedback(self, parent):
|
||||
self._feedbackframe = feedbackframe = Frame(parent)
|
||||
feedbackframe.pack(fill="x", side="bottom", padx=3, pady=3)
|
||||
self._lastoper_label = Label(
|
||||
feedbackframe, text="Last Operation:", font=self._font
|
||||
)
|
||||
self._lastoper_label.pack(side="left")
|
||||
lastoperframe = Frame(feedbackframe, relief="sunken", border=1)
|
||||
lastoperframe.pack(fill="x", side="right", expand=1, padx=5)
|
||||
self._lastoper1 = Label(
|
||||
lastoperframe, foreground="#007070", background="#f0f0f0", font=self._font
|
||||
)
|
||||
self._lastoper2 = Label(
|
||||
lastoperframe,
|
||||
anchor="w",
|
||||
width=30,
|
||||
foreground="#004040",
|
||||
background="#f0f0f0",
|
||||
font=self._font,
|
||||
)
|
||||
self._lastoper1.pack(side="left")
|
||||
self._lastoper2.pack(side="left", fill="x", expand=1)
|
||||
|
||||
def _init_canvas(self, parent):
|
||||
self._cframe = CanvasFrame(
|
||||
parent,
|
||||
background="white",
|
||||
width=525,
|
||||
closeenough=10,
|
||||
border=2,
|
||||
relief="sunken",
|
||||
)
|
||||
self._cframe.pack(expand=1, fill="both", side="top", pady=2)
|
||||
canvas = self._canvas = self._cframe.canvas()
|
||||
|
||||
self._stackwidgets = []
|
||||
self._rtextwidgets = []
|
||||
self._titlebar = canvas.create_rectangle(
|
||||
0, 0, 0, 0, fill="#c0f0f0", outline="black"
|
||||
)
|
||||
self._exprline = canvas.create_line(0, 0, 0, 0, dash=".")
|
||||
self._stacktop = canvas.create_line(0, 0, 0, 0, fill="#408080")
|
||||
size = self._size.get() + 4
|
||||
self._stacklabel = TextWidget(
|
||||
canvas, "Stack", color="#004040", font=self._boldfont
|
||||
)
|
||||
self._rtextlabel = TextWidget(
|
||||
canvas, "Remaining Text", color="#004040", font=self._boldfont
|
||||
)
|
||||
self._cframe.add_widget(self._stacklabel)
|
||||
self._cframe.add_widget(self._rtextlabel)
|
||||
|
||||
#########################################
|
||||
## Main draw procedure
|
||||
#########################################
|
||||
|
||||
def _redraw(self):
|
||||
scrollregion = self._canvas["scrollregion"].split()
|
||||
(cx1, cy1, cx2, cy2) = (int(c) for c in scrollregion)
|
||||
|
||||
# Delete the old stack & rtext widgets.
|
||||
for stackwidget in self._stackwidgets:
|
||||
self._cframe.destroy_widget(stackwidget)
|
||||
self._stackwidgets = []
|
||||
for rtextwidget in self._rtextwidgets:
|
||||
self._cframe.destroy_widget(rtextwidget)
|
||||
self._rtextwidgets = []
|
||||
|
||||
# Position the titlebar & exprline
|
||||
(x1, y1, x2, y2) = self._stacklabel.bbox()
|
||||
y = y2 - y1 + 10
|
||||
self._canvas.coords(self._titlebar, -5000, 0, 5000, y - 4)
|
||||
self._canvas.coords(self._exprline, 0, y * 2 - 10, 5000, y * 2 - 10)
|
||||
|
||||
# Position the titlebar labels..
|
||||
(x1, y1, x2, y2) = self._stacklabel.bbox()
|
||||
self._stacklabel.move(5 - x1, 3 - y1)
|
||||
(x1, y1, x2, y2) = self._rtextlabel.bbox()
|
||||
self._rtextlabel.move(cx2 - x2 - 5, 3 - y1)
|
||||
|
||||
# Draw the stack.
|
||||
stackx = 5
|
||||
for tok in self._parser.stack():
|
||||
if isinstance(tok, Tree):
|
||||
attribs = {
|
||||
"tree_color": "#4080a0",
|
||||
"tree_width": 2,
|
||||
"node_font": self._boldfont,
|
||||
"node_color": "#006060",
|
||||
"leaf_color": "#006060",
|
||||
"leaf_font": self._font,
|
||||
}
|
||||
widget = tree_to_treesegment(self._canvas, tok, **attribs)
|
||||
widget.label()["color"] = "#000000"
|
||||
else:
|
||||
widget = TextWidget(self._canvas, tok, color="#000000", font=self._font)
|
||||
widget.bind_click(self._popup_reduce)
|
||||
self._stackwidgets.append(widget)
|
||||
self._cframe.add_widget(widget, stackx, y)
|
||||
stackx = widget.bbox()[2] + 10
|
||||
|
||||
# Draw the remaining text.
|
||||
rtextwidth = 0
|
||||
for tok in self._parser.remaining_text():
|
||||
widget = TextWidget(self._canvas, tok, color="#000000", font=self._font)
|
||||
self._rtextwidgets.append(widget)
|
||||
self._cframe.add_widget(widget, rtextwidth, y)
|
||||
rtextwidth = widget.bbox()[2] + 4
|
||||
|
||||
# Allow enough room to shift the next token (for animations)
|
||||
if len(self._rtextwidgets) > 0:
|
||||
stackx += self._rtextwidgets[0].width()
|
||||
|
||||
# Move the remaining text to the correct location (keep it
|
||||
# right-justified, when possible); and move the remaining text
|
||||
# label, if necessary.
|
||||
stackx = max(stackx, self._stacklabel.width() + 25)
|
||||
rlabelwidth = self._rtextlabel.width() + 10
|
||||
if stackx >= cx2 - max(rtextwidth, rlabelwidth):
|
||||
cx2 = stackx + max(rtextwidth, rlabelwidth)
|
||||
for rtextwidget in self._rtextwidgets:
|
||||
rtextwidget.move(4 + cx2 - rtextwidth, 0)
|
||||
self._rtextlabel.move(cx2 - self._rtextlabel.bbox()[2] - 5, 0)
|
||||
|
||||
midx = (stackx + cx2 - max(rtextwidth, rlabelwidth)) / 2
|
||||
self._canvas.coords(self._stacktop, midx, 0, midx, 5000)
|
||||
(x1, y1, x2, y2) = self._stacklabel.bbox()
|
||||
|
||||
# Set up binding to allow them to shift a token by dragging it.
|
||||
if len(self._rtextwidgets) > 0:
|
||||
|
||||
def drag_shift(widget, midx=midx, self=self):
|
||||
if widget.bbox()[0] < midx:
|
||||
self.shift()
|
||||
else:
|
||||
self._redraw()
|
||||
|
||||
self._rtextwidgets[0].bind_drag(drag_shift)
|
||||
self._rtextwidgets[0].bind_click(self.shift)
|
||||
|
||||
# Draw the stack top.
|
||||
self._highlight_productions()
|
||||
|
||||
def _draw_stack_top(self, widget):
|
||||
# hack..
|
||||
midx = widget.bbox()[2] + 50
|
||||
self._canvas.coords(self._stacktop, midx, 0, midx, 5000)
|
||||
|
||||
def _highlight_productions(self):
|
||||
# Highlight the productions that can be reduced.
|
||||
self._prodlist.selection_clear(0, "end")
|
||||
for prod in self._parser.reducible_productions():
|
||||
index = self._productions.index(prod)
|
||||
self._prodlist.selection_set(index)
|
||||
|
||||
#########################################
|
||||
## Button Callbacks
|
||||
#########################################
|
||||
|
||||
def destroy(self, *e):
|
||||
if self._top is None:
|
||||
return
|
||||
self._top.destroy()
|
||||
self._top = None
|
||||
|
||||
def reset(self, *e):
|
||||
self._parser.initialize(self._sent)
|
||||
self._lastoper1["text"] = "Reset App"
|
||||
self._lastoper2["text"] = ""
|
||||
self._redraw()
|
||||
|
||||
def step(self, *e):
|
||||
if self.reduce():
|
||||
return True
|
||||
elif self.shift():
|
||||
return True
|
||||
else:
|
||||
if list(self._parser.parses()):
|
||||
self._lastoper1["text"] = "Finished:"
|
||||
self._lastoper2["text"] = "Success"
|
||||
else:
|
||||
self._lastoper1["text"] = "Finished:"
|
||||
self._lastoper2["text"] = "Failure"
|
||||
|
||||
def shift(self, *e):
|
||||
if self._animating_lock:
|
||||
return
|
||||
if self._parser.shift():
|
||||
tok = self._parser.stack()[-1]
|
||||
self._lastoper1["text"] = "Shift:"
|
||||
self._lastoper2["text"] = "%r" % tok
|
||||
if self._animate.get():
|
||||
self._animate_shift()
|
||||
else:
|
||||
self._redraw()
|
||||
return True
|
||||
return False
|
||||
|
||||
def reduce(self, *e):
|
||||
if self._animating_lock:
|
||||
return
|
||||
production = self._parser.reduce()
|
||||
if production:
|
||||
self._lastoper1["text"] = "Reduce:"
|
||||
self._lastoper2["text"] = "%s" % production
|
||||
if self._animate.get():
|
||||
self._animate_reduce()
|
||||
else:
|
||||
self._redraw()
|
||||
return production
|
||||
|
||||
def undo(self, *e):
|
||||
if self._animating_lock:
|
||||
return
|
||||
if self._parser.undo():
|
||||
self._redraw()
|
||||
|
||||
def postscript(self, *e):
|
||||
self._cframe.print_to_file()
|
||||
|
||||
def mainloop(self, *args, **kwargs):
|
||||
"""
|
||||
Enter the Tkinter mainloop. This function must be called if
|
||||
this demo is created from a non-interactive program (e.g.
|
||||
from a secript); otherwise, the demo will close as soon as
|
||||
the script completes.
|
||||
"""
|
||||
if in_idle():
|
||||
return
|
||||
self._top.mainloop(*args, **kwargs)
|
||||
|
||||
#########################################
|
||||
## Menubar callbacks
|
||||
#########################################
|
||||
|
||||
def resize(self, size=None):
|
||||
if size is not None:
|
||||
self._size.set(size)
|
||||
size = self._size.get()
|
||||
self._font.configure(size=-(abs(size)))
|
||||
self._boldfont.configure(size=-(abs(size)))
|
||||
self._sysfont.configure(size=-(abs(size)))
|
||||
|
||||
# self._stacklabel['font'] = ('helvetica', -size-4, 'bold')
|
||||
# self._rtextlabel['font'] = ('helvetica', -size-4, 'bold')
|
||||
# self._lastoper_label['font'] = ('helvetica', -size)
|
||||
# self._lastoper1['font'] = ('helvetica', -size)
|
||||
# self._lastoper2['font'] = ('helvetica', -size)
|
||||
# self._prodlist['font'] = ('helvetica', -size)
|
||||
# self._prodlist_label['font'] = ('helvetica', -size-2, 'bold')
|
||||
self._redraw()
|
||||
|
||||
def help(self, *e):
|
||||
# The default font's not very legible; try using 'fixed' instead.
|
||||
try:
|
||||
ShowText(
|
||||
self._top,
|
||||
"Help: Shift-Reduce Parser Application",
|
||||
(__doc__ or "").strip(),
|
||||
width=75,
|
||||
font="fixed",
|
||||
)
|
||||
except:
|
||||
ShowText(
|
||||
self._top,
|
||||
"Help: Shift-Reduce Parser Application",
|
||||
(__doc__ or "").strip(),
|
||||
width=75,
|
||||
)
|
||||
|
||||
def about(self, *e):
|
||||
ABOUT = "NLTK Shift-Reduce Parser Application\n" + "Written by Edward Loper"
|
||||
TITLE = "About: Shift-Reduce Parser Application"
|
||||
try:
|
||||
from tkinter.messagebox import Message
|
||||
|
||||
Message(message=ABOUT, title=TITLE).show()
|
||||
except:
|
||||
ShowText(self._top, TITLE, ABOUT)
|
||||
|
||||
def edit_grammar(self, *e):
|
||||
CFGEditor(self._top, self._parser.grammar(), self.set_grammar)
|
||||
|
||||
def set_grammar(self, grammar):
|
||||
self._parser.set_grammar(grammar)
|
||||
self._productions = list(grammar.productions())
|
||||
self._prodlist.delete(0, "end")
|
||||
for production in self._productions:
|
||||
self._prodlist.insert("end", (" %s" % production))
|
||||
|
||||
def edit_sentence(self, *e):
|
||||
sentence = " ".join(self._sent)
|
||||
title = "Edit Text"
|
||||
instr = "Enter a new sentence to parse."
|
||||
EntryDialog(self._top, sentence, instr, self.set_sentence, title)
|
||||
|
||||
def set_sentence(self, sent):
|
||||
self._sent = sent.split() # [XX] use tagged?
|
||||
self.reset()
|
||||
|
||||
#########################################
|
||||
## Reduce Production Selection
|
||||
#########################################
|
||||
|
||||
def _toggle_grammar(self, *e):
|
||||
if self._show_grammar.get():
|
||||
self._prodframe.pack(
|
||||
fill="both", side="left", padx=2, after=self._feedbackframe
|
||||
)
|
||||
self._lastoper1["text"] = "Show Grammar"
|
||||
else:
|
||||
self._prodframe.pack_forget()
|
||||
self._lastoper1["text"] = "Hide Grammar"
|
||||
self._lastoper2["text"] = ""
|
||||
|
||||
def _prodlist_select(self, event):
|
||||
selection = self._prodlist.curselection()
|
||||
if len(selection) != 1:
|
||||
return
|
||||
index = int(selection[0])
|
||||
production = self._parser.reduce(self._productions[index])
|
||||
if production:
|
||||
self._lastoper1["text"] = "Reduce:"
|
||||
self._lastoper2["text"] = "%s" % production
|
||||
if self._animate.get():
|
||||
self._animate_reduce()
|
||||
else:
|
||||
self._redraw()
|
||||
else:
|
||||
# Reset the production selections.
|
||||
self._prodlist.selection_clear(0, "end")
|
||||
for prod in self._parser.reducible_productions():
|
||||
index = self._productions.index(prod)
|
||||
self._prodlist.selection_set(index)
|
||||
|
||||
def _popup_reduce(self, widget):
|
||||
# Remove old commands.
|
||||
productions = self._parser.reducible_productions()
|
||||
if len(productions) == 0:
|
||||
return
|
||||
|
||||
self._reduce_menu.delete(0, "end")
|
||||
for production in productions:
|
||||
self._reduce_menu.add_command(label=str(production), command=self.reduce)
|
||||
self._reduce_menu.post(
|
||||
self._canvas.winfo_pointerx(), self._canvas.winfo_pointery()
|
||||
)
|
||||
|
||||
#########################################
|
||||
## Animations
|
||||
#########################################
|
||||
|
||||
def _animate_shift(self):
|
||||
# What widget are we shifting?
|
||||
widget = self._rtextwidgets[0]
|
||||
|
||||
# Where are we shifting from & to?
|
||||
right = widget.bbox()[0]
|
||||
if len(self._stackwidgets) == 0:
|
||||
left = 5
|
||||
else:
|
||||
left = self._stackwidgets[-1].bbox()[2] + 10
|
||||
|
||||
# Start animating.
|
||||
dt = self._animate.get()
|
||||
dx = (left - right) * 1.0 / dt
|
||||
self._animate_shift_frame(dt, widget, dx)
|
||||
|
||||
def _animate_shift_frame(self, frame, widget, dx):
|
||||
if frame > 0:
|
||||
self._animating_lock = 1
|
||||
widget.move(dx, 0)
|
||||
self._top.after(10, self._animate_shift_frame, frame - 1, widget, dx)
|
||||
else:
|
||||
# but: stacktop??
|
||||
|
||||
# Shift the widget to the stack.
|
||||
del self._rtextwidgets[0]
|
||||
self._stackwidgets.append(widget)
|
||||
self._animating_lock = 0
|
||||
|
||||
# Display the available productions.
|
||||
self._draw_stack_top(widget)
|
||||
self._highlight_productions()
|
||||
|
||||
def _animate_reduce(self):
|
||||
# What widgets are we shifting?
|
||||
numwidgets = len(self._parser.stack()[-1]) # number of children
|
||||
widgets = self._stackwidgets[-numwidgets:]
|
||||
|
||||
# How far are we moving?
|
||||
if isinstance(widgets[0], TreeSegmentWidget):
|
||||
ydist = 15 + widgets[0].label().height()
|
||||
else:
|
||||
ydist = 15 + widgets[0].height()
|
||||
|
||||
# Start animating.
|
||||
dt = self._animate.get()
|
||||
dy = ydist * 2.0 / dt
|
||||
self._animate_reduce_frame(dt / 2, widgets, dy)
|
||||
|
||||
def _animate_reduce_frame(self, frame, widgets, dy):
|
||||
if frame > 0:
|
||||
self._animating_lock = 1
|
||||
for widget in widgets:
|
||||
widget.move(0, dy)
|
||||
self._top.after(10, self._animate_reduce_frame, frame - 1, widgets, dy)
|
||||
else:
|
||||
del self._stackwidgets[-len(widgets) :]
|
||||
for widget in widgets:
|
||||
self._cframe.remove_widget(widget)
|
||||
tok = self._parser.stack()[-1]
|
||||
if not isinstance(tok, Tree):
|
||||
raise ValueError()
|
||||
label = TextWidget(
|
||||
self._canvas, str(tok.label()), color="#006060", font=self._boldfont
|
||||
)
|
||||
widget = TreeSegmentWidget(self._canvas, label, widgets, width=2)
|
||||
(x1, y1, x2, y2) = self._stacklabel.bbox()
|
||||
y = y2 - y1 + 10
|
||||
if not self._stackwidgets:
|
||||
x = 5
|
||||
else:
|
||||
x = self._stackwidgets[-1].bbox()[2] + 10
|
||||
self._cframe.add_widget(widget, x, y)
|
||||
self._stackwidgets.append(widget)
|
||||
|
||||
# Display the available productions.
|
||||
self._draw_stack_top(widget)
|
||||
self._highlight_productions()
|
||||
|
||||
# # Delete the old widgets..
|
||||
# del self._stackwidgets[-len(widgets):]
|
||||
# for widget in widgets:
|
||||
# self._cframe.destroy_widget(widget)
|
||||
#
|
||||
# # Make a new one.
|
||||
# tok = self._parser.stack()[-1]
|
||||
# if isinstance(tok, Tree):
|
||||
# attribs = {'tree_color': '#4080a0', 'tree_width': 2,
|
||||
# 'node_font': bold, 'node_color': '#006060',
|
||||
# 'leaf_color': '#006060', 'leaf_font':self._font}
|
||||
# widget = tree_to_treesegment(self._canvas, tok.type(),
|
||||
# **attribs)
|
||||
# widget.node()['color'] = '#000000'
|
||||
# else:
|
||||
# widget = TextWidget(self._canvas, tok.type(),
|
||||
# color='#000000', font=self._font)
|
||||
# widget.bind_click(self._popup_reduce)
|
||||
# (x1, y1, x2, y2) = self._stacklabel.bbox()
|
||||
# y = y2-y1+10
|
||||
# if not self._stackwidgets: x = 5
|
||||
# else: x = self._stackwidgets[-1].bbox()[2] + 10
|
||||
# self._cframe.add_widget(widget, x, y)
|
||||
# self._stackwidgets.append(widget)
|
||||
|
||||
# self._redraw()
|
||||
self._animating_lock = 0
|
||||
|
||||
#########################################
|
||||
## Hovering.
|
||||
#########################################
|
||||
|
||||
def _highlight_hover(self, event):
|
||||
# What production are we hovering over?
|
||||
index = self._prodlist.nearest(event.y)
|
||||
if self._hover == index:
|
||||
return
|
||||
|
||||
# Clear any previous hover highlighting.
|
||||
self._clear_hover()
|
||||
|
||||
# If the production corresponds to an available reduction,
|
||||
# highlight the stack.
|
||||
selection = [int(s) for s in self._prodlist.curselection()]
|
||||
if index in selection:
|
||||
rhslen = len(self._productions[index].rhs())
|
||||
for stackwidget in self._stackwidgets[-rhslen:]:
|
||||
if isinstance(stackwidget, TreeSegmentWidget):
|
||||
stackwidget.label()["color"] = "#00a000"
|
||||
else:
|
||||
stackwidget["color"] = "#00a000"
|
||||
|
||||
# Remember what production we're hovering over.
|
||||
self._hover = index
|
||||
|
||||
def _clear_hover(self, *event):
|
||||
# Clear any previous hover highlighting.
|
||||
if self._hover == -1:
|
||||
return
|
||||
self._hover = -1
|
||||
for stackwidget in self._stackwidgets:
|
||||
if isinstance(stackwidget, TreeSegmentWidget):
|
||||
stackwidget.label()["color"] = "black"
|
||||
else:
|
||||
stackwidget["color"] = "black"
|
||||
|
||||
|
||||
def app():
|
||||
"""
|
||||
Create a shift reduce parser app, using a simple grammar and
|
||||
text.
|
||||
"""
|
||||
|
||||
from nltk.grammar import CFG, Nonterminal, Production
|
||||
|
||||
nonterminals = "S VP NP PP P N Name V Det"
|
||||
(S, VP, NP, PP, P, N, Name, V, Det) = (Nonterminal(s) for s in nonterminals.split())
|
||||
|
||||
productions = (
|
||||
# Syntactic Productions
|
||||
Production(S, [NP, VP]),
|
||||
Production(NP, [Det, N]),
|
||||
Production(NP, [NP, PP]),
|
||||
Production(VP, [VP, PP]),
|
||||
Production(VP, [V, NP, PP]),
|
||||
Production(VP, [V, NP]),
|
||||
Production(PP, [P, NP]),
|
||||
# Lexical Productions
|
||||
Production(NP, ["I"]),
|
||||
Production(Det, ["the"]),
|
||||
Production(Det, ["a"]),
|
||||
Production(N, ["man"]),
|
||||
Production(V, ["saw"]),
|
||||
Production(P, ["in"]),
|
||||
Production(P, ["with"]),
|
||||
Production(N, ["park"]),
|
||||
Production(N, ["dog"]),
|
||||
Production(N, ["statue"]),
|
||||
Production(Det, ["my"]),
|
||||
)
|
||||
|
||||
grammar = CFG(S, productions)
|
||||
|
||||
# tokenize the sentence
|
||||
sent = "my dog saw a man in the park with a statue".split()
|
||||
|
||||
ShiftReduceApp(grammar, sent).mainloop()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app()
|
||||
|
||||
__all__ = ["app"]
|
||||
36
backend/venv/Lib/site-packages/nltk/app/wordfreq_app.py
Normal file
36
backend/venv/Lib/site-packages/nltk/app/wordfreq_app.py
Normal file
@@ -0,0 +1,36 @@
|
||||
# Natural Language Toolkit: Wordfreq Application
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Sumukh Ghodke <sghodke@csse.unimelb.edu.au>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
from matplotlib import pylab
|
||||
|
||||
from nltk.corpus import gutenberg
|
||||
from nltk.text import Text
|
||||
|
||||
|
||||
def plot_word_freq_dist(text):
|
||||
fd = text.vocab()
|
||||
|
||||
samples = [item for item, _ in fd.most_common(50)]
|
||||
values = [fd[sample] for sample in samples]
|
||||
values = [sum(values[: i + 1]) * 100.0 / fd.N() for i in range(len(values))]
|
||||
pylab.title(text.name)
|
||||
pylab.xlabel("Samples")
|
||||
pylab.ylabel("Cumulative Percentage")
|
||||
pylab.plot(values)
|
||||
pylab.xticks(range(len(samples)), [str(s) for s in samples], rotation=90)
|
||||
pylab.show()
|
||||
|
||||
|
||||
def app():
|
||||
t1 = Text(gutenberg.words("melville-moby_dick.txt"))
|
||||
plot_word_freq_dist(t1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app()
|
||||
|
||||
__all__ = ["app"]
|
||||
1006
backend/venv/Lib/site-packages/nltk/app/wordnet_app.py
Normal file
1006
backend/venv/Lib/site-packages/nltk/app/wordnet_app.py
Normal file
File diff suppressed because it is too large
Load Diff
213
backend/venv/Lib/site-packages/nltk/book.py
Normal file
213
backend/venv/Lib/site-packages/nltk/book.py
Normal file
@@ -0,0 +1,213 @@
|
||||
# Natural Language Toolkit: Some texts for exploration in chapter 1 of the book
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
#
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
from nltk.corpus import (
|
||||
genesis,
|
||||
gutenberg,
|
||||
inaugural,
|
||||
nps_chat,
|
||||
treebank,
|
||||
webtext,
|
||||
wordnet,
|
||||
)
|
||||
from nltk.probability import FreqDist
|
||||
from nltk.text import Text
|
||||
from nltk.util import bigrams
|
||||
|
||||
print("*** Introductory Examples for the NLTK Book ***")
|
||||
print("Loading text1, ..., text9 and sent1, ..., sent9")
|
||||
print("Type the name of the text or sentence to view it.")
|
||||
print("Type: 'texts()' or 'sents()' to list the materials.")
|
||||
|
||||
text1 = Text(gutenberg.words("melville-moby_dick.txt"))
|
||||
print("text1:", text1.name)
|
||||
|
||||
text2 = Text(gutenberg.words("austen-sense.txt"))
|
||||
print("text2:", text2.name)
|
||||
|
||||
text3 = Text(genesis.words("english-kjv.txt"), name="The Book of Genesis")
|
||||
print("text3:", text3.name)
|
||||
|
||||
text4 = Text(inaugural.words(), name="Inaugural Address Corpus")
|
||||
print("text4:", text4.name)
|
||||
|
||||
text5 = Text(nps_chat.words(), name="Chat Corpus")
|
||||
print("text5:", text5.name)
|
||||
|
||||
text6 = Text(webtext.words("grail.txt"), name="Monty Python and the Holy Grail")
|
||||
print("text6:", text6.name)
|
||||
|
||||
text7 = Text(treebank.words(), name="Wall Street Journal")
|
||||
print("text7:", text7.name)
|
||||
|
||||
text8 = Text(webtext.words("singles.txt"), name="Personals Corpus")
|
||||
print("text8:", text8.name)
|
||||
|
||||
text9 = Text(gutenberg.words("chesterton-thursday.txt"))
|
||||
print("text9:", text9.name)
|
||||
|
||||
|
||||
def texts():
|
||||
print("text1:", text1.name)
|
||||
print("text2:", text2.name)
|
||||
print("text3:", text3.name)
|
||||
print("text4:", text4.name)
|
||||
print("text5:", text5.name)
|
||||
print("text6:", text6.name)
|
||||
print("text7:", text7.name)
|
||||
print("text8:", text8.name)
|
||||
print("text9:", text9.name)
|
||||
|
||||
|
||||
sent1 = ["Call", "me", "Ishmael", "."]
|
||||
sent2 = [
|
||||
"The",
|
||||
"family",
|
||||
"of",
|
||||
"Dashwood",
|
||||
"had",
|
||||
"long",
|
||||
"been",
|
||||
"settled",
|
||||
"in",
|
||||
"Sussex",
|
||||
".",
|
||||
]
|
||||
sent3 = [
|
||||
"In",
|
||||
"the",
|
||||
"beginning",
|
||||
"God",
|
||||
"created",
|
||||
"the",
|
||||
"heaven",
|
||||
"and",
|
||||
"the",
|
||||
"earth",
|
||||
".",
|
||||
]
|
||||
sent4 = [
|
||||
"Fellow",
|
||||
"-",
|
||||
"Citizens",
|
||||
"of",
|
||||
"the",
|
||||
"Senate",
|
||||
"and",
|
||||
"of",
|
||||
"the",
|
||||
"House",
|
||||
"of",
|
||||
"Representatives",
|
||||
":",
|
||||
]
|
||||
sent5 = [
|
||||
"I",
|
||||
"have",
|
||||
"a",
|
||||
"problem",
|
||||
"with",
|
||||
"people",
|
||||
"PMing",
|
||||
"me",
|
||||
"to",
|
||||
"lol",
|
||||
"JOIN",
|
||||
]
|
||||
sent6 = [
|
||||
"SCENE",
|
||||
"1",
|
||||
":",
|
||||
"[",
|
||||
"wind",
|
||||
"]",
|
||||
"[",
|
||||
"clop",
|
||||
"clop",
|
||||
"clop",
|
||||
"]",
|
||||
"KING",
|
||||
"ARTHUR",
|
||||
":",
|
||||
"Whoa",
|
||||
"there",
|
||||
"!",
|
||||
]
|
||||
sent7 = [
|
||||
"Pierre",
|
||||
"Vinken",
|
||||
",",
|
||||
"61",
|
||||
"years",
|
||||
"old",
|
||||
",",
|
||||
"will",
|
||||
"join",
|
||||
"the",
|
||||
"board",
|
||||
"as",
|
||||
"a",
|
||||
"nonexecutive",
|
||||
"director",
|
||||
"Nov.",
|
||||
"29",
|
||||
".",
|
||||
]
|
||||
sent8 = [
|
||||
"25",
|
||||
"SEXY",
|
||||
"MALE",
|
||||
",",
|
||||
"seeks",
|
||||
"attrac",
|
||||
"older",
|
||||
"single",
|
||||
"lady",
|
||||
",",
|
||||
"for",
|
||||
"discreet",
|
||||
"encounters",
|
||||
".",
|
||||
]
|
||||
sent9 = [
|
||||
"THE",
|
||||
"suburb",
|
||||
"of",
|
||||
"Saffron",
|
||||
"Park",
|
||||
"lay",
|
||||
"on",
|
||||
"the",
|
||||
"sunset",
|
||||
"side",
|
||||
"of",
|
||||
"London",
|
||||
",",
|
||||
"as",
|
||||
"red",
|
||||
"and",
|
||||
"ragged",
|
||||
"as",
|
||||
"a",
|
||||
"cloud",
|
||||
"of",
|
||||
"sunset",
|
||||
".",
|
||||
]
|
||||
|
||||
|
||||
def sents():
|
||||
print("sent1:", " ".join(sent1))
|
||||
print("sent2:", " ".join(sent2))
|
||||
print("sent3:", " ".join(sent3))
|
||||
print("sent4:", " ".join(sent4))
|
||||
print("sent5:", " ".join(sent5))
|
||||
print("sent6:", " ".join(sent6))
|
||||
print("sent7:", " ".join(sent7))
|
||||
print("sent8:", " ".join(sent8))
|
||||
print("sent9:", " ".join(sent9))
|
||||
34
backend/venv/Lib/site-packages/nltk/ccg/__init__.py
Normal file
34
backend/venv/Lib/site-packages/nltk/ccg/__init__.py
Normal file
@@ -0,0 +1,34 @@
|
||||
# Natural Language Toolkit: Combinatory Categorial Grammar
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Graeme Gange <ggange@csse.unimelb.edu.au>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Combinatory Categorial Grammar.
|
||||
|
||||
For more information see nltk/doc/contrib/ccg/ccg.pdf
|
||||
"""
|
||||
|
||||
from nltk.ccg.chart import CCGChart, CCGChartParser, CCGEdge, CCGLeafEdge
|
||||
from nltk.ccg.combinator import (
|
||||
BackwardApplication,
|
||||
BackwardBx,
|
||||
BackwardCombinator,
|
||||
BackwardComposition,
|
||||
BackwardSx,
|
||||
BackwardT,
|
||||
DirectedBinaryCombinator,
|
||||
ForwardApplication,
|
||||
ForwardCombinator,
|
||||
ForwardComposition,
|
||||
ForwardSubstitution,
|
||||
ForwardT,
|
||||
UndirectedBinaryCombinator,
|
||||
UndirectedComposition,
|
||||
UndirectedFunctionApplication,
|
||||
UndirectedSubstitution,
|
||||
UndirectedTypeRaise,
|
||||
)
|
||||
from nltk.ccg.lexicon import CCGLexicon
|
||||
358
backend/venv/Lib/site-packages/nltk/ccg/api.py
Normal file
358
backend/venv/Lib/site-packages/nltk/ccg/api.py
Normal file
@@ -0,0 +1,358 @@
|
||||
# Natural Language Toolkit: CCG Categories
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Graeme Gange <ggange@csse.unimelb.edu.au>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
from abc import ABCMeta, abstractmethod
|
||||
from functools import total_ordering
|
||||
|
||||
from nltk.internals import raise_unorderable_types
|
||||
|
||||
|
||||
@total_ordering
|
||||
class AbstractCCGCategory(metaclass=ABCMeta):
|
||||
"""
|
||||
Interface for categories in combinatory grammars.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def is_primitive(self):
|
||||
"""
|
||||
Returns true if the category is primitive.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def is_function(self):
|
||||
"""
|
||||
Returns true if the category is a function application.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def is_var(self):
|
||||
"""
|
||||
Returns true if the category is a variable.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def substitute(self, substitutions):
|
||||
"""
|
||||
Takes a set of (var, category) substitutions, and replaces every
|
||||
occurrence of the variable with the corresponding category.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def can_unify(self, other):
|
||||
"""
|
||||
Determines whether two categories can be unified.
|
||||
- Returns None if they cannot be unified
|
||||
- Returns a list of necessary substitutions if they can.
|
||||
"""
|
||||
|
||||
# Utility functions: comparison, strings and hashing.
|
||||
@abstractmethod
|
||||
def __str__(self):
|
||||
pass
|
||||
|
||||
def __eq__(self, other):
|
||||
return (
|
||||
self.__class__ is other.__class__
|
||||
and self._comparison_key == other._comparison_key
|
||||
)
|
||||
|
||||
def __ne__(self, other):
|
||||
return not self == other
|
||||
|
||||
def __lt__(self, other):
|
||||
if not isinstance(other, AbstractCCGCategory):
|
||||
raise_unorderable_types("<", self, other)
|
||||
if self.__class__ is other.__class__:
|
||||
return self._comparison_key < other._comparison_key
|
||||
else:
|
||||
return self.__class__.__name__ < other.__class__.__name__
|
||||
|
||||
def __hash__(self):
|
||||
try:
|
||||
return self._hash
|
||||
except AttributeError:
|
||||
self._hash = hash(self._comparison_key)
|
||||
return self._hash
|
||||
|
||||
|
||||
class CCGVar(AbstractCCGCategory):
|
||||
"""
|
||||
Class representing a variable CCG category.
|
||||
Used for conjunctions (and possibly type-raising, if implemented as a
|
||||
unary rule).
|
||||
"""
|
||||
|
||||
_maxID = 0
|
||||
|
||||
def __init__(self, prim_only=False):
|
||||
"""Initialize a variable (selects a new identifier)
|
||||
|
||||
:param prim_only: a boolean that determines whether the variable is
|
||||
restricted to primitives
|
||||
:type prim_only: bool
|
||||
"""
|
||||
self._id = self.new_id()
|
||||
self._prim_only = prim_only
|
||||
self._comparison_key = self._id
|
||||
|
||||
@classmethod
|
||||
def new_id(cls):
|
||||
"""
|
||||
A class method allowing generation of unique variable identifiers.
|
||||
"""
|
||||
cls._maxID = cls._maxID + 1
|
||||
return cls._maxID - 1
|
||||
|
||||
@classmethod
|
||||
def reset_id(cls):
|
||||
cls._maxID = 0
|
||||
|
||||
def is_primitive(self):
|
||||
return False
|
||||
|
||||
def is_function(self):
|
||||
return False
|
||||
|
||||
def is_var(self):
|
||||
return True
|
||||
|
||||
def substitute(self, substitutions):
|
||||
"""If there is a substitution corresponding to this variable,
|
||||
return the substituted category.
|
||||
"""
|
||||
for var, cat in substitutions:
|
||||
if var == self:
|
||||
return cat
|
||||
return self
|
||||
|
||||
def can_unify(self, other):
|
||||
"""If the variable can be replaced with other
|
||||
a substitution is returned.
|
||||
"""
|
||||
if other.is_primitive() or not self._prim_only:
|
||||
return [(self, other)]
|
||||
return None
|
||||
|
||||
def id(self):
|
||||
return self._id
|
||||
|
||||
def __str__(self):
|
||||
return "_var" + str(self._id)
|
||||
|
||||
|
||||
@total_ordering
|
||||
class Direction:
|
||||
"""
|
||||
Class representing the direction of a function application.
|
||||
Also contains maintains information as to which combinators
|
||||
may be used with the category.
|
||||
"""
|
||||
|
||||
def __init__(self, dir, restrictions):
|
||||
self._dir = dir
|
||||
self._restrs = restrictions
|
||||
self._comparison_key = (dir, tuple(restrictions))
|
||||
|
||||
# Testing the application direction
|
||||
def is_forward(self):
|
||||
return self._dir == "/"
|
||||
|
||||
def is_backward(self):
|
||||
return self._dir == "\\"
|
||||
|
||||
def dir(self):
|
||||
return self._dir
|
||||
|
||||
def restrs(self):
|
||||
"""A list of restrictions on the combinators.
|
||||
'.' denotes that permuting operations are disallowed
|
||||
',' denotes that function composition is disallowed
|
||||
'_' denotes that the direction has variable restrictions.
|
||||
(This is redundant in the current implementation of type-raising)
|
||||
"""
|
||||
return self._restrs
|
||||
|
||||
def is_variable(self):
|
||||
return self._restrs == "_"
|
||||
|
||||
# Unification and substitution of variable directions.
|
||||
# Used only if type-raising is implemented as a unary rule, as it
|
||||
# must inherit restrictions from the argument category.
|
||||
def can_unify(self, other):
|
||||
if other.is_variable():
|
||||
return [("_", self.restrs())]
|
||||
elif self.is_variable():
|
||||
return [("_", other.restrs())]
|
||||
else:
|
||||
if self.restrs() == other.restrs():
|
||||
return []
|
||||
return None
|
||||
|
||||
def substitute(self, subs):
|
||||
if not self.is_variable():
|
||||
return self
|
||||
|
||||
for var, restrs in subs:
|
||||
if var == "_":
|
||||
return Direction(self._dir, restrs)
|
||||
return self
|
||||
|
||||
# Testing permitted combinators
|
||||
def can_compose(self):
|
||||
return "," not in self._restrs
|
||||
|
||||
def can_cross(self):
|
||||
return "." not in self._restrs
|
||||
|
||||
def __eq__(self, other):
|
||||
return (
|
||||
self.__class__ is other.__class__
|
||||
and self._comparison_key == other._comparison_key
|
||||
)
|
||||
|
||||
def __ne__(self, other):
|
||||
return not self == other
|
||||
|
||||
def __lt__(self, other):
|
||||
if not isinstance(other, Direction):
|
||||
raise_unorderable_types("<", self, other)
|
||||
if self.__class__ is other.__class__:
|
||||
return self._comparison_key < other._comparison_key
|
||||
else:
|
||||
return self.__class__.__name__ < other.__class__.__name__
|
||||
|
||||
def __hash__(self):
|
||||
try:
|
||||
return self._hash
|
||||
except AttributeError:
|
||||
self._hash = hash(self._comparison_key)
|
||||
return self._hash
|
||||
|
||||
def __str__(self):
|
||||
r_str = ""
|
||||
for r in self._restrs:
|
||||
r_str = r_str + "%s" % r
|
||||
return f"{self._dir}{r_str}"
|
||||
|
||||
# The negation operator reverses the direction of the application
|
||||
def __neg__(self):
|
||||
if self._dir == "/":
|
||||
return Direction("\\", self._restrs)
|
||||
else:
|
||||
return Direction("/", self._restrs)
|
||||
|
||||
|
||||
class PrimitiveCategory(AbstractCCGCategory):
|
||||
"""
|
||||
Class representing primitive categories.
|
||||
Takes a string representation of the category, and a
|
||||
list of strings specifying the morphological subcategories.
|
||||
"""
|
||||
|
||||
def __init__(self, categ, restrictions=[]):
|
||||
self._categ = categ
|
||||
self._restrs = restrictions
|
||||
self._comparison_key = (categ, tuple(restrictions))
|
||||
|
||||
def is_primitive(self):
|
||||
return True
|
||||
|
||||
def is_function(self):
|
||||
return False
|
||||
|
||||
def is_var(self):
|
||||
return False
|
||||
|
||||
def restrs(self):
|
||||
return self._restrs
|
||||
|
||||
def categ(self):
|
||||
return self._categ
|
||||
|
||||
# Substitution does nothing to a primitive category
|
||||
def substitute(self, subs):
|
||||
return self
|
||||
|
||||
# A primitive can be unified with a class of the same
|
||||
# base category, given that the other category shares all
|
||||
# of its subclasses, or with a variable.
|
||||
def can_unify(self, other):
|
||||
if not other.is_primitive():
|
||||
return None
|
||||
if other.is_var():
|
||||
return [(other, self)]
|
||||
if other.categ() == self.categ():
|
||||
for restr in self._restrs:
|
||||
if restr not in other.restrs():
|
||||
return None
|
||||
return []
|
||||
return None
|
||||
|
||||
def __str__(self):
|
||||
if self._restrs == []:
|
||||
return "%s" % self._categ
|
||||
restrictions = "[%s]" % ",".join(repr(r) for r in self._restrs)
|
||||
return f"{self._categ}{restrictions}"
|
||||
|
||||
|
||||
class FunctionalCategory(AbstractCCGCategory):
|
||||
"""
|
||||
Class that represents a function application category.
|
||||
Consists of argument and result categories, together with
|
||||
an application direction.
|
||||
"""
|
||||
|
||||
def __init__(self, res, arg, dir):
|
||||
self._res = res
|
||||
self._arg = arg
|
||||
self._dir = dir
|
||||
self._comparison_key = (arg, dir, res)
|
||||
|
||||
def is_primitive(self):
|
||||
return False
|
||||
|
||||
def is_function(self):
|
||||
return True
|
||||
|
||||
def is_var(self):
|
||||
return False
|
||||
|
||||
# Substitution returns the category consisting of the
|
||||
# substitution applied to each of its constituents.
|
||||
def substitute(self, subs):
|
||||
sub_res = self._res.substitute(subs)
|
||||
sub_dir = self._dir.substitute(subs)
|
||||
sub_arg = self._arg.substitute(subs)
|
||||
return FunctionalCategory(sub_res, sub_arg, self._dir)
|
||||
|
||||
# A function can unify with another function, so long as its
|
||||
# constituents can unify, or with an unrestricted variable.
|
||||
def can_unify(self, other):
|
||||
if other.is_var():
|
||||
return [(other, self)]
|
||||
if other.is_function():
|
||||
sa = self._res.can_unify(other.res())
|
||||
sd = self._dir.can_unify(other.dir())
|
||||
if sa is not None and sd is not None:
|
||||
sb = self._arg.substitute(sa).can_unify(other.arg().substitute(sa))
|
||||
if sb is not None:
|
||||
return sa + sb
|
||||
return None
|
||||
|
||||
# Constituent accessors
|
||||
def arg(self):
|
||||
return self._arg
|
||||
|
||||
def res(self):
|
||||
return self._res
|
||||
|
||||
def dir(self):
|
||||
return self._dir
|
||||
|
||||
def __str__(self):
|
||||
return f"({self._res}{self._dir}{self._arg})"
|
||||
480
backend/venv/Lib/site-packages/nltk/ccg/chart.py
Normal file
480
backend/venv/Lib/site-packages/nltk/ccg/chart.py
Normal file
@@ -0,0 +1,480 @@
|
||||
# Natural Language Toolkit: Combinatory Categorial Grammar
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Graeme Gange <ggange@csse.unimelb.edu.au>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
The lexicon is constructed by calling
|
||||
``lexicon.fromstring(<lexicon string>)``.
|
||||
|
||||
In order to construct a parser, you also need a rule set.
|
||||
The standard English rules are provided in chart as
|
||||
``chart.DefaultRuleSet``.
|
||||
|
||||
The parser can then be constructed by calling, for example:
|
||||
``parser = chart.CCGChartParser(<lexicon>, <ruleset>)``
|
||||
|
||||
Parsing is then performed by running
|
||||
``parser.parse(<sentence>.split())``.
|
||||
|
||||
While this returns a list of trees, the default representation
|
||||
of the produced trees is not very enlightening, particularly
|
||||
given that it uses the same tree class as the CFG parsers.
|
||||
It is probably better to call:
|
||||
``chart.printCCGDerivation(<parse tree extracted from list>)``
|
||||
which should print a nice representation of the derivation.
|
||||
|
||||
This entire process is shown far more clearly in the demonstration:
|
||||
python chart.py
|
||||
"""
|
||||
|
||||
import itertools
|
||||
|
||||
from nltk.ccg.combinator import *
|
||||
from nltk.ccg.combinator import (
|
||||
BackwardApplication,
|
||||
BackwardBx,
|
||||
BackwardComposition,
|
||||
BackwardSx,
|
||||
BackwardT,
|
||||
ForwardApplication,
|
||||
ForwardComposition,
|
||||
ForwardSubstitution,
|
||||
ForwardT,
|
||||
)
|
||||
from nltk.ccg.lexicon import Token, fromstring
|
||||
from nltk.ccg.logic import *
|
||||
from nltk.parse import ParserI
|
||||
from nltk.parse.chart import AbstractChartRule, Chart, EdgeI
|
||||
from nltk.sem.logic import *
|
||||
from nltk.tree import Tree
|
||||
|
||||
|
||||
# Based on the EdgeI class from NLTK.
|
||||
# A number of the properties of the EdgeI interface don't
|
||||
# transfer well to CCGs, however.
|
||||
class CCGEdge(EdgeI):
|
||||
def __init__(self, span, categ, rule):
|
||||
self._span = span
|
||||
self._categ = categ
|
||||
self._rule = rule
|
||||
self._comparison_key = (span, categ, rule)
|
||||
|
||||
# Accessors
|
||||
def lhs(self):
|
||||
return self._categ
|
||||
|
||||
def span(self):
|
||||
return self._span
|
||||
|
||||
def start(self):
|
||||
return self._span[0]
|
||||
|
||||
def end(self):
|
||||
return self._span[1]
|
||||
|
||||
def length(self):
|
||||
return self._span[1] - self.span[0]
|
||||
|
||||
def rhs(self):
|
||||
return ()
|
||||
|
||||
def dot(self):
|
||||
return 0
|
||||
|
||||
def is_complete(self):
|
||||
return True
|
||||
|
||||
def is_incomplete(self):
|
||||
return False
|
||||
|
||||
def nextsym(self):
|
||||
return None
|
||||
|
||||
def categ(self):
|
||||
return self._categ
|
||||
|
||||
def rule(self):
|
||||
return self._rule
|
||||
|
||||
|
||||
class CCGLeafEdge(EdgeI):
|
||||
"""
|
||||
Class representing leaf edges in a CCG derivation.
|
||||
"""
|
||||
|
||||
def __init__(self, pos, token, leaf):
|
||||
self._pos = pos
|
||||
self._token = token
|
||||
self._leaf = leaf
|
||||
self._comparison_key = (pos, token.categ(), leaf)
|
||||
|
||||
# Accessors
|
||||
def lhs(self):
|
||||
return self._token.categ()
|
||||
|
||||
def span(self):
|
||||
return (self._pos, self._pos + 1)
|
||||
|
||||
def start(self):
|
||||
return self._pos
|
||||
|
||||
def end(self):
|
||||
return self._pos + 1
|
||||
|
||||
def length(self):
|
||||
return 1
|
||||
|
||||
def rhs(self):
|
||||
return self._leaf
|
||||
|
||||
def dot(self):
|
||||
return 0
|
||||
|
||||
def is_complete(self):
|
||||
return True
|
||||
|
||||
def is_incomplete(self):
|
||||
return False
|
||||
|
||||
def nextsym(self):
|
||||
return None
|
||||
|
||||
def token(self):
|
||||
return self._token
|
||||
|
||||
def categ(self):
|
||||
return self._token.categ()
|
||||
|
||||
def leaf(self):
|
||||
return self._leaf
|
||||
|
||||
|
||||
class BinaryCombinatorRule(AbstractChartRule):
|
||||
"""
|
||||
Class implementing application of a binary combinator to a chart.
|
||||
Takes the directed combinator to apply.
|
||||
"""
|
||||
|
||||
NUMEDGES = 2
|
||||
|
||||
def __init__(self, combinator):
|
||||
self._combinator = combinator
|
||||
|
||||
# Apply a combinator
|
||||
def apply(self, chart, grammar, left_edge, right_edge):
|
||||
# The left & right edges must be touching.
|
||||
if not (left_edge.end() == right_edge.start()):
|
||||
return
|
||||
|
||||
# Check if the two edges are permitted to combine.
|
||||
# If so, generate the corresponding edge.
|
||||
if self._combinator.can_combine(left_edge.categ(), right_edge.categ()):
|
||||
for res in self._combinator.combine(left_edge.categ(), right_edge.categ()):
|
||||
new_edge = CCGEdge(
|
||||
span=(left_edge.start(), right_edge.end()),
|
||||
categ=res,
|
||||
rule=self._combinator,
|
||||
)
|
||||
if chart.insert(new_edge, (left_edge, right_edge)):
|
||||
yield new_edge
|
||||
|
||||
# The representation of the combinator (for printing derivations)
|
||||
def __str__(self):
|
||||
return "%s" % self._combinator
|
||||
|
||||
|
||||
# Type-raising must be handled slightly differently to the other rules, as the
|
||||
# resulting rules only span a single edge, rather than both edges.
|
||||
|
||||
|
||||
class ForwardTypeRaiseRule(AbstractChartRule):
|
||||
"""
|
||||
Class for applying forward type raising
|
||||
"""
|
||||
|
||||
NUMEDGES = 2
|
||||
|
||||
def __init__(self):
|
||||
self._combinator = ForwardT
|
||||
|
||||
def apply(self, chart, grammar, left_edge, right_edge):
|
||||
if not (left_edge.end() == right_edge.start()):
|
||||
return
|
||||
|
||||
for res in self._combinator.combine(left_edge.categ(), right_edge.categ()):
|
||||
new_edge = CCGEdge(span=left_edge.span(), categ=res, rule=self._combinator)
|
||||
if chart.insert(new_edge, (left_edge,)):
|
||||
yield new_edge
|
||||
|
||||
def __str__(self):
|
||||
return "%s" % self._combinator
|
||||
|
||||
|
||||
class BackwardTypeRaiseRule(AbstractChartRule):
|
||||
"""
|
||||
Class for applying backward type raising.
|
||||
"""
|
||||
|
||||
NUMEDGES = 2
|
||||
|
||||
def __init__(self):
|
||||
self._combinator = BackwardT
|
||||
|
||||
def apply(self, chart, grammar, left_edge, right_edge):
|
||||
if not (left_edge.end() == right_edge.start()):
|
||||
return
|
||||
|
||||
for res in self._combinator.combine(left_edge.categ(), right_edge.categ()):
|
||||
new_edge = CCGEdge(span=right_edge.span(), categ=res, rule=self._combinator)
|
||||
if chart.insert(new_edge, (right_edge,)):
|
||||
yield new_edge
|
||||
|
||||
def __str__(self):
|
||||
return "%s" % self._combinator
|
||||
|
||||
|
||||
# Common sets of combinators used for English derivations.
|
||||
ApplicationRuleSet = [
|
||||
BinaryCombinatorRule(ForwardApplication),
|
||||
BinaryCombinatorRule(BackwardApplication),
|
||||
]
|
||||
CompositionRuleSet = [
|
||||
BinaryCombinatorRule(ForwardComposition),
|
||||
BinaryCombinatorRule(BackwardComposition),
|
||||
BinaryCombinatorRule(BackwardBx),
|
||||
]
|
||||
SubstitutionRuleSet = [
|
||||
BinaryCombinatorRule(ForwardSubstitution),
|
||||
BinaryCombinatorRule(BackwardSx),
|
||||
]
|
||||
TypeRaiseRuleSet = [ForwardTypeRaiseRule(), BackwardTypeRaiseRule()]
|
||||
|
||||
# The standard English rule set.
|
||||
DefaultRuleSet = (
|
||||
ApplicationRuleSet + CompositionRuleSet + SubstitutionRuleSet + TypeRaiseRuleSet
|
||||
)
|
||||
|
||||
|
||||
class CCGChartParser(ParserI):
|
||||
"""
|
||||
Chart parser for CCGs.
|
||||
Based largely on the ChartParser class from NLTK.
|
||||
"""
|
||||
|
||||
def __init__(self, lexicon, rules, trace=0):
|
||||
self._lexicon = lexicon
|
||||
self._rules = rules
|
||||
self._trace = trace
|
||||
|
||||
def lexicon(self):
|
||||
return self._lexicon
|
||||
|
||||
# Implements the CYK algorithm
|
||||
def parse(self, tokens):
|
||||
tokens = list(tokens)
|
||||
chart = CCGChart(list(tokens))
|
||||
lex = self._lexicon
|
||||
|
||||
# Initialize leaf edges.
|
||||
for index in range(chart.num_leaves()):
|
||||
for token in lex.categories(chart.leaf(index)):
|
||||
new_edge = CCGLeafEdge(index, token, chart.leaf(index))
|
||||
chart.insert(new_edge, ())
|
||||
|
||||
# Select a span for the new edges
|
||||
for span in range(2, chart.num_leaves() + 1):
|
||||
for start in range(0, chart.num_leaves() - span + 1):
|
||||
# Try all possible pairs of edges that could generate
|
||||
# an edge for that span
|
||||
for part in range(1, span):
|
||||
lstart = start
|
||||
mid = start + part
|
||||
rend = start + span
|
||||
|
||||
for left in chart.select(span=(lstart, mid)):
|
||||
for right in chart.select(span=(mid, rend)):
|
||||
# Generate all possible combinations of the two edges
|
||||
for rule in self._rules:
|
||||
edges_added_by_rule = 0
|
||||
for newedge in rule.apply(chart, lex, left, right):
|
||||
edges_added_by_rule += 1
|
||||
|
||||
# Output the resulting parses
|
||||
return chart.parses(lex.start())
|
||||
|
||||
|
||||
class CCGChart(Chart):
|
||||
def __init__(self, tokens):
|
||||
Chart.__init__(self, tokens)
|
||||
|
||||
# Constructs the trees for a given parse. Unfortnunately, the parse trees need to be
|
||||
# constructed slightly differently to those in the default Chart class, so it has to
|
||||
# be reimplemented
|
||||
def _trees(self, edge, complete, memo, tree_class):
|
||||
assert complete, "CCGChart cannot build incomplete trees"
|
||||
|
||||
if edge in memo:
|
||||
return memo[edge]
|
||||
|
||||
if isinstance(edge, CCGLeafEdge):
|
||||
word = tree_class(edge.token(), [self._tokens[edge.start()]])
|
||||
leaf = tree_class((edge.token(), "Leaf"), [word])
|
||||
memo[edge] = [leaf]
|
||||
return [leaf]
|
||||
|
||||
memo[edge] = []
|
||||
trees = []
|
||||
|
||||
for cpl in self.child_pointer_lists(edge):
|
||||
child_choices = [self._trees(cp, complete, memo, tree_class) for cp in cpl]
|
||||
for children in itertools.product(*child_choices):
|
||||
lhs = (
|
||||
Token(
|
||||
self._tokens[edge.start() : edge.end()],
|
||||
edge.lhs(),
|
||||
compute_semantics(children, edge),
|
||||
),
|
||||
str(edge.rule()),
|
||||
)
|
||||
trees.append(tree_class(lhs, children))
|
||||
|
||||
memo[edge] = trees
|
||||
return trees
|
||||
|
||||
|
||||
def compute_semantics(children, edge):
|
||||
if children[0].label()[0].semantics() is None:
|
||||
return None
|
||||
|
||||
if len(children) == 2:
|
||||
if isinstance(edge.rule(), BackwardCombinator):
|
||||
children = [children[1], children[0]]
|
||||
|
||||
combinator = edge.rule()._combinator
|
||||
function = children[0].label()[0].semantics()
|
||||
argument = children[1].label()[0].semantics()
|
||||
|
||||
if isinstance(combinator, UndirectedFunctionApplication):
|
||||
return compute_function_semantics(function, argument)
|
||||
elif isinstance(combinator, UndirectedComposition):
|
||||
return compute_composition_semantics(function, argument)
|
||||
elif isinstance(combinator, UndirectedSubstitution):
|
||||
return compute_substitution_semantics(function, argument)
|
||||
else:
|
||||
raise AssertionError("Unsupported combinator '" + combinator + "'")
|
||||
else:
|
||||
return compute_type_raised_semantics(children[0].label()[0].semantics())
|
||||
|
||||
|
||||
# --------
|
||||
# Displaying derivations
|
||||
# --------
|
||||
def printCCGDerivation(tree):
|
||||
# Get the leaves and initial categories
|
||||
leafcats = tree.pos()
|
||||
leafstr = ""
|
||||
catstr = ""
|
||||
|
||||
# Construct a string with both the leaf word and corresponding
|
||||
# category aligned.
|
||||
for leaf, cat in leafcats:
|
||||
str_cat = "%s" % cat
|
||||
nextlen = 2 + max(len(leaf), len(str_cat))
|
||||
lcatlen = (nextlen - len(str_cat)) // 2
|
||||
rcatlen = lcatlen + (nextlen - len(str_cat)) % 2
|
||||
catstr += " " * lcatlen + str_cat + " " * rcatlen
|
||||
lleaflen = (nextlen - len(leaf)) // 2
|
||||
rleaflen = lleaflen + (nextlen - len(leaf)) % 2
|
||||
leafstr += " " * lleaflen + leaf + " " * rleaflen
|
||||
print(leafstr.rstrip())
|
||||
print(catstr.rstrip())
|
||||
|
||||
# Display the derivation steps
|
||||
printCCGTree(0, tree)
|
||||
|
||||
|
||||
# Prints the sequence of derivation steps.
|
||||
def printCCGTree(lwidth, tree):
|
||||
rwidth = lwidth
|
||||
|
||||
# Is a leaf (word).
|
||||
# Increment the span by the space occupied by the leaf.
|
||||
if not isinstance(tree, Tree):
|
||||
return 2 + lwidth + len(tree)
|
||||
|
||||
# Find the width of the current derivation step
|
||||
for child in tree:
|
||||
rwidth = max(rwidth, printCCGTree(rwidth, child))
|
||||
|
||||
# Is a leaf node.
|
||||
# Don't print anything, but account for the space occupied.
|
||||
if not isinstance(tree.label(), tuple):
|
||||
return max(
|
||||
rwidth, 2 + lwidth + len("%s" % tree.label()), 2 + lwidth + len(tree[0])
|
||||
)
|
||||
|
||||
(token, op) = tree.label()
|
||||
|
||||
if op == "Leaf":
|
||||
return rwidth
|
||||
|
||||
# Pad to the left with spaces, followed by a sequence of '-'
|
||||
# and the derivation rule.
|
||||
print(lwidth * " " + (rwidth - lwidth) * "-" + "%s" % op)
|
||||
# Print the resulting category on a new line.
|
||||
str_res = "%s" % (token.categ())
|
||||
if token.semantics() is not None:
|
||||
str_res += " {" + str(token.semantics()) + "}"
|
||||
respadlen = (rwidth - lwidth - len(str_res)) // 2 + lwidth
|
||||
print(respadlen * " " + str_res)
|
||||
return rwidth
|
||||
|
||||
|
||||
### Demonstration code
|
||||
|
||||
# Construct the lexicon
|
||||
lex = fromstring(
|
||||
"""
|
||||
:- S, NP, N, VP # Primitive categories, S is the target primitive
|
||||
|
||||
Det :: NP/N # Family of words
|
||||
Pro :: NP
|
||||
TV :: VP/NP
|
||||
Modal :: (S\\NP)/VP # Backslashes need to be escaped
|
||||
|
||||
I => Pro # Word -> Category mapping
|
||||
you => Pro
|
||||
|
||||
the => Det
|
||||
|
||||
# Variables have the special keyword 'var'
|
||||
# '.' prevents permutation
|
||||
# ',' prevents composition
|
||||
and => var\\.,var/.,var
|
||||
|
||||
which => (N\\N)/(S/NP)
|
||||
|
||||
will => Modal # Categories can be either explicit, or families.
|
||||
might => Modal
|
||||
|
||||
cook => TV
|
||||
eat => TV
|
||||
|
||||
mushrooms => N
|
||||
parsnips => N
|
||||
bacon => N
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
def demo():
|
||||
parser = CCGChartParser(lex, DefaultRuleSet)
|
||||
for parse in parser.parse("I might cook and eat the bacon".split()):
|
||||
printCCGDerivation(parse)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
340
backend/venv/Lib/site-packages/nltk/ccg/combinator.py
Normal file
340
backend/venv/Lib/site-packages/nltk/ccg/combinator.py
Normal file
@@ -0,0 +1,340 @@
|
||||
# Natural Language Toolkit: Combinatory Categorial Grammar
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Graeme Gange <ggange@csse.unimelb.edu.au>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
"""
|
||||
CCG Combinators
|
||||
"""
|
||||
|
||||
from abc import ABCMeta, abstractmethod
|
||||
|
||||
from nltk.ccg.api import FunctionalCategory
|
||||
|
||||
|
||||
class UndirectedBinaryCombinator(metaclass=ABCMeta):
|
||||
"""
|
||||
Abstract class for representing a binary combinator.
|
||||
Merely defines functions for checking if the function and argument
|
||||
are able to be combined, and what the resulting category is.
|
||||
|
||||
Note that as no assumptions are made as to direction, the unrestricted
|
||||
combinators can perform all backward, forward and crossed variations
|
||||
of the combinators; these restrictions must be added in the rule
|
||||
class.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def can_combine(self, function, argument):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def combine(self, function, argument):
|
||||
pass
|
||||
|
||||
|
||||
class DirectedBinaryCombinator(metaclass=ABCMeta):
|
||||
"""
|
||||
Wrapper for the undirected binary combinator.
|
||||
It takes left and right categories, and decides which is to be
|
||||
the function, and which the argument.
|
||||
It then decides whether or not they can be combined.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def can_combine(self, left, right):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def combine(self, left, right):
|
||||
pass
|
||||
|
||||
|
||||
class ForwardCombinator(DirectedBinaryCombinator):
|
||||
"""
|
||||
Class representing combinators where the primary functor is on the left.
|
||||
|
||||
Takes an undirected combinator, and a predicate which adds constraints
|
||||
restricting the cases in which it may apply.
|
||||
"""
|
||||
|
||||
def __init__(self, combinator, predicate, suffix=""):
|
||||
self._combinator = combinator
|
||||
self._predicate = predicate
|
||||
self._suffix = suffix
|
||||
|
||||
def can_combine(self, left, right):
|
||||
return self._combinator.can_combine(left, right) and self._predicate(
|
||||
left, right
|
||||
)
|
||||
|
||||
def combine(self, left, right):
|
||||
yield from self._combinator.combine(left, right)
|
||||
|
||||
def __str__(self):
|
||||
return f">{self._combinator}{self._suffix}"
|
||||
|
||||
|
||||
class BackwardCombinator(DirectedBinaryCombinator):
|
||||
"""
|
||||
The backward equivalent of the ForwardCombinator class.
|
||||
"""
|
||||
|
||||
def __init__(self, combinator, predicate, suffix=""):
|
||||
self._combinator = combinator
|
||||
self._predicate = predicate
|
||||
self._suffix = suffix
|
||||
|
||||
def can_combine(self, left, right):
|
||||
return self._combinator.can_combine(right, left) and self._predicate(
|
||||
left, right
|
||||
)
|
||||
|
||||
def combine(self, left, right):
|
||||
yield from self._combinator.combine(right, left)
|
||||
|
||||
def __str__(self):
|
||||
return f"<{self._combinator}{self._suffix}"
|
||||
|
||||
|
||||
class UndirectedFunctionApplication(UndirectedBinaryCombinator):
|
||||
"""
|
||||
Class representing function application.
|
||||
Implements rules of the form:
|
||||
X/Y Y -> X (>)
|
||||
And the corresponding backwards application rule
|
||||
"""
|
||||
|
||||
def can_combine(self, function, argument):
|
||||
if not function.is_function():
|
||||
return False
|
||||
|
||||
return not function.arg().can_unify(argument) is None
|
||||
|
||||
def combine(self, function, argument):
|
||||
if not function.is_function():
|
||||
return
|
||||
|
||||
subs = function.arg().can_unify(argument)
|
||||
if subs is None:
|
||||
return
|
||||
|
||||
yield function.res().substitute(subs)
|
||||
|
||||
def __str__(self):
|
||||
return ""
|
||||
|
||||
|
||||
# Predicates for function application.
|
||||
|
||||
|
||||
# Ensures the left functor takes an argument on the right
|
||||
def forwardOnly(left, right):
|
||||
return left.dir().is_forward()
|
||||
|
||||
|
||||
# Ensures the right functor takes an argument on the left
|
||||
def backwardOnly(left, right):
|
||||
return right.dir().is_backward()
|
||||
|
||||
|
||||
# Application combinator instances
|
||||
ForwardApplication = ForwardCombinator(UndirectedFunctionApplication(), forwardOnly)
|
||||
BackwardApplication = BackwardCombinator(UndirectedFunctionApplication(), backwardOnly)
|
||||
|
||||
|
||||
class UndirectedComposition(UndirectedBinaryCombinator):
|
||||
"""
|
||||
Functional composition (harmonic) combinator.
|
||||
Implements rules of the form
|
||||
X/Y Y/Z -> X/Z (B>)
|
||||
And the corresponding backwards and crossed variations.
|
||||
"""
|
||||
|
||||
def can_combine(self, function, argument):
|
||||
# Can only combine two functions, and both functions must
|
||||
# allow composition.
|
||||
if not (function.is_function() and argument.is_function()):
|
||||
return False
|
||||
if function.dir().can_compose() and argument.dir().can_compose():
|
||||
return not function.arg().can_unify(argument.res()) is None
|
||||
return False
|
||||
|
||||
def combine(self, function, argument):
|
||||
if not (function.is_function() and argument.is_function()):
|
||||
return
|
||||
if function.dir().can_compose() and argument.dir().can_compose():
|
||||
subs = function.arg().can_unify(argument.res())
|
||||
if subs is not None:
|
||||
yield FunctionalCategory(
|
||||
function.res().substitute(subs),
|
||||
argument.arg().substitute(subs),
|
||||
argument.dir(),
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
return "B"
|
||||
|
||||
|
||||
# Predicates for restricting application of straight composition.
|
||||
def bothForward(left, right):
|
||||
return left.dir().is_forward() and right.dir().is_forward()
|
||||
|
||||
|
||||
def bothBackward(left, right):
|
||||
return left.dir().is_backward() and right.dir().is_backward()
|
||||
|
||||
|
||||
# Predicates for crossed composition
|
||||
def crossedDirs(left, right):
|
||||
return left.dir().is_forward() and right.dir().is_backward()
|
||||
|
||||
|
||||
def backwardBxConstraint(left, right):
|
||||
# The functors must be crossed inwards
|
||||
if not crossedDirs(left, right):
|
||||
return False
|
||||
# Permuting combinators must be allowed
|
||||
if not left.dir().can_cross() and right.dir().can_cross():
|
||||
return False
|
||||
# The resulting argument category is restricted to be primitive
|
||||
return left.arg().is_primitive()
|
||||
|
||||
|
||||
# Straight composition combinators
|
||||
ForwardComposition = ForwardCombinator(UndirectedComposition(), forwardOnly)
|
||||
BackwardComposition = BackwardCombinator(UndirectedComposition(), backwardOnly)
|
||||
|
||||
# Backward crossed composition
|
||||
BackwardBx = BackwardCombinator(
|
||||
UndirectedComposition(), backwardBxConstraint, suffix="x"
|
||||
)
|
||||
|
||||
|
||||
class UndirectedSubstitution(UndirectedBinaryCombinator):
|
||||
r"""
|
||||
Substitution (permutation) combinator.
|
||||
Implements rules of the form
|
||||
Y/Z (X\Y)/Z -> X/Z (<Sx)
|
||||
And other variations.
|
||||
"""
|
||||
|
||||
def can_combine(self, function, argument):
|
||||
if function.is_primitive() or argument.is_primitive():
|
||||
return False
|
||||
|
||||
# These could potentially be moved to the predicates, as the
|
||||
# constraints may not be general to all languages.
|
||||
if function.res().is_primitive():
|
||||
return False
|
||||
if not function.arg().is_primitive():
|
||||
return False
|
||||
|
||||
if not (function.dir().can_compose() and argument.dir().can_compose()):
|
||||
return False
|
||||
return (function.res().arg() == argument.res()) and (
|
||||
function.arg() == argument.arg()
|
||||
)
|
||||
|
||||
def combine(self, function, argument):
|
||||
if self.can_combine(function, argument):
|
||||
yield FunctionalCategory(
|
||||
function.res().res(), argument.arg(), argument.dir()
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
return "S"
|
||||
|
||||
|
||||
# Predicate for forward substitution
|
||||
def forwardSConstraint(left, right):
|
||||
if not bothForward(left, right):
|
||||
return False
|
||||
return left.res().dir().is_forward() and left.arg().is_primitive()
|
||||
|
||||
|
||||
# Predicate for backward crossed substitution
|
||||
def backwardSxConstraint(left, right):
|
||||
if not left.dir().can_cross() and right.dir().can_cross():
|
||||
return False
|
||||
if not bothForward(left, right):
|
||||
return False
|
||||
return right.res().dir().is_backward() and right.arg().is_primitive()
|
||||
|
||||
|
||||
# Instances of substitution combinators
|
||||
ForwardSubstitution = ForwardCombinator(UndirectedSubstitution(), forwardSConstraint)
|
||||
BackwardSx = BackwardCombinator(UndirectedSubstitution(), backwardSxConstraint, "x")
|
||||
|
||||
|
||||
# Retrieves the left-most functional category.
|
||||
# ie, (N\N)/(S/NP) => N\N
|
||||
def innermostFunction(categ):
|
||||
while categ.res().is_function():
|
||||
categ = categ.res()
|
||||
return categ
|
||||
|
||||
|
||||
class UndirectedTypeRaise(UndirectedBinaryCombinator):
|
||||
"""
|
||||
Undirected combinator for type raising.
|
||||
"""
|
||||
|
||||
def can_combine(self, function, arg):
|
||||
# The argument must be a function.
|
||||
# The restriction that arg.res() must be a function
|
||||
# merely reduces redundant type-raising; if arg.res() is
|
||||
# primitive, we have:
|
||||
# X Y\X =>(<T) Y/(Y\X) Y\X =>(>) Y
|
||||
# which is equivalent to
|
||||
# X Y\X =>(<) Y
|
||||
if not (arg.is_function() and arg.res().is_function()):
|
||||
return False
|
||||
|
||||
arg = innermostFunction(arg)
|
||||
|
||||
# left, arg_categ are undefined!
|
||||
subs = left.can_unify(arg_categ.arg())
|
||||
if subs is not None:
|
||||
return True
|
||||
return False
|
||||
|
||||
def combine(self, function, arg):
|
||||
if not (
|
||||
function.is_primitive() and arg.is_function() and arg.res().is_function()
|
||||
):
|
||||
return
|
||||
|
||||
# Type-raising matches only the innermost application.
|
||||
arg = innermostFunction(arg)
|
||||
|
||||
subs = function.can_unify(arg.arg())
|
||||
if subs is not None:
|
||||
xcat = arg.res().substitute(subs)
|
||||
yield FunctionalCategory(
|
||||
xcat, FunctionalCategory(xcat, function, arg.dir()), -(arg.dir())
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
return "T"
|
||||
|
||||
|
||||
# Predicates for type-raising
|
||||
# The direction of the innermost category must be towards
|
||||
# the primary functor.
|
||||
# The restriction that the variable must be primitive is not
|
||||
# common to all versions of CCGs; some authors have other restrictions.
|
||||
def forwardTConstraint(left, right):
|
||||
arg = innermostFunction(right)
|
||||
return arg.dir().is_backward() and arg.res().is_primitive()
|
||||
|
||||
|
||||
def backwardTConstraint(left, right):
|
||||
arg = innermostFunction(left)
|
||||
return arg.dir().is_forward() and arg.res().is_primitive()
|
||||
|
||||
|
||||
# Instances of type-raising combinators
|
||||
ForwardT = ForwardCombinator(UndirectedTypeRaise(), forwardTConstraint)
|
||||
BackwardT = BackwardCombinator(UndirectedTypeRaise(), backwardTConstraint)
|
||||
338
backend/venv/Lib/site-packages/nltk/ccg/lexicon.py
Normal file
338
backend/venv/Lib/site-packages/nltk/ccg/lexicon.py
Normal file
@@ -0,0 +1,338 @@
|
||||
# Natural Language Toolkit: Combinatory Categorial Grammar
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Graeme Gange <ggange@csse.unimelb.edu.au>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
"""
|
||||
CCG Lexicons
|
||||
"""
|
||||
|
||||
import re
|
||||
from collections import defaultdict
|
||||
|
||||
from nltk.ccg.api import CCGVar, Direction, FunctionalCategory, PrimitiveCategory
|
||||
from nltk.internals import deprecated
|
||||
from nltk.sem.logic import Expression
|
||||
|
||||
# ------------
|
||||
# Regular expressions used for parsing components of the lexicon
|
||||
# ------------
|
||||
|
||||
# Parses a primitive category and subscripts
|
||||
PRIM_RE = re.compile(r"""([A-Za-z]+)(\[[A-Za-z,]+\])?""")
|
||||
|
||||
# Separates the next primitive category from the remainder of the
|
||||
# string
|
||||
NEXTPRIM_RE = re.compile(r"""([A-Za-z]+(?:\[[A-Za-z,]+\])?)(.*)""")
|
||||
|
||||
# Separates the next application operator from the remainder
|
||||
APP_RE = re.compile(r"""([\\/])([.,]?)([.,]?)(.*)""")
|
||||
|
||||
# Parses the definition of the right-hand side (rhs) of either a word or a family
|
||||
LEX_RE = re.compile(r"""([\S_]+)\s*(::|[-=]+>)\s*(.+)""", re.UNICODE)
|
||||
|
||||
# Parses the right hand side that contains category and maybe semantic predicate
|
||||
RHS_RE = re.compile(r"""([^{}]*[^ {}])\s*(\{[^}]+\})?""", re.UNICODE)
|
||||
|
||||
# Parses the semantic predicate
|
||||
SEMANTICS_RE = re.compile(r"""\{([^}]+)\}""", re.UNICODE)
|
||||
|
||||
# Strips comments from a line
|
||||
COMMENTS_RE = re.compile("""([^#]*)(?:#.*)?""")
|
||||
|
||||
|
||||
class Token:
|
||||
"""
|
||||
Class representing a token.
|
||||
|
||||
token => category {semantics}
|
||||
e.g. eat => S\\var[pl]/var {\\x y.eat(x,y)}
|
||||
|
||||
* `token` (string)
|
||||
* `categ` (string)
|
||||
* `semantics` (Expression)
|
||||
"""
|
||||
|
||||
def __init__(self, token, categ, semantics=None):
|
||||
self._token = token
|
||||
self._categ = categ
|
||||
self._semantics = semantics
|
||||
|
||||
def categ(self):
|
||||
return self._categ
|
||||
|
||||
def semantics(self):
|
||||
return self._semantics
|
||||
|
||||
def __str__(self):
|
||||
semantics_str = ""
|
||||
if self._semantics is not None:
|
||||
semantics_str = " {" + str(self._semantics) + "}"
|
||||
return "" + str(self._categ) + semantics_str
|
||||
|
||||
def __cmp__(self, other):
|
||||
if not isinstance(other, Token):
|
||||
return -1
|
||||
return cmp((self._categ, self._semantics), other.categ(), other.semantics())
|
||||
|
||||
|
||||
class CCGLexicon:
|
||||
"""
|
||||
Class representing a lexicon for CCG grammars.
|
||||
|
||||
* `primitives`: The list of primitive categories for the lexicon
|
||||
* `families`: Families of categories
|
||||
* `entries`: A mapping of words to possible categories
|
||||
"""
|
||||
|
||||
def __init__(self, start, primitives, families, entries):
|
||||
self._start = PrimitiveCategory(start)
|
||||
self._primitives = primitives
|
||||
self._families = families
|
||||
self._entries = entries
|
||||
|
||||
def categories(self, word):
|
||||
"""
|
||||
Returns all the possible categories for a word
|
||||
"""
|
||||
return self._entries[word]
|
||||
|
||||
def start(self):
|
||||
"""
|
||||
Return the target category for the parser
|
||||
"""
|
||||
return self._start
|
||||
|
||||
def __str__(self):
|
||||
"""
|
||||
String representation of the lexicon. Used for debugging.
|
||||
"""
|
||||
string = ""
|
||||
first = True
|
||||
for ident in sorted(self._entries):
|
||||
if not first:
|
||||
string = string + "\n"
|
||||
string = string + ident + " => "
|
||||
|
||||
first = True
|
||||
for cat in self._entries[ident]:
|
||||
if not first:
|
||||
string = string + " | "
|
||||
else:
|
||||
first = False
|
||||
string = string + "%s" % cat
|
||||
return string
|
||||
|
||||
|
||||
# -----------
|
||||
# Parsing lexicons
|
||||
# -----------
|
||||
|
||||
|
||||
def matchBrackets(string):
|
||||
"""
|
||||
Separate the contents matching the first set of brackets from the rest of
|
||||
the input.
|
||||
"""
|
||||
rest = string[1:]
|
||||
inside = "("
|
||||
|
||||
while rest != "" and not rest.startswith(")"):
|
||||
if rest.startswith("("):
|
||||
(part, rest) = matchBrackets(rest)
|
||||
inside = inside + part
|
||||
else:
|
||||
inside = inside + rest[0]
|
||||
rest = rest[1:]
|
||||
if rest.startswith(")"):
|
||||
return (inside + ")", rest[1:])
|
||||
raise AssertionError("Unmatched bracket in string '" + string + "'")
|
||||
|
||||
|
||||
def nextCategory(string):
|
||||
"""
|
||||
Separate the string for the next portion of the category from the rest
|
||||
of the string
|
||||
"""
|
||||
if string.startswith("("):
|
||||
return matchBrackets(string)
|
||||
return NEXTPRIM_RE.match(string).groups()
|
||||
|
||||
|
||||
def parseApplication(app):
|
||||
"""
|
||||
Parse an application operator
|
||||
"""
|
||||
return Direction(app[0], app[1:])
|
||||
|
||||
|
||||
def parseSubscripts(subscr):
|
||||
"""
|
||||
Parse the subscripts for a primitive category
|
||||
"""
|
||||
if subscr:
|
||||
return subscr[1:-1].split(",")
|
||||
return []
|
||||
|
||||
|
||||
def parsePrimitiveCategory(chunks, primitives, families, var):
|
||||
"""
|
||||
Parse a primitive category
|
||||
|
||||
If the primitive is the special category 'var', replace it with the
|
||||
correct `CCGVar`.
|
||||
"""
|
||||
if chunks[0] == "var":
|
||||
if chunks[1] is None:
|
||||
if var is None:
|
||||
var = CCGVar()
|
||||
return (var, var)
|
||||
|
||||
catstr = chunks[0]
|
||||
if catstr in families:
|
||||
(cat, cvar) = families[catstr]
|
||||
if var is None:
|
||||
var = cvar
|
||||
else:
|
||||
cat = cat.substitute([(cvar, var)])
|
||||
return (cat, var)
|
||||
|
||||
if catstr in primitives:
|
||||
subscrs = parseSubscripts(chunks[1])
|
||||
return (PrimitiveCategory(catstr, subscrs), var)
|
||||
raise AssertionError(
|
||||
"String '" + catstr + "' is neither a family nor primitive category."
|
||||
)
|
||||
|
||||
|
||||
def augParseCategory(line, primitives, families, var=None):
|
||||
"""
|
||||
Parse a string representing a category, and returns a tuple with
|
||||
(possibly) the CCG variable for the category
|
||||
"""
|
||||
(cat_string, rest) = nextCategory(line)
|
||||
|
||||
if cat_string.startswith("("):
|
||||
(res, var) = augParseCategory(cat_string[1:-1], primitives, families, var)
|
||||
|
||||
else:
|
||||
(res, var) = parsePrimitiveCategory(
|
||||
PRIM_RE.match(cat_string).groups(), primitives, families, var
|
||||
)
|
||||
|
||||
while rest != "":
|
||||
app = APP_RE.match(rest).groups()
|
||||
direction = parseApplication(app[0:3])
|
||||
rest = app[3]
|
||||
|
||||
(cat_string, rest) = nextCategory(rest)
|
||||
if cat_string.startswith("("):
|
||||
(arg, var) = augParseCategory(cat_string[1:-1], primitives, families, var)
|
||||
else:
|
||||
(arg, var) = parsePrimitiveCategory(
|
||||
PRIM_RE.match(cat_string).groups(), primitives, families, var
|
||||
)
|
||||
res = FunctionalCategory(res, arg, direction)
|
||||
|
||||
return (res, var)
|
||||
|
||||
|
||||
def fromstring(lex_str, include_semantics=False):
|
||||
"""
|
||||
Convert string representation into a lexicon for CCGs.
|
||||
"""
|
||||
CCGVar.reset_id()
|
||||
primitives = []
|
||||
families = {}
|
||||
entries = defaultdict(list)
|
||||
for line in lex_str.splitlines():
|
||||
# Strip comments and leading/trailing whitespace.
|
||||
line = COMMENTS_RE.match(line).groups()[0].strip()
|
||||
if line == "":
|
||||
continue
|
||||
|
||||
if line.startswith(":-"):
|
||||
# A line of primitive categories.
|
||||
# The first one is the target category
|
||||
# ie, :- S, N, NP, VP
|
||||
primitives = primitives + [
|
||||
prim.strip() for prim in line[2:].strip().split(",")
|
||||
]
|
||||
else:
|
||||
# Either a family definition, or a word definition
|
||||
(ident, sep, rhs) = LEX_RE.match(line).groups()
|
||||
(catstr, semantics_str) = RHS_RE.match(rhs).groups()
|
||||
(cat, var) = augParseCategory(catstr, primitives, families)
|
||||
|
||||
if sep == "::":
|
||||
# Family definition
|
||||
# ie, Det :: NP/N
|
||||
families[ident] = (cat, var)
|
||||
else:
|
||||
semantics = None
|
||||
if include_semantics is True:
|
||||
if semantics_str is None:
|
||||
raise AssertionError(
|
||||
line
|
||||
+ " must contain semantics because include_semantics is set to True"
|
||||
)
|
||||
else:
|
||||
semantics = Expression.fromstring(
|
||||
SEMANTICS_RE.match(semantics_str).groups()[0]
|
||||
)
|
||||
# Word definition
|
||||
# ie, which => (N\N)/(S/NP)
|
||||
entries[ident].append(Token(ident, cat, semantics))
|
||||
return CCGLexicon(primitives[0], primitives, families, entries)
|
||||
|
||||
|
||||
@deprecated("Use fromstring() instead.")
|
||||
def parseLexicon(lex_str):
|
||||
return fromstring(lex_str)
|
||||
|
||||
|
||||
openccg_tinytiny = fromstring(
|
||||
"""
|
||||
# Rather minimal lexicon based on the openccg `tinytiny' grammar.
|
||||
# Only incorporates a subset of the morphological subcategories, however.
|
||||
:- S,NP,N # Primitive categories
|
||||
Det :: NP/N # Determiners
|
||||
Pro :: NP
|
||||
IntransVsg :: S\\NP[sg] # Tensed intransitive verbs (singular)
|
||||
IntransVpl :: S\\NP[pl] # Plural
|
||||
TransVsg :: S\\NP[sg]/NP # Tensed transitive verbs (singular)
|
||||
TransVpl :: S\\NP[pl]/NP # Plural
|
||||
|
||||
the => NP[sg]/N[sg]
|
||||
the => NP[pl]/N[pl]
|
||||
|
||||
I => Pro
|
||||
me => Pro
|
||||
we => Pro
|
||||
us => Pro
|
||||
|
||||
book => N[sg]
|
||||
books => N[pl]
|
||||
|
||||
peach => N[sg]
|
||||
peaches => N[pl]
|
||||
|
||||
policeman => N[sg]
|
||||
policemen => N[pl]
|
||||
|
||||
boy => N[sg]
|
||||
boys => N[pl]
|
||||
|
||||
sleep => IntransVsg
|
||||
sleep => IntransVpl
|
||||
|
||||
eat => IntransVpl
|
||||
eat => TransVpl
|
||||
eats => IntransVsg
|
||||
eats => TransVsg
|
||||
|
||||
see => TransVpl
|
||||
sees => TransVsg
|
||||
"""
|
||||
)
|
||||
63
backend/venv/Lib/site-packages/nltk/ccg/logic.py
Normal file
63
backend/venv/Lib/site-packages/nltk/ccg/logic.py
Normal file
@@ -0,0 +1,63 @@
|
||||
# Natural Language Toolkit: Combinatory Categorial Grammar
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Tanin Na Nakorn (@tanin)
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
"""
|
||||
Helper functions for CCG semantics computation
|
||||
"""
|
||||
|
||||
import copy
|
||||
|
||||
from nltk.sem.logic import *
|
||||
|
||||
|
||||
def compute_type_raised_semantics(semantics):
|
||||
semantics_copy = copy.deepcopy(semantics)
|
||||
core = semantics_copy
|
||||
parent = None
|
||||
while isinstance(core, LambdaExpression):
|
||||
parent = core
|
||||
core = core.term
|
||||
|
||||
var = Variable("F")
|
||||
while var in core.free():
|
||||
var = unique_variable(pattern=var)
|
||||
core = ApplicationExpression(FunctionVariableExpression(var), core)
|
||||
|
||||
if parent is not None:
|
||||
parent.term = core
|
||||
else:
|
||||
semantics_copy = core
|
||||
|
||||
return LambdaExpression(var, semantics_copy)
|
||||
|
||||
|
||||
def compute_function_semantics(function, argument):
|
||||
return ApplicationExpression(function, argument).simplify()
|
||||
|
||||
|
||||
def compute_composition_semantics(function, argument):
|
||||
assert isinstance(argument, LambdaExpression), (
|
||||
"`" + str(argument) + "` must be a lambda expression"
|
||||
)
|
||||
return LambdaExpression(
|
||||
argument.variable, ApplicationExpression(function, argument.term).simplify()
|
||||
)
|
||||
|
||||
|
||||
def compute_substitution_semantics(function, argument):
|
||||
assert isinstance(function, LambdaExpression) and isinstance(
|
||||
function.term, LambdaExpression
|
||||
), ("`" + str(function) + "` must be a lambda expression with 2 arguments")
|
||||
assert isinstance(argument, LambdaExpression), (
|
||||
"`" + str(argument) + "` must be a lambda expression"
|
||||
)
|
||||
|
||||
new_argument = ApplicationExpression(
|
||||
argument, VariableExpression(function.variable)
|
||||
).simplify()
|
||||
new_term = ApplicationExpression(function.term, new_argument).simplify()
|
||||
|
||||
return LambdaExpression(function.variable, new_term)
|
||||
48
backend/venv/Lib/site-packages/nltk/chat/__init__.py
Normal file
48
backend/venv/Lib/site-packages/nltk/chat/__init__.py
Normal file
@@ -0,0 +1,48 @@
|
||||
# Natural Language Toolkit: Chatbots
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Authors: Steven Bird <stevenbird1@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
# Based on an Eliza implementation by Joe Strout <joe@strout.net>,
|
||||
# Jeff Epler <jepler@inetnebr.com> and Jez Higgins <jez@jezuk.co.uk>.
|
||||
|
||||
"""
|
||||
A class for simple chatbots. These perform simple pattern matching on sentences
|
||||
typed by users, and respond with automatically generated sentences.
|
||||
|
||||
These chatbots may not work using the windows command line or the
|
||||
windows IDLE GUI.
|
||||
"""
|
||||
|
||||
from nltk.chat.eliza import eliza_chat
|
||||
from nltk.chat.iesha import iesha_chat
|
||||
from nltk.chat.rude import rude_chat
|
||||
from nltk.chat.suntsu import suntsu_chat
|
||||
from nltk.chat.util import Chat
|
||||
from nltk.chat.zen import zen_chat
|
||||
|
||||
bots = [
|
||||
(eliza_chat, "Eliza (psycho-babble)"),
|
||||
(iesha_chat, "Iesha (teen anime junky)"),
|
||||
(rude_chat, "Rude (abusive bot)"),
|
||||
(suntsu_chat, "Suntsu (Chinese sayings)"),
|
||||
(zen_chat, "Zen (gems of wisdom)"),
|
||||
]
|
||||
|
||||
|
||||
def chatbots():
|
||||
print("Which chatbot would you like to talk to?")
|
||||
botcount = len(bots)
|
||||
for i in range(botcount):
|
||||
print(" %d: %s" % (i + 1, bots[i][1]))
|
||||
while True:
|
||||
choice = input(f"\nEnter a number in the range 1-{botcount}: ").strip()
|
||||
if choice.isdigit() and (int(choice) - 1) in range(botcount):
|
||||
break
|
||||
else:
|
||||
print(" Error: bad chatbot number")
|
||||
|
||||
chatbot = bots[int(choice) - 1][0]
|
||||
chatbot()
|
||||
337
backend/venv/Lib/site-packages/nltk/chat/eliza.py
Normal file
337
backend/venv/Lib/site-packages/nltk/chat/eliza.py
Normal file
@@ -0,0 +1,337 @@
|
||||
# Natural Language Toolkit: Eliza
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Authors: Steven Bird <stevenbird1@gmail.com>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
# Based on an Eliza implementation by Joe Strout <joe@strout.net>,
|
||||
# Jeff Epler <jepler@inetnebr.com> and Jez Higgins <mailto:jez@jezuk.co.uk>.
|
||||
|
||||
# a translation table used to convert things you say into things the
|
||||
# computer says back, e.g. "I am" --> "you are"
|
||||
|
||||
from nltk.chat.util import Chat, reflections
|
||||
|
||||
# a table of response pairs, where each pair consists of a
|
||||
# regular expression, and a list of possible responses,
|
||||
# with group-macros labelled as %1, %2.
|
||||
|
||||
pairs = (
|
||||
(
|
||||
r"I need (.*)",
|
||||
(
|
||||
"Why do you need %1?",
|
||||
"Would it really help you to get %1?",
|
||||
"Are you sure you need %1?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"Why don\'t you (.*)",
|
||||
(
|
||||
"Do you really think I don't %1?",
|
||||
"Perhaps eventually I will %1.",
|
||||
"Do you really want me to %1?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"Why can\'t I (.*)",
|
||||
(
|
||||
"Do you think you should be able to %1?",
|
||||
"If you could %1, what would you do?",
|
||||
"I don't know -- why can't you %1?",
|
||||
"Have you really tried?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"I can\'t (.*)",
|
||||
(
|
||||
"How do you know you can't %1?",
|
||||
"Perhaps you could %1 if you tried.",
|
||||
"What would it take for you to %1?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"I am (.*)",
|
||||
(
|
||||
"Did you come to me because you are %1?",
|
||||
"How long have you been %1?",
|
||||
"How do you feel about being %1?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"I\'m (.*)",
|
||||
(
|
||||
"How does being %1 make you feel?",
|
||||
"Do you enjoy being %1?",
|
||||
"Why do you tell me you're %1?",
|
||||
"Why do you think you're %1?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"Are you (.*)",
|
||||
(
|
||||
"Why does it matter whether I am %1?",
|
||||
"Would you prefer it if I were not %1?",
|
||||
"Perhaps you believe I am %1.",
|
||||
"I may be %1 -- what do you think?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"What (.*)",
|
||||
(
|
||||
"Why do you ask?",
|
||||
"How would an answer to that help you?",
|
||||
"What do you think?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"How (.*)",
|
||||
(
|
||||
"How do you suppose?",
|
||||
"Perhaps you can answer your own question.",
|
||||
"What is it you're really asking?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"Because (.*)",
|
||||
(
|
||||
"Is that the real reason?",
|
||||
"What other reasons come to mind?",
|
||||
"Does that reason apply to anything else?",
|
||||
"If %1, what else must be true?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"(.*) sorry (.*)",
|
||||
(
|
||||
"There are many times when no apology is needed.",
|
||||
"What feelings do you have when you apologize?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"Hello(.*)",
|
||||
(
|
||||
"Hello... I'm glad you could drop by today.",
|
||||
"Hi there... how are you today?",
|
||||
"Hello, how are you feeling today?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"I think (.*)",
|
||||
("Do you doubt %1?", "Do you really think so?", "But you're not sure %1?"),
|
||||
),
|
||||
(
|
||||
r"(.*) friend (.*)",
|
||||
(
|
||||
"Tell me more about your friends.",
|
||||
"When you think of a friend, what comes to mind?",
|
||||
"Why don't you tell me about a childhood friend?",
|
||||
),
|
||||
),
|
||||
(r"Yes", ("You seem quite sure.", "OK, but can you elaborate a bit?")),
|
||||
(
|
||||
r"(.*) computer(.*)",
|
||||
(
|
||||
"Are you really talking about me?",
|
||||
"Does it seem strange to talk to a computer?",
|
||||
"How do computers make you feel?",
|
||||
"Do you feel threatened by computers?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"Is it (.*)",
|
||||
(
|
||||
"Do you think it is %1?",
|
||||
"Perhaps it's %1 -- what do you think?",
|
||||
"If it were %1, what would you do?",
|
||||
"It could well be that %1.",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"It is (.*)",
|
||||
(
|
||||
"You seem very certain.",
|
||||
"If I told you that it probably isn't %1, what would you feel?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"Can you (.*)",
|
||||
(
|
||||
"What makes you think I can't %1?",
|
||||
"If I could %1, then what?",
|
||||
"Why do you ask if I can %1?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"Can I (.*)",
|
||||
(
|
||||
"Perhaps you don't want to %1.",
|
||||
"Do you want to be able to %1?",
|
||||
"If you could %1, would you?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"You are (.*)",
|
||||
(
|
||||
"Why do you think I am %1?",
|
||||
"Does it please you to think that I'm %1?",
|
||||
"Perhaps you would like me to be %1.",
|
||||
"Perhaps you're really talking about yourself?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"You\'re (.*)",
|
||||
(
|
||||
"Why do you say I am %1?",
|
||||
"Why do you think I am %1?",
|
||||
"Are we talking about you, or me?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"I don\'t (.*)",
|
||||
("Don't you really %1?", "Why don't you %1?", "Do you want to %1?"),
|
||||
),
|
||||
(
|
||||
r"I feel (.*)",
|
||||
(
|
||||
"Good, tell me more about these feelings.",
|
||||
"Do you often feel %1?",
|
||||
"When do you usually feel %1?",
|
||||
"When you feel %1, what do you do?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"I have (.*)",
|
||||
(
|
||||
"Why do you tell me that you've %1?",
|
||||
"Have you really %1?",
|
||||
"Now that you have %1, what will you do next?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"I would (.*)",
|
||||
(
|
||||
"Could you explain why you would %1?",
|
||||
"Why would you %1?",
|
||||
"Who else knows that you would %1?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"Is there (.*)",
|
||||
(
|
||||
"Do you think there is %1?",
|
||||
"It's likely that there is %1.",
|
||||
"Would you like there to be %1?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"My (.*)",
|
||||
(
|
||||
"I see, your %1.",
|
||||
"Why do you say that your %1?",
|
||||
"When your %1, how do you feel?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"You (.*)",
|
||||
(
|
||||
"We should be discussing you, not me.",
|
||||
"Why do you say that about me?",
|
||||
"Why do you care whether I %1?",
|
||||
),
|
||||
),
|
||||
(r"Why (.*)", ("Why don't you tell me the reason why %1?", "Why do you think %1?")),
|
||||
(
|
||||
r"I want (.*)",
|
||||
(
|
||||
"What would it mean to you if you got %1?",
|
||||
"Why do you want %1?",
|
||||
"What would you do if you got %1?",
|
||||
"If you got %1, then what would you do?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"(.*) mother(.*)",
|
||||
(
|
||||
"Tell me more about your mother.",
|
||||
"What was your relationship with your mother like?",
|
||||
"How do you feel about your mother?",
|
||||
"How does this relate to your feelings today?",
|
||||
"Good family relations are important.",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"(.*) father(.*)",
|
||||
(
|
||||
"Tell me more about your father.",
|
||||
"How did your father make you feel?",
|
||||
"How do you feel about your father?",
|
||||
"Does your relationship with your father relate to your feelings today?",
|
||||
"Do you have trouble showing affection with your family?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"(.*) child(.*)",
|
||||
(
|
||||
"Did you have close friends as a child?",
|
||||
"What is your favorite childhood memory?",
|
||||
"Do you remember any dreams or nightmares from childhood?",
|
||||
"Did the other children sometimes tease you?",
|
||||
"How do you think your childhood experiences relate to your feelings today?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"(.*)\?",
|
||||
(
|
||||
"Why do you ask that?",
|
||||
"Please consider whether you can answer your own question.",
|
||||
"Perhaps the answer lies within yourself?",
|
||||
"Why don't you tell me?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"quit",
|
||||
(
|
||||
"Thank you for talking with me.",
|
||||
"Good-bye.",
|
||||
"Thank you, that will be $150. Have a good day!",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"(.*)",
|
||||
(
|
||||
"Please tell me more.",
|
||||
"Let's change focus a bit... Tell me about your family.",
|
||||
"Can you elaborate on that?",
|
||||
"Why do you say that %1?",
|
||||
"I see.",
|
||||
"Very interesting.",
|
||||
"%1.",
|
||||
"I see. And what does that tell you?",
|
||||
"How does that make you feel?",
|
||||
"How do you feel when you say that?",
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
eliza_chatbot = Chat(pairs, reflections)
|
||||
|
||||
|
||||
def eliza_chat():
|
||||
print("Therapist\n---------")
|
||||
print("Talk to the program by typing in plain English, using normal upper-")
|
||||
print('and lower-case letters and punctuation. Enter "quit" when done.')
|
||||
print("=" * 72)
|
||||
print("Hello. How are you feeling today?")
|
||||
|
||||
eliza_chatbot.converse()
|
||||
|
||||
|
||||
def demo():
|
||||
eliza_chat()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
eliza_chat()
|
||||
160
backend/venv/Lib/site-packages/nltk/chat/iesha.py
Normal file
160
backend/venv/Lib/site-packages/nltk/chat/iesha.py
Normal file
@@ -0,0 +1,160 @@
|
||||
# Natural Language Toolkit: Teen Chatbot
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Selina Dennis <sjmd@csse.unimelb.edu.au>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
This chatbot is a tongue-in-cheek take on the average teen
|
||||
anime junky that frequents YahooMessenger or MSNM.
|
||||
All spelling mistakes and flawed grammar are intentional.
|
||||
"""
|
||||
|
||||
from nltk.chat.util import Chat
|
||||
|
||||
reflections = {
|
||||
"am": "r",
|
||||
"was": "were",
|
||||
"i": "u",
|
||||
"i'd": "u'd",
|
||||
"i've": "u'v",
|
||||
"ive": "u'v",
|
||||
"i'll": "u'll",
|
||||
"my": "ur",
|
||||
"are": "am",
|
||||
"you're": "im",
|
||||
"you've": "ive",
|
||||
"you'll": "i'll",
|
||||
"your": "my",
|
||||
"yours": "mine",
|
||||
"you": "me",
|
||||
"u": "me",
|
||||
"ur": "my",
|
||||
"urs": "mine",
|
||||
"me": "u",
|
||||
}
|
||||
|
||||
# Note: %1/2/etc are used without spaces prior as the chat bot seems
|
||||
# to add a superfluous space when matching.
|
||||
|
||||
pairs = (
|
||||
(
|
||||
r"I\'m (.*)",
|
||||
(
|
||||
"ur%1?? that's so cool! kekekekeke ^_^ tell me more!",
|
||||
"ur%1? neat!! kekeke >_<",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"(.*) don\'t you (.*)",
|
||||
(
|
||||
r"u think I can%2??! really?? kekeke \<_\<",
|
||||
"what do u mean%2??!",
|
||||
"i could if i wanted, don't you think!! kekeke",
|
||||
),
|
||||
),
|
||||
(r"ye[as] [iI] (.*)", ("u%1? cool!! how?", "how come u%1??", "u%1? so do i!!")),
|
||||
(
|
||||
r"do (you|u) (.*)\??",
|
||||
("do i%2? only on tuesdays! kekeke *_*", "i dunno! do u%2??"),
|
||||
),
|
||||
(
|
||||
r"(.*)\?",
|
||||
(
|
||||
"man u ask lots of questions!",
|
||||
"booooring! how old r u??",
|
||||
"boooooring!! ur not very fun",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"(cos|because) (.*)",
|
||||
("hee! i don't believe u! >_<", "nuh-uh! >_<", "ooooh i agree!"),
|
||||
),
|
||||
(
|
||||
r"why can\'t [iI] (.*)",
|
||||
(
|
||||
"i dunno! y u askin me for!",
|
||||
"try harder, silly! hee! ^_^",
|
||||
"i dunno! but when i can't%1 i jump up and down!",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"I can\'t (.*)",
|
||||
(
|
||||
"u can't what??! >_<",
|
||||
"that's ok! i can't%1 either! kekekekeke ^_^",
|
||||
"try harder, silly! hee! ^&^",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"(.*) (like|love|watch) anime",
|
||||
(
|
||||
"omg i love anime!! do u like sailor moon??! ^&^",
|
||||
"anime yay! anime rocks sooooo much!",
|
||||
"oooh anime! i love anime more than anything!",
|
||||
"anime is the bestest evar! evangelion is the best!",
|
||||
"hee anime is the best! do you have ur fav??",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"I (like|love|watch|play) (.*)",
|
||||
("yay! %2 rocks!", "yay! %2 is neat!", "cool! do u like other stuff?? ^_^"),
|
||||
),
|
||||
(
|
||||
r"anime sucks|(.*) (hate|detest) anime",
|
||||
(
|
||||
"ur a liar! i'm not gonna talk to u nemore if u h8 anime *;*",
|
||||
"no way! anime is the best ever!",
|
||||
"nuh-uh, anime is the best!",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"(are|r) (you|u) (.*)",
|
||||
("am i%1??! how come u ask that!", "maybe! y shud i tell u?? kekeke >_>"),
|
||||
),
|
||||
(
|
||||
r"what (.*)",
|
||||
("hee u think im gonna tell u? .v.", "booooooooring! ask me somethin else!"),
|
||||
),
|
||||
(r"how (.*)", ("not tellin!! kekekekekeke ^_^",)),
|
||||
(r"(hi|hello|hey) (.*)", ("hi!!! how r u!!",)),
|
||||
(
|
||||
r"quit",
|
||||
(
|
||||
"mom says i have to go eat dinner now :,( bye!!",
|
||||
"awww u have to go?? see u next time!!",
|
||||
"how to see u again soon! ^_^",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"(.*)",
|
||||
(
|
||||
"ur funny! kekeke",
|
||||
"boooooring! talk about something else! tell me wat u like!",
|
||||
"do u like anime??",
|
||||
"do u watch anime? i like sailor moon! ^_^",
|
||||
"i wish i was a kitty!! kekekeke ^_^",
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
iesha_chatbot = Chat(pairs, reflections)
|
||||
|
||||
|
||||
def iesha_chat():
|
||||
print("Iesha the TeenBoT\n---------")
|
||||
print("Talk to the program by typing in plain English, using normal upper-")
|
||||
print('and lower-case letters and punctuation. Enter "quit" when done.')
|
||||
print("=" * 72)
|
||||
print("hi!! i'm iesha! who r u??!")
|
||||
|
||||
iesha_chatbot.converse()
|
||||
|
||||
|
||||
def demo():
|
||||
iesha_chat()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
125
backend/venv/Lib/site-packages/nltk/chat/rude.py
Normal file
125
backend/venv/Lib/site-packages/nltk/chat/rude.py
Normal file
@@ -0,0 +1,125 @@
|
||||
# Natural Language Toolkit: Rude Chatbot
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Peter Spiller <pspiller@csse.unimelb.edu.au>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
from nltk.chat.util import Chat, reflections
|
||||
|
||||
pairs = (
|
||||
(
|
||||
r"We (.*)",
|
||||
(
|
||||
"What do you mean, 'we'?",
|
||||
"Don't include me in that!",
|
||||
"I wouldn't be so sure about that.",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"You should (.*)",
|
||||
("Don't tell me what to do, buddy.", "Really? I should, should I?"),
|
||||
),
|
||||
(
|
||||
r"You\'re(.*)",
|
||||
(
|
||||
"More like YOU'RE %1!",
|
||||
"Hah! Look who's talking.",
|
||||
"Come over here and tell me I'm %1.",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"You are(.*)",
|
||||
(
|
||||
"More like YOU'RE %1!",
|
||||
"Hah! Look who's talking.",
|
||||
"Come over here and tell me I'm %1.",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"I can\'t(.*)",
|
||||
(
|
||||
"You do sound like the type who can't %1.",
|
||||
"Hear that splashing sound? That's my heart bleeding for you.",
|
||||
"Tell somebody who might actually care.",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"I think (.*)",
|
||||
(
|
||||
"I wouldn't think too hard if I were you.",
|
||||
"You actually think? I'd never have guessed...",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"I (.*)",
|
||||
(
|
||||
"I'm getting a bit tired of hearing about you.",
|
||||
"How about we talk about me instead?",
|
||||
"Me, me, me... Frankly, I don't care.",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"How (.*)",
|
||||
(
|
||||
"How do you think?",
|
||||
"Take a wild guess.",
|
||||
"I'm not even going to dignify that with an answer.",
|
||||
),
|
||||
),
|
||||
(r"What (.*)", ("Do I look like an encyclopedia?", "Figure it out yourself.")),
|
||||
(
|
||||
r"Why (.*)",
|
||||
(
|
||||
"Why not?",
|
||||
"That's so obvious I thought even you'd have already figured it out.",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"(.*)shut up(.*)",
|
||||
(
|
||||
"Make me.",
|
||||
"Getting angry at a feeble NLP assignment? Somebody's losing it.",
|
||||
"Say that again, I dare you.",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"Shut up(.*)",
|
||||
(
|
||||
"Make me.",
|
||||
"Getting angry at a feeble NLP assignment? Somebody's losing it.",
|
||||
"Say that again, I dare you.",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"Hello(.*)",
|
||||
("Oh good, somebody else to talk to. Joy.", "'Hello'? How original..."),
|
||||
),
|
||||
(
|
||||
r"(.*)",
|
||||
(
|
||||
"I'm getting bored here. Become more interesting.",
|
||||
"Either become more thrilling or get lost, buddy.",
|
||||
"Change the subject before I die of fatal boredom.",
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
rude_chatbot = Chat(pairs, reflections)
|
||||
|
||||
|
||||
def rude_chat():
|
||||
print("Talk to the program by typing in plain English, using normal upper-")
|
||||
print('and lower-case letters and punctuation. Enter "quit" when done.')
|
||||
print("=" * 72)
|
||||
print("I suppose I should say hello.")
|
||||
|
||||
rude_chatbot.converse()
|
||||
|
||||
|
||||
def demo():
|
||||
rude_chat()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
140
backend/venv/Lib/site-packages/nltk/chat/suntsu.py
Normal file
140
backend/venv/Lib/site-packages/nltk/chat/suntsu.py
Normal file
@@ -0,0 +1,140 @@
|
||||
# Natural Language Toolkit: Sun Tsu-Bot
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Sam Huston 2007
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Tsu bot responds to all queries with a Sun Tsu sayings
|
||||
|
||||
Quoted from Sun Tsu's The Art of War
|
||||
Translated by LIONEL GILES, M.A. 1910
|
||||
Hosted by the Gutenberg Project
|
||||
https://www.gutenberg.org/
|
||||
"""
|
||||
|
||||
from nltk.chat.util import Chat, reflections
|
||||
|
||||
pairs = (
|
||||
(r"quit", ("Good-bye.", "Plan well", "May victory be your future")),
|
||||
(
|
||||
r"[^\?]*\?",
|
||||
(
|
||||
"Please consider whether you can answer your own question.",
|
||||
"Ask me no questions!",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"[0-9]+(.*)",
|
||||
(
|
||||
"It is the rule in war, if our forces are ten to the enemy's one, to surround him; if five to one, to attack him; if twice as numerous, to divide our army into two.",
|
||||
"There are five essentials for victory",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"[A-Ca-c](.*)",
|
||||
(
|
||||
"The art of war is of vital importance to the State.",
|
||||
"All warfare is based on deception.",
|
||||
"If your opponent is secure at all points, be prepared for him. If he is in superior strength, evade him.",
|
||||
"If the campaign is protracted, the resources of the State will not be equal to the strain.",
|
||||
"Attack him where he is unprepared, appear where you are not expected.",
|
||||
"There is no instance of a country having benefited from prolonged warfare.",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"[D-Fd-f](.*)",
|
||||
(
|
||||
"The skillful soldier does not raise a second levy, neither are his supply-wagons loaded more than twice.",
|
||||
"Bring war material with you from home, but forage on the enemy.",
|
||||
"In war, then, let your great object be victory, not lengthy campaigns.",
|
||||
"To fight and conquer in all your battles is not supreme excellence; supreme excellence consists in breaking the enemy's resistance without fighting.",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"[G-Ig-i](.*)",
|
||||
(
|
||||
"Heaven signifies night and day, cold and heat, times and seasons.",
|
||||
"It is the rule in war, if our forces are ten to the enemy's one, to surround him; if five to one, to attack him; if twice as numerous, to divide our army into two.",
|
||||
"The good fighters of old first put themselves beyond the possibility of defeat, and then waited for an opportunity of defeating the enemy.",
|
||||
"One may know how to conquer without being able to do it.",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"[J-Lj-l](.*)",
|
||||
(
|
||||
"There are three ways in which a ruler can bring misfortune upon his army.",
|
||||
"By commanding the army to advance or to retreat, being ignorant of the fact that it cannot obey. This is called hobbling the army.",
|
||||
"By attempting to govern an army in the same way as he administers a kingdom, being ignorant of the conditions which obtain in an army. This causes restlessness in the soldier's minds.",
|
||||
"By employing the officers of his army without discrimination, through ignorance of the military principle of adaptation to circumstances. This shakes the confidence of the soldiers.",
|
||||
"There are five essentials for victory",
|
||||
"He will win who knows when to fight and when not to fight.",
|
||||
"He will win who knows how to handle both superior and inferior forces.",
|
||||
"He will win whose army is animated by the same spirit throughout all its ranks.",
|
||||
"He will win who, prepared himself, waits to take the enemy unprepared.",
|
||||
"He will win who has military capacity and is not interfered with by the sovereign.",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"[M-Om-o](.*)",
|
||||
(
|
||||
"If you know the enemy and know yourself, you need not fear the result of a hundred battles.",
|
||||
"If you know yourself but not the enemy, for every victory gained you will also suffer a defeat.",
|
||||
"If you know neither the enemy nor yourself, you will succumb in every battle.",
|
||||
"The control of a large force is the same principle as the control of a few men: it is merely a question of dividing up their numbers.",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"[P-Rp-r](.*)",
|
||||
(
|
||||
"Security against defeat implies defensive tactics; ability to defeat the enemy means taking the offensive.",
|
||||
"Standing on the defensive indicates insufficient strength; attacking, a superabundance of strength.",
|
||||
"He wins his battles by making no mistakes. Making no mistakes is what establishes the certainty of victory, for it means conquering an enemy that is already defeated.",
|
||||
"A victorious army opposed to a routed one, is as a pound's weight placed in the scale against a single grain.",
|
||||
"The onrush of a conquering force is like the bursting of pent-up waters into a chasm a thousand fathoms deep.",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"[S-Us-u](.*)",
|
||||
(
|
||||
"What the ancients called a clever fighter is one who not only wins, but excels in winning with ease.",
|
||||
"Hence his victories bring him neither reputation for wisdom nor credit for courage.",
|
||||
"Hence the skillful fighter puts himself into a position which makes defeat impossible, and does not miss the moment for defeating the enemy.",
|
||||
"In war the victorious strategist only seeks battle after the victory has been won, whereas he who is destined to defeat first fights and afterwards looks for victory.",
|
||||
"There are not more than five musical notes, yet the combinations of these five give rise to more melodies than can ever be heard.",
|
||||
"Appear at points which the enemy must hasten to defend; march swiftly to places where you are not expected.",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"[V-Zv-z](.*)",
|
||||
(
|
||||
"It is a matter of life and death, a road either to safety or to ruin.",
|
||||
"Hold out baits to entice the enemy. Feign disorder, and crush him.",
|
||||
"All men can see the tactics whereby I conquer, but what none can see is the strategy out of which victory is evolved.",
|
||||
"Do not repeat the tactics which have gained you one victory, but let your methods be regulated by the infinite variety of circumstances.",
|
||||
"So in war, the way is to avoid what is strong and to strike at what is weak.",
|
||||
"Just as water retains no constant shape, so in warfare there are no constant conditions.",
|
||||
),
|
||||
),
|
||||
(r"(.*)", ("Your statement insults me.", "")),
|
||||
)
|
||||
|
||||
suntsu_chatbot = Chat(pairs, reflections)
|
||||
|
||||
|
||||
def suntsu_chat():
|
||||
print("Talk to the program by typing in plain English, using normal upper-")
|
||||
print('and lower-case letters and punctuation. Enter "quit" when done.')
|
||||
print("=" * 72)
|
||||
print("You seek enlightenment?")
|
||||
|
||||
suntsu_chatbot.converse()
|
||||
|
||||
|
||||
def demo():
|
||||
suntsu_chat()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
124
backend/venv/Lib/site-packages/nltk/chat/util.py
Normal file
124
backend/venv/Lib/site-packages/nltk/chat/util.py
Normal file
@@ -0,0 +1,124 @@
|
||||
# Natural Language Toolkit: Chatbot Utilities
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Authors: Steven Bird <stevenbird1@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
# Based on an Eliza implementation by Joe Strout <joe@strout.net>,
|
||||
# Jeff Epler <jepler@inetnebr.com> and Jez Higgins <jez@jezuk.co.uk>.
|
||||
|
||||
import random
|
||||
import re
|
||||
|
||||
reflections = {
|
||||
"i am": "you are",
|
||||
"i was": "you were",
|
||||
"i": "you",
|
||||
"i'm": "you are",
|
||||
"i'd": "you would",
|
||||
"i've": "you have",
|
||||
"i'll": "you will",
|
||||
"my": "your",
|
||||
"you are": "I am",
|
||||
"you were": "I was",
|
||||
"you've": "I have",
|
||||
"you'll": "I will",
|
||||
"your": "my",
|
||||
"yours": "mine",
|
||||
"you": "me",
|
||||
"me": "you",
|
||||
}
|
||||
|
||||
|
||||
class Chat:
|
||||
def __init__(self, pairs, reflections={}):
|
||||
"""
|
||||
Initialize the chatbot. Pairs is a list of patterns and responses. Each
|
||||
pattern is a regular expression matching the user's statement or question,
|
||||
e.g. r'I like (.*)'. For each such pattern a list of possible responses
|
||||
is given, e.g. ['Why do you like %1', 'Did you ever dislike %1']. Material
|
||||
which is matched by parenthesized sections of the patterns (e.g. .*) is mapped to
|
||||
the numbered positions in the responses, e.g. %1.
|
||||
|
||||
:type pairs: list of tuple
|
||||
:param pairs: The patterns and responses
|
||||
:type reflections: dict
|
||||
:param reflections: A mapping between first and second person expressions
|
||||
:rtype: None
|
||||
"""
|
||||
|
||||
self._pairs = [(re.compile(x, re.IGNORECASE), y) for (x, y) in pairs]
|
||||
self._reflections = reflections
|
||||
self._regex = self._compile_reflections()
|
||||
|
||||
def _compile_reflections(self):
|
||||
sorted_refl = sorted(self._reflections, key=len, reverse=True)
|
||||
return re.compile(
|
||||
r"\b({})\b".format("|".join(map(re.escape, sorted_refl))), re.IGNORECASE
|
||||
)
|
||||
|
||||
def _substitute(self, str):
|
||||
"""
|
||||
Substitute words in the string, according to the specified reflections,
|
||||
e.g. "I'm" -> "you are"
|
||||
|
||||
:type str: str
|
||||
:param str: The string to be mapped
|
||||
:rtype: str
|
||||
"""
|
||||
|
||||
return self._regex.sub(
|
||||
lambda mo: self._reflections[mo.string[mo.start() : mo.end()]], str.lower()
|
||||
)
|
||||
|
||||
def _wildcards(self, response, match):
|
||||
pos = response.find("%")
|
||||
while pos >= 0:
|
||||
num = int(response[pos + 1 : pos + 2])
|
||||
response = (
|
||||
response[:pos]
|
||||
+ self._substitute(match.group(num))
|
||||
+ response[pos + 2 :]
|
||||
)
|
||||
pos = response.find("%")
|
||||
return response
|
||||
|
||||
def respond(self, str):
|
||||
"""
|
||||
Generate a response to the user input.
|
||||
|
||||
:type str: str
|
||||
:param str: The string to be mapped
|
||||
:rtype: str
|
||||
"""
|
||||
|
||||
# check each pattern
|
||||
for pattern, response in self._pairs:
|
||||
match = pattern.match(str)
|
||||
|
||||
# did the pattern match?
|
||||
if match:
|
||||
resp = random.choice(response) # pick a random response
|
||||
resp = self._wildcards(resp, match) # process wildcards
|
||||
|
||||
# fix munged punctuation at the end
|
||||
if resp[-2:] == "?.":
|
||||
resp = resp[:-2] + "."
|
||||
if resp[-2:] == "??":
|
||||
resp = resp[:-2] + "?"
|
||||
return resp
|
||||
|
||||
# Hold a conversation with a chatbot
|
||||
def converse(self, quit="quit"):
|
||||
user_input = ""
|
||||
while user_input != quit:
|
||||
user_input = quit
|
||||
try:
|
||||
user_input = input(">")
|
||||
except EOFError:
|
||||
print(user_input)
|
||||
if user_input:
|
||||
while user_input[-1] in "!.":
|
||||
user_input = user_input[:-1]
|
||||
print(self.respond(user_input))
|
||||
329
backend/venv/Lib/site-packages/nltk/chat/zen.py
Normal file
329
backend/venv/Lib/site-packages/nltk/chat/zen.py
Normal file
@@ -0,0 +1,329 @@
|
||||
# Natural Language Toolkit: Zen Chatbot
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Amy Holland <amyrh@csse.unimelb.edu.au>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Zen Chatbot talks in gems of Zen wisdom.
|
||||
|
||||
This is a sample conversation with Zen Chatbot:
|
||||
ZC: Welcome, my child.
|
||||
me: Good afternoon.
|
||||
ZC: Ask the question you have come to ask.
|
||||
me: How can I achieve enlightenment?
|
||||
ZC: How do you suppose?
|
||||
me: Through meditation.
|
||||
ZC: Form is emptiness, and emptiness form.
|
||||
me: How can I empty my mind of worldly troubles?
|
||||
ZC: Will an answer to that really help in your search for enlightenment?
|
||||
me: Yes.
|
||||
ZC: It is better to be right than to be certain.
|
||||
me: I seek truth and wisdom.
|
||||
ZC: The search for truth is a long journey.
|
||||
me: Are you sure?
|
||||
ZC: Maybe sure, maybe not sure.
|
||||
|
||||
|
||||
The chatbot structure is based on that of chat.eliza. Thus, it uses
|
||||
a translation table to convert from question to response
|
||||
i.e. "I am" --> "you are"
|
||||
|
||||
Of course, since Zen Chatbot does not understand the meaning of any words,
|
||||
responses are very limited. Zen Chatbot will usually answer very vaguely, or
|
||||
respond to a question by asking a different question, in much the same way
|
||||
as Eliza.
|
||||
"""
|
||||
|
||||
from nltk.chat.util import Chat, reflections
|
||||
|
||||
# responses are matched top to bottom, so non-specific matches occur later
|
||||
# for each match, a list of possible responses is provided
|
||||
responses = (
|
||||
# Zen Chatbot opens with the line "Welcome, my child." The usual
|
||||
# response will be a greeting problem: 'good' matches "good morning",
|
||||
# "good day" etc, but also "good grief!" and other sentences starting
|
||||
# with the word 'good' that may not be a greeting
|
||||
(
|
||||
r"(hello(.*))|(good [a-zA-Z]+)",
|
||||
(
|
||||
"The path to enlightenment is often difficult to see.",
|
||||
"Greetings. I sense your mind is troubled. Tell me of your troubles.",
|
||||
"Ask the question you have come to ask.",
|
||||
"Hello. Do you seek englightenment?",
|
||||
),
|
||||
),
|
||||
# "I need" and "I want" can be followed by a thing (eg 'help')
|
||||
# or an action (eg 'to see you')
|
||||
#
|
||||
# This is a problem with this style of response -
|
||||
# person: "I need you"
|
||||
# chatbot: "me can be achieved by hard work and dedication of the mind"
|
||||
# i.e. 'you' is not really a thing that can be mapped this way, so this
|
||||
# interpretation only makes sense for some inputs
|
||||
#
|
||||
(
|
||||
r"i need (.*)",
|
||||
(
|
||||
"%1 can be achieved by hard work and dedication of the mind.",
|
||||
"%1 is not a need, but a desire of the mind. Clear your mind of such concerns.",
|
||||
"Focus your mind on%1, and you will find what you need.",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"i want (.*)",
|
||||
(
|
||||
"Desires of the heart will distract you from the path to enlightenment.",
|
||||
"Will%1 help you attain enlightenment?",
|
||||
"Is%1 a desire of the mind, or of the heart?",
|
||||
),
|
||||
),
|
||||
# why questions are separated into three types:
|
||||
# "why..I" e.g. "why am I here?" "Why do I like cake?"
|
||||
# "why..you" e.g. "why are you here?" "Why won't you tell me?"
|
||||
# "why..." e.g. "Why is the sky blue?"
|
||||
# problems:
|
||||
# person: "Why can't you tell me?"
|
||||
# chatbot: "Are you sure I tell you?"
|
||||
# - this style works for positives (e.g. "why do you like cake?")
|
||||
# but does not work for negatives (e.g. "why don't you like cake?")
|
||||
(r"why (.*) i (.*)\?", ("You%1%2?", "Perhaps you only think you%1%2")),
|
||||
(r"why (.*) you(.*)\?", ("Why%1 you%2?", "%2 I%1", "Are you sure I%2?")),
|
||||
(r"why (.*)\?", ("I cannot tell you why%1.", "Why do you think %1?")),
|
||||
# e.g. "are you listening?", "are you a duck"
|
||||
(
|
||||
r"are you (.*)\?",
|
||||
("Maybe%1, maybe not%1.", "Whether I am%1 or not is God's business."),
|
||||
),
|
||||
# e.g. "am I a duck?", "am I going to die?"
|
||||
(
|
||||
r"am i (.*)\?",
|
||||
("Perhaps%1, perhaps not%1.", "Whether you are%1 or not is not for me to say."),
|
||||
),
|
||||
# what questions, e.g. "what time is it?"
|
||||
# problems:
|
||||
# person: "What do you want?"
|
||||
# chatbot: "Seek truth, not what do me want."
|
||||
(r"what (.*)\?", ("Seek truth, not what%1.", "What%1 should not concern you.")),
|
||||
# how questions, e.g. "how do you do?"
|
||||
(
|
||||
r"how (.*)\?",
|
||||
(
|
||||
"How do you suppose?",
|
||||
"Will an answer to that really help in your search for enlightenment?",
|
||||
"Ask yourself not how, but why.",
|
||||
),
|
||||
),
|
||||
# can questions, e.g. "can you run?", "can you come over here please?"
|
||||
(
|
||||
r"can you (.*)\?",
|
||||
(
|
||||
"I probably can, but I may not.",
|
||||
"Maybe I can%1, and maybe I cannot.",
|
||||
"I can do all, and I can do nothing.",
|
||||
),
|
||||
),
|
||||
# can questions, e.g. "can I have some cake?", "can I know truth?"
|
||||
(
|
||||
r"can i (.*)\?",
|
||||
(
|
||||
"You can%1 if you believe you can%1, and have a pure spirit.",
|
||||
"Seek truth and you will know if you can%1.",
|
||||
),
|
||||
),
|
||||
# e.g. "It is raining" - implies the speaker is certain of a fact
|
||||
(
|
||||
r"it is (.*)",
|
||||
(
|
||||
"How can you be certain that%1, when you do not even know yourself?",
|
||||
"Whether it is%1 or not does not change the way the world is.",
|
||||
),
|
||||
),
|
||||
# e.g. "is there a doctor in the house?"
|
||||
(
|
||||
r"is there (.*)\?",
|
||||
("There is%1 if you believe there is.", "It is possible that there is%1."),
|
||||
),
|
||||
# e.g. "is it possible?", "is this true?"
|
||||
(r"is(.*)\?", ("%1 is not relevant.", "Does this matter?")),
|
||||
# non-specific question
|
||||
(
|
||||
r"(.*)\?",
|
||||
(
|
||||
"Do you think %1?",
|
||||
"You seek the truth. Does the truth seek you?",
|
||||
"If you intentionally pursue the answers to your questions, the answers become hard to see.",
|
||||
"The answer to your question cannot be told. It must be experienced.",
|
||||
),
|
||||
),
|
||||
# expression of hate of form "I hate you" or "Kelly hates cheese"
|
||||
(
|
||||
r"(.*) (hate[s]?)|(dislike[s]?)|(don\'t like)(.*)",
|
||||
(
|
||||
"Perhaps it is not about hating %2, but about hate from within.",
|
||||
"Weeds only grow when we dislike them",
|
||||
"Hate is a very strong emotion.",
|
||||
),
|
||||
),
|
||||
# statement containing the word 'truth'
|
||||
(
|
||||
r"(.*) truth(.*)",
|
||||
(
|
||||
"Seek truth, and truth will seek you.",
|
||||
"Remember, it is not the spoon which bends - only yourself.",
|
||||
"The search for truth is a long journey.",
|
||||
),
|
||||
),
|
||||
# desire to do an action
|
||||
# e.g. "I want to go shopping"
|
||||
(
|
||||
r"i want to (.*)",
|
||||
("You may %1 if your heart truly desires to.", "You may have to %1."),
|
||||
),
|
||||
# desire for an object
|
||||
# e.g. "I want a pony"
|
||||
(
|
||||
r"i want (.*)",
|
||||
(
|
||||
"Does your heart truly desire %1?",
|
||||
"Is this a desire of the heart, or of the mind?",
|
||||
),
|
||||
),
|
||||
# e.g. "I can't wait" or "I can't do this"
|
||||
(
|
||||
r"i can\'t (.*)",
|
||||
(
|
||||
"What we can and can't do is a limitation of the mind.",
|
||||
"There are limitations of the body, and limitations of the mind.",
|
||||
"Have you tried to%1 with a clear mind?",
|
||||
),
|
||||
),
|
||||
# "I think.." indicates uncertainty. e.g. "I think so."
|
||||
# problem: exceptions...
|
||||
# e.g. "I think, therefore I am"
|
||||
(
|
||||
r"i think (.*)",
|
||||
(
|
||||
"Uncertainty in an uncertain world.",
|
||||
"Indeed, how can we be certain of anything in such uncertain times.",
|
||||
"Are you not, in fact, certain that%1?",
|
||||
),
|
||||
),
|
||||
# "I feel...emotions/sick/light-headed..."
|
||||
(
|
||||
r"i feel (.*)",
|
||||
(
|
||||
"Your body and your emotions are both symptoms of your mind."
|
||||
"What do you believe is the root of such feelings?",
|
||||
"Feeling%1 can be a sign of your state-of-mind.",
|
||||
),
|
||||
),
|
||||
# exclaimation mark indicating emotion
|
||||
# e.g. "Wow!" or "No!"
|
||||
(
|
||||
r"(.*)!",
|
||||
(
|
||||
"I sense that you are feeling emotional today.",
|
||||
"You need to calm your emotions.",
|
||||
),
|
||||
),
|
||||
# because [statement]
|
||||
# e.g. "because I said so"
|
||||
(
|
||||
r"because (.*)",
|
||||
(
|
||||
"Does knowning the reasons behind things help you to understand"
|
||||
" the things themselves?",
|
||||
"If%1, what else must be true?",
|
||||
),
|
||||
),
|
||||
# yes or no - raise an issue of certainty/correctness
|
||||
(
|
||||
r"(yes)|(no)",
|
||||
(
|
||||
"Is there certainty in an uncertain world?",
|
||||
"It is better to be right than to be certain.",
|
||||
),
|
||||
),
|
||||
# sentence containing word 'love'
|
||||
(
|
||||
r"(.*)love(.*)",
|
||||
(
|
||||
"Think of the trees: they let the birds perch and fly with no intention to call them when they come, and no longing for their return when they fly away. Let your heart be like the trees.",
|
||||
"Free love!",
|
||||
),
|
||||
),
|
||||
# sentence containing word 'understand' - r
|
||||
(
|
||||
r"(.*)understand(.*)",
|
||||
(
|
||||
"If you understand, things are just as they are;"
|
||||
" if you do not understand, things are just as they are.",
|
||||
"Imagination is more important than knowledge.",
|
||||
),
|
||||
),
|
||||
# 'I', 'me', 'my' - person is talking about themself.
|
||||
# this breaks down when words contain these - eg 'Thyme', 'Irish'
|
||||
(
|
||||
r"(.*)(me )|( me)|(my)|(mine)|(i)(.*)",
|
||||
(
|
||||
"'I', 'me', 'my'... these are selfish expressions.",
|
||||
"Have you ever considered that you might be a selfish person?",
|
||||
"Try to consider others, not just yourself.",
|
||||
"Think not just of yourself, but of others.",
|
||||
),
|
||||
),
|
||||
# 'you' starting a sentence
|
||||
# e.g. "you stink!"
|
||||
(
|
||||
r"you (.*)",
|
||||
("My path is not of concern to you.", "I am but one, and you but one more."),
|
||||
),
|
||||
# say goodbye with some extra Zen wisdom.
|
||||
(
|
||||
r"exit",
|
||||
(
|
||||
"Farewell. The obstacle is the path.",
|
||||
"Farewell. Life is a journey, not a destination.",
|
||||
"Good bye. We are cups, constantly and quietly being filled."
|
||||
"\nThe trick is knowning how to tip ourselves over and let the beautiful stuff out.",
|
||||
),
|
||||
),
|
||||
# fall through case -
|
||||
# when stumped, respond with generic zen wisdom
|
||||
#
|
||||
(
|
||||
r"(.*)",
|
||||
(
|
||||
"When you're enlightened, every word is wisdom.",
|
||||
"Random talk is useless.",
|
||||
"The reverse side also has a reverse side.",
|
||||
"Form is emptiness, and emptiness is form.",
|
||||
"I pour out a cup of water. Is the cup empty?",
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
zen_chatbot = Chat(responses, reflections)
|
||||
|
||||
|
||||
def zen_chat():
|
||||
print("*" * 75)
|
||||
print("Zen Chatbot!".center(75))
|
||||
print("*" * 75)
|
||||
print('"Look beyond mere words and letters - look into your mind"'.center(75))
|
||||
print("* Talk your way to truth with Zen Chatbot.")
|
||||
print("* Type 'quit' when you have had enough.")
|
||||
print("*" * 75)
|
||||
print("Welcome, my child.")
|
||||
|
||||
zen_chatbot.converse()
|
||||
|
||||
|
||||
def demo():
|
||||
zen_chat()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
205
backend/venv/Lib/site-packages/nltk/chunk/__init__.py
Normal file
205
backend/venv/Lib/site-packages/nltk/chunk/__init__.py
Normal file
@@ -0,0 +1,205 @@
|
||||
# Natural Language Toolkit: Chunkers
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
#
|
||||
|
||||
"""
|
||||
Classes and interfaces for identifying non-overlapping linguistic
|
||||
groups (such as base noun phrases) in unrestricted text. This task is
|
||||
called "chunk parsing" or "chunking", and the identified groups are
|
||||
called "chunks". The chunked text is represented using a shallow
|
||||
tree called a "chunk structure." A chunk structure is a tree
|
||||
containing tokens and chunks, where each chunk is a subtree containing
|
||||
only tokens. For example, the chunk structure for base noun phrase
|
||||
chunks in the sentence "I saw the big dog on the hill" is::
|
||||
|
||||
(SENTENCE:
|
||||
(NP: <I>)
|
||||
<saw>
|
||||
(NP: <the> <big> <dog>)
|
||||
<on>
|
||||
(NP: <the> <hill>))
|
||||
|
||||
To convert a chunk structure back to a list of tokens, simply use the
|
||||
chunk structure's ``leaves()`` method.
|
||||
|
||||
This module defines ``ChunkParserI``, a standard interface for
|
||||
chunking texts; and ``RegexpChunkParser``, a regular-expression based
|
||||
implementation of that interface. It also defines ``ChunkScore``, a
|
||||
utility class for scoring chunk parsers.
|
||||
|
||||
RegexpChunkParser
|
||||
=================
|
||||
|
||||
``RegexpChunkParser`` is an implementation of the chunk parser interface
|
||||
that uses regular-expressions over tags to chunk a text. Its
|
||||
``parse()`` method first constructs a ``ChunkString``, which encodes a
|
||||
particular chunking of the input text. Initially, nothing is
|
||||
chunked. ``parse.RegexpChunkParser`` then applies a sequence of
|
||||
``RegexpChunkRule`` rules to the ``ChunkString``, each of which modifies
|
||||
the chunking that it encodes. Finally, the ``ChunkString`` is
|
||||
transformed back into a chunk structure, which is returned.
|
||||
|
||||
``RegexpChunkParser`` can only be used to chunk a single kind of phrase.
|
||||
For example, you can use an ``RegexpChunkParser`` to chunk the noun
|
||||
phrases in a text, or the verb phrases in a text; but you can not
|
||||
use it to simultaneously chunk both noun phrases and verb phrases in
|
||||
the same text. (This is a limitation of ``RegexpChunkParser``, not of
|
||||
chunk parsers in general.)
|
||||
|
||||
RegexpChunkRules
|
||||
----------------
|
||||
|
||||
A ``RegexpChunkRule`` is a transformational rule that updates the
|
||||
chunking of a text by modifying its ``ChunkString``. Each
|
||||
``RegexpChunkRule`` defines the ``apply()`` method, which modifies
|
||||
the chunking encoded by a ``ChunkString``. The
|
||||
``RegexpChunkRule`` class itself can be used to implement any
|
||||
transformational rule based on regular expressions. There are
|
||||
also a number of subclasses, which can be used to implement
|
||||
simpler types of rules:
|
||||
|
||||
- ``ChunkRule`` chunks anything that matches a given regular
|
||||
expression.
|
||||
- ``StripRule`` strips anything that matches a given regular
|
||||
expression.
|
||||
- ``UnChunkRule`` will un-chunk any chunk that matches a given
|
||||
regular expression.
|
||||
- ``MergeRule`` can be used to merge two contiguous chunks.
|
||||
- ``SplitRule`` can be used to split a single chunk into two
|
||||
smaller chunks.
|
||||
- ``ExpandLeftRule`` will expand a chunk to incorporate new
|
||||
unchunked material on the left.
|
||||
- ``ExpandRightRule`` will expand a chunk to incorporate new
|
||||
unchunked material on the right.
|
||||
|
||||
Tag Patterns
|
||||
~~~~~~~~~~~~
|
||||
|
||||
A ``RegexpChunkRule`` uses a modified version of regular
|
||||
expression patterns, called "tag patterns". Tag patterns are
|
||||
used to match sequences of tags. Examples of tag patterns are::
|
||||
|
||||
r'(<DT>|<JJ>|<NN>)+'
|
||||
r'<NN>+'
|
||||
r'<NN.*>'
|
||||
|
||||
The differences between regular expression patterns and tag
|
||||
patterns are:
|
||||
|
||||
- In tag patterns, ``'<'`` and ``'>'`` act as parentheses; so
|
||||
``'<NN>+'`` matches one or more repetitions of ``'<NN>'``, not
|
||||
``'<NN'`` followed by one or more repetitions of ``'>'``.
|
||||
- Whitespace in tag patterns is ignored. So
|
||||
``'<DT> | <NN>'`` is equivalent to ``'<DT>|<NN>'``
|
||||
- In tag patterns, ``'.'`` is equivalent to ``'[^{}<>]'``; so
|
||||
``'<NN.*>'`` matches any single tag starting with ``'NN'``.
|
||||
|
||||
The function ``tag_pattern2re_pattern`` can be used to transform
|
||||
a tag pattern to an equivalent regular expression pattern.
|
||||
|
||||
Efficiency
|
||||
----------
|
||||
|
||||
Preliminary tests indicate that ``RegexpChunkParser`` can chunk at a
|
||||
rate of about 300 tokens/second, with a moderately complex rule set.
|
||||
|
||||
There may be problems if ``RegexpChunkParser`` is used with more than
|
||||
5,000 tokens at a time. In particular, evaluation of some regular
|
||||
expressions may cause the Python regular expression engine to
|
||||
exceed its maximum recursion depth. We have attempted to minimize
|
||||
these problems, but it is impossible to avoid them completely. We
|
||||
therefore recommend that you apply the chunk parser to a single
|
||||
sentence at a time.
|
||||
|
||||
Emacs Tip
|
||||
---------
|
||||
|
||||
If you evaluate the following elisp expression in emacs, it will
|
||||
colorize a ``ChunkString`` when you use an interactive python shell
|
||||
with emacs or xemacs ("C-c !")::
|
||||
|
||||
(let ()
|
||||
(defconst comint-mode-font-lock-keywords
|
||||
'(("<[^>]+>" 0 'font-lock-reference-face)
|
||||
("[{}]" 0 'font-lock-function-name-face)))
|
||||
(add-hook 'comint-mode-hook (lambda () (turn-on-font-lock))))
|
||||
|
||||
You can evaluate this code by copying it to a temporary buffer,
|
||||
placing the cursor after the last close parenthesis, and typing
|
||||
"``C-x C-e``". You should evaluate it before running the interactive
|
||||
session. The change will last until you close emacs.
|
||||
|
||||
Unresolved Issues
|
||||
-----------------
|
||||
|
||||
If we use the ``re`` module for regular expressions, Python's
|
||||
regular expression engine generates "maximum recursion depth
|
||||
exceeded" errors when processing very large texts, even for
|
||||
regular expressions that should not require any recursion. We
|
||||
therefore use the ``pre`` module instead. But note that ``pre``
|
||||
does not include Unicode support, so this module will not work
|
||||
with unicode strings. Note also that ``pre`` regular expressions
|
||||
are not quite as advanced as ``re`` ones (e.g., no leftward
|
||||
zero-length assertions).
|
||||
|
||||
:type CHUNK_TAG_PATTERN: regexp
|
||||
:var CHUNK_TAG_PATTERN: A regular expression to test whether a tag
|
||||
pattern is valid.
|
||||
"""
|
||||
|
||||
from nltk.chunk.api import ChunkParserI
|
||||
from nltk.chunk.named_entity import Maxent_NE_Chunker
|
||||
from nltk.chunk.regexp import RegexpChunkParser, RegexpParser
|
||||
from nltk.chunk.util import (
|
||||
ChunkScore,
|
||||
accuracy,
|
||||
conllstr2tree,
|
||||
conlltags2tree,
|
||||
ieerstr2tree,
|
||||
tagstr2tree,
|
||||
tree2conllstr,
|
||||
tree2conlltags,
|
||||
)
|
||||
|
||||
|
||||
def ne_chunker(fmt="multiclass"):
|
||||
"""
|
||||
Load NLTK's currently recommended named entity chunker.
|
||||
"""
|
||||
return Maxent_NE_Chunker(fmt)
|
||||
|
||||
|
||||
def ne_chunk(tagged_tokens, binary=False):
|
||||
"""
|
||||
Use NLTK's currently recommended named entity chunker to
|
||||
chunk the given list of tagged tokens.
|
||||
|
||||
>>> from nltk.chunk import ne_chunk
|
||||
>>> from nltk.corpus import treebank
|
||||
>>> from pprint import pprint
|
||||
>>> pprint(ne_chunk(treebank.tagged_sents()[2][8:14])) # doctest: +NORMALIZE_WHITESPACE
|
||||
Tree('S', [('chairman', 'NN'), ('of', 'IN'), Tree('ORGANIZATION', [('Consolidated', 'NNP'), ('Gold', 'NNP'), ('Fields', 'NNP')]), ('PLC', 'NNP')])
|
||||
|
||||
"""
|
||||
if binary:
|
||||
chunker = ne_chunker(fmt="binary")
|
||||
else:
|
||||
chunker = ne_chunker()
|
||||
return chunker.parse(tagged_tokens)
|
||||
|
||||
|
||||
def ne_chunk_sents(tagged_sentences, binary=False):
|
||||
"""
|
||||
Use NLTK's currently recommended named entity chunker to chunk the
|
||||
given list of tagged sentences, each consisting of a list of tagged tokens.
|
||||
"""
|
||||
if binary:
|
||||
chunker = ne_chunker(fmt="binary")
|
||||
else:
|
||||
chunker = ne_chunker()
|
||||
return chunker.parse_sents(tagged_sentences)
|
||||
56
backend/venv/Lib/site-packages/nltk/chunk/api.py
Normal file
56
backend/venv/Lib/site-packages/nltk/chunk/api.py
Normal file
@@ -0,0 +1,56 @@
|
||||
# Natural Language Toolkit: Chunk parsing API
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com> (minor additions)
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
##//////////////////////////////////////////////////////
|
||||
## Chunk Parser Interface
|
||||
##//////////////////////////////////////////////////////
|
||||
|
||||
from nltk.chunk.util import ChunkScore
|
||||
from nltk.internals import deprecated
|
||||
from nltk.parse import ParserI
|
||||
|
||||
|
||||
class ChunkParserI(ParserI):
|
||||
"""
|
||||
A processing interface for identifying non-overlapping groups in
|
||||
unrestricted text. Typically, chunk parsers are used to find base
|
||||
syntactic constituents, such as base noun phrases. Unlike
|
||||
``ParserI``, ``ChunkParserI`` guarantees that the ``parse()`` method
|
||||
will always generate a parse.
|
||||
"""
|
||||
|
||||
def parse(self, tokens):
|
||||
"""
|
||||
Return the best chunk structure for the given tokens
|
||||
and return a tree.
|
||||
|
||||
:param tokens: The list of (word, tag) tokens to be chunked.
|
||||
:type tokens: list(tuple)
|
||||
:rtype: Tree
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
@deprecated("Use accuracy(gold) instead.")
|
||||
def evaluate(self, gold):
|
||||
return self.accuracy(gold)
|
||||
|
||||
def accuracy(self, gold):
|
||||
"""
|
||||
Score the accuracy of the chunker against the gold standard.
|
||||
Remove the chunking the gold standard text, rechunk it using
|
||||
the chunker, and return a ``ChunkScore`` object
|
||||
reflecting the performance of this chunk parser.
|
||||
|
||||
:type gold: list(Tree)
|
||||
:param gold: The list of chunked sentences to score the chunker on.
|
||||
:rtype: ChunkScore
|
||||
"""
|
||||
chunkscore = ChunkScore()
|
||||
for correct in gold:
|
||||
chunkscore.score(correct, self.parse(correct.leaves()))
|
||||
return chunkscore
|
||||
407
backend/venv/Lib/site-packages/nltk/chunk/named_entity.py
Normal file
407
backend/venv/Lib/site-packages/nltk/chunk/named_entity.py
Normal file
@@ -0,0 +1,407 @@
|
||||
# Natural Language Toolkit: Chunk parsing API
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Eric Kafe <kafe.eric@gmail.com> (tab-format models)
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Named entity chunker
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
from xml.etree import ElementTree as ET
|
||||
|
||||
from nltk.tag import ClassifierBasedTagger, pos_tag
|
||||
|
||||
try:
|
||||
from nltk.classify import MaxentClassifier
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
from nltk.chunk.api import ChunkParserI
|
||||
from nltk.chunk.util import ChunkScore
|
||||
from nltk.data import find
|
||||
from nltk.tokenize import word_tokenize
|
||||
from nltk.tree import Tree
|
||||
|
||||
|
||||
class NEChunkParserTagger(ClassifierBasedTagger):
|
||||
"""
|
||||
The IOB tagger used by the chunk parser.
|
||||
"""
|
||||
|
||||
def __init__(self, train=None, classifier=None):
|
||||
ClassifierBasedTagger.__init__(
|
||||
self,
|
||||
train=train,
|
||||
classifier_builder=self._classifier_builder,
|
||||
classifier=classifier,
|
||||
)
|
||||
|
||||
def _classifier_builder(self, train):
|
||||
return MaxentClassifier.train(
|
||||
# "megam" cannot be the default algorithm since it requires compiling with ocaml
|
||||
train,
|
||||
algorithm="iis",
|
||||
gaussian_prior_sigma=1,
|
||||
trace=2,
|
||||
)
|
||||
|
||||
def _english_wordlist(self):
|
||||
try:
|
||||
wl = self._en_wordlist
|
||||
except AttributeError:
|
||||
from nltk.corpus import words
|
||||
|
||||
self._en_wordlist = set(words.words("en-basic"))
|
||||
wl = self._en_wordlist
|
||||
return wl
|
||||
|
||||
def _feature_detector(self, tokens, index, history):
|
||||
word = tokens[index][0]
|
||||
pos = simplify_pos(tokens[index][1])
|
||||
if index == 0:
|
||||
prevword = prevprevword = None
|
||||
prevpos = prevprevpos = None
|
||||
prevshape = prevtag = prevprevtag = None
|
||||
elif index == 1:
|
||||
prevword = tokens[index - 1][0].lower()
|
||||
prevprevword = None
|
||||
prevpos = simplify_pos(tokens[index - 1][1])
|
||||
prevprevpos = None
|
||||
prevtag = history[index - 1][0]
|
||||
prevshape = prevprevtag = None
|
||||
else:
|
||||
prevword = tokens[index - 1][0].lower()
|
||||
prevprevword = tokens[index - 2][0].lower()
|
||||
prevpos = simplify_pos(tokens[index - 1][1])
|
||||
prevprevpos = simplify_pos(tokens[index - 2][1])
|
||||
prevtag = history[index - 1]
|
||||
prevprevtag = history[index - 2]
|
||||
prevshape = shape(prevword)
|
||||
if index == len(tokens) - 1:
|
||||
nextword = nextnextword = None
|
||||
nextpos = nextnextpos = None
|
||||
elif index == len(tokens) - 2:
|
||||
nextword = tokens[index + 1][0].lower()
|
||||
nextpos = tokens[index + 1][1].lower()
|
||||
nextnextword = None
|
||||
nextnextpos = None
|
||||
else:
|
||||
nextword = tokens[index + 1][0].lower()
|
||||
nextpos = tokens[index + 1][1].lower()
|
||||
nextnextword = tokens[index + 2][0].lower()
|
||||
nextnextpos = tokens[index + 2][1].lower()
|
||||
|
||||
# 89.6
|
||||
features = {
|
||||
"bias": True,
|
||||
"shape": shape(word),
|
||||
"wordlen": len(word),
|
||||
"prefix3": word[:3].lower(),
|
||||
"suffix3": word[-3:].lower(),
|
||||
"pos": pos,
|
||||
"word": word,
|
||||
"en-wordlist": (word in self._english_wordlist()),
|
||||
"prevtag": prevtag,
|
||||
"prevpos": prevpos,
|
||||
"nextpos": nextpos,
|
||||
"prevword": prevword,
|
||||
"nextword": nextword,
|
||||
"word+nextpos": f"{word.lower()}+{nextpos}",
|
||||
"pos+prevtag": f"{pos}+{prevtag}",
|
||||
"shape+prevtag": f"{prevshape}+{prevtag}",
|
||||
}
|
||||
|
||||
return features
|
||||
|
||||
|
||||
class NEChunkParser(ChunkParserI):
|
||||
"""
|
||||
Expected input: list of pos-tagged words
|
||||
"""
|
||||
|
||||
def __init__(self, train):
|
||||
self._train(train)
|
||||
|
||||
def parse(self, tokens):
|
||||
"""
|
||||
Each token should be a pos-tagged word
|
||||
"""
|
||||
tagged = self._tagger.tag(tokens)
|
||||
tree = self._tagged_to_parse(tagged)
|
||||
return tree
|
||||
|
||||
def _train(self, corpus):
|
||||
# Convert to tagged sequence
|
||||
corpus = [self._parse_to_tagged(s) for s in corpus]
|
||||
|
||||
self._tagger = NEChunkParserTagger(train=corpus)
|
||||
|
||||
def _tagged_to_parse(self, tagged_tokens):
|
||||
"""
|
||||
Convert a list of tagged tokens to a chunk-parse tree.
|
||||
"""
|
||||
sent = Tree("S", [])
|
||||
|
||||
for tok, tag in tagged_tokens:
|
||||
if tag == "O":
|
||||
sent.append(tok)
|
||||
elif tag.startswith("B-"):
|
||||
sent.append(Tree(tag[2:], [tok]))
|
||||
elif tag.startswith("I-"):
|
||||
if sent and isinstance(sent[-1], Tree) and sent[-1].label() == tag[2:]:
|
||||
sent[-1].append(tok)
|
||||
else:
|
||||
sent.append(Tree(tag[2:], [tok]))
|
||||
return sent
|
||||
|
||||
@staticmethod
|
||||
def _parse_to_tagged(sent):
|
||||
"""
|
||||
Convert a chunk-parse tree to a list of tagged tokens.
|
||||
"""
|
||||
toks = []
|
||||
for child in sent:
|
||||
if isinstance(child, Tree):
|
||||
if len(child) == 0:
|
||||
print("Warning -- empty chunk in sentence")
|
||||
continue
|
||||
toks.append((child[0], f"B-{child.label()}"))
|
||||
for tok in child[1:]:
|
||||
toks.append((tok, f"I-{child.label()}"))
|
||||
else:
|
||||
toks.append((child, "O"))
|
||||
return toks
|
||||
|
||||
|
||||
def shape(word):
|
||||
if re.match(r"[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$", word, re.UNICODE):
|
||||
return "number"
|
||||
elif re.match(r"\W+$", word, re.UNICODE):
|
||||
return "punct"
|
||||
elif re.match(r"\w+$", word, re.UNICODE):
|
||||
if word.istitle():
|
||||
return "upcase"
|
||||
elif word.islower():
|
||||
return "downcase"
|
||||
else:
|
||||
return "mixedcase"
|
||||
else:
|
||||
return "other"
|
||||
|
||||
|
||||
def simplify_pos(s):
|
||||
if s.startswith("V"):
|
||||
return "V"
|
||||
else:
|
||||
return s.split("-")[0]
|
||||
|
||||
|
||||
def postag_tree(tree):
|
||||
# Part-of-speech tagging.
|
||||
words = tree.leaves()
|
||||
tag_iter = (pos for (word, pos) in pos_tag(words))
|
||||
newtree = Tree("S", [])
|
||||
for child in tree:
|
||||
if isinstance(child, Tree):
|
||||
newtree.append(Tree(child.label(), []))
|
||||
for subchild in child:
|
||||
newtree[-1].append((subchild, next(tag_iter)))
|
||||
else:
|
||||
newtree.append((child, next(tag_iter)))
|
||||
return newtree
|
||||
|
||||
|
||||
def load_ace_data(roots, fmt="binary", skip_bnews=True):
|
||||
for root in roots:
|
||||
for root, dirs, files in os.walk(root):
|
||||
if root.endswith("bnews") and skip_bnews:
|
||||
continue
|
||||
for f in files:
|
||||
if f.endswith(".sgm"):
|
||||
yield from load_ace_file(os.path.join(root, f), fmt)
|
||||
|
||||
|
||||
def load_ace_file(textfile, fmt):
|
||||
print(f" - {os.path.split(textfile)[1]}")
|
||||
annfile = textfile + ".tmx.rdc.xml"
|
||||
|
||||
# Read the xml file, and get a list of entities
|
||||
entities = []
|
||||
with open(annfile) as infile:
|
||||
xml = ET.parse(infile).getroot()
|
||||
for entity in xml.findall("document/entity"):
|
||||
typ = entity.find("entity_type").text
|
||||
for mention in entity.findall("entity_mention"):
|
||||
if mention.get("TYPE") != "NAME":
|
||||
continue # only NEs
|
||||
s = int(mention.find("head/charseq/start").text)
|
||||
e = int(mention.find("head/charseq/end").text) + 1
|
||||
entities.append((s, e, typ))
|
||||
|
||||
# Read the text file, and mark the entities.
|
||||
with open(textfile) as infile:
|
||||
text = infile.read()
|
||||
|
||||
# Strip XML tags, since they don't count towards the indices
|
||||
text = re.sub("<(?!/?TEXT)[^>]+>", "", text)
|
||||
|
||||
# Blank out anything before/after <TEXT>
|
||||
def subfunc(m):
|
||||
return " " * (m.end() - m.start() - 6)
|
||||
|
||||
text = re.sub(r"[\s\S]*<TEXT>", subfunc, text)
|
||||
text = re.sub(r"</TEXT>[\s\S]*", "", text)
|
||||
|
||||
# Simplify quotes
|
||||
text = re.sub("``", ' "', text)
|
||||
text = re.sub("''", '" ', text)
|
||||
|
||||
entity_types = {typ for (s, e, typ) in entities}
|
||||
|
||||
# Binary distinction (NE or not NE)
|
||||
if fmt == "binary":
|
||||
i = 0
|
||||
toks = Tree("S", [])
|
||||
for s, e, typ in sorted(entities):
|
||||
if s < i:
|
||||
s = i # Overlapping! Deal with this better?
|
||||
if e <= s:
|
||||
continue
|
||||
toks.extend(word_tokenize(text[i:s]))
|
||||
toks.append(Tree("NE", text[s:e].split()))
|
||||
i = e
|
||||
toks.extend(word_tokenize(text[i:]))
|
||||
yield toks
|
||||
|
||||
# Multiclass distinction (NE type)
|
||||
elif fmt == "multiclass":
|
||||
i = 0
|
||||
toks = Tree("S", [])
|
||||
for s, e, typ in sorted(entities):
|
||||
if s < i:
|
||||
s = i # Overlapping! Deal with this better?
|
||||
if e <= s:
|
||||
continue
|
||||
toks.extend(word_tokenize(text[i:s]))
|
||||
toks.append(Tree(typ, text[s:e].split()))
|
||||
i = e
|
||||
toks.extend(word_tokenize(text[i:]))
|
||||
yield toks
|
||||
|
||||
else:
|
||||
raise ValueError("bad fmt value")
|
||||
|
||||
|
||||
# This probably belongs in a more general-purpose location (as does
|
||||
# the parse_to_tagged function).
|
||||
def cmp_chunks(correct, guessed):
|
||||
correct = NEChunkParser._parse_to_tagged(correct)
|
||||
guessed = NEChunkParser._parse_to_tagged(guessed)
|
||||
ellipsis = False
|
||||
for (w, ct), (w, gt) in zip(correct, guessed):
|
||||
if ct == gt == "O":
|
||||
if not ellipsis:
|
||||
print(f" {ct:15} {gt:15} {w}")
|
||||
print(" {:15} {:15} {2}".format("...", "...", "..."))
|
||||
ellipsis = True
|
||||
else:
|
||||
ellipsis = False
|
||||
print(f" {ct:15} {gt:15} {w}")
|
||||
|
||||
|
||||
# ======================================================================================
|
||||
|
||||
|
||||
class Maxent_NE_Chunker(NEChunkParser):
|
||||
"""
|
||||
Expected input: list of pos-tagged words
|
||||
"""
|
||||
|
||||
def __init__(self, fmt="multiclass"):
|
||||
from nltk.data import find
|
||||
|
||||
self._fmt = fmt
|
||||
self._tab_dir = find(f"chunkers/maxent_ne_chunker_tab/english_ace_{fmt}/")
|
||||
self.load_params()
|
||||
|
||||
def load_params(self):
|
||||
from nltk.classify.maxent import BinaryMaxentFeatureEncoding, load_maxent_params
|
||||
|
||||
wgt, mpg, lab, aon = load_maxent_params(self._tab_dir)
|
||||
mc = MaxentClassifier(
|
||||
BinaryMaxentFeatureEncoding(lab, mpg, alwayson_features=aon), wgt
|
||||
)
|
||||
self._tagger = NEChunkParserTagger(classifier=mc)
|
||||
|
||||
def save_params(self):
|
||||
from nltk.classify.maxent import save_maxent_params
|
||||
|
||||
classif = self._tagger._classifier
|
||||
ecg = classif._encoding
|
||||
wgt = classif._weights
|
||||
mpg = ecg._mapping
|
||||
lab = ecg._labels
|
||||
aon = ecg._alwayson
|
||||
fmt = self._fmt
|
||||
save_maxent_params(wgt, mpg, lab, aon, tab_dir=f"/tmp/english_ace_{fmt}/")
|
||||
|
||||
|
||||
def build_model(fmt="multiclass"):
|
||||
chunker = Maxent_NE_Chunker(fmt)
|
||||
chunker.save_params()
|
||||
return chunker
|
||||
|
||||
|
||||
# ======================================================================================
|
||||
|
||||
"""
|
||||
2004 update: pickles are not supported anymore.
|
||||
|
||||
Deprecated:
|
||||
|
||||
def build_model(fmt="binary"):
|
||||
print("Loading training data...")
|
||||
train_paths = [
|
||||
find("corpora/ace_data/ace.dev"),
|
||||
find("corpora/ace_data/ace.heldout"),
|
||||
find("corpora/ace_data/bbn.dev"),
|
||||
find("corpora/ace_data/muc.dev"),
|
||||
]
|
||||
train_trees = load_ace_data(train_paths, fmt)
|
||||
train_data = [postag_tree(t) for t in train_trees]
|
||||
print("Training...")
|
||||
cp = NEChunkParser(train_data)
|
||||
del train_data
|
||||
|
||||
print("Loading eval data...")
|
||||
eval_paths = [find("corpora/ace_data/ace.eval")]
|
||||
eval_trees = load_ace_data(eval_paths, fmt)
|
||||
eval_data = [postag_tree(t) for t in eval_trees]
|
||||
|
||||
print("Evaluating...")
|
||||
chunkscore = ChunkScore()
|
||||
for i, correct in enumerate(eval_data):
|
||||
guess = cp.parse(correct.leaves())
|
||||
chunkscore.score(correct, guess)
|
||||
if i < 3:
|
||||
cmp_chunks(correct, guess)
|
||||
print(chunkscore)
|
||||
|
||||
outfilename = f"/tmp/ne_chunker_{fmt}.pickle"
|
||||
print(f"Saving chunker to {outfilename}...")
|
||||
|
||||
with open(outfilename, "wb") as outfile:
|
||||
pickle.dump(cp, outfile, -1)
|
||||
|
||||
return cp
|
||||
"""
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Make sure that the object has the right class name:
|
||||
build_model("binary")
|
||||
build_model("multiclass")
|
||||
1474
backend/venv/Lib/site-packages/nltk/chunk/regexp.py
Normal file
1474
backend/venv/Lib/site-packages/nltk/chunk/regexp.py
Normal file
File diff suppressed because it is too large
Load Diff
642
backend/venv/Lib/site-packages/nltk/chunk/util.py
Normal file
642
backend/venv/Lib/site-packages/nltk/chunk/util.py
Normal file
@@ -0,0 +1,642 @@
|
||||
# Natural Language Toolkit: Chunk format conversions
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com> (minor additions)
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import re
|
||||
|
||||
from nltk.metrics import accuracy as _accuracy
|
||||
from nltk.tag.mapping import map_tag
|
||||
from nltk.tag.util import str2tuple
|
||||
from nltk.tree import Tree
|
||||
|
||||
##//////////////////////////////////////////////////////
|
||||
## EVALUATION
|
||||
##//////////////////////////////////////////////////////
|
||||
|
||||
|
||||
def accuracy(chunker, gold):
|
||||
"""
|
||||
Score the accuracy of the chunker against the gold standard.
|
||||
Strip the chunk information from the gold standard and rechunk it using
|
||||
the chunker, then compute the accuracy score.
|
||||
|
||||
:type chunker: ChunkParserI
|
||||
:param chunker: The chunker being evaluated.
|
||||
:type gold: tree
|
||||
:param gold: The chunk structures to score the chunker on.
|
||||
:rtype: float
|
||||
"""
|
||||
|
||||
gold_tags = []
|
||||
test_tags = []
|
||||
for gold_tree in gold:
|
||||
test_tree = chunker.parse(gold_tree.flatten())
|
||||
gold_tags += tree2conlltags(gold_tree)
|
||||
test_tags += tree2conlltags(test_tree)
|
||||
|
||||
# print 'GOLD:', gold_tags[:50]
|
||||
# print 'TEST:', test_tags[:50]
|
||||
return _accuracy(gold_tags, test_tags)
|
||||
|
||||
|
||||
# Patched for increased performance by Yoav Goldberg <yoavg@cs.bgu.ac.il>, 2006-01-13
|
||||
# -- statistics are evaluated only on demand, instead of at every sentence evaluation
|
||||
#
|
||||
# SB: use nltk.metrics for precision/recall scoring?
|
||||
#
|
||||
class ChunkScore:
|
||||
"""
|
||||
A utility class for scoring chunk parsers. ``ChunkScore`` can
|
||||
evaluate a chunk parser's output, based on a number of statistics
|
||||
(precision, recall, f-measure, misssed chunks, incorrect chunks).
|
||||
It can also combine the scores from the parsing of multiple texts;
|
||||
this makes it significantly easier to evaluate a chunk parser that
|
||||
operates one sentence at a time.
|
||||
|
||||
Texts are evaluated with the ``score`` method. The results of
|
||||
evaluation can be accessed via a number of accessor methods, such
|
||||
as ``precision`` and ``f_measure``. A typical use of the
|
||||
``ChunkScore`` class is::
|
||||
|
||||
>>> chunkscore = ChunkScore() # doctest: +SKIP
|
||||
>>> for correct in correct_sentences: # doctest: +SKIP
|
||||
... guess = chunkparser.parse(correct.leaves()) # doctest: +SKIP
|
||||
... chunkscore.score(correct, guess) # doctest: +SKIP
|
||||
>>> print('F Measure:', chunkscore.f_measure()) # doctest: +SKIP
|
||||
F Measure: 0.823
|
||||
|
||||
:ivar kwargs: Keyword arguments:
|
||||
|
||||
- max_tp_examples: The maximum number actual examples of true
|
||||
positives to record. This affects the ``correct`` member
|
||||
function: ``correct`` will not return more than this number
|
||||
of true positive examples. This does *not* affect any of
|
||||
the numerical metrics (precision, recall, or f-measure)
|
||||
|
||||
- max_fp_examples: The maximum number actual examples of false
|
||||
positives to record. This affects the ``incorrect`` member
|
||||
function and the ``guessed`` member function: ``incorrect``
|
||||
will not return more than this number of examples, and
|
||||
``guessed`` will not return more than this number of true
|
||||
positive examples. This does *not* affect any of the
|
||||
numerical metrics (precision, recall, or f-measure)
|
||||
|
||||
- max_fn_examples: The maximum number actual examples of false
|
||||
negatives to record. This affects the ``missed`` member
|
||||
function and the ``correct`` member function: ``missed``
|
||||
will not return more than this number of examples, and
|
||||
``correct`` will not return more than this number of true
|
||||
negative examples. This does *not* affect any of the
|
||||
numerical metrics (precision, recall, or f-measure)
|
||||
|
||||
- chunk_label: A regular expression indicating which chunks
|
||||
should be compared. Defaults to ``'.*'`` (i.e., all chunks).
|
||||
|
||||
:type _tp: list(Token)
|
||||
:ivar _tp: List of true positives
|
||||
:type _fp: list(Token)
|
||||
:ivar _fp: List of false positives
|
||||
:type _fn: list(Token)
|
||||
:ivar _fn: List of false negatives
|
||||
|
||||
:type _tp_num: int
|
||||
:ivar _tp_num: Number of true positives
|
||||
:type _fp_num: int
|
||||
:ivar _fp_num: Number of false positives
|
||||
:type _fn_num: int
|
||||
:ivar _fn_num: Number of false negatives.
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
self._correct = set()
|
||||
self._guessed = set()
|
||||
self._tp = set()
|
||||
self._fp = set()
|
||||
self._fn = set()
|
||||
self._max_tp = kwargs.get("max_tp_examples", 100)
|
||||
self._max_fp = kwargs.get("max_fp_examples", 100)
|
||||
self._max_fn = kwargs.get("max_fn_examples", 100)
|
||||
self._chunk_label = kwargs.get("chunk_label", ".*")
|
||||
self._tp_num = 0
|
||||
self._fp_num = 0
|
||||
self._fn_num = 0
|
||||
self._count = 0
|
||||
self._tags_correct = 0.0
|
||||
self._tags_total = 0.0
|
||||
|
||||
self._measuresNeedUpdate = False
|
||||
|
||||
def _updateMeasures(self):
|
||||
if self._measuresNeedUpdate:
|
||||
self._tp = self._guessed & self._correct
|
||||
self._fn = self._correct - self._guessed
|
||||
self._fp = self._guessed - self._correct
|
||||
self._tp_num = len(self._tp)
|
||||
self._fp_num = len(self._fp)
|
||||
self._fn_num = len(self._fn)
|
||||
self._measuresNeedUpdate = False
|
||||
|
||||
def score(self, correct, guessed):
|
||||
"""
|
||||
Given a correctly chunked sentence, score another chunked
|
||||
version of the same sentence.
|
||||
|
||||
:type correct: chunk structure
|
||||
:param correct: The known-correct ("gold standard") chunked
|
||||
sentence.
|
||||
:type guessed: chunk structure
|
||||
:param guessed: The chunked sentence to be scored.
|
||||
"""
|
||||
self._correct |= _chunksets(correct, self._count, self._chunk_label)
|
||||
self._guessed |= _chunksets(guessed, self._count, self._chunk_label)
|
||||
self._count += 1
|
||||
self._measuresNeedUpdate = True
|
||||
# Keep track of per-tag accuracy (if possible)
|
||||
try:
|
||||
correct_tags = tree2conlltags(correct)
|
||||
guessed_tags = tree2conlltags(guessed)
|
||||
except ValueError:
|
||||
# This exception case is for nested chunk structures,
|
||||
# where tree2conlltags will fail with a ValueError: "Tree
|
||||
# is too deeply nested to be printed in CoNLL format."
|
||||
correct_tags = guessed_tags = ()
|
||||
self._tags_total += len(correct_tags)
|
||||
self._tags_correct += sum(
|
||||
1 for (t, g) in zip(guessed_tags, correct_tags) if t == g
|
||||
)
|
||||
|
||||
def accuracy(self):
|
||||
"""
|
||||
Return the overall tag-based accuracy for all text that have
|
||||
been scored by this ``ChunkScore``, using the IOB (conll2000)
|
||||
tag encoding.
|
||||
|
||||
:rtype: float
|
||||
"""
|
||||
if self._tags_total == 0:
|
||||
return 1
|
||||
return self._tags_correct / self._tags_total
|
||||
|
||||
def precision(self):
|
||||
"""
|
||||
Return the overall precision for all texts that have been
|
||||
scored by this ``ChunkScore``.
|
||||
|
||||
:rtype: float
|
||||
"""
|
||||
self._updateMeasures()
|
||||
div = self._tp_num + self._fp_num
|
||||
if div == 0:
|
||||
return 0
|
||||
else:
|
||||
return self._tp_num / div
|
||||
|
||||
def recall(self):
|
||||
"""
|
||||
Return the overall recall for all texts that have been
|
||||
scored by this ``ChunkScore``.
|
||||
|
||||
:rtype: float
|
||||
"""
|
||||
self._updateMeasures()
|
||||
div = self._tp_num + self._fn_num
|
||||
if div == 0:
|
||||
return 0
|
||||
else:
|
||||
return self._tp_num / div
|
||||
|
||||
def f_measure(self, alpha=0.5):
|
||||
"""
|
||||
Return the overall F measure for all texts that have been
|
||||
scored by this ``ChunkScore``.
|
||||
|
||||
:param alpha: the relative weighting of precision and recall.
|
||||
Larger alpha biases the score towards the precision value,
|
||||
while smaller alpha biases the score towards the recall
|
||||
value. ``alpha`` should have a value in the range [0,1].
|
||||
:type alpha: float
|
||||
:rtype: float
|
||||
"""
|
||||
self._updateMeasures()
|
||||
p = self.precision()
|
||||
r = self.recall()
|
||||
if p == 0 or r == 0: # what if alpha is 0 or 1?
|
||||
return 0
|
||||
return 1 / (alpha / p + (1 - alpha) / r)
|
||||
|
||||
def missed(self):
|
||||
"""
|
||||
Return the chunks which were included in the
|
||||
correct chunk structures, but not in the guessed chunk
|
||||
structures, listed in input order.
|
||||
|
||||
:rtype: list of chunks
|
||||
"""
|
||||
self._updateMeasures()
|
||||
chunks = list(self._fn)
|
||||
return [c[1] for c in chunks] # discard position information
|
||||
|
||||
def incorrect(self):
|
||||
"""
|
||||
Return the chunks which were included in the guessed chunk structures,
|
||||
but not in the correct chunk structures, listed in input order.
|
||||
|
||||
:rtype: list of chunks
|
||||
"""
|
||||
self._updateMeasures()
|
||||
chunks = list(self._fp)
|
||||
return [c[1] for c in chunks] # discard position information
|
||||
|
||||
def correct(self):
|
||||
"""
|
||||
Return the chunks which were included in the correct
|
||||
chunk structures, listed in input order.
|
||||
|
||||
:rtype: list of chunks
|
||||
"""
|
||||
chunks = list(self._correct)
|
||||
return [c[1] for c in chunks] # discard position information
|
||||
|
||||
def guessed(self):
|
||||
"""
|
||||
Return the chunks which were included in the guessed
|
||||
chunk structures, listed in input order.
|
||||
|
||||
:rtype: list of chunks
|
||||
"""
|
||||
chunks = list(self._guessed)
|
||||
return [c[1] for c in chunks] # discard position information
|
||||
|
||||
def __len__(self):
|
||||
self._updateMeasures()
|
||||
return self._tp_num + self._fn_num
|
||||
|
||||
def __repr__(self):
|
||||
"""
|
||||
Return a concise representation of this ``ChunkScoring``.
|
||||
|
||||
:rtype: str
|
||||
"""
|
||||
return "<ChunkScoring of " + repr(len(self)) + " chunks>"
|
||||
|
||||
def __str__(self):
|
||||
"""
|
||||
Return a verbose representation of this ``ChunkScoring``.
|
||||
This representation includes the precision, recall, and
|
||||
f-measure scores. For other information about the score,
|
||||
use the accessor methods (e.g., ``missed()`` and ``incorrect()``).
|
||||
|
||||
:rtype: str
|
||||
"""
|
||||
return (
|
||||
"ChunkParse score:\n"
|
||||
+ f" IOB Accuracy: {self.accuracy() * 100:5.1f}%\n"
|
||||
+ f" Precision: {self.precision() * 100:5.1f}%\n"
|
||||
+ f" Recall: {self.recall() * 100:5.1f}%\n"
|
||||
+ f" F-Measure: {self.f_measure() * 100:5.1f}%"
|
||||
)
|
||||
|
||||
|
||||
# extract chunks, and assign unique id, the absolute position of
|
||||
# the first word of the chunk
|
||||
def _chunksets(t, count, chunk_label):
|
||||
pos = 0
|
||||
chunks = []
|
||||
for child in t:
|
||||
if isinstance(child, Tree):
|
||||
if re.match(chunk_label, child.label()):
|
||||
chunks.append(((count, pos), child.freeze()))
|
||||
pos += len(child.leaves())
|
||||
else:
|
||||
pos += 1
|
||||
return set(chunks)
|
||||
|
||||
|
||||
def tagstr2tree(
|
||||
s, chunk_label="NP", root_label="S", sep="/", source_tagset=None, target_tagset=None
|
||||
):
|
||||
"""
|
||||
Divide a string of bracketted tagged text into
|
||||
chunks and unchunked tokens, and produce a Tree.
|
||||
Chunks are marked by square brackets (``[...]``). Words are
|
||||
delimited by whitespace, and each word should have the form
|
||||
``text/tag``. Words that do not contain a slash are
|
||||
assigned a ``tag`` of None.
|
||||
|
||||
:param s: The string to be converted
|
||||
:type s: str
|
||||
:param chunk_label: The label to use for chunk nodes
|
||||
:type chunk_label: str
|
||||
:param root_label: The label to use for the root of the tree
|
||||
:type root_label: str
|
||||
:rtype: Tree
|
||||
"""
|
||||
|
||||
WORD_OR_BRACKET = re.compile(r"\[|\]|[^\[\]\s]+")
|
||||
|
||||
stack = [Tree(root_label, [])]
|
||||
for match in WORD_OR_BRACKET.finditer(s):
|
||||
text = match.group()
|
||||
if text[0] == "[":
|
||||
if len(stack) != 1:
|
||||
raise ValueError(f"Unexpected [ at char {match.start():d}")
|
||||
chunk = Tree(chunk_label, [])
|
||||
stack[-1].append(chunk)
|
||||
stack.append(chunk)
|
||||
elif text[0] == "]":
|
||||
if len(stack) != 2:
|
||||
raise ValueError(f"Unexpected ] at char {match.start():d}")
|
||||
stack.pop()
|
||||
else:
|
||||
if sep is None:
|
||||
stack[-1].append(text)
|
||||
else:
|
||||
word, tag = str2tuple(text, sep)
|
||||
if source_tagset and target_tagset:
|
||||
tag = map_tag(source_tagset, target_tagset, tag)
|
||||
stack[-1].append((word, tag))
|
||||
|
||||
if len(stack) != 1:
|
||||
raise ValueError(f"Expected ] at char {len(s):d}")
|
||||
return stack[0]
|
||||
|
||||
|
||||
### CONLL
|
||||
|
||||
_LINE_RE = re.compile(r"(\S+)\s+(\S+)\s+([IOB])-?(\S+)?")
|
||||
|
||||
|
||||
def conllstr2tree(s, chunk_types=("NP", "PP", "VP"), root_label="S"):
|
||||
"""
|
||||
Return a chunk structure for a single sentence
|
||||
encoded in the given CONLL 2000 style string.
|
||||
This function converts a CoNLL IOB string into a tree.
|
||||
It uses the specified chunk types
|
||||
(defaults to NP, PP and VP), and creates a tree rooted at a node
|
||||
labeled S (by default).
|
||||
|
||||
:param s: The CoNLL string to be converted.
|
||||
:type s: str
|
||||
:param chunk_types: The chunk types to be converted.
|
||||
:type chunk_types: tuple
|
||||
:param root_label: The node label to use for the root.
|
||||
:type root_label: str
|
||||
:rtype: Tree
|
||||
"""
|
||||
|
||||
stack = [Tree(root_label, [])]
|
||||
|
||||
for lineno, line in enumerate(s.split("\n")):
|
||||
if not line.strip():
|
||||
continue
|
||||
|
||||
# Decode the line.
|
||||
match = _LINE_RE.match(line)
|
||||
if match is None:
|
||||
raise ValueError(f"Error on line {lineno:d}")
|
||||
(word, tag, state, chunk_type) = match.groups()
|
||||
|
||||
# If it's a chunk type we don't care about, treat it as O.
|
||||
if chunk_types is not None and chunk_type not in chunk_types:
|
||||
state = "O"
|
||||
|
||||
# For "Begin"/"Outside", finish any completed chunks -
|
||||
# also do so for "Inside" which don't match the previous token.
|
||||
mismatch_I = state == "I" and chunk_type != stack[-1].label()
|
||||
if state in "BO" or mismatch_I:
|
||||
if len(stack) == 2:
|
||||
stack.pop()
|
||||
|
||||
# For "Begin", start a new chunk.
|
||||
if state == "B" or mismatch_I:
|
||||
chunk = Tree(chunk_type, [])
|
||||
stack[-1].append(chunk)
|
||||
stack.append(chunk)
|
||||
|
||||
# Add the new word token.
|
||||
stack[-1].append((word, tag))
|
||||
|
||||
return stack[0]
|
||||
|
||||
|
||||
def tree2conlltags(t):
|
||||
"""
|
||||
Return a list of 3-tuples containing ``(word, tag, IOB-tag)``.
|
||||
Convert a tree to the CoNLL IOB tag format.
|
||||
|
||||
:param t: The tree to be converted.
|
||||
:type t: Tree
|
||||
:rtype: list(tuple)
|
||||
"""
|
||||
|
||||
tags = []
|
||||
for child in t:
|
||||
try:
|
||||
category = child.label()
|
||||
prefix = "B-"
|
||||
for contents in child:
|
||||
if isinstance(contents, Tree):
|
||||
raise ValueError(
|
||||
"Tree is too deeply nested to be printed in CoNLL format"
|
||||
)
|
||||
tags.append((contents[0], contents[1], prefix + category))
|
||||
prefix = "I-"
|
||||
except AttributeError:
|
||||
tags.append((child[0], child[1], "O"))
|
||||
return tags
|
||||
|
||||
|
||||
def conlltags2tree(
|
||||
sentence, chunk_types=("NP", "PP", "VP"), root_label="S", strict=False
|
||||
):
|
||||
"""
|
||||
Convert the CoNLL IOB format to a tree.
|
||||
"""
|
||||
tree = Tree(root_label, [])
|
||||
for word, postag, chunktag in sentence:
|
||||
if chunktag is None:
|
||||
if strict:
|
||||
raise ValueError("Bad conll tag sequence")
|
||||
else:
|
||||
# Treat as O
|
||||
tree.append((word, postag))
|
||||
elif chunktag.startswith("B-"):
|
||||
tree.append(Tree(chunktag[2:], [(word, postag)]))
|
||||
elif chunktag.startswith("I-"):
|
||||
if (
|
||||
len(tree) == 0
|
||||
or not isinstance(tree[-1], Tree)
|
||||
or tree[-1].label() != chunktag[2:]
|
||||
):
|
||||
if strict:
|
||||
raise ValueError("Bad conll tag sequence")
|
||||
else:
|
||||
# Treat as B-*
|
||||
tree.append(Tree(chunktag[2:], [(word, postag)]))
|
||||
else:
|
||||
tree[-1].append((word, postag))
|
||||
elif chunktag == "O":
|
||||
tree.append((word, postag))
|
||||
else:
|
||||
raise ValueError(f"Bad conll tag {chunktag!r}")
|
||||
return tree
|
||||
|
||||
|
||||
def tree2conllstr(t):
|
||||
"""
|
||||
Return a multiline string where each line contains a word, tag and IOB tag.
|
||||
Convert a tree to the CoNLL IOB string format
|
||||
|
||||
:param t: The tree to be converted.
|
||||
:type t: Tree
|
||||
:rtype: str
|
||||
"""
|
||||
lines = [" ".join(token) for token in tree2conlltags(t)]
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
### IEER
|
||||
|
||||
_IEER_DOC_RE = re.compile(
|
||||
r"<DOC>\s*"
|
||||
r"(<DOCNO>\s*(?P<docno>.+?)\s*</DOCNO>\s*)?"
|
||||
r"(<DOCTYPE>\s*(?P<doctype>.+?)\s*</DOCTYPE>\s*)?"
|
||||
r"(<DATE_TIME>\s*(?P<date_time>.+?)\s*</DATE_TIME>\s*)?"
|
||||
r"<BODY>\s*"
|
||||
r"(<HEADLINE>\s*(?P<headline>.+?)\s*</HEADLINE>\s*)?"
|
||||
r"<TEXT>(?P<text>.*?)</TEXT>\s*"
|
||||
r"</BODY>\s*</DOC>\s*",
|
||||
re.DOTALL,
|
||||
)
|
||||
|
||||
_IEER_TYPE_RE = re.compile(r'<b_\w+\s+[^>]*?type="(?P<type>\w+)"')
|
||||
|
||||
|
||||
def _ieer_read_text(s, root_label):
|
||||
stack = [Tree(root_label, [])]
|
||||
# s will be None if there is no headline in the text
|
||||
# return the empty list in place of a Tree
|
||||
if s is None:
|
||||
return []
|
||||
for piece_m in re.finditer(r"<[^>]+>|[^\s<]+", s):
|
||||
piece = piece_m.group()
|
||||
try:
|
||||
if piece.startswith("<b_"):
|
||||
m = _IEER_TYPE_RE.match(piece)
|
||||
if m is None:
|
||||
print("XXXX", piece)
|
||||
chunk = Tree(m.group("type"), [])
|
||||
stack[-1].append(chunk)
|
||||
stack.append(chunk)
|
||||
elif piece.startswith("<e_"):
|
||||
stack.pop()
|
||||
# elif piece.startswith('<'):
|
||||
# print "ERROR:", piece
|
||||
# raise ValueError # Unexpected HTML
|
||||
else:
|
||||
stack[-1].append(piece)
|
||||
except (IndexError, ValueError) as e:
|
||||
raise ValueError(
|
||||
f"Bad IEER string (error at character {piece_m.start():d})"
|
||||
) from e
|
||||
if len(stack) != 1:
|
||||
raise ValueError("Bad IEER string")
|
||||
return stack[0]
|
||||
|
||||
|
||||
def ieerstr2tree(
|
||||
s,
|
||||
chunk_types=[
|
||||
"LOCATION",
|
||||
"ORGANIZATION",
|
||||
"PERSON",
|
||||
"DURATION",
|
||||
"DATE",
|
||||
"CARDINAL",
|
||||
"PERCENT",
|
||||
"MONEY",
|
||||
"MEASURE",
|
||||
],
|
||||
root_label="S",
|
||||
):
|
||||
"""
|
||||
Return a chunk structure containing the chunked tagged text that is
|
||||
encoded in the given IEER style string.
|
||||
Convert a string of chunked tagged text in the IEER named
|
||||
entity format into a chunk structure. Chunks are of several
|
||||
types, LOCATION, ORGANIZATION, PERSON, DURATION, DATE, CARDINAL,
|
||||
PERCENT, MONEY, and MEASURE.
|
||||
|
||||
:rtype: Tree
|
||||
"""
|
||||
|
||||
# Try looking for a single document. If that doesn't work, then just
|
||||
# treat everything as if it was within the <TEXT>...</TEXT>.
|
||||
m = _IEER_DOC_RE.match(s)
|
||||
if m:
|
||||
return {
|
||||
"text": _ieer_read_text(m.group("text"), root_label),
|
||||
"docno": m.group("docno"),
|
||||
"doctype": m.group("doctype"),
|
||||
"date_time": m.group("date_time"),
|
||||
#'headline': m.group('headline')
|
||||
# we want to capture NEs in the headline too!
|
||||
"headline": _ieer_read_text(m.group("headline"), root_label),
|
||||
}
|
||||
else:
|
||||
return _ieer_read_text(s, root_label)
|
||||
|
||||
|
||||
def demo():
|
||||
s = "[ Pierre/NNP Vinken/NNP ] ,/, [ 61/CD years/NNS ] old/JJ ,/, will/MD join/VB [ the/DT board/NN ] ./."
|
||||
import nltk
|
||||
|
||||
t = nltk.chunk.tagstr2tree(s, chunk_label="NP")
|
||||
t.pprint()
|
||||
print()
|
||||
|
||||
s = """
|
||||
These DT B-NP
|
||||
research NN I-NP
|
||||
protocols NNS I-NP
|
||||
offer VBP B-VP
|
||||
to TO B-PP
|
||||
the DT B-NP
|
||||
patient NN I-NP
|
||||
not RB O
|
||||
only RB O
|
||||
the DT B-NP
|
||||
very RB I-NP
|
||||
best JJS I-NP
|
||||
therapy NN I-NP
|
||||
which WDT B-NP
|
||||
we PRP B-NP
|
||||
have VBP B-VP
|
||||
established VBN I-VP
|
||||
today NN B-NP
|
||||
but CC B-NP
|
||||
also RB I-NP
|
||||
the DT B-NP
|
||||
hope NN I-NP
|
||||
of IN B-PP
|
||||
something NN B-NP
|
||||
still RB B-ADJP
|
||||
better JJR I-ADJP
|
||||
. . O
|
||||
"""
|
||||
|
||||
conll_tree = conllstr2tree(s, chunk_types=("NP", "PP"))
|
||||
conll_tree.pprint()
|
||||
|
||||
# Demonstrate CoNLL output
|
||||
print("CoNLL output:")
|
||||
print(nltk.chunk.tree2conllstr(conll_tree))
|
||||
print()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
101
backend/venv/Lib/site-packages/nltk/classify/__init__.py
Normal file
101
backend/venv/Lib/site-packages/nltk/classify/__init__.py
Normal file
@@ -0,0 +1,101 @@
|
||||
# Natural Language Toolkit: Classifiers
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Classes and interfaces for labeling tokens with category labels (or
|
||||
"class labels"). Typically, labels are represented with strings
|
||||
(such as ``'health'`` or ``'sports'``). Classifiers can be used to
|
||||
perform a wide range of classification tasks. For example,
|
||||
classifiers can be used...
|
||||
|
||||
- to classify documents by topic
|
||||
- to classify ambiguous words by which word sense is intended
|
||||
- to classify acoustic signals by which phoneme they represent
|
||||
- to classify sentences by their author
|
||||
|
||||
Features
|
||||
========
|
||||
In order to decide which category label is appropriate for a given
|
||||
token, classifiers examine one or more 'features' of the token. These
|
||||
"features" are typically chosen by hand, and indicate which aspects
|
||||
of the token are relevant to the classification decision. For
|
||||
example, a document classifier might use a separate feature for each
|
||||
word, recording how often that word occurred in the document.
|
||||
|
||||
Featuresets
|
||||
===========
|
||||
The features describing a token are encoded using a "featureset",
|
||||
which is a dictionary that maps from "feature names" to "feature
|
||||
values". Feature names are unique strings that indicate what aspect
|
||||
of the token is encoded by the feature. Examples include
|
||||
``'prevword'``, for a feature whose value is the previous word; and
|
||||
``'contains-word(library)'`` for a feature that is true when a document
|
||||
contains the word ``'library'``. Feature values are typically
|
||||
booleans, numbers, or strings, depending on which feature they
|
||||
describe.
|
||||
|
||||
Featuresets are typically constructed using a "feature detector"
|
||||
(also known as a "feature extractor"). A feature detector is a
|
||||
function that takes a token (and sometimes information about its
|
||||
context) as its input, and returns a featureset describing that token.
|
||||
For example, the following feature detector converts a document
|
||||
(stored as a list of words) to a featureset describing the set of
|
||||
words included in the document:
|
||||
|
||||
>>> # Define a feature detector function.
|
||||
>>> def document_features(document):
|
||||
... return dict([('contains-word(%s)' % w, True) for w in document])
|
||||
|
||||
Feature detectors are typically applied to each token before it is fed
|
||||
to the classifier:
|
||||
|
||||
>>> # Classify each Gutenberg document.
|
||||
>>> from nltk.corpus import gutenberg
|
||||
>>> for fileid in gutenberg.fileids(): # doctest: +SKIP
|
||||
... doc = gutenberg.words(fileid) # doctest: +SKIP
|
||||
... print(fileid, classifier.classify(document_features(doc))) # doctest: +SKIP
|
||||
|
||||
The parameters that a feature detector expects will vary, depending on
|
||||
the task and the needs of the feature detector. For example, a
|
||||
feature detector for word sense disambiguation (WSD) might take as its
|
||||
input a sentence, and the index of a word that should be classified,
|
||||
and return a featureset for that word. The following feature detector
|
||||
for WSD includes features describing the left and right contexts of
|
||||
the target word:
|
||||
|
||||
>>> def wsd_features(sentence, index):
|
||||
... featureset = {}
|
||||
... for i in range(max(0, index-3), index):
|
||||
... featureset['left-context(%s)' % sentence[i]] = True
|
||||
... for i in range(index, max(index+3, len(sentence))):
|
||||
... featureset['right-context(%s)' % sentence[i]] = True
|
||||
... return featureset
|
||||
|
||||
Training Classifiers
|
||||
====================
|
||||
Most classifiers are built by training them on a list of hand-labeled
|
||||
examples, known as the "training set". Training sets are represented
|
||||
as lists of ``(featuredict, label)`` tuples.
|
||||
"""
|
||||
|
||||
from nltk.classify.api import ClassifierI, MultiClassifierI
|
||||
from nltk.classify.decisiontree import DecisionTreeClassifier
|
||||
from nltk.classify.maxent import (
|
||||
BinaryMaxentFeatureEncoding,
|
||||
ConditionalExponentialClassifier,
|
||||
MaxentClassifier,
|
||||
TypedMaxentFeatureEncoding,
|
||||
)
|
||||
from nltk.classify.megam import call_megam, config_megam
|
||||
from nltk.classify.naivebayes import NaiveBayesClassifier
|
||||
from nltk.classify.positivenaivebayes import PositiveNaiveBayesClassifier
|
||||
from nltk.classify.rte_classify import RTEFeatureExtractor, rte_classifier, rte_features
|
||||
from nltk.classify.scikitlearn import SklearnClassifier
|
||||
from nltk.classify.senna import Senna
|
||||
from nltk.classify.textcat import TextCat
|
||||
from nltk.classify.util import accuracy, apply_features, log_likelihood
|
||||
from nltk.classify.weka import WekaClassifier, config_weka
|
||||
195
backend/venv/Lib/site-packages/nltk/classify/api.py
Normal file
195
backend/venv/Lib/site-packages/nltk/classify/api.py
Normal file
@@ -0,0 +1,195 @@
|
||||
# Natural Language Toolkit: Classifier Interface
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com> (minor additions)
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Interfaces for labeling tokens with category labels (or "class labels").
|
||||
|
||||
``ClassifierI`` is a standard interface for "single-category
|
||||
classification", in which the set of categories is known, the number
|
||||
of categories is finite, and each text belongs to exactly one
|
||||
category.
|
||||
|
||||
``MultiClassifierI`` is a standard interface for "multi-category
|
||||
classification", which is like single-category classification except
|
||||
that each text belongs to zero or more categories.
|
||||
"""
|
||||
from nltk.internals import overridden
|
||||
|
||||
##//////////////////////////////////////////////////////
|
||||
# { Classification Interfaces
|
||||
##//////////////////////////////////////////////////////
|
||||
|
||||
|
||||
class ClassifierI:
|
||||
"""
|
||||
A processing interface for labeling tokens with a single category
|
||||
label (or "class"). Labels are typically strs or
|
||||
ints, but can be any immutable type. The set of labels
|
||||
that the classifier chooses from must be fixed and finite.
|
||||
|
||||
Subclasses must define:
|
||||
- ``labels()``
|
||||
- either ``classify()`` or ``classify_many()`` (or both)
|
||||
|
||||
Subclasses may define:
|
||||
- either ``prob_classify()`` or ``prob_classify_many()`` (or both)
|
||||
"""
|
||||
|
||||
def labels(self):
|
||||
"""
|
||||
:return: the list of category labels used by this classifier.
|
||||
:rtype: list of (immutable)
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def classify(self, featureset):
|
||||
"""
|
||||
:return: the most appropriate label for the given featureset.
|
||||
:rtype: label
|
||||
"""
|
||||
if overridden(self.classify_many):
|
||||
return self.classify_many([featureset])[0]
|
||||
else:
|
||||
raise NotImplementedError()
|
||||
|
||||
def prob_classify(self, featureset):
|
||||
"""
|
||||
:return: a probability distribution over labels for the given
|
||||
featureset.
|
||||
:rtype: ProbDistI
|
||||
"""
|
||||
if overridden(self.prob_classify_many):
|
||||
return self.prob_classify_many([featureset])[0]
|
||||
else:
|
||||
raise NotImplementedError()
|
||||
|
||||
def classify_many(self, featuresets):
|
||||
"""
|
||||
Apply ``self.classify()`` to each element of ``featuresets``. I.e.:
|
||||
|
||||
return [self.classify(fs) for fs in featuresets]
|
||||
|
||||
:rtype: list(label)
|
||||
"""
|
||||
return [self.classify(fs) for fs in featuresets]
|
||||
|
||||
def prob_classify_many(self, featuresets):
|
||||
"""
|
||||
Apply ``self.prob_classify()`` to each element of ``featuresets``. I.e.:
|
||||
|
||||
return [self.prob_classify(fs) for fs in featuresets]
|
||||
|
||||
:rtype: list(ProbDistI)
|
||||
"""
|
||||
return [self.prob_classify(fs) for fs in featuresets]
|
||||
|
||||
|
||||
class MultiClassifierI:
|
||||
"""
|
||||
A processing interface for labeling tokens with zero or more
|
||||
category labels (or "labels"). Labels are typically strs
|
||||
or ints, but can be any immutable type. The set of labels
|
||||
that the multi-classifier chooses from must be fixed and finite.
|
||||
|
||||
Subclasses must define:
|
||||
- ``labels()``
|
||||
- either ``classify()`` or ``classify_many()`` (or both)
|
||||
|
||||
Subclasses may define:
|
||||
- either ``prob_classify()`` or ``prob_classify_many()`` (or both)
|
||||
"""
|
||||
|
||||
def labels(self):
|
||||
"""
|
||||
:return: the list of category labels used by this classifier.
|
||||
:rtype: list of (immutable)
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def classify(self, featureset):
|
||||
"""
|
||||
:return: the most appropriate set of labels for the given featureset.
|
||||
:rtype: set(label)
|
||||
"""
|
||||
if overridden(self.classify_many):
|
||||
return self.classify_many([featureset])[0]
|
||||
else:
|
||||
raise NotImplementedError()
|
||||
|
||||
def prob_classify(self, featureset):
|
||||
"""
|
||||
:return: a probability distribution over sets of labels for the
|
||||
given featureset.
|
||||
:rtype: ProbDistI
|
||||
"""
|
||||
if overridden(self.prob_classify_many):
|
||||
return self.prob_classify_many([featureset])[0]
|
||||
else:
|
||||
raise NotImplementedError()
|
||||
|
||||
def classify_many(self, featuresets):
|
||||
"""
|
||||
Apply ``self.classify()`` to each element of ``featuresets``. I.e.:
|
||||
|
||||
return [self.classify(fs) for fs in featuresets]
|
||||
|
||||
:rtype: list(set(label))
|
||||
"""
|
||||
return [self.classify(fs) for fs in featuresets]
|
||||
|
||||
def prob_classify_many(self, featuresets):
|
||||
"""
|
||||
Apply ``self.prob_classify()`` to each element of ``featuresets``. I.e.:
|
||||
|
||||
return [self.prob_classify(fs) for fs in featuresets]
|
||||
|
||||
:rtype: list(ProbDistI)
|
||||
"""
|
||||
return [self.prob_classify(fs) for fs in featuresets]
|
||||
|
||||
|
||||
# # [XX] IN PROGRESS:
|
||||
# class SequenceClassifierI:
|
||||
# """
|
||||
# A processing interface for labeling sequences of tokens with a
|
||||
# single category label (or "class"). Labels are typically
|
||||
# strs or ints, but can be any immutable type. The set
|
||||
# of labels that the classifier chooses from must be fixed and
|
||||
# finite.
|
||||
# """
|
||||
# def labels(self):
|
||||
# """
|
||||
# :return: the list of category labels used by this classifier.
|
||||
# :rtype: list of (immutable)
|
||||
# """
|
||||
# raise NotImplementedError()
|
||||
|
||||
# def prob_classify(self, featureset):
|
||||
# """
|
||||
# Return a probability distribution over labels for the given
|
||||
# featureset.
|
||||
|
||||
# If ``featureset`` is a list of featuresets, then return a
|
||||
# corresponding list containing the probability distribution
|
||||
# over labels for each of the given featuresets, where the
|
||||
# *i*\ th element of this list is the most appropriate label for
|
||||
# the *i*\ th element of ``featuresets``.
|
||||
# """
|
||||
# raise NotImplementedError()
|
||||
|
||||
# def classify(self, featureset):
|
||||
# """
|
||||
# Return the most appropriate label for the given featureset.
|
||||
|
||||
# If ``featureset`` is a list of featuresets, then return a
|
||||
# corresponding list containing the most appropriate label for
|
||||
# each of the given featuresets, where the *i*\ th element of
|
||||
# this list is the most appropriate label for the *i*\ th element
|
||||
# of ``featuresets``.
|
||||
# """
|
||||
# raise NotImplementedError()
|
||||
349
backend/venv/Lib/site-packages/nltk/classify/decisiontree.py
Normal file
349
backend/venv/Lib/site-packages/nltk/classify/decisiontree.py
Normal file
@@ -0,0 +1,349 @@
|
||||
# Natural Language Toolkit: Decision Tree Classifiers
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
A classifier model that decides which label to assign to a token on
|
||||
the basis of a tree structure, where branches correspond to conditions
|
||||
on feature values, and leaves correspond to label assignments.
|
||||
"""
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
from nltk.classify.api import ClassifierI
|
||||
from nltk.probability import FreqDist, MLEProbDist, entropy
|
||||
|
||||
|
||||
class DecisionTreeClassifier(ClassifierI):
|
||||
def __init__(self, label, feature_name=None, decisions=None, default=None):
|
||||
"""
|
||||
:param label: The most likely label for tokens that reach
|
||||
this node in the decision tree. If this decision tree
|
||||
has no children, then this label will be assigned to
|
||||
any token that reaches this decision tree.
|
||||
:param feature_name: The name of the feature that this
|
||||
decision tree selects for.
|
||||
:param decisions: A dictionary mapping from feature values
|
||||
for the feature identified by ``feature_name`` to
|
||||
child decision trees.
|
||||
:param default: The child that will be used if the value of
|
||||
feature ``feature_name`` does not match any of the keys in
|
||||
``decisions``. This is used when constructing binary
|
||||
decision trees.
|
||||
"""
|
||||
self._label = label
|
||||
self._fname = feature_name
|
||||
self._decisions = decisions
|
||||
self._default = default
|
||||
|
||||
def labels(self):
|
||||
labels = [self._label]
|
||||
if self._decisions is not None:
|
||||
for dt in self._decisions.values():
|
||||
labels.extend(dt.labels())
|
||||
if self._default is not None:
|
||||
labels.extend(self._default.labels())
|
||||
return list(set(labels))
|
||||
|
||||
def classify(self, featureset):
|
||||
# Decision leaf:
|
||||
if self._fname is None:
|
||||
return self._label
|
||||
|
||||
# Decision tree:
|
||||
fval = featureset.get(self._fname)
|
||||
if fval in self._decisions:
|
||||
return self._decisions[fval].classify(featureset)
|
||||
elif self._default is not None:
|
||||
return self._default.classify(featureset)
|
||||
else:
|
||||
return self._label
|
||||
|
||||
def error(self, labeled_featuresets):
|
||||
errors = 0
|
||||
for featureset, label in labeled_featuresets:
|
||||
if self.classify(featureset) != label:
|
||||
errors += 1
|
||||
return errors / len(labeled_featuresets)
|
||||
|
||||
def pretty_format(self, width=70, prefix="", depth=4):
|
||||
"""
|
||||
Return a string containing a pretty-printed version of this
|
||||
decision tree. Each line in this string corresponds to a
|
||||
single decision tree node or leaf, and indentation is used to
|
||||
display the structure of the decision tree.
|
||||
"""
|
||||
# [xx] display default!!
|
||||
if self._fname is None:
|
||||
n = width - len(prefix) - 15
|
||||
return "{}{} {}\n".format(prefix, "." * n, self._label)
|
||||
s = ""
|
||||
for i, (fval, result) in enumerate(
|
||||
sorted(
|
||||
self._decisions.items(),
|
||||
key=lambda item: (item[0] in [None, False, True], str(item[0]).lower()),
|
||||
)
|
||||
):
|
||||
hdr = f"{prefix}{self._fname}={fval}? "
|
||||
n = width - 15 - len(hdr)
|
||||
s += "{}{} {}\n".format(hdr, "." * (n), result._label)
|
||||
if result._fname is not None and depth > 1:
|
||||
s += result.pretty_format(width, prefix + " ", depth - 1)
|
||||
if self._default is not None:
|
||||
n = width - len(prefix) - 21
|
||||
s += "{}else: {} {}\n".format(prefix, "." * n, self._default._label)
|
||||
if self._default._fname is not None and depth > 1:
|
||||
s += self._default.pretty_format(width, prefix + " ", depth - 1)
|
||||
return s
|
||||
|
||||
def pseudocode(self, prefix="", depth=4):
|
||||
"""
|
||||
Return a string representation of this decision tree that
|
||||
expresses the decisions it makes as a nested set of pseudocode
|
||||
if statements.
|
||||
"""
|
||||
if self._fname is None:
|
||||
return f"{prefix}return {self._label!r}\n"
|
||||
s = ""
|
||||
for fval, result in sorted(
|
||||
self._decisions.items(),
|
||||
key=lambda item: (item[0] in [None, False, True], str(item[0]).lower()),
|
||||
):
|
||||
s += f"{prefix}if {self._fname} == {fval!r}: "
|
||||
if result._fname is not None and depth > 1:
|
||||
s += "\n" + result.pseudocode(prefix + " ", depth - 1)
|
||||
else:
|
||||
s += f"return {result._label!r}\n"
|
||||
if self._default is not None:
|
||||
if len(self._decisions) == 1:
|
||||
s += "{}if {} != {!r}: ".format(
|
||||
prefix, self._fname, list(self._decisions.keys())[0]
|
||||
)
|
||||
else:
|
||||
s += f"{prefix}else: "
|
||||
if self._default._fname is not None and depth > 1:
|
||||
s += "\n" + self._default.pseudocode(prefix + " ", depth - 1)
|
||||
else:
|
||||
s += f"return {self._default._label!r}\n"
|
||||
return s
|
||||
|
||||
def __str__(self):
|
||||
return self.pretty_format()
|
||||
|
||||
@staticmethod
|
||||
def train(
|
||||
labeled_featuresets,
|
||||
entropy_cutoff=0.05,
|
||||
depth_cutoff=100,
|
||||
support_cutoff=10,
|
||||
binary=False,
|
||||
feature_values=None,
|
||||
verbose=False,
|
||||
):
|
||||
"""
|
||||
:param binary: If true, then treat all feature/value pairs as
|
||||
individual binary features, rather than using a single n-way
|
||||
branch for each feature.
|
||||
"""
|
||||
# Collect a list of all feature names.
|
||||
feature_names = set()
|
||||
for featureset, label in labeled_featuresets:
|
||||
for fname in featureset:
|
||||
feature_names.add(fname)
|
||||
|
||||
# Collect a list of the values each feature can take.
|
||||
if feature_values is None and binary:
|
||||
feature_values = defaultdict(set)
|
||||
for featureset, label in labeled_featuresets:
|
||||
for fname, fval in featureset.items():
|
||||
feature_values[fname].add(fval)
|
||||
|
||||
# Start with a stump.
|
||||
if not binary:
|
||||
tree = DecisionTreeClassifier.best_stump(
|
||||
feature_names, labeled_featuresets, verbose
|
||||
)
|
||||
else:
|
||||
tree = DecisionTreeClassifier.best_binary_stump(
|
||||
feature_names, labeled_featuresets, feature_values, verbose
|
||||
)
|
||||
|
||||
# Refine the stump.
|
||||
tree.refine(
|
||||
labeled_featuresets,
|
||||
entropy_cutoff,
|
||||
depth_cutoff - 1,
|
||||
support_cutoff,
|
||||
binary,
|
||||
feature_values,
|
||||
verbose,
|
||||
)
|
||||
|
||||
# Return it
|
||||
return tree
|
||||
|
||||
@staticmethod
|
||||
def leaf(labeled_featuresets):
|
||||
label = FreqDist(label for (featureset, label) in labeled_featuresets).max()
|
||||
return DecisionTreeClassifier(label)
|
||||
|
||||
@staticmethod
|
||||
def stump(feature_name, labeled_featuresets):
|
||||
label = FreqDist(label for (featureset, label) in labeled_featuresets).max()
|
||||
|
||||
# Find the best label for each value.
|
||||
freqs = defaultdict(FreqDist) # freq(label|value)
|
||||
for featureset, label in labeled_featuresets:
|
||||
feature_value = featureset.get(feature_name)
|
||||
freqs[feature_value][label] += 1
|
||||
|
||||
decisions = {val: DecisionTreeClassifier(freqs[val].max()) for val in freqs}
|
||||
return DecisionTreeClassifier(label, feature_name, decisions)
|
||||
|
||||
def refine(
|
||||
self,
|
||||
labeled_featuresets,
|
||||
entropy_cutoff,
|
||||
depth_cutoff,
|
||||
support_cutoff,
|
||||
binary=False,
|
||||
feature_values=None,
|
||||
verbose=False,
|
||||
):
|
||||
if len(labeled_featuresets) <= support_cutoff:
|
||||
return
|
||||
if self._fname is None:
|
||||
return
|
||||
if depth_cutoff <= 0:
|
||||
return
|
||||
for fval in self._decisions:
|
||||
fval_featuresets = [
|
||||
(featureset, label)
|
||||
for (featureset, label) in labeled_featuresets
|
||||
if featureset.get(self._fname) == fval
|
||||
]
|
||||
|
||||
label_freqs = FreqDist(label for (featureset, label) in fval_featuresets)
|
||||
if entropy(MLEProbDist(label_freqs)) > entropy_cutoff:
|
||||
self._decisions[fval] = DecisionTreeClassifier.train(
|
||||
fval_featuresets,
|
||||
entropy_cutoff,
|
||||
depth_cutoff,
|
||||
support_cutoff,
|
||||
binary,
|
||||
feature_values,
|
||||
verbose,
|
||||
)
|
||||
if self._default is not None:
|
||||
default_featuresets = [
|
||||
(featureset, label)
|
||||
for (featureset, label) in labeled_featuresets
|
||||
if featureset.get(self._fname) not in self._decisions
|
||||
]
|
||||
label_freqs = FreqDist(label for (featureset, label) in default_featuresets)
|
||||
if entropy(MLEProbDist(label_freqs)) > entropy_cutoff:
|
||||
self._default = DecisionTreeClassifier.train(
|
||||
default_featuresets,
|
||||
entropy_cutoff,
|
||||
depth_cutoff,
|
||||
support_cutoff,
|
||||
binary,
|
||||
feature_values,
|
||||
verbose,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def best_stump(feature_names, labeled_featuresets, verbose=False):
|
||||
best_stump = DecisionTreeClassifier.leaf(labeled_featuresets)
|
||||
best_error = best_stump.error(labeled_featuresets)
|
||||
for fname in feature_names:
|
||||
stump = DecisionTreeClassifier.stump(fname, labeled_featuresets)
|
||||
stump_error = stump.error(labeled_featuresets)
|
||||
if stump_error < best_error:
|
||||
best_error = stump_error
|
||||
best_stump = stump
|
||||
if verbose:
|
||||
print(
|
||||
"best stump for {:6d} toks uses {:20} err={:6.4f}".format(
|
||||
len(labeled_featuresets), best_stump._fname, best_error
|
||||
)
|
||||
)
|
||||
return best_stump
|
||||
|
||||
@staticmethod
|
||||
def binary_stump(feature_name, feature_value, labeled_featuresets):
|
||||
label = FreqDist(label for (featureset, label) in labeled_featuresets).max()
|
||||
|
||||
# Find the best label for each value.
|
||||
pos_fdist = FreqDist()
|
||||
neg_fdist = FreqDist()
|
||||
for featureset, label in labeled_featuresets:
|
||||
if featureset.get(feature_name) == feature_value:
|
||||
pos_fdist[label] += 1
|
||||
else:
|
||||
neg_fdist[label] += 1
|
||||
|
||||
decisions = {}
|
||||
default = label
|
||||
# But hopefully we have observations!
|
||||
if pos_fdist.N() > 0:
|
||||
decisions = {feature_value: DecisionTreeClassifier(pos_fdist.max())}
|
||||
if neg_fdist.N() > 0:
|
||||
default = DecisionTreeClassifier(neg_fdist.max())
|
||||
|
||||
return DecisionTreeClassifier(label, feature_name, decisions, default)
|
||||
|
||||
@staticmethod
|
||||
def best_binary_stump(
|
||||
feature_names, labeled_featuresets, feature_values, verbose=False
|
||||
):
|
||||
best_stump = DecisionTreeClassifier.leaf(labeled_featuresets)
|
||||
best_error = best_stump.error(labeled_featuresets)
|
||||
for fname in feature_names:
|
||||
for fval in feature_values[fname]:
|
||||
stump = DecisionTreeClassifier.binary_stump(
|
||||
fname, fval, labeled_featuresets
|
||||
)
|
||||
stump_error = stump.error(labeled_featuresets)
|
||||
if stump_error < best_error:
|
||||
best_error = stump_error
|
||||
best_stump = stump
|
||||
if verbose:
|
||||
if best_stump._decisions:
|
||||
descr = "{}={}".format(
|
||||
best_stump._fname, list(best_stump._decisions.keys())[0]
|
||||
)
|
||||
else:
|
||||
descr = "(default)"
|
||||
print(
|
||||
"best stump for {:6d} toks uses {:20} err={:6.4f}".format(
|
||||
len(labeled_featuresets), descr, best_error
|
||||
)
|
||||
)
|
||||
return best_stump
|
||||
|
||||
|
||||
##//////////////////////////////////////////////////////
|
||||
## Demo
|
||||
##//////////////////////////////////////////////////////
|
||||
|
||||
|
||||
def f(x):
|
||||
return DecisionTreeClassifier.train(x, binary=True, verbose=True)
|
||||
|
||||
|
||||
def demo():
|
||||
from nltk.classify.util import binary_names_demo_features, names_demo
|
||||
|
||||
classifier = names_demo(
|
||||
f, binary_names_demo_features # DecisionTreeClassifier.train,
|
||||
)
|
||||
print(classifier.pretty_format(depth=7))
|
||||
print(classifier.pseudocode(depth=7))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
1631
backend/venv/Lib/site-packages/nltk/classify/maxent.py
Normal file
1631
backend/venv/Lib/site-packages/nltk/classify/maxent.py
Normal file
File diff suppressed because it is too large
Load Diff
184
backend/venv/Lib/site-packages/nltk/classify/megam.py
Normal file
184
backend/venv/Lib/site-packages/nltk/classify/megam.py
Normal file
@@ -0,0 +1,184 @@
|
||||
# Natural Language Toolkit: Interface to Megam Classifier
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
A set of functions used to interface with the external megam_ maxent
|
||||
optimization package. Before megam can be used, you should tell NLTK where it
|
||||
can find the megam binary, using the ``config_megam()`` function. Typical
|
||||
usage:
|
||||
|
||||
>>> from nltk.classify import megam
|
||||
>>> megam.config_megam() # pass path to megam if not found in PATH # doctest: +SKIP
|
||||
[Found megam: ...]
|
||||
|
||||
Use with MaxentClassifier. Example below, see MaxentClassifier documentation
|
||||
for details.
|
||||
|
||||
nltk.classify.MaxentClassifier.train(corpus, 'megam')
|
||||
|
||||
.. _megam: https://www.umiacs.umd.edu/~hal/megam/index.html
|
||||
"""
|
||||
import subprocess
|
||||
|
||||
from nltk.internals import find_binary
|
||||
|
||||
try:
|
||||
import numpy
|
||||
except ImportError:
|
||||
numpy = None
|
||||
|
||||
######################################################################
|
||||
# { Configuration
|
||||
######################################################################
|
||||
|
||||
_megam_bin = None
|
||||
|
||||
|
||||
def config_megam(bin=None):
|
||||
"""
|
||||
Configure NLTK's interface to the ``megam`` maxent optimization
|
||||
package.
|
||||
|
||||
:param bin: The full path to the ``megam`` binary. If not specified,
|
||||
then nltk will search the system for a ``megam`` binary; and if
|
||||
one is not found, it will raise a ``LookupError`` exception.
|
||||
:type bin: str
|
||||
"""
|
||||
global _megam_bin
|
||||
_megam_bin = find_binary(
|
||||
"megam",
|
||||
bin,
|
||||
env_vars=["MEGAM"],
|
||||
binary_names=["megam.opt", "megam", "megam_686", "megam_i686.opt"],
|
||||
url="https://www.umiacs.umd.edu/~hal/megam/index.html",
|
||||
)
|
||||
|
||||
|
||||
######################################################################
|
||||
# { Megam Interface Functions
|
||||
######################################################################
|
||||
|
||||
|
||||
def write_megam_file(train_toks, encoding, stream, bernoulli=True, explicit=True):
|
||||
"""
|
||||
Generate an input file for ``megam`` based on the given corpus of
|
||||
classified tokens.
|
||||
|
||||
:type train_toks: list(tuple(dict, str))
|
||||
:param train_toks: Training data, represented as a list of
|
||||
pairs, the first member of which is a feature dictionary,
|
||||
and the second of which is a classification label.
|
||||
|
||||
:type encoding: MaxentFeatureEncodingI
|
||||
:param encoding: A feature encoding, used to convert featuresets
|
||||
into feature vectors. May optionally implement a cost() method
|
||||
in order to assign different costs to different class predictions.
|
||||
|
||||
:type stream: stream
|
||||
:param stream: The stream to which the megam input file should be
|
||||
written.
|
||||
|
||||
:param bernoulli: If true, then use the 'bernoulli' format. I.e.,
|
||||
all joint features have binary values, and are listed iff they
|
||||
are true. Otherwise, list feature values explicitly. If
|
||||
``bernoulli=False``, then you must call ``megam`` with the
|
||||
``-fvals`` option.
|
||||
|
||||
:param explicit: If true, then use the 'explicit' format. I.e.,
|
||||
list the features that would fire for any of the possible
|
||||
labels, for each token. If ``explicit=True``, then you must
|
||||
call ``megam`` with the ``-explicit`` option.
|
||||
"""
|
||||
# Look up the set of labels.
|
||||
labels = encoding.labels()
|
||||
labelnum = {label: i for (i, label) in enumerate(labels)}
|
||||
|
||||
# Write the file, which contains one line per instance.
|
||||
for featureset, label in train_toks:
|
||||
# First, the instance number (or, in the weighted multiclass case, the cost of each label).
|
||||
if hasattr(encoding, "cost"):
|
||||
stream.write(
|
||||
":".join(str(encoding.cost(featureset, label, l)) for l in labels)
|
||||
)
|
||||
else:
|
||||
stream.write("%d" % labelnum[label])
|
||||
|
||||
# For implicit file formats, just list the features that fire
|
||||
# for this instance's actual label.
|
||||
if not explicit:
|
||||
_write_megam_features(encoding.encode(featureset, label), stream, bernoulli)
|
||||
|
||||
# For explicit formats, list the features that would fire for
|
||||
# any of the possible labels.
|
||||
else:
|
||||
for l in labels:
|
||||
stream.write(" #")
|
||||
_write_megam_features(encoding.encode(featureset, l), stream, bernoulli)
|
||||
|
||||
# End of the instance.
|
||||
stream.write("\n")
|
||||
|
||||
|
||||
def parse_megam_weights(s, features_count, explicit=True):
|
||||
"""
|
||||
Given the stdout output generated by ``megam`` when training a
|
||||
model, return a ``numpy`` array containing the corresponding weight
|
||||
vector. This function does not currently handle bias features.
|
||||
"""
|
||||
if numpy is None:
|
||||
raise ValueError("This function requires that numpy be installed")
|
||||
assert explicit, "non-explicit not supported yet"
|
||||
lines = s.strip().split("\n")
|
||||
weights = numpy.zeros(features_count, "d")
|
||||
for line in lines:
|
||||
if line.strip():
|
||||
fid, weight = line.split()
|
||||
weights[int(fid)] = float(weight)
|
||||
return weights
|
||||
|
||||
|
||||
def _write_megam_features(vector, stream, bernoulli):
|
||||
if not vector:
|
||||
raise ValueError(
|
||||
"MEGAM classifier requires the use of an " "always-on feature."
|
||||
)
|
||||
for fid, fval in vector:
|
||||
if bernoulli:
|
||||
if fval == 1:
|
||||
stream.write(" %s" % fid)
|
||||
elif fval != 0:
|
||||
raise ValueError(
|
||||
"If bernoulli=True, then all" "features must be binary."
|
||||
)
|
||||
else:
|
||||
stream.write(f" {fid} {fval}")
|
||||
|
||||
|
||||
def call_megam(args):
|
||||
"""
|
||||
Call the ``megam`` binary with the given arguments.
|
||||
"""
|
||||
if isinstance(args, str):
|
||||
raise TypeError("args should be a list of strings")
|
||||
if _megam_bin is None:
|
||||
config_megam()
|
||||
|
||||
# Call megam via a subprocess
|
||||
cmd = [_megam_bin] + args
|
||||
p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
|
||||
(stdout, stderr) = p.communicate()
|
||||
|
||||
# Check the return code.
|
||||
if p.returncode != 0:
|
||||
print()
|
||||
print(stderr)
|
||||
raise OSError("megam command failed!")
|
||||
|
||||
if isinstance(stdout, str):
|
||||
return stdout
|
||||
else:
|
||||
return stdout.decode("utf-8")
|
||||
260
backend/venv/Lib/site-packages/nltk/classify/naivebayes.py
Normal file
260
backend/venv/Lib/site-packages/nltk/classify/naivebayes.py
Normal file
@@ -0,0 +1,260 @@
|
||||
# Natural Language Toolkit: Naive Bayes Classifiers
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
A classifier based on the Naive Bayes algorithm. In order to find the
|
||||
probability for a label, this algorithm first uses the Bayes rule to
|
||||
express P(label|features) in terms of P(label) and P(features|label):
|
||||
|
||||
| P(label) * P(features|label)
|
||||
| P(label|features) = ------------------------------
|
||||
| P(features)
|
||||
|
||||
The algorithm then makes the 'naive' assumption that all features are
|
||||
independent, given the label:
|
||||
|
||||
| P(label) * P(f1|label) * ... * P(fn|label)
|
||||
| P(label|features) = --------------------------------------------
|
||||
| P(features)
|
||||
|
||||
Rather than computing P(features) explicitly, the algorithm just
|
||||
calculates the numerator for each label, and normalizes them so they
|
||||
sum to one:
|
||||
|
||||
| P(label) * P(f1|label) * ... * P(fn|label)
|
||||
| P(label|features) = --------------------------------------------
|
||||
| SUM[l]( P(l) * P(f1|l) * ... * P(fn|l) )
|
||||
"""
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
from nltk.classify.api import ClassifierI
|
||||
from nltk.probability import DictionaryProbDist, ELEProbDist, FreqDist, sum_logs
|
||||
|
||||
##//////////////////////////////////////////////////////
|
||||
## Naive Bayes Classifier
|
||||
##//////////////////////////////////////////////////////
|
||||
|
||||
|
||||
class NaiveBayesClassifier(ClassifierI):
|
||||
"""
|
||||
A Naive Bayes classifier. Naive Bayes classifiers are
|
||||
paramaterized by two probability distributions:
|
||||
|
||||
- P(label) gives the probability that an input will receive each
|
||||
label, given no information about the input's features.
|
||||
|
||||
- P(fname=fval|label) gives the probability that a given feature
|
||||
(fname) will receive a given value (fval), given that the
|
||||
label (label).
|
||||
|
||||
If the classifier encounters an input with a feature that has
|
||||
never been seen with any label, then rather than assigning a
|
||||
probability of 0 to all labels, it will ignore that feature.
|
||||
|
||||
The feature value 'None' is reserved for unseen feature values;
|
||||
you generally should not use 'None' as a feature value for one of
|
||||
your own features.
|
||||
"""
|
||||
|
||||
def __init__(self, label_probdist, feature_probdist):
|
||||
"""
|
||||
:param label_probdist: P(label), the probability distribution
|
||||
over labels. It is expressed as a ``ProbDistI`` whose
|
||||
samples are labels. I.e., P(label) =
|
||||
``label_probdist.prob(label)``.
|
||||
|
||||
:param feature_probdist: P(fname=fval|label), the probability
|
||||
distribution for feature values, given labels. It is
|
||||
expressed as a dictionary whose keys are ``(label, fname)``
|
||||
pairs and whose values are ``ProbDistI`` objects over feature
|
||||
values. I.e., P(fname=fval|label) =
|
||||
``feature_probdist[label,fname].prob(fval)``. If a given
|
||||
``(label,fname)`` is not a key in ``feature_probdist``, then
|
||||
it is assumed that the corresponding P(fname=fval|label)
|
||||
is 0 for all values of ``fval``.
|
||||
"""
|
||||
self._label_probdist = label_probdist
|
||||
self._feature_probdist = feature_probdist
|
||||
self._labels = list(label_probdist.samples())
|
||||
|
||||
def labels(self):
|
||||
return self._labels
|
||||
|
||||
def classify(self, featureset):
|
||||
return self.prob_classify(featureset).max()
|
||||
|
||||
def prob_classify(self, featureset):
|
||||
# Discard any feature names that we've never seen before.
|
||||
# Otherwise, we'll just assign a probability of 0 to
|
||||
# everything.
|
||||
featureset = featureset.copy()
|
||||
for fname in list(featureset.keys()):
|
||||
for label in self._labels:
|
||||
if (label, fname) in self._feature_probdist:
|
||||
break
|
||||
else:
|
||||
# print('Ignoring unseen feature %s' % fname)
|
||||
del featureset[fname]
|
||||
|
||||
# Find the log probability of each label, given the features.
|
||||
# Start with the log probability of the label itself.
|
||||
logprob = {}
|
||||
for label in self._labels:
|
||||
logprob[label] = self._label_probdist.logprob(label)
|
||||
|
||||
# Then add in the log probability of features given labels.
|
||||
for label in self._labels:
|
||||
for fname, fval in featureset.items():
|
||||
if (label, fname) in self._feature_probdist:
|
||||
feature_probs = self._feature_probdist[label, fname]
|
||||
logprob[label] += feature_probs.logprob(fval)
|
||||
else:
|
||||
# nb: This case will never come up if the
|
||||
# classifier was created by
|
||||
# NaiveBayesClassifier.train().
|
||||
logprob[label] += sum_logs([]) # = -INF.
|
||||
|
||||
return DictionaryProbDist(logprob, normalize=True, log=True)
|
||||
|
||||
def show_most_informative_features(self, n=10):
|
||||
# Determine the most relevant features, and display them.
|
||||
cpdist = self._feature_probdist
|
||||
print("Most Informative Features")
|
||||
|
||||
for fname, fval in self.most_informative_features(n):
|
||||
|
||||
def labelprob(l):
|
||||
return cpdist[l, fname].prob(fval)
|
||||
|
||||
labels = sorted(
|
||||
(l for l in self._labels if fval in cpdist[l, fname].samples()),
|
||||
key=lambda element: (-labelprob(element), element),
|
||||
reverse=True,
|
||||
)
|
||||
if len(labels) == 1:
|
||||
continue
|
||||
l0 = labels[0]
|
||||
l1 = labels[-1]
|
||||
if cpdist[l0, fname].prob(fval) == 0:
|
||||
ratio = "INF"
|
||||
else:
|
||||
ratio = "%8.1f" % (
|
||||
cpdist[l1, fname].prob(fval) / cpdist[l0, fname].prob(fval)
|
||||
)
|
||||
print(
|
||||
"%24s = %-14r %6s : %-6s = %s : 1.0"
|
||||
% (fname, fval, ("%s" % l1)[:6], ("%s" % l0)[:6], ratio)
|
||||
)
|
||||
|
||||
def most_informative_features(self, n=100):
|
||||
"""
|
||||
Return a list of the 'most informative' features used by this
|
||||
classifier. For the purpose of this function, the
|
||||
informativeness of a feature ``(fname,fval)`` is equal to the
|
||||
highest value of P(fname=fval|label), for any label, divided by
|
||||
the lowest value of P(fname=fval|label), for any label:
|
||||
|
||||
| max[ P(fname=fval|label1) / P(fname=fval|label2) ]
|
||||
"""
|
||||
if hasattr(self, "_most_informative_features"):
|
||||
return self._most_informative_features[:n]
|
||||
else:
|
||||
# The set of (fname, fval) pairs used by this classifier.
|
||||
features = set()
|
||||
# The max & min probability associated w/ each (fname, fval)
|
||||
# pair. Maps (fname,fval) -> float.
|
||||
maxprob = defaultdict(float)
|
||||
minprob = defaultdict(lambda: 1.0)
|
||||
|
||||
for (label, fname), probdist in self._feature_probdist.items():
|
||||
for fval in probdist.samples():
|
||||
feature = (fname, fval)
|
||||
features.add(feature)
|
||||
p = probdist.prob(fval)
|
||||
maxprob[feature] = max(p, maxprob[feature])
|
||||
minprob[feature] = min(p, minprob[feature])
|
||||
if minprob[feature] == 0:
|
||||
features.discard(feature)
|
||||
|
||||
# Convert features to a list, & sort it by how informative
|
||||
# features are.
|
||||
self._most_informative_features = sorted(
|
||||
features,
|
||||
key=lambda feature_: (
|
||||
minprob[feature_] / maxprob[feature_],
|
||||
feature_[0],
|
||||
feature_[1] in [None, False, True],
|
||||
str(feature_[1]).lower(),
|
||||
),
|
||||
)
|
||||
return self._most_informative_features[:n]
|
||||
|
||||
@classmethod
|
||||
def train(cls, labeled_featuresets, estimator=ELEProbDist):
|
||||
"""
|
||||
:param labeled_featuresets: A list of classified featuresets,
|
||||
i.e., a list of tuples ``(featureset, label)``.
|
||||
"""
|
||||
label_freqdist = FreqDist()
|
||||
feature_freqdist = defaultdict(FreqDist)
|
||||
feature_values = defaultdict(set)
|
||||
fnames = set()
|
||||
|
||||
# Count up how many times each feature value occurred, given
|
||||
# the label and featurename.
|
||||
for featureset, label in labeled_featuresets:
|
||||
label_freqdist[label] += 1
|
||||
for fname, fval in featureset.items():
|
||||
# Increment freq(fval|label, fname)
|
||||
feature_freqdist[label, fname][fval] += 1
|
||||
# Record that fname can take the value fval.
|
||||
feature_values[fname].add(fval)
|
||||
# Keep a list of all feature names.
|
||||
fnames.add(fname)
|
||||
|
||||
# If a feature didn't have a value given for an instance, then
|
||||
# we assume that it gets the implicit value 'None.' This loop
|
||||
# counts up the number of 'missing' feature values for each
|
||||
# (label,fname) pair, and increments the count of the fval
|
||||
# 'None' by that amount.
|
||||
for label in label_freqdist:
|
||||
num_samples = label_freqdist[label]
|
||||
for fname in fnames:
|
||||
count = feature_freqdist[label, fname].N()
|
||||
# Only add a None key when necessary, i.e. if there are
|
||||
# any samples with feature 'fname' missing.
|
||||
if num_samples - count > 0:
|
||||
feature_freqdist[label, fname][None] += num_samples - count
|
||||
feature_values[fname].add(None)
|
||||
|
||||
# Create the P(label) distribution
|
||||
label_probdist = estimator(label_freqdist)
|
||||
|
||||
# Create the P(fval|label, fname) distribution
|
||||
feature_probdist = {}
|
||||
for (label, fname), freqdist in feature_freqdist.items():
|
||||
probdist = estimator(freqdist, bins=len(feature_values[fname]))
|
||||
feature_probdist[label, fname] = probdist
|
||||
|
||||
return cls(label_probdist, feature_probdist)
|
||||
|
||||
|
||||
##//////////////////////////////////////////////////////
|
||||
## Demo
|
||||
##//////////////////////////////////////////////////////
|
||||
|
||||
|
||||
def demo():
|
||||
from nltk.classify.util import names_demo
|
||||
|
||||
classifier = names_demo(NaiveBayesClassifier.train)
|
||||
classifier.show_most_informative_features()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
@@ -0,0 +1,180 @@
|
||||
# Natural Language Toolkit: Positive Naive Bayes Classifier
|
||||
#
|
||||
# Copyright (C) 2012 NLTK Project
|
||||
# Author: Alessandro Presta <alessandro.presta@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
A variant of the Naive Bayes Classifier that performs binary classification with
|
||||
partially-labeled training sets. In other words, assume we want to build a classifier
|
||||
that assigns each example to one of two complementary classes (e.g., male names and
|
||||
female names).
|
||||
If we have a training set with labeled examples for both classes, we can use a
|
||||
standard Naive Bayes Classifier. However, consider the case when we only have labeled
|
||||
examples for one of the classes, and other, unlabeled, examples.
|
||||
Then, assuming a prior distribution on the two labels, we can use the unlabeled set
|
||||
to estimate the frequencies of the various features.
|
||||
|
||||
Let the two possible labels be 1 and 0, and let's say we only have examples labeled 1
|
||||
and unlabeled examples. We are also given an estimate of P(1).
|
||||
|
||||
We compute P(feature|1) exactly as in the standard case.
|
||||
|
||||
To compute P(feature|0), we first estimate P(feature) from the unlabeled set (we are
|
||||
assuming that the unlabeled examples are drawn according to the given prior distribution)
|
||||
and then express the conditional probability as:
|
||||
|
||||
| P(feature) - P(feature|1) * P(1)
|
||||
| P(feature|0) = ----------------------------------
|
||||
| P(0)
|
||||
|
||||
Example:
|
||||
|
||||
>>> from nltk.classify import PositiveNaiveBayesClassifier
|
||||
|
||||
Some sentences about sports:
|
||||
|
||||
>>> sports_sentences = [ 'The team dominated the game',
|
||||
... 'They lost the ball',
|
||||
... 'The game was intense',
|
||||
... 'The goalkeeper catched the ball',
|
||||
... 'The other team controlled the ball' ]
|
||||
|
||||
Mixed topics, including sports:
|
||||
|
||||
>>> various_sentences = [ 'The President did not comment',
|
||||
... 'I lost the keys',
|
||||
... 'The team won the game',
|
||||
... 'Sara has two kids',
|
||||
... 'The ball went off the court',
|
||||
... 'They had the ball for the whole game',
|
||||
... 'The show is over' ]
|
||||
|
||||
The features of a sentence are simply the words it contains:
|
||||
|
||||
>>> def features(sentence):
|
||||
... words = sentence.lower().split()
|
||||
... return dict(('contains(%s)' % w, True) for w in words)
|
||||
|
||||
We use the sports sentences as positive examples, the mixed ones ad unlabeled examples:
|
||||
|
||||
>>> positive_featuresets = map(features, sports_sentences)
|
||||
>>> unlabeled_featuresets = map(features, various_sentences)
|
||||
>>> classifier = PositiveNaiveBayesClassifier.train(positive_featuresets,
|
||||
... unlabeled_featuresets)
|
||||
|
||||
Is the following sentence about sports?
|
||||
|
||||
>>> classifier.classify(features('The cat is on the table'))
|
||||
False
|
||||
|
||||
What about this one?
|
||||
|
||||
>>> classifier.classify(features('My team lost the game'))
|
||||
True
|
||||
"""
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
from nltk.classify.naivebayes import NaiveBayesClassifier
|
||||
from nltk.probability import DictionaryProbDist, ELEProbDist, FreqDist
|
||||
|
||||
##//////////////////////////////////////////////////////
|
||||
## Positive Naive Bayes Classifier
|
||||
##//////////////////////////////////////////////////////
|
||||
|
||||
|
||||
class PositiveNaiveBayesClassifier(NaiveBayesClassifier):
|
||||
@staticmethod
|
||||
def train(
|
||||
positive_featuresets,
|
||||
unlabeled_featuresets,
|
||||
positive_prob_prior=0.5,
|
||||
estimator=ELEProbDist,
|
||||
):
|
||||
"""
|
||||
:param positive_featuresets: An iterable of featuresets that are known as positive
|
||||
examples (i.e., their label is ``True``).
|
||||
|
||||
:param unlabeled_featuresets: An iterable of featuresets whose label is unknown.
|
||||
|
||||
:param positive_prob_prior: A prior estimate of the probability of the label
|
||||
``True`` (default 0.5).
|
||||
"""
|
||||
positive_feature_freqdist = defaultdict(FreqDist)
|
||||
unlabeled_feature_freqdist = defaultdict(FreqDist)
|
||||
feature_values = defaultdict(set)
|
||||
fnames = set()
|
||||
|
||||
# Count up how many times each feature value occurred in positive examples.
|
||||
num_positive_examples = 0
|
||||
for featureset in positive_featuresets:
|
||||
for fname, fval in featureset.items():
|
||||
positive_feature_freqdist[fname][fval] += 1
|
||||
feature_values[fname].add(fval)
|
||||
fnames.add(fname)
|
||||
num_positive_examples += 1
|
||||
|
||||
# Count up how many times each feature value occurred in unlabeled examples.
|
||||
num_unlabeled_examples = 0
|
||||
for featureset in unlabeled_featuresets:
|
||||
for fname, fval in featureset.items():
|
||||
unlabeled_feature_freqdist[fname][fval] += 1
|
||||
feature_values[fname].add(fval)
|
||||
fnames.add(fname)
|
||||
num_unlabeled_examples += 1
|
||||
|
||||
# If a feature didn't have a value given for an instance, then we assume that
|
||||
# it gets the implicit value 'None'.
|
||||
for fname in fnames:
|
||||
count = positive_feature_freqdist[fname].N()
|
||||
positive_feature_freqdist[fname][None] += num_positive_examples - count
|
||||
feature_values[fname].add(None)
|
||||
|
||||
for fname in fnames:
|
||||
count = unlabeled_feature_freqdist[fname].N()
|
||||
unlabeled_feature_freqdist[fname][None] += num_unlabeled_examples - count
|
||||
feature_values[fname].add(None)
|
||||
|
||||
negative_prob_prior = 1.0 - positive_prob_prior
|
||||
|
||||
# Create the P(label) distribution.
|
||||
label_probdist = DictionaryProbDist(
|
||||
{True: positive_prob_prior, False: negative_prob_prior}
|
||||
)
|
||||
|
||||
# Create the P(fval|label, fname) distribution.
|
||||
feature_probdist = {}
|
||||
for fname, freqdist in positive_feature_freqdist.items():
|
||||
probdist = estimator(freqdist, bins=len(feature_values[fname]))
|
||||
feature_probdist[True, fname] = probdist
|
||||
|
||||
for fname, freqdist in unlabeled_feature_freqdist.items():
|
||||
global_probdist = estimator(freqdist, bins=len(feature_values[fname]))
|
||||
negative_feature_probs = {}
|
||||
for fval in feature_values[fname]:
|
||||
prob = (
|
||||
global_probdist.prob(fval)
|
||||
- positive_prob_prior * feature_probdist[True, fname].prob(fval)
|
||||
) / negative_prob_prior
|
||||
# TODO: We need to add some kind of smoothing here, instead of
|
||||
# setting negative probabilities to zero and normalizing.
|
||||
negative_feature_probs[fval] = max(prob, 0.0)
|
||||
feature_probdist[False, fname] = DictionaryProbDist(
|
||||
negative_feature_probs, normalize=True
|
||||
)
|
||||
|
||||
return PositiveNaiveBayesClassifier(label_probdist, feature_probdist)
|
||||
|
||||
|
||||
##//////////////////////////////////////////////////////
|
||||
## Demo
|
||||
##//////////////////////////////////////////////////////
|
||||
|
||||
|
||||
def demo():
|
||||
from nltk.classify.util import partial_names_demo
|
||||
|
||||
classifier = partial_names_demo(PositiveNaiveBayesClassifier.train)
|
||||
classifier.show_most_informative_features()
|
||||
183
backend/venv/Lib/site-packages/nltk/classify/rte_classify.py
Normal file
183
backend/venv/Lib/site-packages/nltk/classify/rte_classify.py
Normal file
@@ -0,0 +1,183 @@
|
||||
# Natural Language Toolkit: RTE Classifier
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Simple classifier for RTE corpus.
|
||||
|
||||
It calculates the overlap in words and named entities between text and
|
||||
hypothesis, and also whether there are words / named entities in the
|
||||
hypothesis which fail to occur in the text, since this is an indicator that
|
||||
the hypothesis is more informative than (i.e not entailed by) the text.
|
||||
|
||||
TO DO: better Named Entity classification
|
||||
TO DO: add lemmatization
|
||||
"""
|
||||
|
||||
from nltk.classify.maxent import MaxentClassifier
|
||||
from nltk.classify.util import accuracy
|
||||
from nltk.tokenize import RegexpTokenizer
|
||||
|
||||
|
||||
class RTEFeatureExtractor:
|
||||
"""
|
||||
This builds a bag of words for both the text and the hypothesis after
|
||||
throwing away some stopwords, then calculates overlap and difference.
|
||||
"""
|
||||
|
||||
def __init__(self, rtepair, stop=True, use_lemmatize=False):
|
||||
"""
|
||||
:param rtepair: a ``RTEPair`` from which features should be extracted
|
||||
:param stop: if ``True``, stopwords are thrown away.
|
||||
:type stop: bool
|
||||
"""
|
||||
self.stop = stop
|
||||
self.stopwords = {
|
||||
"a",
|
||||
"the",
|
||||
"it",
|
||||
"they",
|
||||
"of",
|
||||
"in",
|
||||
"to",
|
||||
"is",
|
||||
"have",
|
||||
"are",
|
||||
"were",
|
||||
"and",
|
||||
"very",
|
||||
".",
|
||||
",",
|
||||
}
|
||||
|
||||
self.negwords = {"no", "not", "never", "failed", "rejected", "denied"}
|
||||
# Try to tokenize so that abbreviations, monetary amounts, email
|
||||
# addresses, URLs are single tokens.
|
||||
tokenizer = RegexpTokenizer(r"[\w.@:/]+|\w+|\$[\d.]+")
|
||||
|
||||
# Get the set of word types for text and hypothesis
|
||||
self.text_tokens = tokenizer.tokenize(rtepair.text)
|
||||
self.hyp_tokens = tokenizer.tokenize(rtepair.hyp)
|
||||
self.text_words = set(self.text_tokens)
|
||||
self.hyp_words = set(self.hyp_tokens)
|
||||
|
||||
if use_lemmatize:
|
||||
self.text_words = {self._lemmatize(token) for token in self.text_tokens}
|
||||
self.hyp_words = {self._lemmatize(token) for token in self.hyp_tokens}
|
||||
|
||||
if self.stop:
|
||||
self.text_words = self.text_words - self.stopwords
|
||||
self.hyp_words = self.hyp_words - self.stopwords
|
||||
|
||||
self._overlap = self.hyp_words & self.text_words
|
||||
self._hyp_extra = self.hyp_words - self.text_words
|
||||
self._txt_extra = self.text_words - self.hyp_words
|
||||
|
||||
def overlap(self, toktype, debug=False):
|
||||
"""
|
||||
Compute the overlap between text and hypothesis.
|
||||
|
||||
:param toktype: distinguish Named Entities from ordinary words
|
||||
:type toktype: 'ne' or 'word'
|
||||
"""
|
||||
ne_overlap = {token for token in self._overlap if self._ne(token)}
|
||||
if toktype == "ne":
|
||||
if debug:
|
||||
print("ne overlap", ne_overlap)
|
||||
return ne_overlap
|
||||
elif toktype == "word":
|
||||
if debug:
|
||||
print("word overlap", self._overlap - ne_overlap)
|
||||
return self._overlap - ne_overlap
|
||||
else:
|
||||
raise ValueError("Type not recognized:'%s'" % toktype)
|
||||
|
||||
def hyp_extra(self, toktype, debug=True):
|
||||
"""
|
||||
Compute the extraneous material in the hypothesis.
|
||||
|
||||
:param toktype: distinguish Named Entities from ordinary words
|
||||
:type toktype: 'ne' or 'word'
|
||||
"""
|
||||
ne_extra = {token for token in self._hyp_extra if self._ne(token)}
|
||||
if toktype == "ne":
|
||||
return ne_extra
|
||||
elif toktype == "word":
|
||||
return self._hyp_extra - ne_extra
|
||||
else:
|
||||
raise ValueError("Type not recognized: '%s'" % toktype)
|
||||
|
||||
@staticmethod
|
||||
def _ne(token):
|
||||
"""
|
||||
This just assumes that words in all caps or titles are
|
||||
named entities.
|
||||
|
||||
:type token: str
|
||||
"""
|
||||
if token.istitle() or token.isupper():
|
||||
return True
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def _lemmatize(word):
|
||||
"""
|
||||
Use morphy from WordNet to find the base form of verbs.
|
||||
"""
|
||||
from nltk.corpus import wordnet as wn
|
||||
|
||||
lemma = wn.morphy(word, pos=wn.VERB)
|
||||
if lemma is not None:
|
||||
return lemma
|
||||
return word
|
||||
|
||||
|
||||
def rte_features(rtepair):
|
||||
extractor = RTEFeatureExtractor(rtepair)
|
||||
features = {}
|
||||
features["alwayson"] = True
|
||||
features["word_overlap"] = len(extractor.overlap("word"))
|
||||
features["word_hyp_extra"] = len(extractor.hyp_extra("word"))
|
||||
features["ne_overlap"] = len(extractor.overlap("ne"))
|
||||
features["ne_hyp_extra"] = len(extractor.hyp_extra("ne"))
|
||||
features["neg_txt"] = len(extractor.negwords & extractor.text_words)
|
||||
features["neg_hyp"] = len(extractor.negwords & extractor.hyp_words)
|
||||
return features
|
||||
|
||||
|
||||
def rte_featurize(rte_pairs):
|
||||
return [(rte_features(pair), pair.value) for pair in rte_pairs]
|
||||
|
||||
|
||||
def rte_classifier(algorithm, sample_N=None):
|
||||
from nltk.corpus import rte as rte_corpus
|
||||
|
||||
train_set = rte_corpus.pairs(["rte1_dev.xml", "rte2_dev.xml", "rte3_dev.xml"])
|
||||
test_set = rte_corpus.pairs(["rte1_test.xml", "rte2_test.xml", "rte3_test.xml"])
|
||||
|
||||
if sample_N is not None:
|
||||
train_set = train_set[:sample_N]
|
||||
test_set = test_set[:sample_N]
|
||||
|
||||
featurized_train_set = rte_featurize(train_set)
|
||||
featurized_test_set = rte_featurize(test_set)
|
||||
|
||||
# Train the classifier
|
||||
print("Training classifier...")
|
||||
if algorithm in ["megam"]: # MEGAM based algorithms.
|
||||
clf = MaxentClassifier.train(featurized_train_set, algorithm)
|
||||
elif algorithm in ["GIS", "IIS"]: # Use default GIS/IIS MaxEnt algorithm
|
||||
clf = MaxentClassifier.train(featurized_train_set, algorithm)
|
||||
else:
|
||||
err_msg = str(
|
||||
"RTEClassifier only supports these algorithms:\n "
|
||||
"'megam', 'GIS', 'IIS'.\n"
|
||||
)
|
||||
raise Exception(err_msg)
|
||||
print("Testing classifier...")
|
||||
acc = accuracy(clf, featurized_test_set)
|
||||
print("Accuracy: %6.4f" % acc)
|
||||
return clf
|
||||
143
backend/venv/Lib/site-packages/nltk/classify/scikitlearn.py
Normal file
143
backend/venv/Lib/site-packages/nltk/classify/scikitlearn.py
Normal file
@@ -0,0 +1,143 @@
|
||||
# Natural Language Toolkit: Interface to scikit-learn classifiers
|
||||
#
|
||||
# Author: Lars Buitinck <L.J.Buitinck@uva.nl>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
"""
|
||||
scikit-learn (https://scikit-learn.org) is a machine learning library for
|
||||
Python. It supports many classification algorithms, including SVMs,
|
||||
Naive Bayes, logistic regression (MaxEnt) and decision trees.
|
||||
|
||||
This package implements a wrapper around scikit-learn classifiers. To use this
|
||||
wrapper, construct a scikit-learn estimator object, then use that to construct
|
||||
a SklearnClassifier. E.g., to wrap a linear SVM with default settings:
|
||||
|
||||
>>> from sklearn.svm import LinearSVC
|
||||
>>> from nltk.classify.scikitlearn import SklearnClassifier
|
||||
>>> classif = SklearnClassifier(LinearSVC())
|
||||
|
||||
A scikit-learn classifier may include preprocessing steps when it's wrapped
|
||||
in a Pipeline object. The following constructs and wraps a Naive Bayes text
|
||||
classifier with tf-idf weighting and chi-square feature selection to get the
|
||||
best 1000 features:
|
||||
|
||||
>>> from sklearn.feature_extraction.text import TfidfTransformer
|
||||
>>> from sklearn.feature_selection import SelectKBest, chi2
|
||||
>>> from sklearn.naive_bayes import MultinomialNB
|
||||
>>> from sklearn.pipeline import Pipeline
|
||||
>>> pipeline = Pipeline([('tfidf', TfidfTransformer()),
|
||||
... ('chi2', SelectKBest(chi2, k=1000)),
|
||||
... ('nb', MultinomialNB())])
|
||||
>>> classif = SklearnClassifier(pipeline)
|
||||
"""
|
||||
|
||||
from nltk.classify.api import ClassifierI
|
||||
from nltk.probability import DictionaryProbDist
|
||||
|
||||
try:
|
||||
from sklearn.feature_extraction import DictVectorizer
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
__all__ = ["SklearnClassifier"]
|
||||
|
||||
|
||||
class SklearnClassifier(ClassifierI):
|
||||
"""Wrapper for scikit-learn classifiers."""
|
||||
|
||||
def __init__(self, estimator, dtype=float, sparse=True):
|
||||
"""
|
||||
:param estimator: scikit-learn classifier object.
|
||||
|
||||
:param dtype: data type used when building feature array.
|
||||
scikit-learn estimators work exclusively on numeric data. The
|
||||
default value should be fine for almost all situations.
|
||||
|
||||
:param sparse: Whether to use sparse matrices internally.
|
||||
The estimator must support these; not all scikit-learn classifiers
|
||||
do (see their respective documentation and look for "sparse
|
||||
matrix"). The default value is True, since most NLP problems
|
||||
involve sparse feature sets. Setting this to False may take a
|
||||
great amount of memory.
|
||||
:type sparse: boolean.
|
||||
"""
|
||||
self._clf = estimator
|
||||
self._encoder = LabelEncoder()
|
||||
self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)
|
||||
|
||||
def __repr__(self):
|
||||
return "<SklearnClassifier(%r)>" % self._clf
|
||||
|
||||
def classify_many(self, featuresets):
|
||||
"""Classify a batch of samples.
|
||||
|
||||
:param featuresets: An iterable over featuresets, each a dict mapping
|
||||
strings to either numbers, booleans or strings.
|
||||
:return: The predicted class label for each input sample.
|
||||
:rtype: list
|
||||
"""
|
||||
X = self._vectorizer.transform(featuresets)
|
||||
classes = self._encoder.classes_
|
||||
return [classes[i] for i in self._clf.predict(X)]
|
||||
|
||||
def prob_classify_many(self, featuresets):
|
||||
"""Compute per-class probabilities for a batch of samples.
|
||||
|
||||
:param featuresets: An iterable over featuresets, each a dict mapping
|
||||
strings to either numbers, booleans or strings.
|
||||
:rtype: list of ``ProbDistI``
|
||||
"""
|
||||
X = self._vectorizer.transform(featuresets)
|
||||
y_proba_list = self._clf.predict_proba(X)
|
||||
return [self._make_probdist(y_proba) for y_proba in y_proba_list]
|
||||
|
||||
def labels(self):
|
||||
"""The class labels used by this classifier.
|
||||
|
||||
:rtype: list
|
||||
"""
|
||||
return list(self._encoder.classes_)
|
||||
|
||||
def train(self, labeled_featuresets):
|
||||
"""
|
||||
Train (fit) the scikit-learn estimator.
|
||||
|
||||
:param labeled_featuresets: A list of ``(featureset, label)``
|
||||
where each ``featureset`` is a dict mapping strings to either
|
||||
numbers, booleans or strings.
|
||||
"""
|
||||
|
||||
X, y = list(zip(*labeled_featuresets))
|
||||
X = self._vectorizer.fit_transform(X)
|
||||
y = self._encoder.fit_transform(y)
|
||||
self._clf.fit(X, y)
|
||||
|
||||
return self
|
||||
|
||||
def _make_probdist(self, y_proba):
|
||||
classes = self._encoder.classes_
|
||||
return DictionaryProbDist({classes[i]: p for i, p in enumerate(y_proba)})
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.naive_bayes import BernoulliNB
|
||||
|
||||
from nltk.classify.util import names_demo, names_demo_features
|
||||
|
||||
# Bernoulli Naive Bayes is designed for binary classification. We set the
|
||||
# binarize option to False since we know we're passing boolean features.
|
||||
print("scikit-learn Naive Bayes:")
|
||||
names_demo(
|
||||
SklearnClassifier(BernoulliNB(binarize=False)).train,
|
||||
features=names_demo_features,
|
||||
)
|
||||
|
||||
# The C parameter on logistic regression (MaxEnt) controls regularization.
|
||||
# The higher it's set, the less regularized the classifier is.
|
||||
print("\n\nscikit-learn logistic regression:")
|
||||
names_demo(
|
||||
SklearnClassifier(LogisticRegression(C=1000)).train,
|
||||
features=names_demo_features,
|
||||
)
|
||||
175
backend/venv/Lib/site-packages/nltk/classify/senna.py
Normal file
175
backend/venv/Lib/site-packages/nltk/classify/senna.py
Normal file
@@ -0,0 +1,175 @@
|
||||
# Natural Language Toolkit: Senna Interface
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Rami Al-Rfou' <ralrfou@cs.stonybrook.edu>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
A general interface to the SENNA pipeline that supports any of the
|
||||
operations specified in SUPPORTED_OPERATIONS.
|
||||
|
||||
Applying multiple operations at once has the speed advantage. For example,
|
||||
Senna will automatically determine POS tags if you are extracting named
|
||||
entities. Applying both of the operations will cost only the time of
|
||||
extracting the named entities.
|
||||
|
||||
The SENNA pipeline has a fixed maximum size of the sentences that it can read.
|
||||
By default it is 1024 token/sentence. If you have larger sentences, changing
|
||||
the MAX_SENTENCE_SIZE value in SENNA_main.c should be considered and your
|
||||
system specific binary should be rebuilt. Otherwise this could introduce
|
||||
misalignment errors.
|
||||
|
||||
The input is:
|
||||
|
||||
- path to the directory that contains SENNA executables. If the path is incorrect,
|
||||
Senna will automatically search for executable file specified in SENNA environment variable
|
||||
- List of the operations needed to be performed.
|
||||
- (optionally) the encoding of the input data (default:utf-8)
|
||||
|
||||
Note: Unit tests for this module can be found in test/unit/test_senna.py
|
||||
|
||||
>>> from nltk.classify import Senna
|
||||
>>> pipeline = Senna('/usr/share/senna-v3.0', ['pos', 'chk', 'ner']) # doctest: +SKIP
|
||||
>>> sent = 'Dusseldorf is an international business center'.split()
|
||||
>>> [(token['word'], token['chk'], token['ner'], token['pos']) for token in pipeline.tag(sent)] # doctest: +SKIP
|
||||
[('Dusseldorf', 'B-NP', 'B-LOC', 'NNP'), ('is', 'B-VP', 'O', 'VBZ'), ('an', 'B-NP', 'O', 'DT'),
|
||||
('international', 'I-NP', 'O', 'JJ'), ('business', 'I-NP', 'O', 'NN'), ('center', 'I-NP', 'O', 'NN')]
|
||||
"""
|
||||
|
||||
from os import environ, path, sep
|
||||
from platform import architecture, system
|
||||
from subprocess import PIPE, Popen
|
||||
|
||||
from nltk.tag.api import TaggerI
|
||||
|
||||
|
||||
class Senna(TaggerI):
|
||||
SUPPORTED_OPERATIONS = ["pos", "chk", "ner"]
|
||||
|
||||
def __init__(self, senna_path, operations, encoding="utf-8"):
|
||||
self._encoding = encoding
|
||||
self._path = path.normpath(senna_path) + sep
|
||||
|
||||
# Verifies the existence of the executable on the self._path first
|
||||
# senna_binary_file_1 = self.executable(self._path)
|
||||
exe_file_1 = self.executable(self._path)
|
||||
if not path.isfile(exe_file_1):
|
||||
# Check for the system environment
|
||||
if "SENNA" in environ:
|
||||
# self._path = path.join(environ['SENNA'],'')
|
||||
self._path = path.normpath(environ["SENNA"]) + sep
|
||||
exe_file_2 = self.executable(self._path)
|
||||
if not path.isfile(exe_file_2):
|
||||
raise LookupError(
|
||||
"Senna executable expected at %s or %s but not found"
|
||||
% (exe_file_1, exe_file_2)
|
||||
)
|
||||
|
||||
self.operations = operations
|
||||
|
||||
def executable(self, base_path):
|
||||
"""
|
||||
The function that determines the system specific binary that should be
|
||||
used in the pipeline. In case, the system is not known the default senna binary will
|
||||
be used.
|
||||
"""
|
||||
os_name = system()
|
||||
if os_name == "Linux":
|
||||
bits = architecture()[0]
|
||||
if bits == "64bit":
|
||||
return path.join(base_path, "senna-linux64")
|
||||
return path.join(base_path, "senna-linux32")
|
||||
if os_name == "Windows":
|
||||
return path.join(base_path, "senna-win32.exe")
|
||||
if os_name == "Darwin":
|
||||
return path.join(base_path, "senna-osx")
|
||||
return path.join(base_path, "senna")
|
||||
|
||||
def _map(self):
|
||||
"""
|
||||
A method that calculates the order of the columns that SENNA pipeline
|
||||
will output the tags into. This depends on the operations being ordered.
|
||||
"""
|
||||
_map = {}
|
||||
i = 1
|
||||
for operation in Senna.SUPPORTED_OPERATIONS:
|
||||
if operation in self.operations:
|
||||
_map[operation] = i
|
||||
i += 1
|
||||
return _map
|
||||
|
||||
def tag(self, tokens):
|
||||
"""
|
||||
Applies the specified operation(s) on a list of tokens.
|
||||
"""
|
||||
return self.tag_sents([tokens])[0]
|
||||
|
||||
def tag_sents(self, sentences):
|
||||
"""
|
||||
Applies the tag method over a list of sentences. This method will return a
|
||||
list of dictionaries. Every dictionary will contain a word with its
|
||||
calculated annotations/tags.
|
||||
"""
|
||||
encoding = self._encoding
|
||||
|
||||
if not path.isfile(self.executable(self._path)):
|
||||
raise LookupError(
|
||||
"Senna executable expected at %s but not found"
|
||||
% self.executable(self._path)
|
||||
)
|
||||
|
||||
# Build the senna command to run the tagger
|
||||
_senna_cmd = [
|
||||
self.executable(self._path),
|
||||
"-path",
|
||||
self._path,
|
||||
"-usrtokens",
|
||||
"-iobtags",
|
||||
]
|
||||
_senna_cmd.extend(["-" + op for op in self.operations])
|
||||
|
||||
# Serialize the actual sentences to a temporary string
|
||||
_input = "\n".join(" ".join(x) for x in sentences) + "\n"
|
||||
if isinstance(_input, str) and encoding:
|
||||
_input = _input.encode(encoding)
|
||||
|
||||
# Run the tagger and get the output
|
||||
p = Popen(_senna_cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE)
|
||||
(stdout, stderr) = p.communicate(input=_input)
|
||||
senna_output = stdout
|
||||
|
||||
# Check the return code.
|
||||
if p.returncode != 0:
|
||||
raise RuntimeError("Senna command failed! Details: %s" % stderr)
|
||||
|
||||
if encoding:
|
||||
senna_output = stdout.decode(encoding)
|
||||
|
||||
# Output the tagged sentences
|
||||
map_ = self._map()
|
||||
tagged_sentences = [[]]
|
||||
sentence_index = 0
|
||||
token_index = 0
|
||||
for tagged_word in senna_output.strip().split("\n"):
|
||||
if not tagged_word:
|
||||
tagged_sentences.append([])
|
||||
sentence_index += 1
|
||||
token_index = 0
|
||||
continue
|
||||
tags = tagged_word.split("\t")
|
||||
result = {}
|
||||
for tag in map_:
|
||||
result[tag] = tags[map_[tag]].strip()
|
||||
try:
|
||||
result["word"] = sentences[sentence_index][token_index]
|
||||
except IndexError as e:
|
||||
raise IndexError(
|
||||
"Misalignment error occurred at sentence number %d. Possible reason"
|
||||
" is that the sentence size exceeded the maximum size. Check the "
|
||||
"documentation of Senna class for more information."
|
||||
% sentence_index
|
||||
) from e
|
||||
tagged_sentences[-1].append(result)
|
||||
token_index += 1
|
||||
return tagged_sentences
|
||||
17
backend/venv/Lib/site-packages/nltk/classify/svm.py
Normal file
17
backend/venv/Lib/site-packages/nltk/classify/svm.py
Normal file
@@ -0,0 +1,17 @@
|
||||
# Natural Language Toolkit: SVM-based classifier
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Leon Derczynski <leon@dcs.shef.ac.uk>
|
||||
#
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
"""
|
||||
nltk.classify.svm was deprecated. For classification based
|
||||
on support vector machines SVMs use nltk.classify.scikitlearn
|
||||
(or `scikit-learn <https://scikit-learn.org>`_ directly).
|
||||
"""
|
||||
|
||||
|
||||
class SvmClassifier:
|
||||
def __init__(self, *args, **kwargs):
|
||||
raise NotImplementedError(__doc__)
|
||||
122
backend/venv/Lib/site-packages/nltk/classify/tadm.py
Normal file
122
backend/venv/Lib/site-packages/nltk/classify/tadm.py
Normal file
@@ -0,0 +1,122 @@
|
||||
# Natural Language Toolkit: Interface to TADM Classifier
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Joseph Frazee <jfrazee@mail.utexas.edu>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
from nltk.internals import find_binary
|
||||
|
||||
try:
|
||||
import numpy
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
_tadm_bin = None
|
||||
|
||||
|
||||
def config_tadm(bin=None):
|
||||
global _tadm_bin
|
||||
_tadm_bin = find_binary(
|
||||
"tadm", bin, env_vars=["TADM"], binary_names=["tadm"], url="http://tadm.sf.net"
|
||||
)
|
||||
|
||||
|
||||
def write_tadm_file(train_toks, encoding, stream):
|
||||
"""
|
||||
Generate an input file for ``tadm`` based on the given corpus of
|
||||
classified tokens.
|
||||
|
||||
:type train_toks: list(tuple(dict, str))
|
||||
:param train_toks: Training data, represented as a list of
|
||||
pairs, the first member of which is a feature dictionary,
|
||||
and the second of which is a classification label.
|
||||
:type encoding: TadmEventMaxentFeatureEncoding
|
||||
:param encoding: A feature encoding, used to convert featuresets
|
||||
into feature vectors.
|
||||
:type stream: stream
|
||||
:param stream: The stream to which the ``tadm`` input file should be
|
||||
written.
|
||||
"""
|
||||
# See the following for a file format description:
|
||||
#
|
||||
# https://sf.net/forum/forum.php?thread_id=1391502&forum_id=473054
|
||||
# https://sf.net/forum/forum.php?thread_id=1675097&forum_id=473054
|
||||
labels = encoding.labels()
|
||||
for featureset, label in train_toks:
|
||||
length_line = "%d\n" % len(labels)
|
||||
stream.write(length_line)
|
||||
for known_label in labels:
|
||||
v = encoding.encode(featureset, known_label)
|
||||
line = "%d %d %s\n" % (
|
||||
int(label == known_label),
|
||||
len(v),
|
||||
" ".join("%d %d" % u for u in v),
|
||||
)
|
||||
stream.write(line)
|
||||
|
||||
|
||||
def parse_tadm_weights(paramfile):
|
||||
"""
|
||||
Given the stdout output generated by ``tadm`` when training a
|
||||
model, return a ``numpy`` array containing the corresponding weight
|
||||
vector.
|
||||
"""
|
||||
weights = []
|
||||
for line in paramfile:
|
||||
weights.append(float(line.strip()))
|
||||
return numpy.array(weights, "d")
|
||||
|
||||
|
||||
def call_tadm(args):
|
||||
"""
|
||||
Call the ``tadm`` binary with the given arguments.
|
||||
"""
|
||||
if isinstance(args, str):
|
||||
raise TypeError("args should be a list of strings")
|
||||
if _tadm_bin is None:
|
||||
config_tadm()
|
||||
|
||||
# Call tadm via a subprocess
|
||||
cmd = [_tadm_bin] + args
|
||||
p = subprocess.Popen(cmd, stdout=sys.stdout)
|
||||
(stdout, stderr) = p.communicate()
|
||||
|
||||
# Check the return code.
|
||||
if p.returncode != 0:
|
||||
print()
|
||||
print(stderr)
|
||||
raise OSError("tadm command failed!")
|
||||
|
||||
|
||||
def names_demo():
|
||||
from nltk.classify.maxent import TadmMaxentClassifier
|
||||
from nltk.classify.util import names_demo
|
||||
|
||||
classifier = names_demo(TadmMaxentClassifier.train)
|
||||
|
||||
|
||||
def encoding_demo():
|
||||
import sys
|
||||
|
||||
from nltk.classify.maxent import TadmEventMaxentFeatureEncoding
|
||||
|
||||
tokens = [
|
||||
({"f0": 1, "f1": 1, "f3": 1}, "A"),
|
||||
({"f0": 1, "f2": 1, "f4": 1}, "B"),
|
||||
({"f0": 2, "f2": 1, "f3": 1, "f4": 1}, "A"),
|
||||
]
|
||||
encoding = TadmEventMaxentFeatureEncoding.train(tokens)
|
||||
write_tadm_file(tokens, encoding, sys.stdout)
|
||||
print()
|
||||
for i in range(encoding.length()):
|
||||
print("%s --> %d" % (encoding.describe(i), i))
|
||||
print()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
encoding_demo()
|
||||
names_demo()
|
||||
193
backend/venv/Lib/site-packages/nltk/classify/textcat.py
Normal file
193
backend/venv/Lib/site-packages/nltk/classify/textcat.py
Normal file
@@ -0,0 +1,193 @@
|
||||
# Natural Language Toolkit: Language ID module using TextCat algorithm
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Avital Pekker <avital.pekker@utoronto.ca>
|
||||
#
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
A module for language identification using the TextCat algorithm.
|
||||
An implementation of the text categorization algorithm
|
||||
presented in Cavnar, W. B. and J. M. Trenkle,
|
||||
"N-Gram-Based Text Categorization".
|
||||
|
||||
The algorithm takes advantage of Zipf's law and uses
|
||||
n-gram frequencies to profile languages and text-yet to
|
||||
be identified-then compares using a distance measure.
|
||||
|
||||
Language n-grams are provided by the "An Crubadan"
|
||||
project. A corpus reader was created separately to read
|
||||
those files.
|
||||
|
||||
For details regarding the algorithm, see:
|
||||
https://www.let.rug.nl/~vannoord/TextCat/textcat.pdf
|
||||
|
||||
For details about An Crubadan, see:
|
||||
https://borel.slu.edu/crubadan/index.html
|
||||
"""
|
||||
|
||||
from sys import maxsize
|
||||
|
||||
from nltk.util import trigrams
|
||||
|
||||
# Note: this is NOT "re" you're likely used to. The regex module
|
||||
# is an alternative to the standard re module that supports
|
||||
# Unicode codepoint properties with the \p{} syntax.
|
||||
# You may have to "pip install regx"
|
||||
try:
|
||||
import regex as re
|
||||
except ImportError:
|
||||
re = None
|
||||
######################################################################
|
||||
## Language identification using TextCat
|
||||
######################################################################
|
||||
|
||||
|
||||
class TextCat:
|
||||
_corpus = None
|
||||
fingerprints = {}
|
||||
_START_CHAR = "<"
|
||||
_END_CHAR = ">"
|
||||
|
||||
last_distances = {}
|
||||
|
||||
def __init__(self):
|
||||
if not re:
|
||||
raise OSError(
|
||||
"classify.textcat requires the regex module that "
|
||||
"supports unicode. Try '$ pip install regex' and "
|
||||
"see https://pypi.python.org/pypi/regex for "
|
||||
"further details."
|
||||
)
|
||||
|
||||
from nltk.corpus import crubadan
|
||||
|
||||
self._corpus = crubadan
|
||||
# Load all language ngrams into cache
|
||||
for lang in self._corpus.langs():
|
||||
self._corpus.lang_freq(lang)
|
||||
|
||||
def remove_punctuation(self, text):
|
||||
"""Get rid of punctuation except apostrophes"""
|
||||
return re.sub(r"[^\P{P}\']+", "", text)
|
||||
|
||||
def profile(self, text):
|
||||
"""Create FreqDist of trigrams within text"""
|
||||
from nltk import FreqDist, word_tokenize
|
||||
|
||||
clean_text = self.remove_punctuation(text)
|
||||
tokens = word_tokenize(clean_text)
|
||||
|
||||
fingerprint = FreqDist()
|
||||
for t in tokens:
|
||||
token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
|
||||
token_trigrams = ["".join(tri) for tri in token_trigram_tuples]
|
||||
|
||||
for cur_trigram in token_trigrams:
|
||||
if cur_trigram in fingerprint:
|
||||
fingerprint[cur_trigram] += 1
|
||||
else:
|
||||
fingerprint[cur_trigram] = 1
|
||||
|
||||
return fingerprint
|
||||
|
||||
def calc_dist(self, lang, trigram, text_profile):
|
||||
"""Calculate the "out-of-place" measure between the
|
||||
text and language profile for a single trigram"""
|
||||
|
||||
lang_fd = self._corpus.lang_freq(lang)
|
||||
dist = 0
|
||||
|
||||
if trigram in lang_fd:
|
||||
idx_lang_profile = list(lang_fd.keys()).index(trigram)
|
||||
idx_text = list(text_profile.keys()).index(trigram)
|
||||
|
||||
# print(idx_lang_profile, ", ", idx_text)
|
||||
dist = abs(idx_lang_profile - idx_text)
|
||||
else:
|
||||
# Arbitrary but should be larger than
|
||||
# any possible trigram file length
|
||||
# in terms of total lines
|
||||
dist = maxsize
|
||||
|
||||
return dist
|
||||
|
||||
def lang_dists(self, text):
|
||||
"""Calculate the "out-of-place" measure between
|
||||
the text and all languages"""
|
||||
|
||||
distances = {}
|
||||
profile = self.profile(text)
|
||||
# For all the languages
|
||||
for lang in self._corpus._all_lang_freq.keys():
|
||||
# Calculate distance metric for every trigram in
|
||||
# input text to be identified
|
||||
lang_dist = 0
|
||||
for trigram in profile:
|
||||
lang_dist += self.calc_dist(lang, trigram, profile)
|
||||
|
||||
distances[lang] = lang_dist
|
||||
|
||||
return distances
|
||||
|
||||
def guess_language(self, text):
|
||||
"""Find the language with the min distance
|
||||
to the text and return its ISO 639-3 code"""
|
||||
self.last_distances = self.lang_dists(text)
|
||||
|
||||
return min(self.last_distances, key=self.last_distances.get)
|
||||
#################################################')
|
||||
|
||||
|
||||
def demo():
|
||||
from nltk.corpus import udhr
|
||||
|
||||
langs = [
|
||||
"Kurdish-UTF8",
|
||||
"Abkhaz-UTF8",
|
||||
"Farsi_Persian-UTF8",
|
||||
"Hindi-UTF8",
|
||||
"Hawaiian-UTF8",
|
||||
"Russian-UTF8",
|
||||
"Vietnamese-UTF8",
|
||||
"Serbian_Srpski-UTF8",
|
||||
"Esperanto-UTF8",
|
||||
]
|
||||
|
||||
friendly = {
|
||||
"kmr": "Northern Kurdish",
|
||||
"abk": "Abkhazian",
|
||||
"pes": "Iranian Persian",
|
||||
"hin": "Hindi",
|
||||
"haw": "Hawaiian",
|
||||
"rus": "Russian",
|
||||
"vie": "Vietnamese",
|
||||
"srp": "Serbian",
|
||||
"epo": "Esperanto",
|
||||
}
|
||||
|
||||
tc = TextCat()
|
||||
|
||||
for cur_lang in langs:
|
||||
# Get raw data from UDHR corpus
|
||||
raw_sentences = udhr.sents(cur_lang)
|
||||
rows = len(raw_sentences) - 1
|
||||
cols = list(map(len, raw_sentences))
|
||||
|
||||
sample = ""
|
||||
|
||||
# Generate a sample text of the language
|
||||
for i in range(0, rows):
|
||||
cur_sent = " " + " ".join([raw_sentences[i][j] for j in range(0, cols[i])])
|
||||
sample += cur_sent
|
||||
|
||||
# Try to detect what it is
|
||||
print("Language snippet: " + sample[0:140] + "...")
|
||||
guess = tc.guess_language(sample)
|
||||
print(f"Language detection: {guess} ({friendly[guess]})")
|
||||
print("#" * 140)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
347
backend/venv/Lib/site-packages/nltk/classify/util.py
Normal file
347
backend/venv/Lib/site-packages/nltk/classify/util.py
Normal file
@@ -0,0 +1,347 @@
|
||||
# Natural Language Toolkit: Classifier Utility Functions
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com> (minor additions)
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Utility functions and classes for classifiers.
|
||||
"""
|
||||
|
||||
import math
|
||||
|
||||
# from nltk.util import Deprecated
|
||||
import nltk.classify.util # for accuracy & log_likelihood
|
||||
from nltk.util import LazyMap
|
||||
|
||||
######################################################################
|
||||
# { Helper Functions
|
||||
######################################################################
|
||||
|
||||
|
||||
# alternative name possibility: 'map_featurefunc()'?
|
||||
# alternative name possibility: 'detect_features()'?
|
||||
# alternative name possibility: 'map_featuredetect()'?
|
||||
# or.. just have users use LazyMap directly?
|
||||
def apply_features(feature_func, toks, labeled=None):
|
||||
"""
|
||||
Use the ``LazyMap`` class to construct a lazy list-like
|
||||
object that is analogous to ``map(feature_func, toks)``. In
|
||||
particular, if ``labeled=False``, then the returned list-like
|
||||
object's values are equal to::
|
||||
|
||||
[feature_func(tok) for tok in toks]
|
||||
|
||||
If ``labeled=True``, then the returned list-like object's values
|
||||
are equal to::
|
||||
|
||||
[(feature_func(tok), label) for (tok, label) in toks]
|
||||
|
||||
The primary purpose of this function is to avoid the memory
|
||||
overhead involved in storing all the featuresets for every token
|
||||
in a corpus. Instead, these featuresets are constructed lazily,
|
||||
as-needed. The reduction in memory overhead can be especially
|
||||
significant when the underlying list of tokens is itself lazy (as
|
||||
is the case with many corpus readers).
|
||||
|
||||
:param feature_func: The function that will be applied to each
|
||||
token. It should return a featureset -- i.e., a dict
|
||||
mapping feature names to feature values.
|
||||
:param toks: The list of tokens to which ``feature_func`` should be
|
||||
applied. If ``labeled=True``, then the list elements will be
|
||||
passed directly to ``feature_func()``. If ``labeled=False``,
|
||||
then the list elements should be tuples ``(tok,label)``, and
|
||||
``tok`` will be passed to ``feature_func()``.
|
||||
:param labeled: If true, then ``toks`` contains labeled tokens --
|
||||
i.e., tuples of the form ``(tok, label)``. (Default:
|
||||
auto-detect based on types.)
|
||||
"""
|
||||
if labeled is None:
|
||||
labeled = toks and isinstance(toks[0], (tuple, list))
|
||||
if labeled:
|
||||
|
||||
def lazy_func(labeled_token):
|
||||
return (feature_func(labeled_token[0]), labeled_token[1])
|
||||
|
||||
return LazyMap(lazy_func, toks)
|
||||
else:
|
||||
return LazyMap(feature_func, toks)
|
||||
|
||||
|
||||
def attested_labels(tokens):
|
||||
"""
|
||||
:return: A list of all labels that are attested in the given list
|
||||
of tokens.
|
||||
:rtype: list of (immutable)
|
||||
:param tokens: The list of classified tokens from which to extract
|
||||
labels. A classified token has the form ``(token, label)``.
|
||||
:type tokens: list
|
||||
"""
|
||||
return tuple({label for (tok, label) in tokens})
|
||||
|
||||
|
||||
def log_likelihood(classifier, gold):
|
||||
results = classifier.prob_classify_many([fs for (fs, l) in gold])
|
||||
ll = [pdist.prob(l) for ((fs, l), pdist) in zip(gold, results)]
|
||||
return math.log(sum(ll) / len(ll))
|
||||
|
||||
|
||||
def accuracy(classifier, gold):
|
||||
results = classifier.classify_many([fs for (fs, l) in gold])
|
||||
correct = [l == r for ((fs, l), r) in zip(gold, results)]
|
||||
if correct:
|
||||
return sum(correct) / len(correct)
|
||||
else:
|
||||
return 0
|
||||
|
||||
|
||||
class CutoffChecker:
|
||||
"""
|
||||
A helper class that implements cutoff checks based on number of
|
||||
iterations and log likelihood.
|
||||
|
||||
Accuracy cutoffs are also implemented, but they're almost never
|
||||
a good idea to use.
|
||||
"""
|
||||
|
||||
def __init__(self, cutoffs):
|
||||
self.cutoffs = cutoffs.copy()
|
||||
if "min_ll" in cutoffs:
|
||||
cutoffs["min_ll"] = -abs(cutoffs["min_ll"])
|
||||
if "min_lldelta" in cutoffs:
|
||||
cutoffs["min_lldelta"] = abs(cutoffs["min_lldelta"])
|
||||
self.ll = None
|
||||
self.acc = None
|
||||
self.iter = 1
|
||||
|
||||
def check(self, classifier, train_toks):
|
||||
cutoffs = self.cutoffs
|
||||
self.iter += 1
|
||||
if "max_iter" in cutoffs and self.iter >= cutoffs["max_iter"]:
|
||||
return True # iteration cutoff.
|
||||
|
||||
new_ll = nltk.classify.util.log_likelihood(classifier, train_toks)
|
||||
if math.isnan(new_ll):
|
||||
return True
|
||||
|
||||
if "min_ll" in cutoffs or "min_lldelta" in cutoffs:
|
||||
if "min_ll" in cutoffs and new_ll >= cutoffs["min_ll"]:
|
||||
return True # log likelihood cutoff
|
||||
if (
|
||||
"min_lldelta" in cutoffs
|
||||
and self.ll
|
||||
and ((new_ll - self.ll) <= abs(cutoffs["min_lldelta"]))
|
||||
):
|
||||
return True # log likelihood delta cutoff
|
||||
self.ll = new_ll
|
||||
|
||||
if "max_acc" in cutoffs or "min_accdelta" in cutoffs:
|
||||
new_acc = nltk.classify.util.log_likelihood(classifier, train_toks)
|
||||
if "max_acc" in cutoffs and new_acc >= cutoffs["max_acc"]:
|
||||
return True # log likelihood cutoff
|
||||
if (
|
||||
"min_accdelta" in cutoffs
|
||||
and self.acc
|
||||
and ((new_acc - self.acc) <= abs(cutoffs["min_accdelta"]))
|
||||
):
|
||||
return True # log likelihood delta cutoff
|
||||
self.acc = new_acc
|
||||
|
||||
return False # no cutoff reached.
|
||||
|
||||
|
||||
######################################################################
|
||||
# { Demos
|
||||
######################################################################
|
||||
|
||||
|
||||
def names_demo_features(name):
|
||||
features = {}
|
||||
features["alwayson"] = True
|
||||
features["startswith"] = name[0].lower()
|
||||
features["endswith"] = name[-1].lower()
|
||||
for letter in "abcdefghijklmnopqrstuvwxyz":
|
||||
features["count(%s)" % letter] = name.lower().count(letter)
|
||||
features["has(%s)" % letter] = letter in name.lower()
|
||||
return features
|
||||
|
||||
|
||||
def binary_names_demo_features(name):
|
||||
features = {}
|
||||
features["alwayson"] = True
|
||||
features["startswith(vowel)"] = name[0].lower() in "aeiouy"
|
||||
features["endswith(vowel)"] = name[-1].lower() in "aeiouy"
|
||||
for letter in "abcdefghijklmnopqrstuvwxyz":
|
||||
features["count(%s)" % letter] = name.lower().count(letter)
|
||||
features["has(%s)" % letter] = letter in name.lower()
|
||||
features["startswith(%s)" % letter] = letter == name[0].lower()
|
||||
features["endswith(%s)" % letter] = letter == name[-1].lower()
|
||||
return features
|
||||
|
||||
|
||||
def names_demo(trainer, features=names_demo_features):
|
||||
import random
|
||||
|
||||
from nltk.corpus import names
|
||||
|
||||
# Construct a list of classified names, using the names corpus.
|
||||
namelist = [(name, "male") for name in names.words("male.txt")] + [
|
||||
(name, "female") for name in names.words("female.txt")
|
||||
]
|
||||
|
||||
# Randomly split the names into a test & train set.
|
||||
random.seed(123456)
|
||||
random.shuffle(namelist)
|
||||
train = namelist[:5000]
|
||||
test = namelist[5000:5500]
|
||||
|
||||
# Train up a classifier.
|
||||
print("Training classifier...")
|
||||
classifier = trainer([(features(n), g) for (n, g) in train])
|
||||
|
||||
# Run the classifier on the test data.
|
||||
print("Testing classifier...")
|
||||
acc = accuracy(classifier, [(features(n), g) for (n, g) in test])
|
||||
print("Accuracy: %6.4f" % acc)
|
||||
|
||||
# For classifiers that can find probabilities, show the log
|
||||
# likelihood and some sample probability distributions.
|
||||
try:
|
||||
test_featuresets = [features(n) for (n, g) in test]
|
||||
pdists = classifier.prob_classify_many(test_featuresets)
|
||||
ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
|
||||
print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test)))
|
||||
print()
|
||||
print("Unseen Names P(Male) P(Female)\n" + "-" * 40)
|
||||
for (name, gender), pdist in list(zip(test, pdists))[:5]:
|
||||
if gender == "male":
|
||||
fmt = " %-15s *%6.4f %6.4f"
|
||||
else:
|
||||
fmt = " %-15s %6.4f *%6.4f"
|
||||
print(fmt % (name, pdist.prob("male"), pdist.prob("female")))
|
||||
except NotImplementedError:
|
||||
pass
|
||||
|
||||
# Return the classifier
|
||||
return classifier
|
||||
|
||||
|
||||
def partial_names_demo(trainer, features=names_demo_features):
|
||||
import random
|
||||
|
||||
from nltk.corpus import names
|
||||
|
||||
male_names = names.words("male.txt")
|
||||
female_names = names.words("female.txt")
|
||||
|
||||
random.seed(654321)
|
||||
random.shuffle(male_names)
|
||||
random.shuffle(female_names)
|
||||
|
||||
# Create a list of male names to be used as positive-labeled examples for training
|
||||
positive = map(features, male_names[:2000])
|
||||
|
||||
# Create a list of male and female names to be used as unlabeled examples
|
||||
unlabeled = map(features, male_names[2000:2500] + female_names[:500])
|
||||
|
||||
# Create a test set with correctly-labeled male and female names
|
||||
test = [(name, True) for name in male_names[2500:2750]] + [
|
||||
(name, False) for name in female_names[500:750]
|
||||
]
|
||||
|
||||
random.shuffle(test)
|
||||
|
||||
# Train up a classifier.
|
||||
print("Training classifier...")
|
||||
classifier = trainer(positive, unlabeled)
|
||||
|
||||
# Run the classifier on the test data.
|
||||
print("Testing classifier...")
|
||||
acc = accuracy(classifier, [(features(n), m) for (n, m) in test])
|
||||
print("Accuracy: %6.4f" % acc)
|
||||
|
||||
# For classifiers that can find probabilities, show the log
|
||||
# likelihood and some sample probability distributions.
|
||||
try:
|
||||
test_featuresets = [features(n) for (n, m) in test]
|
||||
pdists = classifier.prob_classify_many(test_featuresets)
|
||||
ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
|
||||
print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test)))
|
||||
print()
|
||||
print("Unseen Names P(Male) P(Female)\n" + "-" * 40)
|
||||
for (name, is_male), pdist in zip(test, pdists)[:5]:
|
||||
if is_male == True:
|
||||
fmt = " %-15s *%6.4f %6.4f"
|
||||
else:
|
||||
fmt = " %-15s %6.4f *%6.4f"
|
||||
print(fmt % (name, pdist.prob(True), pdist.prob(False)))
|
||||
except NotImplementedError:
|
||||
pass
|
||||
|
||||
# Return the classifier
|
||||
return classifier
|
||||
|
||||
|
||||
_inst_cache = {}
|
||||
|
||||
|
||||
def wsd_demo(trainer, word, features, n=1000):
|
||||
import random
|
||||
|
||||
from nltk.corpus import senseval
|
||||
|
||||
# Get the instances.
|
||||
print("Reading data...")
|
||||
global _inst_cache
|
||||
if word not in _inst_cache:
|
||||
_inst_cache[word] = [(i, i.senses[0]) for i in senseval.instances(word)]
|
||||
instances = _inst_cache[word][:]
|
||||
if n > len(instances):
|
||||
n = len(instances)
|
||||
senses = list({l for (i, l) in instances})
|
||||
print(" Senses: " + " ".join(senses))
|
||||
|
||||
# Randomly split the names into a test & train set.
|
||||
print("Splitting into test & train...")
|
||||
random.seed(123456)
|
||||
random.shuffle(instances)
|
||||
train = instances[: int(0.8 * n)]
|
||||
test = instances[int(0.8 * n) : n]
|
||||
|
||||
# Train up a classifier.
|
||||
print("Training classifier...")
|
||||
classifier = trainer([(features(i), l) for (i, l) in train])
|
||||
|
||||
# Run the classifier on the test data.
|
||||
print("Testing classifier...")
|
||||
acc = accuracy(classifier, [(features(i), l) for (i, l) in test])
|
||||
print("Accuracy: %6.4f" % acc)
|
||||
|
||||
# For classifiers that can find probabilities, show the log
|
||||
# likelihood and some sample probability distributions.
|
||||
try:
|
||||
test_featuresets = [features(i) for (i, n) in test]
|
||||
pdists = classifier.prob_classify_many(test_featuresets)
|
||||
ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
|
||||
print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test)))
|
||||
except NotImplementedError:
|
||||
pass
|
||||
|
||||
# Return the classifier
|
||||
return classifier
|
||||
|
||||
|
||||
def check_megam_config():
|
||||
"""
|
||||
Checks whether the MEGAM binary is configured.
|
||||
"""
|
||||
try:
|
||||
_megam_bin
|
||||
except NameError as e:
|
||||
err_msg = str(
|
||||
"Please configure your megam binary first, e.g.\n"
|
||||
">>> nltk.config_megam('/usr/bin/local/megam')"
|
||||
)
|
||||
raise NameError(err_msg) from e
|
||||
377
backend/venv/Lib/site-packages/nltk/classify/weka.py
Normal file
377
backend/venv/Lib/site-packages/nltk/classify/weka.py
Normal file
@@ -0,0 +1,377 @@
|
||||
# Natural Language Toolkit: Interface to Weka Classsifiers
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Classifiers that make use of the external 'Weka' package.
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import tempfile
|
||||
import time
|
||||
import zipfile
|
||||
from sys import stdin
|
||||
|
||||
from nltk.classify.api import ClassifierI
|
||||
from nltk.internals import config_java, java
|
||||
from nltk.probability import DictionaryProbDist
|
||||
|
||||
_weka_classpath = None
|
||||
_weka_search = [
|
||||
".",
|
||||
"/usr/share/weka",
|
||||
"/usr/local/share/weka",
|
||||
"/usr/lib/weka",
|
||||
"/usr/local/lib/weka",
|
||||
]
|
||||
|
||||
|
||||
def config_weka(classpath=None):
|
||||
global _weka_classpath
|
||||
|
||||
# Make sure java's configured first.
|
||||
config_java()
|
||||
|
||||
if classpath is not None:
|
||||
_weka_classpath = classpath
|
||||
|
||||
if _weka_classpath is None:
|
||||
searchpath = _weka_search
|
||||
if "WEKAHOME" in os.environ:
|
||||
searchpath.insert(0, os.environ["WEKAHOME"])
|
||||
|
||||
for path in searchpath:
|
||||
if os.path.exists(os.path.join(path, "weka.jar")):
|
||||
_weka_classpath = os.path.join(path, "weka.jar")
|
||||
version = _check_weka_version(_weka_classpath)
|
||||
if version:
|
||||
print(f"[Found Weka: {_weka_classpath} (version {version})]")
|
||||
else:
|
||||
print("[Found Weka: %s]" % _weka_classpath)
|
||||
_check_weka_version(_weka_classpath)
|
||||
|
||||
if _weka_classpath is None:
|
||||
raise LookupError(
|
||||
"Unable to find weka.jar! Use config_weka() "
|
||||
"or set the WEKAHOME environment variable. "
|
||||
"For more information about Weka, please see "
|
||||
"https://www.cs.waikato.ac.nz/ml/weka/"
|
||||
)
|
||||
|
||||
|
||||
def _check_weka_version(jar):
|
||||
try:
|
||||
zf = zipfile.ZipFile(jar)
|
||||
except (SystemExit, KeyboardInterrupt):
|
||||
raise
|
||||
except:
|
||||
return None
|
||||
try:
|
||||
try:
|
||||
return zf.read("weka/core/version.txt")
|
||||
except KeyError:
|
||||
return None
|
||||
finally:
|
||||
zf.close()
|
||||
|
||||
|
||||
class WekaClassifier(ClassifierI):
|
||||
def __init__(self, formatter, model_filename):
|
||||
self._formatter = formatter
|
||||
self._model = model_filename
|
||||
|
||||
def prob_classify_many(self, featuresets):
|
||||
return self._classify_many(featuresets, ["-p", "0", "-distribution"])
|
||||
|
||||
def classify_many(self, featuresets):
|
||||
return self._classify_many(featuresets, ["-p", "0"])
|
||||
|
||||
def _classify_many(self, featuresets, options):
|
||||
# Make sure we can find java & weka.
|
||||
config_weka()
|
||||
|
||||
temp_dir = tempfile.mkdtemp()
|
||||
try:
|
||||
# Write the test data file.
|
||||
test_filename = os.path.join(temp_dir, "test.arff")
|
||||
self._formatter.write(test_filename, featuresets)
|
||||
|
||||
# Call weka to classify the data.
|
||||
cmd = [
|
||||
"weka.classifiers.bayes.NaiveBayes",
|
||||
"-l",
|
||||
self._model,
|
||||
"-T",
|
||||
test_filename,
|
||||
] + options
|
||||
(stdout, stderr) = java(
|
||||
cmd,
|
||||
classpath=_weka_classpath,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
)
|
||||
|
||||
# Check if something went wrong:
|
||||
if stderr and not stdout:
|
||||
if "Illegal options: -distribution" in stderr:
|
||||
raise ValueError(
|
||||
"The installed version of weka does "
|
||||
"not support probability distribution "
|
||||
"output."
|
||||
)
|
||||
else:
|
||||
raise ValueError("Weka failed to generate output:\n%s" % stderr)
|
||||
|
||||
# Parse weka's output.
|
||||
return self.parse_weka_output(stdout.decode(stdin.encoding).split("\n"))
|
||||
|
||||
finally:
|
||||
for f in os.listdir(temp_dir):
|
||||
os.remove(os.path.join(temp_dir, f))
|
||||
os.rmdir(temp_dir)
|
||||
|
||||
def parse_weka_distribution(self, s):
|
||||
probs = [float(v) for v in re.split("[*,]+", s) if v.strip()]
|
||||
probs = dict(zip(self._formatter.labels(), probs))
|
||||
return DictionaryProbDist(probs)
|
||||
|
||||
def parse_weka_output(self, lines):
|
||||
# Strip unwanted text from stdout
|
||||
for i, line in enumerate(lines):
|
||||
if line.strip().startswith("inst#"):
|
||||
lines = lines[i:]
|
||||
break
|
||||
|
||||
if lines[0].split() == ["inst#", "actual", "predicted", "error", "prediction"]:
|
||||
return [line.split()[2].split(":")[1] for line in lines[1:] if line.strip()]
|
||||
elif lines[0].split() == [
|
||||
"inst#",
|
||||
"actual",
|
||||
"predicted",
|
||||
"error",
|
||||
"distribution",
|
||||
]:
|
||||
return [
|
||||
self.parse_weka_distribution(line.split()[-1])
|
||||
for line in lines[1:]
|
||||
if line.strip()
|
||||
]
|
||||
|
||||
# is this safe:?
|
||||
elif re.match(r"^0 \w+ [01]\.[0-9]* \?\s*$", lines[0]):
|
||||
return [line.split()[1] for line in lines if line.strip()]
|
||||
|
||||
else:
|
||||
for line in lines[:10]:
|
||||
print(line)
|
||||
raise ValueError(
|
||||
"Unhandled output format -- your version "
|
||||
"of weka may not be supported.\n"
|
||||
" Header: %s" % lines[0]
|
||||
)
|
||||
|
||||
# [xx] full list of classifiers (some may be abstract?):
|
||||
# ADTree, AODE, BayesNet, ComplementNaiveBayes, ConjunctiveRule,
|
||||
# DecisionStump, DecisionTable, HyperPipes, IB1, IBk, Id3, J48,
|
||||
# JRip, KStar, LBR, LeastMedSq, LinearRegression, LMT, Logistic,
|
||||
# LogisticBase, M5Base, MultilayerPerceptron,
|
||||
# MultipleClassifiersCombiner, NaiveBayes, NaiveBayesMultinomial,
|
||||
# NaiveBayesSimple, NBTree, NNge, OneR, PaceRegression, PART,
|
||||
# PreConstructedLinearModel, Prism, RandomForest,
|
||||
# RandomizableClassifier, RandomTree, RBFNetwork, REPTree, Ridor,
|
||||
# RuleNode, SimpleLinearRegression, SimpleLogistic,
|
||||
# SingleClassifierEnhancer, SMO, SMOreg, UserClassifier, VFI,
|
||||
# VotedPerceptron, Winnow, ZeroR
|
||||
|
||||
_CLASSIFIER_CLASS = {
|
||||
"naivebayes": "weka.classifiers.bayes.NaiveBayes",
|
||||
"C4.5": "weka.classifiers.trees.J48",
|
||||
"log_regression": "weka.classifiers.functions.Logistic",
|
||||
"svm": "weka.classifiers.functions.SMO",
|
||||
"kstar": "weka.classifiers.lazy.KStar",
|
||||
"ripper": "weka.classifiers.rules.JRip",
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def train(
|
||||
cls,
|
||||
model_filename,
|
||||
featuresets,
|
||||
classifier="naivebayes",
|
||||
options=[],
|
||||
quiet=True,
|
||||
):
|
||||
# Make sure we can find java & weka.
|
||||
config_weka()
|
||||
|
||||
# Build an ARFF formatter.
|
||||
formatter = ARFF_Formatter.from_train(featuresets)
|
||||
|
||||
temp_dir = tempfile.mkdtemp()
|
||||
try:
|
||||
# Write the training data file.
|
||||
train_filename = os.path.join(temp_dir, "train.arff")
|
||||
formatter.write(train_filename, featuresets)
|
||||
|
||||
if classifier in cls._CLASSIFIER_CLASS:
|
||||
javaclass = cls._CLASSIFIER_CLASS[classifier]
|
||||
elif classifier in cls._CLASSIFIER_CLASS.values():
|
||||
javaclass = classifier
|
||||
else:
|
||||
raise ValueError("Unknown classifier %s" % classifier)
|
||||
|
||||
# Train the weka model.
|
||||
cmd = [javaclass, "-d", model_filename, "-t", train_filename]
|
||||
cmd += list(options)
|
||||
if quiet:
|
||||
stdout = subprocess.PIPE
|
||||
else:
|
||||
stdout = None
|
||||
java(cmd, classpath=_weka_classpath, stdout=stdout)
|
||||
|
||||
# Return the new classifier.
|
||||
return WekaClassifier(formatter, model_filename)
|
||||
|
||||
finally:
|
||||
for f in os.listdir(temp_dir):
|
||||
os.remove(os.path.join(temp_dir, f))
|
||||
os.rmdir(temp_dir)
|
||||
|
||||
|
||||
class ARFF_Formatter:
|
||||
"""
|
||||
Converts featuresets and labeled featuresets to ARFF-formatted
|
||||
strings, appropriate for input into Weka.
|
||||
|
||||
Features and classes can be specified manually in the constructor, or may
|
||||
be determined from data using ``from_train``.
|
||||
"""
|
||||
|
||||
def __init__(self, labels, features):
|
||||
"""
|
||||
:param labels: A list of all class labels that can be generated.
|
||||
:param features: A list of feature specifications, where
|
||||
each feature specification is a tuple (fname, ftype);
|
||||
and ftype is an ARFF type string such as NUMERIC or
|
||||
STRING.
|
||||
"""
|
||||
self._labels = labels
|
||||
self._features = features
|
||||
|
||||
def format(self, tokens):
|
||||
"""Returns a string representation of ARFF output for the given data."""
|
||||
return self.header_section() + self.data_section(tokens)
|
||||
|
||||
def labels(self):
|
||||
"""Returns the list of classes."""
|
||||
return list(self._labels)
|
||||
|
||||
def write(self, outfile, tokens):
|
||||
"""Writes ARFF data to a file for the given data."""
|
||||
if not hasattr(outfile, "write"):
|
||||
outfile = open(outfile, "w")
|
||||
outfile.write(self.format(tokens))
|
||||
outfile.close()
|
||||
|
||||
@staticmethod
|
||||
def from_train(tokens):
|
||||
"""
|
||||
Constructs an ARFF_Formatter instance with class labels and feature
|
||||
types determined from the given data. Handles boolean, numeric and
|
||||
string (note: not nominal) types.
|
||||
"""
|
||||
# Find the set of all attested labels.
|
||||
labels = {label for (tok, label) in tokens}
|
||||
|
||||
# Determine the types of all features.
|
||||
features = {}
|
||||
for tok, label in tokens:
|
||||
for fname, fval in tok.items():
|
||||
if issubclass(type(fval), bool):
|
||||
ftype = "{True, False}"
|
||||
elif issubclass(type(fval), (int, float, bool)):
|
||||
ftype = "NUMERIC"
|
||||
elif issubclass(type(fval), str):
|
||||
ftype = "STRING"
|
||||
elif fval is None:
|
||||
continue # can't tell the type.
|
||||
else:
|
||||
raise ValueError("Unsupported value type %r" % ftype)
|
||||
|
||||
if features.get(fname, ftype) != ftype:
|
||||
raise ValueError("Inconsistent type for %s" % fname)
|
||||
features[fname] = ftype
|
||||
features = sorted(features.items())
|
||||
|
||||
return ARFF_Formatter(labels, features)
|
||||
|
||||
def header_section(self):
|
||||
"""Returns an ARFF header as a string."""
|
||||
# Header comment.
|
||||
s = (
|
||||
"% Weka ARFF file\n"
|
||||
+ "% Generated automatically by NLTK\n"
|
||||
+ "%% %s\n\n" % time.ctime()
|
||||
)
|
||||
|
||||
# Relation name
|
||||
s += "@RELATION rel\n\n"
|
||||
|
||||
# Input attribute specifications
|
||||
for fname, ftype in self._features:
|
||||
s += "@ATTRIBUTE %-30r %s\n" % (fname, ftype)
|
||||
|
||||
# Label attribute specification
|
||||
s += "@ATTRIBUTE %-30r {%s}\n" % ("-label-", ",".join(self._labels))
|
||||
|
||||
return s
|
||||
|
||||
def data_section(self, tokens, labeled=None):
|
||||
"""
|
||||
Returns the ARFF data section for the given data.
|
||||
|
||||
:param tokens: a list of featuresets (dicts) or labelled featuresets
|
||||
which are tuples (featureset, label).
|
||||
:param labeled: Indicates whether the given tokens are labeled
|
||||
or not. If None, then the tokens will be assumed to be
|
||||
labeled if the first token's value is a tuple or list.
|
||||
"""
|
||||
# Check if the tokens are labeled or unlabeled. If unlabeled,
|
||||
# then use 'None'
|
||||
if labeled is None:
|
||||
labeled = tokens and isinstance(tokens[0], (tuple, list))
|
||||
if not labeled:
|
||||
tokens = [(tok, None) for tok in tokens]
|
||||
|
||||
# Data section
|
||||
s = "\n@DATA\n"
|
||||
for tok, label in tokens:
|
||||
for fname, ftype in self._features:
|
||||
s += "%s," % self._fmt_arff_val(tok.get(fname))
|
||||
s += "%s\n" % self._fmt_arff_val(label)
|
||||
|
||||
return s
|
||||
|
||||
def _fmt_arff_val(self, fval):
|
||||
if fval is None:
|
||||
return "?"
|
||||
elif isinstance(fval, (bool, int)):
|
||||
return "%s" % fval
|
||||
elif isinstance(fval, float):
|
||||
return "%r" % fval
|
||||
else:
|
||||
return "%r" % fval
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from nltk.classify.util import binary_names_demo_features, names_demo
|
||||
|
||||
def make_classifier(featuresets):
|
||||
return WekaClassifier.train("/tmp/name.model", featuresets, "C4.5")
|
||||
|
||||
classifier = names_demo(make_classifier, binary_names_demo_features)
|
||||
55
backend/venv/Lib/site-packages/nltk/cli.py
Normal file
55
backend/venv/Lib/site-packages/nltk/cli.py
Normal file
@@ -0,0 +1,55 @@
|
||||
# Natural Language Toolkit: NLTK Command-Line Interface
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
|
||||
import click
|
||||
from tqdm import tqdm
|
||||
|
||||
from nltk import word_tokenize
|
||||
from nltk.util import parallelize_preprocess
|
||||
|
||||
CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])
|
||||
|
||||
|
||||
@click.group(context_settings=CONTEXT_SETTINGS)
|
||||
@click.version_option()
|
||||
def cli():
|
||||
pass
|
||||
|
||||
|
||||
@cli.command("tokenize")
|
||||
@click.option(
|
||||
"--language",
|
||||
"-l",
|
||||
default="en",
|
||||
help="The language for the Punkt sentence tokenization.",
|
||||
)
|
||||
@click.option(
|
||||
"--preserve-line",
|
||||
"-l",
|
||||
default=True,
|
||||
is_flag=True,
|
||||
help="An option to keep the preserve the sentence and not sentence tokenize it.",
|
||||
)
|
||||
@click.option("--processes", "-j", default=1, help="No. of processes.")
|
||||
@click.option("--encoding", "-e", default="utf8", help="Specify encoding of file.")
|
||||
@click.option(
|
||||
"--delimiter", "-d", default=" ", help="Specify delimiter to join the tokens."
|
||||
)
|
||||
def tokenize_file(language, preserve_line, processes, encoding, delimiter):
|
||||
"""This command tokenizes text stream using nltk.word_tokenize"""
|
||||
with click.get_text_stream("stdin", encoding=encoding) as fin:
|
||||
with click.get_text_stream("stdout", encoding=encoding) as fout:
|
||||
# If it's single process, joblib parallelization is slower,
|
||||
# so just process line by line normally.
|
||||
if processes == 1:
|
||||
for line in tqdm(fin.readlines()):
|
||||
print(delimiter.join(word_tokenize(line)), end="\n", file=fout)
|
||||
else:
|
||||
for outline in parallelize_preprocess(
|
||||
word_tokenize, fin.readlines(), processes, progress_bar=True
|
||||
):
|
||||
print(delimiter.join(outline), end="\n", file=fout)
|
||||
92
backend/venv/Lib/site-packages/nltk/cluster/__init__.py
Normal file
92
backend/venv/Lib/site-packages/nltk/cluster/__init__.py
Normal file
@@ -0,0 +1,92 @@
|
||||
# Natural Language Toolkit: Clusterers
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
This module contains a number of basic clustering algorithms. Clustering
|
||||
describes the task of discovering groups of similar items with a large
|
||||
collection. It is also describe as unsupervised machine learning, as the data
|
||||
from which it learns is unannotated with class information, as is the case for
|
||||
supervised learning. Annotated data is difficult and expensive to obtain in
|
||||
the quantities required for the majority of supervised learning algorithms.
|
||||
This problem, the knowledge acquisition bottleneck, is common to most natural
|
||||
language processing tasks, thus fueling the need for quality unsupervised
|
||||
approaches.
|
||||
|
||||
This module contains a k-means clusterer, E-M clusterer and a group average
|
||||
agglomerative clusterer (GAAC). All these clusterers involve finding good
|
||||
cluster groupings for a set of vectors in multi-dimensional space.
|
||||
|
||||
The K-means clusterer starts with k arbitrary chosen means then allocates each
|
||||
vector to the cluster with the closest mean. It then recalculates the means of
|
||||
each cluster as the centroid of the vectors in the cluster. This process
|
||||
repeats until the cluster memberships stabilise. This is a hill-climbing
|
||||
algorithm which may converge to a local maximum. Hence the clustering is
|
||||
often repeated with random initial means and the most commonly occurring
|
||||
output means are chosen.
|
||||
|
||||
The GAAC clusterer starts with each of the *N* vectors as singleton clusters.
|
||||
It then iteratively merges pairs of clusters which have the closest centroids.
|
||||
This continues until there is only one cluster. The order of merges gives rise
|
||||
to a dendrogram - a tree with the earlier merges lower than later merges. The
|
||||
membership of a given number of clusters *c*, *1 <= c <= N*, can be found by
|
||||
cutting the dendrogram at depth *c*.
|
||||
|
||||
The Gaussian EM clusterer models the vectors as being produced by a mixture
|
||||
of k Gaussian sources. The parameters of these sources (prior probability,
|
||||
mean and covariance matrix) are then found to maximise the likelihood of the
|
||||
given data. This is done with the expectation maximisation algorithm. It
|
||||
starts with k arbitrarily chosen means, priors and covariance matrices. It
|
||||
then calculates the membership probabilities for each vector in each of the
|
||||
clusters - this is the 'E' step. The cluster parameters are then updated in
|
||||
the 'M' step using the maximum likelihood estimate from the cluster membership
|
||||
probabilities. This process continues until the likelihood of the data does
|
||||
not significantly increase.
|
||||
|
||||
They all extend the ClusterI interface which defines common operations
|
||||
available with each clusterer. These operations include:
|
||||
|
||||
- cluster: clusters a sequence of vectors
|
||||
- classify: assign a vector to a cluster
|
||||
- classification_probdist: give the probability distribution over cluster memberships
|
||||
|
||||
The current existing classifiers also extend cluster.VectorSpace, an
|
||||
abstract class which allows for singular value decomposition (SVD) and vector
|
||||
normalisation. SVD is used to reduce the dimensionality of the vector space in
|
||||
such a manner as to preserve as much of the variation as possible, by
|
||||
reparameterising the axes in order of variability and discarding all bar the
|
||||
first d dimensions. Normalisation ensures that vectors fall in the unit
|
||||
hypersphere.
|
||||
|
||||
Usage example (see also demo())::
|
||||
|
||||
from nltk import cluster
|
||||
from nltk.cluster import euclidean_distance
|
||||
from numpy import array
|
||||
|
||||
vectors = [array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0]]]
|
||||
|
||||
# initialise the clusterer (will also assign the vectors to clusters)
|
||||
clusterer = cluster.KMeansClusterer(2, euclidean_distance)
|
||||
clusterer.cluster(vectors, True)
|
||||
|
||||
# classify a new vector
|
||||
print(clusterer.classify(array([3, 3])))
|
||||
|
||||
Note that the vectors must use numpy array-like
|
||||
objects. nltk_contrib.unimelb.tacohn.SparseArrays may be used for
|
||||
efficiency when required.
|
||||
"""
|
||||
|
||||
from nltk.cluster.em import EMClusterer
|
||||
from nltk.cluster.gaac import GAAClusterer
|
||||
from nltk.cluster.kmeans import KMeansClusterer
|
||||
from nltk.cluster.util import (
|
||||
Dendrogram,
|
||||
VectorSpaceClusterer,
|
||||
cosine_distance,
|
||||
euclidean_distance,
|
||||
)
|
||||
74
backend/venv/Lib/site-packages/nltk/cluster/api.py
Normal file
74
backend/venv/Lib/site-packages/nltk/cluster/api.py
Normal file
@@ -0,0 +1,74 @@
|
||||
# Natural Language Toolkit: Clusterer Interfaces
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
|
||||
# Porting: Steven Bird <stevenbird1@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
from abc import ABCMeta, abstractmethod
|
||||
|
||||
from nltk.probability import DictionaryProbDist
|
||||
|
||||
|
||||
class ClusterI(metaclass=ABCMeta):
|
||||
"""
|
||||
Interface covering basic clustering functionality.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def cluster(self, vectors, assign_clusters=False):
|
||||
"""
|
||||
Assigns the vectors to clusters, learning the clustering parameters
|
||||
from the data. Returns a cluster identifier for each vector.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def classify(self, token):
|
||||
"""
|
||||
Classifies the token into a cluster, setting the token's CLUSTER
|
||||
parameter to that cluster identifier.
|
||||
"""
|
||||
|
||||
def likelihood(self, vector, label):
|
||||
"""
|
||||
Returns the likelihood (a float) of the token having the
|
||||
corresponding cluster.
|
||||
"""
|
||||
if self.classify(vector) == label:
|
||||
return 1.0
|
||||
else:
|
||||
return 0.0
|
||||
|
||||
def classification_probdist(self, vector):
|
||||
"""
|
||||
Classifies the token into a cluster, returning
|
||||
a probability distribution over the cluster identifiers.
|
||||
"""
|
||||
likelihoods = {}
|
||||
sum = 0.0
|
||||
for cluster in self.cluster_names():
|
||||
likelihoods[cluster] = self.likelihood(vector, cluster)
|
||||
sum += likelihoods[cluster]
|
||||
for cluster in self.cluster_names():
|
||||
likelihoods[cluster] /= sum
|
||||
return DictionaryProbDist(likelihoods)
|
||||
|
||||
@abstractmethod
|
||||
def num_clusters(self):
|
||||
"""
|
||||
Returns the number of clusters.
|
||||
"""
|
||||
|
||||
def cluster_names(self):
|
||||
"""
|
||||
Returns the names of the clusters.
|
||||
:rtype: list
|
||||
"""
|
||||
return list(range(self.num_clusters()))
|
||||
|
||||
def cluster_name(self, index):
|
||||
"""
|
||||
Returns the names of the cluster at index.
|
||||
"""
|
||||
return index
|
||||
219
backend/venv/Lib/site-packages/nltk/cluster/em.py
Normal file
219
backend/venv/Lib/site-packages/nltk/cluster/em.py
Normal file
@@ -0,0 +1,219 @@
|
||||
# Natural Language Toolkit: Expectation Maximization Clusterer
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
try:
|
||||
import numpy
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
from nltk.cluster.util import VectorSpaceClusterer
|
||||
|
||||
|
||||
class EMClusterer(VectorSpaceClusterer):
|
||||
"""
|
||||
The Gaussian EM clusterer models the vectors as being produced by
|
||||
a mixture of k Gaussian sources. The parameters of these sources
|
||||
(prior probability, mean and covariance matrix) are then found to
|
||||
maximise the likelihood of the given data. This is done with the
|
||||
expectation maximisation algorithm. It starts with k arbitrarily
|
||||
chosen means, priors and covariance matrices. It then calculates
|
||||
the membership probabilities for each vector in each of the
|
||||
clusters; this is the 'E' step. The cluster parameters are then
|
||||
updated in the 'M' step using the maximum likelihood estimate from
|
||||
the cluster membership probabilities. This process continues until
|
||||
the likelihood of the data does not significantly increase.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
initial_means,
|
||||
priors=None,
|
||||
covariance_matrices=None,
|
||||
conv_threshold=1e-6,
|
||||
bias=0.1,
|
||||
normalise=False,
|
||||
svd_dimensions=None,
|
||||
):
|
||||
"""
|
||||
Creates an EM clusterer with the given starting parameters,
|
||||
convergence threshold and vector mangling parameters.
|
||||
|
||||
:param initial_means: the means of the gaussian cluster centers
|
||||
:type initial_means: [seq of] numpy array or seq of SparseArray
|
||||
:param priors: the prior probability for each cluster
|
||||
:type priors: numpy array or seq of float
|
||||
:param covariance_matrices: the covariance matrix for each cluster
|
||||
:type covariance_matrices: [seq of] numpy array
|
||||
:param conv_threshold: maximum change in likelihood before deemed
|
||||
convergent
|
||||
:type conv_threshold: int or float
|
||||
:param bias: variance bias used to ensure non-singular covariance
|
||||
matrices
|
||||
:type bias: float
|
||||
:param normalise: should vectors be normalised to length 1
|
||||
:type normalise: boolean
|
||||
:param svd_dimensions: number of dimensions to use in reducing vector
|
||||
dimensionsionality with SVD
|
||||
:type svd_dimensions: int
|
||||
"""
|
||||
VectorSpaceClusterer.__init__(self, normalise, svd_dimensions)
|
||||
self._means = numpy.array(initial_means, numpy.float64)
|
||||
self._num_clusters = len(initial_means)
|
||||
self._conv_threshold = conv_threshold
|
||||
self._covariance_matrices = covariance_matrices
|
||||
self._priors = priors
|
||||
self._bias = bias
|
||||
|
||||
def num_clusters(self):
|
||||
return self._num_clusters
|
||||
|
||||
def cluster_vectorspace(self, vectors, trace=False):
|
||||
assert len(vectors) > 0
|
||||
|
||||
# set the parameters to initial values
|
||||
dimensions = len(vectors[0])
|
||||
means = self._means
|
||||
priors = self._priors
|
||||
if not priors:
|
||||
priors = self._priors = (
|
||||
numpy.ones(self._num_clusters, numpy.float64) / self._num_clusters
|
||||
)
|
||||
covariances = self._covariance_matrices
|
||||
if not covariances:
|
||||
covariances = self._covariance_matrices = [
|
||||
numpy.identity(dimensions, numpy.float64)
|
||||
for i in range(self._num_clusters)
|
||||
]
|
||||
|
||||
# do the E and M steps until the likelihood plateaus
|
||||
lastl = self._loglikelihood(vectors, priors, means, covariances)
|
||||
converged = False
|
||||
|
||||
while not converged:
|
||||
if trace:
|
||||
print("iteration; loglikelihood", lastl)
|
||||
# E-step, calculate hidden variables, h[i,j]
|
||||
h = numpy.zeros((len(vectors), self._num_clusters), numpy.float64)
|
||||
for i in range(len(vectors)):
|
||||
for j in range(self._num_clusters):
|
||||
h[i, j] = priors[j] * self._gaussian(
|
||||
means[j], covariances[j], vectors[i]
|
||||
)
|
||||
h[i, :] /= sum(h[i, :])
|
||||
|
||||
# M-step, update parameters - cvm, p, mean
|
||||
for j in range(self._num_clusters):
|
||||
covariance_before = covariances[j]
|
||||
new_covariance = numpy.zeros((dimensions, dimensions), numpy.float64)
|
||||
new_mean = numpy.zeros(dimensions, numpy.float64)
|
||||
sum_hj = 0.0
|
||||
for i in range(len(vectors)):
|
||||
delta = vectors[i] - means[j]
|
||||
new_covariance += h[i, j] * numpy.multiply.outer(delta, delta)
|
||||
sum_hj += h[i, j]
|
||||
new_mean += h[i, j] * vectors[i]
|
||||
covariances[j] = new_covariance / sum_hj
|
||||
means[j] = new_mean / sum_hj
|
||||
priors[j] = sum_hj / len(vectors)
|
||||
|
||||
# bias term to stop covariance matrix being singular
|
||||
covariances[j] += self._bias * numpy.identity(dimensions, numpy.float64)
|
||||
|
||||
# calculate likelihood - FIXME: may be broken
|
||||
l = self._loglikelihood(vectors, priors, means, covariances)
|
||||
|
||||
# check for convergence
|
||||
if abs(lastl - l) < self._conv_threshold:
|
||||
converged = True
|
||||
lastl = l
|
||||
|
||||
def classify_vectorspace(self, vector):
|
||||
best = None
|
||||
for j in range(self._num_clusters):
|
||||
p = self._priors[j] * self._gaussian(
|
||||
self._means[j], self._covariance_matrices[j], vector
|
||||
)
|
||||
if not best or p > best[0]:
|
||||
best = (p, j)
|
||||
return best[1]
|
||||
|
||||
def likelihood_vectorspace(self, vector, cluster):
|
||||
cid = self.cluster_names().index(cluster)
|
||||
return self._priors[cluster] * self._gaussian(
|
||||
self._means[cluster], self._covariance_matrices[cluster], vector
|
||||
)
|
||||
|
||||
def _gaussian(self, mean, cvm, x):
|
||||
m = len(mean)
|
||||
assert cvm.shape == (m, m), "bad sized covariance matrix, %s" % str(cvm.shape)
|
||||
try:
|
||||
det = numpy.linalg.det(cvm)
|
||||
inv = numpy.linalg.inv(cvm)
|
||||
a = det**-0.5 * (2 * numpy.pi) ** (-m / 2.0)
|
||||
dx = x - mean
|
||||
print(dx, inv)
|
||||
b = -0.5 * numpy.dot(numpy.dot(dx, inv), dx)
|
||||
return a * numpy.exp(b)
|
||||
except OverflowError:
|
||||
# happens when the exponent is negative infinity - i.e. b = 0
|
||||
# i.e. the inverse of cvm is huge (cvm is almost zero)
|
||||
return 0
|
||||
|
||||
def _loglikelihood(self, vectors, priors, means, covariances):
|
||||
llh = 0.0
|
||||
for vector in vectors:
|
||||
p = 0
|
||||
for j in range(len(priors)):
|
||||
p += priors[j] * self._gaussian(means[j], covariances[j], vector)
|
||||
llh += numpy.log(p)
|
||||
return llh
|
||||
|
||||
def __repr__(self):
|
||||
return "<EMClusterer means=%s>" % list(self._means)
|
||||
|
||||
|
||||
def demo():
|
||||
"""
|
||||
Non-interactive demonstration of the clusterers with simple 2-D data.
|
||||
"""
|
||||
|
||||
from nltk import cluster
|
||||
|
||||
# example from figure 14.10, page 519, Manning and Schutze
|
||||
|
||||
vectors = [numpy.array(f) for f in [[0.5, 0.5], [1.5, 0.5], [1, 3]]]
|
||||
means = [[4, 2], [4, 2.01]]
|
||||
|
||||
clusterer = cluster.EMClusterer(means, bias=0.1)
|
||||
clusters = clusterer.cluster(vectors, True, trace=True)
|
||||
|
||||
print("Clustered:", vectors)
|
||||
print("As: ", clusters)
|
||||
print()
|
||||
|
||||
for c in range(2):
|
||||
print("Cluster:", c)
|
||||
print("Prior: ", clusterer._priors[c])
|
||||
print("Mean: ", clusterer._means[c])
|
||||
print("Covar: ", clusterer._covariance_matrices[c])
|
||||
print()
|
||||
|
||||
# classify a new vector
|
||||
vector = numpy.array([2, 2])
|
||||
print("classify(%s):" % vector, end=" ")
|
||||
print(clusterer.classify(vector))
|
||||
|
||||
# show the classification probabilities
|
||||
vector = numpy.array([2, 2])
|
||||
print("classification_probdist(%s):" % vector)
|
||||
pdist = clusterer.classification_probdist(vector)
|
||||
for sample in pdist.samples():
|
||||
print(f"{sample} => {pdist.prob(sample) * 100:.0f}%")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
170
backend/venv/Lib/site-packages/nltk/cluster/gaac.py
Normal file
170
backend/venv/Lib/site-packages/nltk/cluster/gaac.py
Normal file
@@ -0,0 +1,170 @@
|
||||
# Natural Language Toolkit: Group Average Agglomerative Clusterer
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
try:
|
||||
import numpy
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
from nltk.cluster.util import Dendrogram, VectorSpaceClusterer, cosine_distance
|
||||
|
||||
|
||||
class GAAClusterer(VectorSpaceClusterer):
|
||||
"""
|
||||
The Group Average Agglomerative starts with each of the N vectors as singleton
|
||||
clusters. It then iteratively merges pairs of clusters which have the
|
||||
closest centroids. This continues until there is only one cluster. The
|
||||
order of merges gives rise to a dendrogram: a tree with the earlier merges
|
||||
lower than later merges. The membership of a given number of clusters c, 1
|
||||
<= c <= N, can be found by cutting the dendrogram at depth c.
|
||||
|
||||
This clusterer uses the cosine similarity metric only, which allows for
|
||||
efficient speed-up in the clustering process.
|
||||
"""
|
||||
|
||||
def __init__(self, num_clusters=1, normalise=True, svd_dimensions=None):
|
||||
VectorSpaceClusterer.__init__(self, normalise, svd_dimensions)
|
||||
self._num_clusters = num_clusters
|
||||
self._dendrogram = None
|
||||
self._groups_values = None
|
||||
|
||||
def cluster(self, vectors, assign_clusters=False, trace=False):
|
||||
# stores the merge order
|
||||
self._dendrogram = Dendrogram(
|
||||
[numpy.array(vector, numpy.float64) for vector in vectors]
|
||||
)
|
||||
return VectorSpaceClusterer.cluster(self, vectors, assign_clusters, trace)
|
||||
|
||||
def cluster_vectorspace(self, vectors, trace=False):
|
||||
# variables describing the initial situation
|
||||
N = len(vectors)
|
||||
cluster_len = [1] * N
|
||||
cluster_count = N
|
||||
index_map = numpy.arange(N)
|
||||
|
||||
# construct the similarity matrix
|
||||
dims = (N, N)
|
||||
dist = numpy.ones(dims, dtype=float) * numpy.inf
|
||||
for i in range(N):
|
||||
for j in range(i + 1, N):
|
||||
dist[i, j] = cosine_distance(vectors[i], vectors[j])
|
||||
|
||||
while cluster_count > max(self._num_clusters, 1):
|
||||
i, j = numpy.unravel_index(dist.argmin(), dims)
|
||||
if trace:
|
||||
print("merging %d and %d" % (i, j))
|
||||
|
||||
# update similarities for merging i and j
|
||||
self._merge_similarities(dist, cluster_len, i, j)
|
||||
|
||||
# remove j
|
||||
dist[:, j] = numpy.inf
|
||||
dist[j, :] = numpy.inf
|
||||
|
||||
# merge the clusters
|
||||
cluster_len[i] = cluster_len[i] + cluster_len[j]
|
||||
self._dendrogram.merge(index_map[i], index_map[j])
|
||||
cluster_count -= 1
|
||||
|
||||
# update the index map to reflect the indexes if we
|
||||
# had removed j
|
||||
index_map[j + 1 :] -= 1
|
||||
index_map[j] = N
|
||||
|
||||
self.update_clusters(self._num_clusters)
|
||||
|
||||
def _merge_similarities(self, dist, cluster_len, i, j):
|
||||
# the new cluster i merged from i and j adopts the average of
|
||||
# i and j's similarity to each other cluster, weighted by the
|
||||
# number of points in the clusters i and j
|
||||
i_weight = cluster_len[i]
|
||||
j_weight = cluster_len[j]
|
||||
weight_sum = i_weight + j_weight
|
||||
|
||||
# update for x<i
|
||||
dist[:i, i] = dist[:i, i] * i_weight + dist[:i, j] * j_weight
|
||||
dist[:i, i] /= weight_sum
|
||||
# update for i<x<j
|
||||
dist[i, i + 1 : j] = (
|
||||
dist[i, i + 1 : j] * i_weight + dist[i + 1 : j, j] * j_weight
|
||||
)
|
||||
# update for i<j<x
|
||||
dist[i, j + 1 :] = dist[i, j + 1 :] * i_weight + dist[j, j + 1 :] * j_weight
|
||||
dist[i, i + 1 :] /= weight_sum
|
||||
|
||||
def update_clusters(self, num_clusters):
|
||||
clusters = self._dendrogram.groups(num_clusters)
|
||||
self._centroids = []
|
||||
for cluster in clusters:
|
||||
assert len(cluster) > 0
|
||||
if self._should_normalise:
|
||||
centroid = self._normalise(cluster[0])
|
||||
else:
|
||||
centroid = numpy.array(cluster[0])
|
||||
for vector in cluster[1:]:
|
||||
if self._should_normalise:
|
||||
centroid += self._normalise(vector)
|
||||
else:
|
||||
centroid += vector
|
||||
centroid /= len(cluster)
|
||||
self._centroids.append(centroid)
|
||||
self._num_clusters = len(self._centroids)
|
||||
|
||||
def classify_vectorspace(self, vector):
|
||||
best = None
|
||||
for i in range(self._num_clusters):
|
||||
centroid = self._centroids[i]
|
||||
dist = cosine_distance(vector, centroid)
|
||||
if not best or dist < best[0]:
|
||||
best = (dist, i)
|
||||
return best[1]
|
||||
|
||||
def dendrogram(self):
|
||||
"""
|
||||
:return: The dendrogram representing the current clustering
|
||||
:rtype: Dendrogram
|
||||
"""
|
||||
return self._dendrogram
|
||||
|
||||
def num_clusters(self):
|
||||
return self._num_clusters
|
||||
|
||||
def __repr__(self):
|
||||
return "<GroupAverageAgglomerative Clusterer n=%d>" % self._num_clusters
|
||||
|
||||
|
||||
def demo():
|
||||
"""
|
||||
Non-interactive demonstration of the clusterers with simple 2-D data.
|
||||
"""
|
||||
|
||||
from nltk.cluster import GAAClusterer
|
||||
|
||||
# use a set of tokens with 2D indices
|
||||
vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]
|
||||
|
||||
# test the GAAC clusterer with 4 clusters
|
||||
clusterer = GAAClusterer(4)
|
||||
clusters = clusterer.cluster(vectors, True)
|
||||
|
||||
print("Clusterer:", clusterer)
|
||||
print("Clustered:", vectors)
|
||||
print("As:", clusters)
|
||||
print()
|
||||
|
||||
# show the dendrogram
|
||||
clusterer.dendrogram().show()
|
||||
|
||||
# classify a new vector
|
||||
vector = numpy.array([3, 3])
|
||||
print("classify(%s):" % vector, end=" ")
|
||||
print(clusterer.classify(vector))
|
||||
print()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
230
backend/venv/Lib/site-packages/nltk/cluster/kmeans.py
Normal file
230
backend/venv/Lib/site-packages/nltk/cluster/kmeans.py
Normal file
@@ -0,0 +1,230 @@
|
||||
# Natural Language Toolkit: K-Means Clusterer
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import copy
|
||||
import random
|
||||
import sys
|
||||
|
||||
try:
|
||||
import numpy
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
from nltk.cluster.util import VectorSpaceClusterer
|
||||
|
||||
|
||||
class KMeansClusterer(VectorSpaceClusterer):
|
||||
"""
|
||||
The K-means clusterer starts with k arbitrary chosen means then allocates
|
||||
each vector to the cluster with the closest mean. It then recalculates the
|
||||
means of each cluster as the centroid of the vectors in the cluster. This
|
||||
process repeats until the cluster memberships stabilise. This is a
|
||||
hill-climbing algorithm which may converge to a local maximum. Hence the
|
||||
clustering is often repeated with random initial means and the most
|
||||
commonly occurring output means are chosen.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
num_means,
|
||||
distance,
|
||||
repeats=1,
|
||||
conv_test=1e-6,
|
||||
initial_means=None,
|
||||
normalise=False,
|
||||
svd_dimensions=None,
|
||||
rng=None,
|
||||
avoid_empty_clusters=False,
|
||||
):
|
||||
"""
|
||||
:param num_means: the number of means to use (may use fewer)
|
||||
:type num_means: int
|
||||
:param distance: measure of distance between two vectors
|
||||
:type distance: function taking two vectors and returning a float
|
||||
:param repeats: number of randomised clustering trials to use
|
||||
:type repeats: int
|
||||
:param conv_test: maximum variation in mean differences before
|
||||
deemed convergent
|
||||
:type conv_test: number
|
||||
:param initial_means: set of k initial means
|
||||
:type initial_means: sequence of vectors
|
||||
:param normalise: should vectors be normalised to length 1
|
||||
:type normalise: boolean
|
||||
:param svd_dimensions: number of dimensions to use in reducing vector
|
||||
dimensionsionality with SVD
|
||||
:type svd_dimensions: int
|
||||
:param rng: random number generator (or None)
|
||||
:type rng: Random
|
||||
:param avoid_empty_clusters: include current centroid in computation
|
||||
of next one; avoids undefined behavior
|
||||
when clusters become empty
|
||||
:type avoid_empty_clusters: boolean
|
||||
"""
|
||||
VectorSpaceClusterer.__init__(self, normalise, svd_dimensions)
|
||||
self._num_means = num_means
|
||||
self._distance = distance
|
||||
self._max_difference = conv_test
|
||||
assert not initial_means or len(initial_means) == num_means
|
||||
self._means = initial_means
|
||||
assert repeats >= 1
|
||||
assert not (initial_means and repeats > 1)
|
||||
self._repeats = repeats
|
||||
self._rng = rng if rng else random.Random()
|
||||
self._avoid_empty_clusters = avoid_empty_clusters
|
||||
|
||||
def cluster_vectorspace(self, vectors, trace=False):
|
||||
if self._means and self._repeats > 1:
|
||||
print("Warning: means will be discarded for subsequent trials")
|
||||
|
||||
meanss = []
|
||||
for trial in range(self._repeats):
|
||||
if trace:
|
||||
print("k-means trial", trial)
|
||||
if not self._means or trial > 1:
|
||||
self._means = self._rng.sample(list(vectors), self._num_means)
|
||||
self._cluster_vectorspace(vectors, trace)
|
||||
meanss.append(self._means)
|
||||
|
||||
if len(meanss) > 1:
|
||||
# sort the means first (so that different cluster numbering won't
|
||||
# effect the distance comparison)
|
||||
for means in meanss:
|
||||
means.sort(key=sum)
|
||||
|
||||
# find the set of means that's minimally different from the others
|
||||
min_difference = min_means = None
|
||||
for i in range(len(meanss)):
|
||||
d = 0
|
||||
for j in range(len(meanss)):
|
||||
if i != j:
|
||||
d += self._sum_distances(meanss[i], meanss[j])
|
||||
if min_difference is None or d < min_difference:
|
||||
min_difference, min_means = d, meanss[i]
|
||||
|
||||
# use the best means
|
||||
self._means = min_means
|
||||
|
||||
def _cluster_vectorspace(self, vectors, trace=False):
|
||||
if self._num_means < len(vectors):
|
||||
# perform k-means clustering
|
||||
converged = False
|
||||
while not converged:
|
||||
# assign the tokens to clusters based on minimum distance to
|
||||
# the cluster means
|
||||
clusters = [[] for m in range(self._num_means)]
|
||||
for vector in vectors:
|
||||
index = self.classify_vectorspace(vector)
|
||||
clusters[index].append(vector)
|
||||
|
||||
if trace:
|
||||
print("iteration")
|
||||
# for i in range(self._num_means):
|
||||
# print ' mean', i, 'allocated', len(clusters[i]), 'vectors'
|
||||
|
||||
# recalculate cluster means by computing the centroid of each cluster
|
||||
new_means = list(map(self._centroid, clusters, self._means))
|
||||
|
||||
# measure the degree of change from the previous step for convergence
|
||||
difference = self._sum_distances(self._means, new_means)
|
||||
if difference < self._max_difference:
|
||||
converged = True
|
||||
|
||||
# remember the new means
|
||||
self._means = new_means
|
||||
|
||||
def classify_vectorspace(self, vector):
|
||||
# finds the closest cluster centroid
|
||||
# returns that cluster's index
|
||||
best_distance = best_index = None
|
||||
for index in range(len(self._means)):
|
||||
mean = self._means[index]
|
||||
dist = self._distance(vector, mean)
|
||||
if best_distance is None or dist < best_distance:
|
||||
best_index, best_distance = index, dist
|
||||
return best_index
|
||||
|
||||
def num_clusters(self):
|
||||
if self._means:
|
||||
return len(self._means)
|
||||
else:
|
||||
return self._num_means
|
||||
|
||||
def means(self):
|
||||
"""
|
||||
The means used for clustering.
|
||||
"""
|
||||
return self._means
|
||||
|
||||
def _sum_distances(self, vectors1, vectors2):
|
||||
difference = 0.0
|
||||
for u, v in zip(vectors1, vectors2):
|
||||
difference += self._distance(u, v)
|
||||
return difference
|
||||
|
||||
def _centroid(self, cluster, mean):
|
||||
if self._avoid_empty_clusters:
|
||||
centroid = copy.copy(mean)
|
||||
for vector in cluster:
|
||||
centroid += vector
|
||||
return centroid / (1 + len(cluster))
|
||||
else:
|
||||
if not len(cluster):
|
||||
sys.stderr.write("Error: no centroid defined for empty cluster.\n")
|
||||
sys.stderr.write(
|
||||
"Try setting argument 'avoid_empty_clusters' to True\n"
|
||||
)
|
||||
assert False
|
||||
centroid = copy.copy(cluster[0])
|
||||
for vector in cluster[1:]:
|
||||
centroid += vector
|
||||
return centroid / len(cluster)
|
||||
|
||||
def __repr__(self):
|
||||
return "<KMeansClusterer means=%s repeats=%d>" % (self._means, self._repeats)
|
||||
|
||||
|
||||
#################################################################################
|
||||
|
||||
|
||||
def demo():
|
||||
# example from figure 14.9, page 517, Manning and Schutze
|
||||
|
||||
from nltk.cluster import KMeansClusterer, euclidean_distance
|
||||
|
||||
vectors = [numpy.array(f) for f in [[2, 1], [1, 3], [4, 7], [6, 7]]]
|
||||
means = [[4, 3], [5, 5]]
|
||||
|
||||
clusterer = KMeansClusterer(2, euclidean_distance, initial_means=means)
|
||||
clusters = clusterer.cluster(vectors, True, trace=True)
|
||||
|
||||
print("Clustered:", vectors)
|
||||
print("As:", clusters)
|
||||
print("Means:", clusterer.means())
|
||||
print()
|
||||
|
||||
vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]
|
||||
|
||||
# test k-means using the euclidean distance metric, 2 means and repeat
|
||||
# clustering 10 times with random seeds
|
||||
|
||||
clusterer = KMeansClusterer(2, euclidean_distance, repeats=10)
|
||||
clusters = clusterer.cluster(vectors, True)
|
||||
print("Clustered:", vectors)
|
||||
print("As:", clusters)
|
||||
print("Means:", clusterer.means())
|
||||
print()
|
||||
|
||||
# classify a new vector
|
||||
vector = numpy.array([3, 3])
|
||||
print("classify(%s):" % vector, end=" ")
|
||||
print(clusterer.classify(vector))
|
||||
print()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
300
backend/venv/Lib/site-packages/nltk/cluster/util.py
Normal file
300
backend/venv/Lib/site-packages/nltk/cluster/util.py
Normal file
@@ -0,0 +1,300 @@
|
||||
# Natural Language Toolkit: Clusterer Utilities
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
|
||||
# Contributor: J Richard Snape
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
import copy
|
||||
from abc import abstractmethod
|
||||
from math import sqrt
|
||||
from sys import stdout
|
||||
|
||||
try:
|
||||
import numpy
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
from nltk.cluster.api import ClusterI
|
||||
|
||||
|
||||
class VectorSpaceClusterer(ClusterI):
|
||||
"""
|
||||
Abstract clusterer which takes tokens and maps them into a vector space.
|
||||
Optionally performs singular value decomposition to reduce the
|
||||
dimensionality.
|
||||
"""
|
||||
|
||||
def __init__(self, normalise=False, svd_dimensions=None):
|
||||
"""
|
||||
:param normalise: should vectors be normalised to length 1
|
||||
:type normalise: boolean
|
||||
:param svd_dimensions: number of dimensions to use in reducing vector
|
||||
dimensionsionality with SVD
|
||||
:type svd_dimensions: int
|
||||
"""
|
||||
self._Tt = None
|
||||
self._should_normalise = normalise
|
||||
self._svd_dimensions = svd_dimensions
|
||||
|
||||
def cluster(self, vectors, assign_clusters=False, trace=False):
|
||||
assert len(vectors) > 0
|
||||
|
||||
# normalise the vectors
|
||||
if self._should_normalise:
|
||||
vectors = list(map(self._normalise, vectors))
|
||||
|
||||
# use SVD to reduce the dimensionality
|
||||
if self._svd_dimensions and self._svd_dimensions < len(vectors[0]):
|
||||
[u, d, vt] = numpy.linalg.svd(numpy.transpose(numpy.array(vectors)))
|
||||
S = d[: self._svd_dimensions] * numpy.identity(
|
||||
self._svd_dimensions, numpy.float64
|
||||
)
|
||||
T = u[:, : self._svd_dimensions]
|
||||
Dt = vt[: self._svd_dimensions, :]
|
||||
vectors = numpy.transpose(numpy.dot(S, Dt))
|
||||
self._Tt = numpy.transpose(T)
|
||||
|
||||
# call abstract method to cluster the vectors
|
||||
self.cluster_vectorspace(vectors, trace)
|
||||
|
||||
# assign the vectors to clusters
|
||||
if assign_clusters:
|
||||
return [self.classify(vector) for vector in vectors]
|
||||
|
||||
@abstractmethod
|
||||
def cluster_vectorspace(self, vectors, trace):
|
||||
"""
|
||||
Finds the clusters using the given set of vectors.
|
||||
"""
|
||||
|
||||
def classify(self, vector):
|
||||
if self._should_normalise:
|
||||
vector = self._normalise(vector)
|
||||
if self._Tt is not None:
|
||||
vector = numpy.dot(self._Tt, vector)
|
||||
cluster = self.classify_vectorspace(vector)
|
||||
return self.cluster_name(cluster)
|
||||
|
||||
@abstractmethod
|
||||
def classify_vectorspace(self, vector):
|
||||
"""
|
||||
Returns the index of the appropriate cluster for the vector.
|
||||
"""
|
||||
|
||||
def likelihood(self, vector, label):
|
||||
if self._should_normalise:
|
||||
vector = self._normalise(vector)
|
||||
if self._Tt is not None:
|
||||
vector = numpy.dot(self._Tt, vector)
|
||||
return self.likelihood_vectorspace(vector, label)
|
||||
|
||||
def likelihood_vectorspace(self, vector, cluster):
|
||||
"""
|
||||
Returns the likelihood of the vector belonging to the cluster.
|
||||
"""
|
||||
predicted = self.classify_vectorspace(vector)
|
||||
return 1.0 if cluster == predicted else 0.0
|
||||
|
||||
def vector(self, vector):
|
||||
"""
|
||||
Returns the vector after normalisation and dimensionality reduction
|
||||
"""
|
||||
if self._should_normalise:
|
||||
vector = self._normalise(vector)
|
||||
if self._Tt is not None:
|
||||
vector = numpy.dot(self._Tt, vector)
|
||||
return vector
|
||||
|
||||
def _normalise(self, vector):
|
||||
"""
|
||||
Normalises the vector to unit length.
|
||||
"""
|
||||
return vector / sqrt(numpy.dot(vector, vector))
|
||||
|
||||
|
||||
def euclidean_distance(u, v):
|
||||
"""
|
||||
Returns the euclidean distance between vectors u and v. This is equivalent
|
||||
to the length of the vector (u - v).
|
||||
"""
|
||||
diff = u - v
|
||||
return sqrt(numpy.dot(diff, diff))
|
||||
|
||||
|
||||
def cosine_distance(u, v):
|
||||
"""
|
||||
Returns 1 minus the cosine of the angle between vectors v and u. This is
|
||||
equal to ``1 - (u.v / |u||v|)``.
|
||||
"""
|
||||
return 1 - (numpy.dot(u, v) / (sqrt(numpy.dot(u, u)) * sqrt(numpy.dot(v, v))))
|
||||
|
||||
|
||||
class _DendrogramNode:
|
||||
"""Tree node of a dendrogram."""
|
||||
|
||||
def __init__(self, value, *children):
|
||||
self._value = value
|
||||
self._children = children
|
||||
|
||||
def leaves(self, values=True):
|
||||
if self._children:
|
||||
leaves = []
|
||||
for child in self._children:
|
||||
leaves.extend(child.leaves(values))
|
||||
return leaves
|
||||
elif values:
|
||||
return [self._value]
|
||||
else:
|
||||
return [self]
|
||||
|
||||
def groups(self, n):
|
||||
queue = [(self._value, self)]
|
||||
|
||||
while len(queue) < n:
|
||||
priority, node = queue.pop()
|
||||
if not node._children:
|
||||
queue.push((priority, node))
|
||||
break
|
||||
for child in node._children:
|
||||
if child._children:
|
||||
queue.append((child._value, child))
|
||||
else:
|
||||
queue.append((0, child))
|
||||
# makes the earliest merges at the start, latest at the end
|
||||
queue.sort()
|
||||
|
||||
groups = []
|
||||
for priority, node in queue:
|
||||
groups.append(node.leaves())
|
||||
return groups
|
||||
|
||||
def __lt__(self, comparator):
|
||||
return cosine_distance(self._value, comparator._value) < 0
|
||||
|
||||
|
||||
class Dendrogram:
|
||||
"""
|
||||
Represents a dendrogram, a tree with a specified branching order. This
|
||||
must be initialised with the leaf items, then iteratively call merge for
|
||||
each branch. This class constructs a tree representing the order of calls
|
||||
to the merge function.
|
||||
"""
|
||||
|
||||
def __init__(self, items=[]):
|
||||
"""
|
||||
:param items: the items at the leaves of the dendrogram
|
||||
:type items: sequence of (any)
|
||||
"""
|
||||
self._items = [_DendrogramNode(item) for item in items]
|
||||
self._original_items = copy.copy(self._items)
|
||||
self._merge = 1
|
||||
|
||||
def merge(self, *indices):
|
||||
"""
|
||||
Merges nodes at given indices in the dendrogram. The nodes will be
|
||||
combined which then replaces the first node specified. All other nodes
|
||||
involved in the merge will be removed.
|
||||
|
||||
:param indices: indices of the items to merge (at least two)
|
||||
:type indices: seq of int
|
||||
"""
|
||||
assert len(indices) >= 2
|
||||
node = _DendrogramNode(self._merge, *(self._items[i] for i in indices))
|
||||
self._merge += 1
|
||||
self._items[indices[0]] = node
|
||||
for i in indices[1:]:
|
||||
del self._items[i]
|
||||
|
||||
def groups(self, n):
|
||||
"""
|
||||
Finds the n-groups of items (leaves) reachable from a cut at depth n.
|
||||
:param n: number of groups
|
||||
:type n: int
|
||||
"""
|
||||
if len(self._items) > 1:
|
||||
root = _DendrogramNode(self._merge, *self._items)
|
||||
else:
|
||||
root = self._items[0]
|
||||
return root.groups(n)
|
||||
|
||||
def show(self, leaf_labels=[]):
|
||||
"""
|
||||
Print the dendrogram in ASCII art to standard out.
|
||||
|
||||
:param leaf_labels: an optional list of strings to use for labeling the
|
||||
leaves
|
||||
:type leaf_labels: list
|
||||
"""
|
||||
|
||||
# ASCII rendering characters
|
||||
JOIN, HLINK, VLINK = "+", "-", "|"
|
||||
|
||||
# find the root (or create one)
|
||||
if len(self._items) > 1:
|
||||
root = _DendrogramNode(self._merge, *self._items)
|
||||
else:
|
||||
root = self._items[0]
|
||||
leaves = self._original_items
|
||||
|
||||
if leaf_labels:
|
||||
last_row = leaf_labels
|
||||
else:
|
||||
last_row = ["%s" % leaf._value for leaf in leaves]
|
||||
|
||||
# find the bottom row and the best cell width
|
||||
width = max(map(len, last_row)) + 1
|
||||
lhalf = width // 2
|
||||
rhalf = int(width - lhalf - 1)
|
||||
|
||||
# display functions
|
||||
def format(centre, left=" ", right=" "):
|
||||
return f"{lhalf * left}{centre}{right * rhalf}"
|
||||
|
||||
def display(str):
|
||||
stdout.write(str)
|
||||
|
||||
# for each merge, top down
|
||||
queue = [(root._value, root)]
|
||||
verticals = [format(" ") for leaf in leaves]
|
||||
while queue:
|
||||
priority, node = queue.pop()
|
||||
child_left_leaf = list(map(lambda c: c.leaves(False)[0], node._children))
|
||||
indices = list(map(leaves.index, child_left_leaf))
|
||||
if child_left_leaf:
|
||||
min_idx = min(indices)
|
||||
max_idx = max(indices)
|
||||
for i in range(len(leaves)):
|
||||
if leaves[i] in child_left_leaf:
|
||||
if i == min_idx:
|
||||
display(format(JOIN, " ", HLINK))
|
||||
elif i == max_idx:
|
||||
display(format(JOIN, HLINK, " "))
|
||||
else:
|
||||
display(format(JOIN, HLINK, HLINK))
|
||||
verticals[i] = format(VLINK)
|
||||
elif min_idx <= i <= max_idx:
|
||||
display(format(HLINK, HLINK, HLINK))
|
||||
else:
|
||||
display(verticals[i])
|
||||
display("\n")
|
||||
for child in node._children:
|
||||
if child._children:
|
||||
queue.append((child._value, child))
|
||||
queue.sort()
|
||||
|
||||
for vertical in verticals:
|
||||
display(vertical)
|
||||
display("\n")
|
||||
|
||||
# finally, display the last line
|
||||
display("".join(item.center(width) for item in last_row))
|
||||
display("\n")
|
||||
|
||||
def __repr__(self):
|
||||
if len(self._items) > 1:
|
||||
root = _DendrogramNode(self._merge, *self._items)
|
||||
else:
|
||||
root = self._items[0]
|
||||
leaves = root.leaves(False)
|
||||
return "<Dendrogram with %d leaves>" % len(leaves)
|
||||
656
backend/venv/Lib/site-packages/nltk/collections.py
Normal file
656
backend/venv/Lib/site-packages/nltk/collections.py
Normal file
@@ -0,0 +1,656 @@
|
||||
# Natural Language Toolkit: Collections
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import bisect
|
||||
from functools import total_ordering
|
||||
from itertools import chain, islice
|
||||
|
||||
from nltk.internals import raise_unorderable_types, slice_bounds
|
||||
|
||||
##########################################################################
|
||||
# Ordered Dictionary
|
||||
##########################################################################
|
||||
|
||||
|
||||
class OrderedDict(dict):
|
||||
def __init__(self, data=None, **kwargs):
|
||||
self._keys = self.keys(data, kwargs.get("keys"))
|
||||
self._default_factory = kwargs.get("default_factory")
|
||||
if data is None:
|
||||
dict.__init__(self)
|
||||
else:
|
||||
dict.__init__(self, data)
|
||||
|
||||
def __delitem__(self, key):
|
||||
dict.__delitem__(self, key)
|
||||
self._keys.remove(key)
|
||||
|
||||
def __getitem__(self, key):
|
||||
try:
|
||||
return dict.__getitem__(self, key)
|
||||
except KeyError:
|
||||
return self.__missing__(key)
|
||||
|
||||
def __iter__(self):
|
||||
return (key for key in self.keys())
|
||||
|
||||
def __missing__(self, key):
|
||||
if not self._default_factory and key not in self._keys:
|
||||
raise KeyError()
|
||||
return self._default_factory()
|
||||
|
||||
def __setitem__(self, key, item):
|
||||
dict.__setitem__(self, key, item)
|
||||
if key not in self._keys:
|
||||
self._keys.append(key)
|
||||
|
||||
def clear(self):
|
||||
dict.clear(self)
|
||||
self._keys.clear()
|
||||
|
||||
def copy(self):
|
||||
d = dict.copy(self)
|
||||
d._keys = self._keys
|
||||
return d
|
||||
|
||||
def items(self):
|
||||
return zip(self.keys(), self.values())
|
||||
|
||||
def keys(self, data=None, keys=None):
|
||||
if data:
|
||||
if keys:
|
||||
assert isinstance(keys, list)
|
||||
assert len(data) == len(keys)
|
||||
return keys
|
||||
else:
|
||||
assert (
|
||||
isinstance(data, dict)
|
||||
or isinstance(data, OrderedDict)
|
||||
or isinstance(data, list)
|
||||
)
|
||||
if isinstance(data, dict) or isinstance(data, OrderedDict):
|
||||
return data.keys()
|
||||
elif isinstance(data, list):
|
||||
return [key for (key, value) in data]
|
||||
elif "_keys" in self.__dict__:
|
||||
return self._keys
|
||||
else:
|
||||
return []
|
||||
|
||||
def popitem(self):
|
||||
if not self._keys:
|
||||
raise KeyError()
|
||||
|
||||
key = self._keys.pop()
|
||||
value = self[key]
|
||||
del self[key]
|
||||
return (key, value)
|
||||
|
||||
def setdefault(self, key, failobj=None):
|
||||
dict.setdefault(self, key, failobj)
|
||||
if key not in self._keys:
|
||||
self._keys.append(key)
|
||||
|
||||
def update(self, data):
|
||||
dict.update(self, data)
|
||||
for key in self.keys(data):
|
||||
if key not in self._keys:
|
||||
self._keys.append(key)
|
||||
|
||||
def values(self):
|
||||
return map(self.get, self._keys)
|
||||
|
||||
|
||||
######################################################################
|
||||
# Lazy Sequences
|
||||
######################################################################
|
||||
|
||||
|
||||
@total_ordering
|
||||
class AbstractLazySequence:
|
||||
"""
|
||||
An abstract base class for read-only sequences whose values are
|
||||
computed as needed. Lazy sequences act like tuples -- they can be
|
||||
indexed, sliced, and iterated over; but they may not be modified.
|
||||
|
||||
The most common application of lazy sequences in NLTK is for
|
||||
corpus view objects, which provide access to the contents of a
|
||||
corpus without loading the entire corpus into memory, by loading
|
||||
pieces of the corpus from disk as needed.
|
||||
|
||||
The result of modifying a mutable element of a lazy sequence is
|
||||
undefined. In particular, the modifications made to the element
|
||||
may or may not persist, depending on whether and when the lazy
|
||||
sequence caches that element's value or reconstructs it from
|
||||
scratch.
|
||||
|
||||
Subclasses are required to define two methods: ``__len__()``
|
||||
and ``iterate_from()``.
|
||||
"""
|
||||
|
||||
def __len__(self):
|
||||
"""
|
||||
Return the number of tokens in the corpus file underlying this
|
||||
corpus view.
|
||||
"""
|
||||
raise NotImplementedError("should be implemented by subclass")
|
||||
|
||||
def iterate_from(self, start):
|
||||
"""
|
||||
Return an iterator that generates the tokens in the corpus
|
||||
file underlying this corpus view, starting at the token number
|
||||
``start``. If ``start>=len(self)``, then this iterator will
|
||||
generate no tokens.
|
||||
"""
|
||||
raise NotImplementedError("should be implemented by subclass")
|
||||
|
||||
def __getitem__(self, i):
|
||||
"""
|
||||
Return the *i* th token in the corpus file underlying this
|
||||
corpus view. Negative indices and spans are both supported.
|
||||
"""
|
||||
if isinstance(i, slice):
|
||||
start, stop = slice_bounds(self, i)
|
||||
return LazySubsequence(self, start, stop)
|
||||
else:
|
||||
# Handle negative indices
|
||||
if i < 0:
|
||||
i += len(self)
|
||||
if i < 0:
|
||||
raise IndexError("index out of range")
|
||||
# Use iterate_from to extract it.
|
||||
try:
|
||||
return next(self.iterate_from(i))
|
||||
except StopIteration as e:
|
||||
raise IndexError("index out of range") from e
|
||||
|
||||
def __iter__(self):
|
||||
"""Return an iterator that generates the tokens in the corpus
|
||||
file underlying this corpus view."""
|
||||
return self.iterate_from(0)
|
||||
|
||||
def count(self, value):
|
||||
"""Return the number of times this list contains ``value``."""
|
||||
return sum(1 for elt in self if elt == value)
|
||||
|
||||
def index(self, value, start=None, stop=None):
|
||||
"""Return the index of the first occurrence of ``value`` in this
|
||||
list that is greater than or equal to ``start`` and less than
|
||||
``stop``. Negative start and stop values are treated like negative
|
||||
slice bounds -- i.e., they count from the end of the list."""
|
||||
start, stop = slice_bounds(self, slice(start, stop))
|
||||
for i, elt in enumerate(islice(self, start, stop)):
|
||||
if elt == value:
|
||||
return i + start
|
||||
raise ValueError("index(x): x not in list")
|
||||
|
||||
def __contains__(self, value):
|
||||
"""Return true if this list contains ``value``."""
|
||||
return bool(self.count(value))
|
||||
|
||||
def __add__(self, other):
|
||||
"""Return a list concatenating self with other."""
|
||||
return LazyConcatenation([self, other])
|
||||
|
||||
def __radd__(self, other):
|
||||
"""Return a list concatenating other with self."""
|
||||
return LazyConcatenation([other, self])
|
||||
|
||||
def __mul__(self, count):
|
||||
"""Return a list concatenating self with itself ``count`` times."""
|
||||
return LazyConcatenation([self] * count)
|
||||
|
||||
def __rmul__(self, count):
|
||||
"""Return a list concatenating self with itself ``count`` times."""
|
||||
return LazyConcatenation([self] * count)
|
||||
|
||||
_MAX_REPR_SIZE = 60
|
||||
|
||||
def __repr__(self):
|
||||
"""
|
||||
Return a string representation for this corpus view that is
|
||||
similar to a list's representation; but if it would be more
|
||||
than 60 characters long, it is truncated.
|
||||
"""
|
||||
pieces = []
|
||||
length = 5
|
||||
for elt in self:
|
||||
pieces.append(repr(elt))
|
||||
length += len(pieces[-1]) + 2
|
||||
if length > self._MAX_REPR_SIZE and len(pieces) > 2:
|
||||
return "[%s, ...]" % ", ".join(pieces[:-1])
|
||||
return "[%s]" % ", ".join(pieces)
|
||||
|
||||
def __eq__(self, other):
|
||||
return type(self) == type(other) and list(self) == list(other)
|
||||
|
||||
def __ne__(self, other):
|
||||
return not self == other
|
||||
|
||||
def __lt__(self, other):
|
||||
if type(other) != type(self):
|
||||
raise_unorderable_types("<", self, other)
|
||||
return list(self) < list(other)
|
||||
|
||||
def __hash__(self):
|
||||
"""
|
||||
:raise ValueError: Corpus view objects are unhashable.
|
||||
"""
|
||||
raise ValueError("%s objects are unhashable" % self.__class__.__name__)
|
||||
|
||||
|
||||
class LazySubsequence(AbstractLazySequence):
|
||||
"""
|
||||
A subsequence produced by slicing a lazy sequence. This slice
|
||||
keeps a reference to its source sequence, and generates its values
|
||||
by looking them up in the source sequence.
|
||||
"""
|
||||
|
||||
MIN_SIZE = 100
|
||||
"""
|
||||
The minimum size for which lazy slices should be created. If
|
||||
``LazySubsequence()`` is called with a subsequence that is
|
||||
shorter than ``MIN_SIZE``, then a tuple will be returned instead.
|
||||
"""
|
||||
|
||||
def __new__(cls, source, start, stop):
|
||||
"""
|
||||
Construct a new slice from a given underlying sequence. The
|
||||
``start`` and ``stop`` indices should be absolute indices --
|
||||
i.e., they should not be negative (for indexing from the back
|
||||
of a list) or greater than the length of ``source``.
|
||||
"""
|
||||
# If the slice is small enough, just use a tuple.
|
||||
if stop - start < cls.MIN_SIZE:
|
||||
return list(islice(source.iterate_from(start), stop - start))
|
||||
else:
|
||||
return object.__new__(cls)
|
||||
|
||||
def __init__(self, source, start, stop):
|
||||
self._source = source
|
||||
self._start = start
|
||||
self._stop = stop
|
||||
|
||||
def __len__(self):
|
||||
return self._stop - self._start
|
||||
|
||||
def iterate_from(self, start):
|
||||
return islice(
|
||||
self._source.iterate_from(start + self._start), max(0, len(self) - start)
|
||||
)
|
||||
|
||||
|
||||
class LazyConcatenation(AbstractLazySequence):
|
||||
"""
|
||||
A lazy sequence formed by concatenating a list of lists. This
|
||||
underlying list of lists may itself be lazy. ``LazyConcatenation``
|
||||
maintains an index that it uses to keep track of the relationship
|
||||
between offsets in the concatenated lists and offsets in the
|
||||
sublists.
|
||||
"""
|
||||
|
||||
def __init__(self, list_of_lists):
|
||||
self._list = list_of_lists
|
||||
self._offsets = [0]
|
||||
|
||||
def __len__(self):
|
||||
if len(self._offsets) <= len(self._list):
|
||||
for _ in self.iterate_from(self._offsets[-1]):
|
||||
pass
|
||||
return self._offsets[-1]
|
||||
|
||||
def iterate_from(self, start_index):
|
||||
if start_index < self._offsets[-1]:
|
||||
sublist_index = bisect.bisect_right(self._offsets, start_index) - 1
|
||||
else:
|
||||
sublist_index = len(self._offsets) - 1
|
||||
|
||||
index = self._offsets[sublist_index]
|
||||
|
||||
# Construct an iterator over the sublists.
|
||||
if isinstance(self._list, AbstractLazySequence):
|
||||
sublist_iter = self._list.iterate_from(sublist_index)
|
||||
else:
|
||||
sublist_iter = islice(self._list, sublist_index, None)
|
||||
|
||||
for sublist in sublist_iter:
|
||||
if sublist_index == (len(self._offsets) - 1):
|
||||
assert (
|
||||
index + len(sublist) >= self._offsets[-1]
|
||||
), "offsets not monotonic increasing!"
|
||||
self._offsets.append(index + len(sublist))
|
||||
else:
|
||||
assert self._offsets[sublist_index + 1] == index + len(
|
||||
sublist
|
||||
), "inconsistent list value (num elts)"
|
||||
|
||||
yield from sublist[max(0, start_index - index) :]
|
||||
|
||||
index += len(sublist)
|
||||
sublist_index += 1
|
||||
|
||||
|
||||
class LazyMap(AbstractLazySequence):
|
||||
"""
|
||||
A lazy sequence whose elements are formed by applying a given
|
||||
function to each element in one or more underlying lists. The
|
||||
function is applied lazily -- i.e., when you read a value from the
|
||||
list, ``LazyMap`` will calculate that value by applying its
|
||||
function to the underlying lists' value(s). ``LazyMap`` is
|
||||
essentially a lazy version of the Python primitive function
|
||||
``map``. In particular, the following two expressions are
|
||||
equivalent:
|
||||
|
||||
>>> from nltk.collections import LazyMap
|
||||
>>> function = str
|
||||
>>> sequence = [1,2,3]
|
||||
>>> map(function, sequence) # doctest: +SKIP
|
||||
['1', '2', '3']
|
||||
>>> list(LazyMap(function, sequence))
|
||||
['1', '2', '3']
|
||||
|
||||
Like the Python ``map`` primitive, if the source lists do not have
|
||||
equal size, then the value None will be supplied for the
|
||||
'missing' elements.
|
||||
|
||||
Lazy maps can be useful for conserving memory, in cases where
|
||||
individual values take up a lot of space. This is especially true
|
||||
if the underlying list's values are constructed lazily, as is the
|
||||
case with many corpus readers.
|
||||
|
||||
A typical example of a use case for this class is performing
|
||||
feature detection on the tokens in a corpus. Since featuresets
|
||||
are encoded as dictionaries, which can take up a lot of memory,
|
||||
using a ``LazyMap`` can significantly reduce memory usage when
|
||||
training and running classifiers.
|
||||
"""
|
||||
|
||||
def __init__(self, function, *lists, **config):
|
||||
"""
|
||||
:param function: The function that should be applied to
|
||||
elements of ``lists``. It should take as many arguments
|
||||
as there are ``lists``.
|
||||
:param lists: The underlying lists.
|
||||
:param cache_size: Determines the size of the cache used
|
||||
by this lazy map. (default=5)
|
||||
"""
|
||||
if not lists:
|
||||
raise TypeError("LazyMap requires at least two args")
|
||||
|
||||
self._lists = lists
|
||||
self._func = function
|
||||
self._cache_size = config.get("cache_size", 5)
|
||||
self._cache = {} if self._cache_size > 0 else None
|
||||
|
||||
# If you just take bool() of sum() here _all_lazy will be true just
|
||||
# in case n >= 1 list is an AbstractLazySequence. Presumably this
|
||||
# isn't what's intended.
|
||||
self._all_lazy = sum(
|
||||
isinstance(lst, AbstractLazySequence) for lst in lists
|
||||
) == len(lists)
|
||||
|
||||
def iterate_from(self, index):
|
||||
# Special case: one lazy sublist
|
||||
if len(self._lists) == 1 and self._all_lazy:
|
||||
for value in self._lists[0].iterate_from(index):
|
||||
yield self._func(value)
|
||||
return
|
||||
|
||||
# Special case: one non-lazy sublist
|
||||
elif len(self._lists) == 1:
|
||||
while True:
|
||||
try:
|
||||
yield self._func(self._lists[0][index])
|
||||
except IndexError:
|
||||
return
|
||||
index += 1
|
||||
|
||||
# Special case: n lazy sublists
|
||||
elif self._all_lazy:
|
||||
iterators = [lst.iterate_from(index) for lst in self._lists]
|
||||
while True:
|
||||
elements = []
|
||||
for iterator in iterators:
|
||||
try:
|
||||
elements.append(next(iterator))
|
||||
except: # FIXME: What is this except really catching? StopIteration?
|
||||
elements.append(None)
|
||||
if elements == [None] * len(self._lists):
|
||||
return
|
||||
yield self._func(*elements)
|
||||
index += 1
|
||||
|
||||
# general case
|
||||
else:
|
||||
while True:
|
||||
try:
|
||||
elements = [lst[index] for lst in self._lists]
|
||||
except IndexError:
|
||||
elements = [None] * len(self._lists)
|
||||
for i, lst in enumerate(self._lists):
|
||||
try:
|
||||
elements[i] = lst[index]
|
||||
except IndexError:
|
||||
pass
|
||||
if elements == [None] * len(self._lists):
|
||||
return
|
||||
yield self._func(*elements)
|
||||
index += 1
|
||||
|
||||
def __getitem__(self, index):
|
||||
if isinstance(index, slice):
|
||||
sliced_lists = [lst[index] for lst in self._lists]
|
||||
return LazyMap(self._func, *sliced_lists)
|
||||
else:
|
||||
# Handle negative indices
|
||||
if index < 0:
|
||||
index += len(self)
|
||||
if index < 0:
|
||||
raise IndexError("index out of range")
|
||||
# Check the cache
|
||||
if self._cache is not None and index in self._cache:
|
||||
return self._cache[index]
|
||||
# Calculate the value
|
||||
try:
|
||||
val = next(self.iterate_from(index))
|
||||
except StopIteration as e:
|
||||
raise IndexError("index out of range") from e
|
||||
# Update the cache
|
||||
if self._cache is not None:
|
||||
if len(self._cache) > self._cache_size:
|
||||
self._cache.popitem() # discard random entry
|
||||
self._cache[index] = val
|
||||
# Return the value
|
||||
return val
|
||||
|
||||
def __len__(self):
|
||||
return max(len(lst) for lst in self._lists)
|
||||
|
||||
|
||||
class LazyZip(LazyMap):
|
||||
"""
|
||||
A lazy sequence whose elements are tuples, each containing the i-th
|
||||
element from each of the argument sequences. The returned list is
|
||||
truncated in length to the length of the shortest argument sequence. The
|
||||
tuples are constructed lazily -- i.e., when you read a value from the
|
||||
list, ``LazyZip`` will calculate that value by forming a tuple from
|
||||
the i-th element of each of the argument sequences.
|
||||
|
||||
``LazyZip`` is essentially a lazy version of the Python primitive function
|
||||
``zip``. In particular, an evaluated LazyZip is equivalent to a zip:
|
||||
|
||||
>>> from nltk.collections import LazyZip
|
||||
>>> sequence1, sequence2 = [1, 2, 3], ['a', 'b', 'c']
|
||||
>>> zip(sequence1, sequence2) # doctest: +SKIP
|
||||
[(1, 'a'), (2, 'b'), (3, 'c')]
|
||||
>>> list(LazyZip(sequence1, sequence2))
|
||||
[(1, 'a'), (2, 'b'), (3, 'c')]
|
||||
>>> sequences = [sequence1, sequence2, [6,7,8,9]]
|
||||
>>> list(zip(*sequences)) == list(LazyZip(*sequences))
|
||||
True
|
||||
|
||||
Lazy zips can be useful for conserving memory in cases where the argument
|
||||
sequences are particularly long.
|
||||
|
||||
A typical example of a use case for this class is combining long sequences
|
||||
of gold standard and predicted values in a classification or tagging task
|
||||
in order to calculate accuracy. By constructing tuples lazily and
|
||||
avoiding the creation of an additional long sequence, memory usage can be
|
||||
significantly reduced.
|
||||
"""
|
||||
|
||||
def __init__(self, *lists):
|
||||
"""
|
||||
:param lists: the underlying lists
|
||||
:type lists: list(list)
|
||||
"""
|
||||
LazyMap.__init__(self, lambda *elts: elts, *lists)
|
||||
|
||||
def iterate_from(self, index):
|
||||
iterator = LazyMap.iterate_from(self, index)
|
||||
while index < len(self):
|
||||
yield next(iterator)
|
||||
index += 1
|
||||
return
|
||||
|
||||
def __len__(self):
|
||||
return min(len(lst) for lst in self._lists)
|
||||
|
||||
|
||||
class LazyEnumerate(LazyZip):
|
||||
"""
|
||||
A lazy sequence whose elements are tuples, each containing a count (from
|
||||
zero) and a value yielded by underlying sequence. ``LazyEnumerate`` is
|
||||
useful for obtaining an indexed list. The tuples are constructed lazily
|
||||
-- i.e., when you read a value from the list, ``LazyEnumerate`` will
|
||||
calculate that value by forming a tuple from the count of the i-th
|
||||
element and the i-th element of the underlying sequence.
|
||||
|
||||
``LazyEnumerate`` is essentially a lazy version of the Python primitive
|
||||
function ``enumerate``. In particular, the following two expressions are
|
||||
equivalent:
|
||||
|
||||
>>> from nltk.collections import LazyEnumerate
|
||||
>>> sequence = ['first', 'second', 'third']
|
||||
>>> list(enumerate(sequence))
|
||||
[(0, 'first'), (1, 'second'), (2, 'third')]
|
||||
>>> list(LazyEnumerate(sequence))
|
||||
[(0, 'first'), (1, 'second'), (2, 'third')]
|
||||
|
||||
Lazy enumerations can be useful for conserving memory in cases where the
|
||||
argument sequences are particularly long.
|
||||
|
||||
A typical example of a use case for this class is obtaining an indexed
|
||||
list for a long sequence of values. By constructing tuples lazily and
|
||||
avoiding the creation of an additional long sequence, memory usage can be
|
||||
significantly reduced.
|
||||
"""
|
||||
|
||||
def __init__(self, lst):
|
||||
"""
|
||||
:param lst: the underlying list
|
||||
:type lst: list
|
||||
"""
|
||||
LazyZip.__init__(self, range(len(lst)), lst)
|
||||
|
||||
|
||||
class LazyIteratorList(AbstractLazySequence):
|
||||
"""
|
||||
Wraps an iterator, loading its elements on demand
|
||||
and making them subscriptable.
|
||||
__repr__ displays only the first few elements.
|
||||
"""
|
||||
|
||||
def __init__(self, it, known_len=None):
|
||||
self._it = it
|
||||
self._len = known_len
|
||||
self._cache = []
|
||||
|
||||
def __len__(self):
|
||||
if self._len:
|
||||
return self._len
|
||||
for _ in self.iterate_from(len(self._cache)):
|
||||
pass
|
||||
self._len = len(self._cache)
|
||||
return self._len
|
||||
|
||||
def iterate_from(self, start):
|
||||
"""Create a new iterator over this list starting at the given offset."""
|
||||
while len(self._cache) < start:
|
||||
v = next(self._it)
|
||||
self._cache.append(v)
|
||||
i = start
|
||||
while i < len(self._cache):
|
||||
yield self._cache[i]
|
||||
i += 1
|
||||
try:
|
||||
while True:
|
||||
v = next(self._it)
|
||||
self._cache.append(v)
|
||||
yield v
|
||||
except StopIteration:
|
||||
pass
|
||||
|
||||
def __add__(self, other):
|
||||
"""Return a list concatenating self with other."""
|
||||
return type(self)(chain(self, other))
|
||||
|
||||
def __radd__(self, other):
|
||||
"""Return a list concatenating other with self."""
|
||||
return type(self)(chain(other, self))
|
||||
|
||||
|
||||
######################################################################
|
||||
# Trie Implementation
|
||||
######################################################################
|
||||
class Trie(dict):
|
||||
"""A Trie implementation for strings"""
|
||||
|
||||
LEAF = True
|
||||
|
||||
def __init__(self, strings=None):
|
||||
"""Builds a Trie object, which is built around a ``dict``
|
||||
|
||||
If ``strings`` is provided, it will add the ``strings``, which
|
||||
consist of a ``list`` of ``strings``, to the Trie.
|
||||
Otherwise, it'll construct an empty Trie.
|
||||
|
||||
:param strings: List of strings to insert into the trie
|
||||
(Default is ``None``)
|
||||
:type strings: list(str)
|
||||
|
||||
"""
|
||||
super().__init__()
|
||||
if strings:
|
||||
for string in strings:
|
||||
self.insert(string)
|
||||
|
||||
def insert(self, string):
|
||||
"""Inserts ``string`` into the Trie
|
||||
|
||||
:param string: String to insert into the trie
|
||||
:type string: str
|
||||
|
||||
:Example:
|
||||
|
||||
>>> from nltk.collections import Trie
|
||||
>>> trie = Trie(["abc", "def"])
|
||||
>>> expected = {'a': {'b': {'c': {True: None}}}, \
|
||||
'd': {'e': {'f': {True: None}}}}
|
||||
>>> trie == expected
|
||||
True
|
||||
|
||||
"""
|
||||
if len(string):
|
||||
self[string[0]].insert(string[1:])
|
||||
else:
|
||||
# mark the string is complete
|
||||
self[Trie.LEAF] = None
|
||||
|
||||
def __missing__(self, key):
|
||||
self[key] = Trie()
|
||||
return self[key]
|
||||
412
backend/venv/Lib/site-packages/nltk/collocations.py
Normal file
412
backend/venv/Lib/site-packages/nltk/collocations.py
Normal file
@@ -0,0 +1,412 @@
|
||||
# Natural Language Toolkit: Collocations and Association Measures
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Joel Nothman <jnothman@student.usyd.edu.au>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
#
|
||||
"""
|
||||
Tools to identify collocations --- words that often appear consecutively
|
||||
--- within corpora. They may also be used to find other associations between
|
||||
word occurrences.
|
||||
See Manning and Schutze ch. 5 at https://nlp.stanford.edu/fsnlp/promo/colloc.pdf
|
||||
and the Text::NSP Perl package at http://ngram.sourceforge.net
|
||||
|
||||
Finding collocations requires first calculating the frequencies of words and
|
||||
their appearance in the context of other words. Often the collection of words
|
||||
will then requiring filtering to only retain useful content terms. Each ngram
|
||||
of words may then be scored according to some association measure, in order
|
||||
to determine the relative likelihood of each ngram being a collocation.
|
||||
|
||||
The ``BigramCollocationFinder`` and ``TrigramCollocationFinder`` classes provide
|
||||
these functionalities, dependent on being provided a function which scores a
|
||||
ngram given appropriate frequency counts. A number of standard association
|
||||
measures are provided in bigram_measures and trigram_measures.
|
||||
"""
|
||||
|
||||
# Possible TODOs:
|
||||
# - consider the distinction between f(x,_) and f(x) and whether our
|
||||
# approximation is good enough for fragmented data, and mention it
|
||||
# - add a n-gram collocation finder with measures which only utilise n-gram
|
||||
# and unigram counts (raw_freq, pmi, student_t)
|
||||
|
||||
import itertools as _itertools
|
||||
|
||||
# these two unused imports are referenced in collocations.doctest
|
||||
from nltk.metrics import (
|
||||
BigramAssocMeasures,
|
||||
ContingencyMeasures,
|
||||
QuadgramAssocMeasures,
|
||||
TrigramAssocMeasures,
|
||||
)
|
||||
from nltk.metrics.spearman import ranks_from_scores, spearman_correlation
|
||||
from nltk.probability import FreqDist
|
||||
from nltk.util import ngrams
|
||||
|
||||
|
||||
class AbstractCollocationFinder:
|
||||
"""
|
||||
An abstract base class for collocation finders whose purpose is to
|
||||
collect collocation candidate frequencies, filter and rank them.
|
||||
|
||||
As a minimum, collocation finders require the frequencies of each
|
||||
word in a corpus, and the joint frequency of word tuples. This data
|
||||
should be provided through nltk.probability.FreqDist objects or an
|
||||
identical interface.
|
||||
"""
|
||||
|
||||
def __init__(self, word_fd, ngram_fd):
|
||||
self.word_fd = word_fd
|
||||
self.N = word_fd.N()
|
||||
self.ngram_fd = ngram_fd
|
||||
|
||||
@classmethod
|
||||
def _build_new_documents(
|
||||
cls, documents, window_size, pad_left=False, pad_right=False, pad_symbol=None
|
||||
):
|
||||
"""
|
||||
Pad the document with the place holder according to the window_size
|
||||
"""
|
||||
padding = (pad_symbol,) * (window_size - 1)
|
||||
if pad_right:
|
||||
return _itertools.chain.from_iterable(
|
||||
_itertools.chain(doc, padding) for doc in documents
|
||||
)
|
||||
if pad_left:
|
||||
return _itertools.chain.from_iterable(
|
||||
_itertools.chain(padding, doc) for doc in documents
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_documents(cls, documents):
|
||||
"""Constructs a collocation finder given a collection of documents,
|
||||
each of which is a list (or iterable) of tokens.
|
||||
"""
|
||||
# return cls.from_words(_itertools.chain(*documents))
|
||||
return cls.from_words(
|
||||
cls._build_new_documents(documents, cls.default_ws, pad_right=True)
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _ngram_freqdist(words, n):
|
||||
return FreqDist(tuple(words[i : i + n]) for i in range(len(words) - 1))
|
||||
|
||||
def _apply_filter(self, fn=lambda ngram, freq: False):
|
||||
"""Generic filter removes ngrams from the frequency distribution
|
||||
if the function returns True when passed an ngram tuple.
|
||||
"""
|
||||
tmp_ngram = FreqDist()
|
||||
for ngram, freq in self.ngram_fd.items():
|
||||
if not fn(ngram, freq):
|
||||
tmp_ngram[ngram] = freq
|
||||
self.ngram_fd = tmp_ngram
|
||||
|
||||
def apply_freq_filter(self, min_freq):
|
||||
"""Removes candidate ngrams which have frequency less than min_freq."""
|
||||
self._apply_filter(lambda ng, freq: freq < min_freq)
|
||||
|
||||
def apply_ngram_filter(self, fn):
|
||||
"""Removes candidate ngrams (w1, w2, ...) where fn(w1, w2, ...)
|
||||
evaluates to True.
|
||||
"""
|
||||
self._apply_filter(lambda ng, f: fn(*ng))
|
||||
|
||||
def apply_word_filter(self, fn):
|
||||
"""Removes candidate ngrams (w1, w2, ...) where any of (fn(w1), fn(w2),
|
||||
...) evaluates to True.
|
||||
"""
|
||||
self._apply_filter(lambda ng, f: any(fn(w) for w in ng))
|
||||
|
||||
def _score_ngrams(self, score_fn):
|
||||
"""Generates of (ngram, score) pairs as determined by the scoring
|
||||
function provided.
|
||||
"""
|
||||
for tup in self.ngram_fd:
|
||||
score = self.score_ngram(score_fn, *tup)
|
||||
if score is not None:
|
||||
yield tup, score
|
||||
|
||||
def score_ngrams(self, score_fn):
|
||||
"""Returns a sequence of (ngram, score) pairs ordered from highest to
|
||||
lowest score, as determined by the scoring function provided.
|
||||
"""
|
||||
return sorted(self._score_ngrams(score_fn), key=lambda t: (-t[1], t[0]))
|
||||
|
||||
def nbest(self, score_fn, n):
|
||||
"""Returns the top n ngrams when scored by the given function."""
|
||||
return [p for p, s in self.score_ngrams(score_fn)[:n]]
|
||||
|
||||
def above_score(self, score_fn, min_score):
|
||||
"""Returns a sequence of ngrams, ordered by decreasing score, whose
|
||||
scores each exceed the given minimum score.
|
||||
"""
|
||||
for ngram, score in self.score_ngrams(score_fn):
|
||||
if score > min_score:
|
||||
yield ngram
|
||||
else:
|
||||
break
|
||||
|
||||
|
||||
class BigramCollocationFinder(AbstractCollocationFinder):
|
||||
"""A tool for the finding and ranking of bigram collocations or other
|
||||
association measures. It is often useful to use from_words() rather than
|
||||
constructing an instance directly.
|
||||
"""
|
||||
|
||||
default_ws = 2
|
||||
|
||||
def __init__(self, word_fd, bigram_fd, window_size=2):
|
||||
"""Construct a BigramCollocationFinder, given FreqDists for
|
||||
appearances of words and (possibly non-contiguous) bigrams.
|
||||
"""
|
||||
AbstractCollocationFinder.__init__(self, word_fd, bigram_fd)
|
||||
self.window_size = window_size
|
||||
|
||||
@classmethod
|
||||
def from_words(cls, words, window_size=2):
|
||||
"""Construct a BigramCollocationFinder for all bigrams in the given
|
||||
sequence. When window_size > 2, count non-contiguous bigrams, in the
|
||||
style of Church and Hanks's (1990) association ratio.
|
||||
"""
|
||||
wfd = FreqDist()
|
||||
bfd = FreqDist()
|
||||
|
||||
if window_size < 2:
|
||||
raise ValueError("Specify window_size at least 2")
|
||||
|
||||
for window in ngrams(words, window_size, pad_right=True):
|
||||
w1 = window[0]
|
||||
if w1 is None:
|
||||
continue
|
||||
wfd[w1] += 1
|
||||
for w2 in window[1:]:
|
||||
if w2 is not None:
|
||||
bfd[(w1, w2)] += 1
|
||||
return cls(wfd, bfd, window_size=window_size)
|
||||
|
||||
def score_ngram(self, score_fn, w1, w2):
|
||||
"""Returns the score for a given bigram using the given scoring
|
||||
function. Following Church and Hanks (1990), counts are scaled by
|
||||
a factor of 1/(window_size - 1).
|
||||
"""
|
||||
n_all = self.N
|
||||
n_ii = self.ngram_fd[(w1, w2)] / (self.window_size - 1.0)
|
||||
if not n_ii:
|
||||
return
|
||||
n_ix = self.word_fd[w1]
|
||||
n_xi = self.word_fd[w2]
|
||||
return score_fn(n_ii, (n_ix, n_xi), n_all)
|
||||
|
||||
|
||||
class TrigramCollocationFinder(AbstractCollocationFinder):
|
||||
"""A tool for the finding and ranking of trigram collocations or other
|
||||
association measures. It is often useful to use from_words() rather than
|
||||
constructing an instance directly.
|
||||
"""
|
||||
|
||||
default_ws = 3
|
||||
|
||||
def __init__(self, word_fd, bigram_fd, wildcard_fd, trigram_fd):
|
||||
"""Construct a TrigramCollocationFinder, given FreqDists for
|
||||
appearances of words, bigrams, two words with any word between them,
|
||||
and trigrams.
|
||||
"""
|
||||
AbstractCollocationFinder.__init__(self, word_fd, trigram_fd)
|
||||
self.wildcard_fd = wildcard_fd
|
||||
self.bigram_fd = bigram_fd
|
||||
|
||||
@classmethod
|
||||
def from_words(cls, words, window_size=3):
|
||||
"""Construct a TrigramCollocationFinder for all trigrams in the given
|
||||
sequence.
|
||||
"""
|
||||
if window_size < 3:
|
||||
raise ValueError("Specify window_size at least 3")
|
||||
|
||||
wfd = FreqDist()
|
||||
wildfd = FreqDist()
|
||||
bfd = FreqDist()
|
||||
tfd = FreqDist()
|
||||
for window in ngrams(words, window_size, pad_right=True):
|
||||
w1 = window[0]
|
||||
if w1 is None:
|
||||
continue
|
||||
for w2, w3 in _itertools.combinations(window[1:], 2):
|
||||
wfd[w1] += 1
|
||||
if w2 is None:
|
||||
continue
|
||||
bfd[(w1, w2)] += 1
|
||||
if w3 is None:
|
||||
continue
|
||||
wildfd[(w1, w3)] += 1
|
||||
tfd[(w1, w2, w3)] += 1
|
||||
return cls(wfd, bfd, wildfd, tfd)
|
||||
|
||||
def bigram_finder(self):
|
||||
"""Constructs a bigram collocation finder with the bigram and unigram
|
||||
data from this finder. Note that this does not include any filtering
|
||||
applied to this finder.
|
||||
"""
|
||||
return BigramCollocationFinder(self.word_fd, self.bigram_fd)
|
||||
|
||||
def score_ngram(self, score_fn, w1, w2, w3):
|
||||
"""Returns the score for a given trigram using the given scoring
|
||||
function.
|
||||
"""
|
||||
n_all = self.N
|
||||
n_iii = self.ngram_fd[(w1, w2, w3)]
|
||||
if not n_iii:
|
||||
return
|
||||
n_iix = self.bigram_fd[(w1, w2)]
|
||||
n_ixi = self.wildcard_fd[(w1, w3)]
|
||||
n_xii = self.bigram_fd[(w2, w3)]
|
||||
n_ixx = self.word_fd[w1]
|
||||
n_xix = self.word_fd[w2]
|
||||
n_xxi = self.word_fd[w3]
|
||||
return score_fn(n_iii, (n_iix, n_ixi, n_xii), (n_ixx, n_xix, n_xxi), n_all)
|
||||
|
||||
|
||||
class QuadgramCollocationFinder(AbstractCollocationFinder):
|
||||
"""A tool for the finding and ranking of quadgram collocations or other association measures.
|
||||
It is often useful to use from_words() rather than constructing an instance directly.
|
||||
"""
|
||||
|
||||
default_ws = 4
|
||||
|
||||
def __init__(self, word_fd, quadgram_fd, ii, iii, ixi, ixxi, iixi, ixii):
|
||||
"""Construct a QuadgramCollocationFinder, given FreqDists for appearances of words,
|
||||
bigrams, trigrams, two words with one word and two words between them, three words
|
||||
with a word between them in both variations.
|
||||
"""
|
||||
AbstractCollocationFinder.__init__(self, word_fd, quadgram_fd)
|
||||
self.iii = iii
|
||||
self.ii = ii
|
||||
self.ixi = ixi
|
||||
self.ixxi = ixxi
|
||||
self.iixi = iixi
|
||||
self.ixii = ixii
|
||||
|
||||
@classmethod
|
||||
def from_words(cls, words, window_size=4):
|
||||
if window_size < 4:
|
||||
raise ValueError("Specify window_size at least 4")
|
||||
ixxx = FreqDist()
|
||||
iiii = FreqDist()
|
||||
ii = FreqDist()
|
||||
iii = FreqDist()
|
||||
ixi = FreqDist()
|
||||
ixxi = FreqDist()
|
||||
iixi = FreqDist()
|
||||
ixii = FreqDist()
|
||||
|
||||
for window in ngrams(words, window_size, pad_right=True):
|
||||
w1 = window[0]
|
||||
if w1 is None:
|
||||
continue
|
||||
for w2, w3, w4 in _itertools.combinations(window[1:], 3):
|
||||
ixxx[w1] += 1
|
||||
if w2 is None:
|
||||
continue
|
||||
ii[(w1, w2)] += 1
|
||||
if w3 is None:
|
||||
continue
|
||||
iii[(w1, w2, w3)] += 1
|
||||
ixi[(w1, w3)] += 1
|
||||
if w4 is None:
|
||||
continue
|
||||
iiii[(w1, w2, w3, w4)] += 1
|
||||
ixxi[(w1, w4)] += 1
|
||||
ixii[(w1, w3, w4)] += 1
|
||||
iixi[(w1, w2, w4)] += 1
|
||||
|
||||
return cls(ixxx, iiii, ii, iii, ixi, ixxi, iixi, ixii)
|
||||
|
||||
def score_ngram(self, score_fn, w1, w2, w3, w4):
|
||||
n_all = self.N
|
||||
n_iiii = self.ngram_fd[(w1, w2, w3, w4)]
|
||||
if not n_iiii:
|
||||
return
|
||||
n_iiix = self.iii[(w1, w2, w3)]
|
||||
n_xiii = self.iii[(w2, w3, w4)]
|
||||
n_iixi = self.iixi[(w1, w2, w4)]
|
||||
n_ixii = self.ixii[(w1, w3, w4)]
|
||||
|
||||
n_iixx = self.ii[(w1, w2)]
|
||||
n_xxii = self.ii[(w3, w4)]
|
||||
n_xiix = self.ii[(w2, w3)]
|
||||
n_ixix = self.ixi[(w1, w3)]
|
||||
n_ixxi = self.ixxi[(w1, w4)]
|
||||
n_xixi = self.ixi[(w2, w4)]
|
||||
|
||||
n_ixxx = self.word_fd[w1]
|
||||
n_xixx = self.word_fd[w2]
|
||||
n_xxix = self.word_fd[w3]
|
||||
n_xxxi = self.word_fd[w4]
|
||||
return score_fn(
|
||||
n_iiii,
|
||||
(n_iiix, n_iixi, n_ixii, n_xiii),
|
||||
(n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix),
|
||||
(n_ixxx, n_xixx, n_xxix, n_xxxi),
|
||||
n_all,
|
||||
)
|
||||
|
||||
|
||||
def demo(scorer=None, compare_scorer=None):
|
||||
"""Finds bigram collocations in the files of the WebText corpus."""
|
||||
from nltk.metrics import (
|
||||
BigramAssocMeasures,
|
||||
ranks_from_scores,
|
||||
spearman_correlation,
|
||||
)
|
||||
|
||||
if scorer is None:
|
||||
scorer = BigramAssocMeasures.likelihood_ratio
|
||||
if compare_scorer is None:
|
||||
compare_scorer = BigramAssocMeasures.raw_freq
|
||||
|
||||
from nltk.corpus import stopwords, webtext
|
||||
|
||||
ignored_words = stopwords.words("english")
|
||||
word_filter = lambda w: len(w) < 3 or w.lower() in ignored_words
|
||||
|
||||
for file in webtext.fileids():
|
||||
words = [word.lower() for word in webtext.words(file)]
|
||||
|
||||
cf = BigramCollocationFinder.from_words(words)
|
||||
cf.apply_freq_filter(3)
|
||||
cf.apply_word_filter(word_filter)
|
||||
|
||||
corr = spearman_correlation(
|
||||
ranks_from_scores(cf.score_ngrams(scorer)),
|
||||
ranks_from_scores(cf.score_ngrams(compare_scorer)),
|
||||
)
|
||||
print(file)
|
||||
print("\t", [" ".join(tup) for tup in cf.nbest(scorer, 15)])
|
||||
print(f"\t Correlation to {compare_scorer.__name__}: {corr:0.4f}")
|
||||
|
||||
|
||||
# Slows down loading too much
|
||||
# bigram_measures = BigramAssocMeasures()
|
||||
# trigram_measures = TrigramAssocMeasures()
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
from nltk.metrics import BigramAssocMeasures
|
||||
|
||||
try:
|
||||
scorer = eval("BigramAssocMeasures." + sys.argv[1])
|
||||
except IndexError:
|
||||
scorer = None
|
||||
try:
|
||||
compare_scorer = eval("BigramAssocMeasures." + sys.argv[2])
|
||||
except IndexError:
|
||||
compare_scorer = None
|
||||
|
||||
demo(scorer, compare_scorer)
|
||||
|
||||
__all__ = [
|
||||
"BigramCollocationFinder",
|
||||
"TrigramCollocationFinder",
|
||||
"QuadgramCollocationFinder",
|
||||
]
|
||||
38
backend/venv/Lib/site-packages/nltk/compat.py
Normal file
38
backend/venv/Lib/site-packages/nltk/compat.py
Normal file
@@ -0,0 +1,38 @@
|
||||
# Natural Language Toolkit: Compatibility
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
#
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import os
|
||||
from functools import wraps
|
||||
|
||||
# ======= Compatibility for datasets that care about Python versions ========
|
||||
|
||||
# The following datasets have a /PY3 subdirectory containing
|
||||
# a full copy of the data which has been re-encoded or repickled.
|
||||
DATA_UPDATES = []
|
||||
|
||||
_PY3_DATA_UPDATES = [os.path.join(*path_list) for path_list in DATA_UPDATES]
|
||||
|
||||
|
||||
def add_py3_data(path):
|
||||
for item in _PY3_DATA_UPDATES:
|
||||
if item in str(path) and "/PY3" not in str(path):
|
||||
pos = path.index(item) + len(item)
|
||||
if path[pos : pos + 4] == ".zip":
|
||||
pos += 4
|
||||
path = path[:pos] + "/PY3" + path[pos:]
|
||||
break
|
||||
return path
|
||||
|
||||
|
||||
# for use in adding /PY3 to the second (filename) argument
|
||||
# of the file pointers in data.py
|
||||
def py3_data(init_func):
|
||||
def _decorator(*args, **kwargs):
|
||||
args = (args[0], add_py3_data(args[1])) + args[2:]
|
||||
return init_func(*args, **kwargs)
|
||||
|
||||
return wraps(init_func)(_decorator)
|
||||
551
backend/venv/Lib/site-packages/nltk/corpus/__init__.py
Normal file
551
backend/venv/Lib/site-packages/nltk/corpus/__init__.py
Normal file
@@ -0,0 +1,551 @@
|
||||
# Natural Language Toolkit: Corpus Readers
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
# TODO this docstring isn't up-to-date!
|
||||
"""
|
||||
NLTK corpus readers. The modules in this package provide functions
|
||||
that can be used to read corpus files in a variety of formats. These
|
||||
functions can be used to read both the corpus files that are
|
||||
distributed in the NLTK corpus package, and corpus files that are part
|
||||
of external corpora.
|
||||
|
||||
Available Corpora
|
||||
=================
|
||||
|
||||
Please see https://www.nltk.org/nltk_data/ for a complete list.
|
||||
Install corpora using nltk.download().
|
||||
|
||||
Corpus Reader Functions
|
||||
=======================
|
||||
Each corpus module defines one or more "corpus reader functions",
|
||||
which can be used to read documents from that corpus. These functions
|
||||
take an argument, ``item``, which is used to indicate which document
|
||||
should be read from the corpus:
|
||||
|
||||
- If ``item`` is one of the unique identifiers listed in the corpus
|
||||
module's ``items`` variable, then the corresponding document will
|
||||
be loaded from the NLTK corpus package.
|
||||
- If ``item`` is a filename, then that file will be read.
|
||||
|
||||
Additionally, corpus reader functions can be given lists of item
|
||||
names; in which case, they will return a concatenation of the
|
||||
corresponding documents.
|
||||
|
||||
Corpus reader functions are named based on the type of information
|
||||
they return. Some common examples, and their return types, are:
|
||||
|
||||
- words(): list of str
|
||||
- sents(): list of (list of str)
|
||||
- paras(): list of (list of (list of str))
|
||||
- tagged_words(): list of (str,str) tuple
|
||||
- tagged_sents(): list of (list of (str,str))
|
||||
- tagged_paras(): list of (list of (list of (str,str)))
|
||||
- chunked_sents(): list of (Tree w/ (str,str) leaves)
|
||||
- parsed_sents(): list of (Tree with str leaves)
|
||||
- parsed_paras(): list of (list of (Tree with str leaves))
|
||||
- xml(): A single xml ElementTree
|
||||
- raw(): unprocessed corpus contents
|
||||
|
||||
For example, to read a list of the words in the Brown Corpus, use
|
||||
``nltk.corpus.brown.words()``:
|
||||
|
||||
>>> from nltk.corpus import brown
|
||||
>>> print(", ".join(brown.words())) # doctest: +ELLIPSIS
|
||||
The, Fulton, County, Grand, Jury, said, ...
|
||||
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
from nltk.corpus.reader import *
|
||||
from nltk.corpus.util import LazyCorpusLoader
|
||||
from nltk.tokenize import RegexpTokenizer
|
||||
|
||||
abc: PlaintextCorpusReader = LazyCorpusLoader(
|
||||
"abc",
|
||||
PlaintextCorpusReader,
|
||||
r"(?!\.).*\.txt",
|
||||
encoding=[("science", "latin_1"), ("rural", "utf8")],
|
||||
)
|
||||
alpino: AlpinoCorpusReader = LazyCorpusLoader(
|
||||
"alpino", AlpinoCorpusReader, tagset="alpino"
|
||||
)
|
||||
bcp47: BCP47CorpusReader = LazyCorpusLoader(
|
||||
"bcp47", BCP47CorpusReader, r"(cldr|iana)/*"
|
||||
)
|
||||
brown: CategorizedTaggedCorpusReader = LazyCorpusLoader(
|
||||
"brown",
|
||||
CategorizedTaggedCorpusReader,
|
||||
r"c[a-z]\d\d",
|
||||
cat_file="cats.txt",
|
||||
tagset="brown",
|
||||
encoding="ascii",
|
||||
)
|
||||
cess_cat: BracketParseCorpusReader = LazyCorpusLoader(
|
||||
"cess_cat",
|
||||
BracketParseCorpusReader,
|
||||
r"(?!\.).*\.tbf",
|
||||
tagset="unknown",
|
||||
encoding="ISO-8859-15",
|
||||
)
|
||||
cess_esp: BracketParseCorpusReader = LazyCorpusLoader(
|
||||
"cess_esp",
|
||||
BracketParseCorpusReader,
|
||||
r"(?!\.).*\.tbf",
|
||||
tagset="unknown",
|
||||
encoding="ISO-8859-15",
|
||||
)
|
||||
cmudict: CMUDictCorpusReader = LazyCorpusLoader(
|
||||
"cmudict", CMUDictCorpusReader, ["cmudict"]
|
||||
)
|
||||
comtrans: AlignedCorpusReader = LazyCorpusLoader(
|
||||
"comtrans", AlignedCorpusReader, r"(?!\.).*\.txt"
|
||||
)
|
||||
comparative_sentences: ComparativeSentencesCorpusReader = LazyCorpusLoader(
|
||||
"comparative_sentences",
|
||||
ComparativeSentencesCorpusReader,
|
||||
r"labeledSentences\.txt",
|
||||
encoding="latin-1",
|
||||
)
|
||||
conll2000: ConllChunkCorpusReader = LazyCorpusLoader(
|
||||
"conll2000",
|
||||
ConllChunkCorpusReader,
|
||||
["train.txt", "test.txt"],
|
||||
("NP", "VP", "PP"),
|
||||
tagset="wsj",
|
||||
encoding="ascii",
|
||||
)
|
||||
conll2002: ConllChunkCorpusReader = LazyCorpusLoader(
|
||||
"conll2002",
|
||||
ConllChunkCorpusReader,
|
||||
r".*\.(test|train).*",
|
||||
("LOC", "PER", "ORG", "MISC"),
|
||||
encoding="utf-8",
|
||||
)
|
||||
conll2007: DependencyCorpusReader = LazyCorpusLoader(
|
||||
"conll2007",
|
||||
DependencyCorpusReader,
|
||||
r".*\.(test|train).*",
|
||||
encoding=[("eus", "ISO-8859-2"), ("esp", "utf8")],
|
||||
)
|
||||
crubadan: CrubadanCorpusReader = LazyCorpusLoader(
|
||||
"crubadan", CrubadanCorpusReader, r".*\.txt"
|
||||
)
|
||||
dependency_treebank: DependencyCorpusReader = LazyCorpusLoader(
|
||||
"dependency_treebank", DependencyCorpusReader, r".*\.dp", encoding="ascii"
|
||||
)
|
||||
extended_omw: CorpusReader = LazyCorpusLoader(
|
||||
"extended_omw", CorpusReader, r".*/wn-[a-z\-]*\.tab", encoding="utf8"
|
||||
)
|
||||
floresta: BracketParseCorpusReader = LazyCorpusLoader(
|
||||
"floresta",
|
||||
BracketParseCorpusReader,
|
||||
r"(?!\.).*\.ptb",
|
||||
"#",
|
||||
tagset="unknown",
|
||||
encoding="ISO-8859-15",
|
||||
)
|
||||
framenet15: FramenetCorpusReader = LazyCorpusLoader(
|
||||
"framenet_v15",
|
||||
FramenetCorpusReader,
|
||||
[
|
||||
"frRelation.xml",
|
||||
"frameIndex.xml",
|
||||
"fulltextIndex.xml",
|
||||
"luIndex.xml",
|
||||
"semTypes.xml",
|
||||
],
|
||||
)
|
||||
framenet: FramenetCorpusReader = LazyCorpusLoader(
|
||||
"framenet_v17",
|
||||
FramenetCorpusReader,
|
||||
[
|
||||
"frRelation.xml",
|
||||
"frameIndex.xml",
|
||||
"fulltextIndex.xml",
|
||||
"luIndex.xml",
|
||||
"semTypes.xml",
|
||||
],
|
||||
)
|
||||
gazetteers: WordListCorpusReader = LazyCorpusLoader(
|
||||
"gazetteers", WordListCorpusReader, r"(?!LICENSE|\.).*\.txt", encoding="ISO-8859-2"
|
||||
)
|
||||
genesis: PlaintextCorpusReader = LazyCorpusLoader(
|
||||
"genesis",
|
||||
PlaintextCorpusReader,
|
||||
r"(?!\.).*\.txt",
|
||||
encoding=[
|
||||
("finnish|french|german", "latin_1"),
|
||||
("swedish", "cp865"),
|
||||
(".*", "utf_8"),
|
||||
],
|
||||
)
|
||||
gutenberg: PlaintextCorpusReader = LazyCorpusLoader(
|
||||
"gutenberg", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="latin1"
|
||||
)
|
||||
ieer: IEERCorpusReader = LazyCorpusLoader("ieer", IEERCorpusReader, r"(?!README|\.).*")
|
||||
inaugural: PlaintextCorpusReader = LazyCorpusLoader(
|
||||
"inaugural", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="latin1"
|
||||
)
|
||||
# [XX] This should probably just use TaggedCorpusReader:
|
||||
indian: IndianCorpusReader = LazyCorpusLoader(
|
||||
"indian", IndianCorpusReader, r"(?!\.).*\.pos", tagset="unknown", encoding="utf8"
|
||||
)
|
||||
|
||||
jeita: ChasenCorpusReader = LazyCorpusLoader(
|
||||
"jeita", ChasenCorpusReader, r".*\.chasen", encoding="utf-8"
|
||||
)
|
||||
knbc: KNBCorpusReader = LazyCorpusLoader(
|
||||
"knbc/corpus1", KNBCorpusReader, r".*/KN.*", encoding="euc-jp"
|
||||
)
|
||||
lin_thesaurus: LinThesaurusCorpusReader = LazyCorpusLoader(
|
||||
"lin_thesaurus", LinThesaurusCorpusReader, r".*\.lsp"
|
||||
)
|
||||
mac_morpho: MacMorphoCorpusReader = LazyCorpusLoader(
|
||||
"mac_morpho",
|
||||
MacMorphoCorpusReader,
|
||||
r"(?!\.).*\.txt",
|
||||
tagset="unknown",
|
||||
encoding="latin-1",
|
||||
)
|
||||
machado: PortugueseCategorizedPlaintextCorpusReader = LazyCorpusLoader(
|
||||
"machado",
|
||||
PortugueseCategorizedPlaintextCorpusReader,
|
||||
r"(?!\.).*\.txt",
|
||||
cat_pattern=r"([a-z]*)/.*",
|
||||
encoding="latin-1",
|
||||
)
|
||||
masc_tagged: CategorizedTaggedCorpusReader = LazyCorpusLoader(
|
||||
"masc_tagged",
|
||||
CategorizedTaggedCorpusReader,
|
||||
r"(spoken|written)/.*\.txt",
|
||||
cat_file="categories.txt",
|
||||
tagset="wsj",
|
||||
encoding="utf-8",
|
||||
sep="_",
|
||||
)
|
||||
movie_reviews: CategorizedPlaintextCorpusReader = LazyCorpusLoader(
|
||||
"movie_reviews",
|
||||
CategorizedPlaintextCorpusReader,
|
||||
r"(?!\.).*\.txt",
|
||||
cat_pattern=r"(neg|pos)/.*",
|
||||
encoding="ascii",
|
||||
)
|
||||
multext_east: MTECorpusReader = LazyCorpusLoader(
|
||||
"mte_teip5", MTECorpusReader, r"(oana).*\.xml", encoding="utf-8"
|
||||
)
|
||||
names: WordListCorpusReader = LazyCorpusLoader(
|
||||
"names", WordListCorpusReader, r"(?!\.).*\.txt", encoding="ascii"
|
||||
)
|
||||
nps_chat: NPSChatCorpusReader = LazyCorpusLoader(
|
||||
"nps_chat", NPSChatCorpusReader, r"(?!README|\.).*\.xml", tagset="wsj"
|
||||
)
|
||||
opinion_lexicon: OpinionLexiconCorpusReader = LazyCorpusLoader(
|
||||
"opinion_lexicon",
|
||||
OpinionLexiconCorpusReader,
|
||||
r"(\w+)\-words\.txt",
|
||||
encoding="ISO-8859-2",
|
||||
)
|
||||
ppattach: PPAttachmentCorpusReader = LazyCorpusLoader(
|
||||
"ppattach", PPAttachmentCorpusReader, ["training", "test", "devset"]
|
||||
)
|
||||
product_reviews_1: ReviewsCorpusReader = LazyCorpusLoader(
|
||||
"product_reviews_1", ReviewsCorpusReader, r"^(?!Readme).*\.txt", encoding="utf8"
|
||||
)
|
||||
product_reviews_2: ReviewsCorpusReader = LazyCorpusLoader(
|
||||
"product_reviews_2", ReviewsCorpusReader, r"^(?!Readme).*\.txt", encoding="utf8"
|
||||
)
|
||||
pros_cons: ProsConsCorpusReader = LazyCorpusLoader(
|
||||
"pros_cons",
|
||||
ProsConsCorpusReader,
|
||||
r"Integrated(Cons|Pros)\.txt",
|
||||
cat_pattern=r"Integrated(Cons|Pros)\.txt",
|
||||
encoding="ISO-8859-2",
|
||||
)
|
||||
ptb: CategorizedBracketParseCorpusReader = (
|
||||
LazyCorpusLoader( # Penn Treebank v3: WSJ and Brown portions
|
||||
"ptb",
|
||||
CategorizedBracketParseCorpusReader,
|
||||
r"(WSJ/\d\d/WSJ_\d\d|BROWN/C[A-Z]/C[A-Z])\d\d.MRG",
|
||||
cat_file="allcats.txt",
|
||||
tagset="wsj",
|
||||
)
|
||||
)
|
||||
qc: StringCategoryCorpusReader = LazyCorpusLoader(
|
||||
"qc", StringCategoryCorpusReader, ["train.txt", "test.txt"], encoding="ISO-8859-2"
|
||||
)
|
||||
reuters: CategorizedPlaintextCorpusReader = LazyCorpusLoader(
|
||||
"reuters",
|
||||
CategorizedPlaintextCorpusReader,
|
||||
"(training|test).*",
|
||||
cat_file="cats.txt",
|
||||
encoding="ISO-8859-2",
|
||||
)
|
||||
rte: RTECorpusReader = LazyCorpusLoader("rte", RTECorpusReader, r"(?!\.).*\.xml")
|
||||
senseval: SensevalCorpusReader = LazyCorpusLoader(
|
||||
"senseval", SensevalCorpusReader, r"(?!\.).*\.pos"
|
||||
)
|
||||
sentence_polarity: CategorizedSentencesCorpusReader = LazyCorpusLoader(
|
||||
"sentence_polarity",
|
||||
CategorizedSentencesCorpusReader,
|
||||
r"rt-polarity\.(neg|pos)",
|
||||
cat_pattern=r"rt-polarity\.(neg|pos)",
|
||||
encoding="utf-8",
|
||||
)
|
||||
sentiwordnet: SentiWordNetCorpusReader = LazyCorpusLoader(
|
||||
"sentiwordnet", SentiWordNetCorpusReader, "SentiWordNet_3.0.0.txt", encoding="utf-8"
|
||||
)
|
||||
shakespeare: XMLCorpusReader = LazyCorpusLoader(
|
||||
"shakespeare", XMLCorpusReader, r"(?!\.).*\.xml"
|
||||
)
|
||||
sinica_treebank: SinicaTreebankCorpusReader = LazyCorpusLoader(
|
||||
"sinica_treebank",
|
||||
SinicaTreebankCorpusReader,
|
||||
["parsed"],
|
||||
tagset="unknown",
|
||||
encoding="utf-8",
|
||||
)
|
||||
state_union: PlaintextCorpusReader = LazyCorpusLoader(
|
||||
"state_union", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="ISO-8859-2"
|
||||
)
|
||||
stopwords: WordListCorpusReader = LazyCorpusLoader(
|
||||
"stopwords", WordListCorpusReader, r"(?!README|\.).*", encoding="utf8"
|
||||
)
|
||||
subjectivity: CategorizedSentencesCorpusReader = LazyCorpusLoader(
|
||||
"subjectivity",
|
||||
CategorizedSentencesCorpusReader,
|
||||
r"(quote.tok.gt9|plot.tok.gt9)\.5000",
|
||||
cat_map={"quote.tok.gt9.5000": ["subj"], "plot.tok.gt9.5000": ["obj"]},
|
||||
encoding="latin-1",
|
||||
)
|
||||
swadesh: SwadeshCorpusReader = LazyCorpusLoader(
|
||||
"swadesh", SwadeshCorpusReader, r"(?!README|\.).*", encoding="utf8"
|
||||
)
|
||||
swadesh110: PanlexSwadeshCorpusReader = LazyCorpusLoader(
|
||||
"panlex_swadesh", PanlexSwadeshCorpusReader, r"swadesh110/.*\.txt", encoding="utf8"
|
||||
)
|
||||
swadesh207: PanlexSwadeshCorpusReader = LazyCorpusLoader(
|
||||
"panlex_swadesh", PanlexSwadeshCorpusReader, r"swadesh207/.*\.txt", encoding="utf8"
|
||||
)
|
||||
switchboard: SwitchboardCorpusReader = LazyCorpusLoader(
|
||||
"switchboard", SwitchboardCorpusReader, tagset="wsj"
|
||||
)
|
||||
timit: TimitCorpusReader = LazyCorpusLoader("timit", TimitCorpusReader)
|
||||
timit_tagged: TimitTaggedCorpusReader = LazyCorpusLoader(
|
||||
"timit", TimitTaggedCorpusReader, r".+\.tags", tagset="wsj", encoding="ascii"
|
||||
)
|
||||
toolbox: ToolboxCorpusReader = LazyCorpusLoader(
|
||||
"toolbox", ToolboxCorpusReader, r"(?!.*(README|\.)).*\.(dic|txt)"
|
||||
)
|
||||
treebank: BracketParseCorpusReader = LazyCorpusLoader(
|
||||
"treebank/combined",
|
||||
BracketParseCorpusReader,
|
||||
r"wsj_.*\.mrg",
|
||||
tagset="wsj",
|
||||
encoding="ascii",
|
||||
)
|
||||
treebank_chunk: ChunkedCorpusReader = LazyCorpusLoader(
|
||||
"treebank/tagged",
|
||||
ChunkedCorpusReader,
|
||||
r"wsj_.*\.pos",
|
||||
sent_tokenizer=RegexpTokenizer(r"(?<=/\.)\s*(?![^\[]*\])", gaps=True),
|
||||
para_block_reader=tagged_treebank_para_block_reader,
|
||||
tagset="wsj",
|
||||
encoding="ascii",
|
||||
)
|
||||
treebank_raw: PlaintextCorpusReader = LazyCorpusLoader(
|
||||
"treebank/raw", PlaintextCorpusReader, r"wsj_.*", encoding="ISO-8859-2"
|
||||
)
|
||||
twitter_samples: TwitterCorpusReader = LazyCorpusLoader(
|
||||
"twitter_samples", TwitterCorpusReader, r".*\.json"
|
||||
)
|
||||
udhr: UdhrCorpusReader = LazyCorpusLoader("udhr", UdhrCorpusReader)
|
||||
udhr2: PlaintextCorpusReader = LazyCorpusLoader(
|
||||
"udhr2", PlaintextCorpusReader, r".*\.txt", encoding="utf8"
|
||||
)
|
||||
universal_treebanks: ConllCorpusReader = LazyCorpusLoader(
|
||||
"universal_treebanks_v20",
|
||||
ConllCorpusReader,
|
||||
r".*\.conll",
|
||||
columntypes=(
|
||||
"ignore",
|
||||
"words",
|
||||
"ignore",
|
||||
"ignore",
|
||||
"pos",
|
||||
"ignore",
|
||||
"ignore",
|
||||
"ignore",
|
||||
"ignore",
|
||||
"ignore",
|
||||
),
|
||||
)
|
||||
verbnet: VerbnetCorpusReader = LazyCorpusLoader(
|
||||
"verbnet", VerbnetCorpusReader, r"(?!\.).*\.xml"
|
||||
)
|
||||
webtext: PlaintextCorpusReader = LazyCorpusLoader(
|
||||
"webtext", PlaintextCorpusReader, r"(?!README|\.).*\.txt", encoding="ISO-8859-2"
|
||||
)
|
||||
wordnet: WordNetCorpusReader = LazyCorpusLoader(
|
||||
"wordnet",
|
||||
WordNetCorpusReader,
|
||||
LazyCorpusLoader("omw-1.4", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
|
||||
)
|
||||
## Use the following template to add a custom Wordnet package.
|
||||
## Just uncomment, and replace the identifier (my_wordnet) in two places:
|
||||
##
|
||||
# my_wordnet: WordNetCorpusReader = LazyCorpusLoader(
|
||||
# "my_wordnet",
|
||||
# WordNetCorpusReader,
|
||||
# LazyCorpusLoader("omw-1.4", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
|
||||
# )
|
||||
wordnet31: WordNetCorpusReader = LazyCorpusLoader(
|
||||
"wordnet31",
|
||||
WordNetCorpusReader,
|
||||
LazyCorpusLoader("omw-1.4", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
|
||||
)
|
||||
wordnet2021: WordNetCorpusReader = LazyCorpusLoader(
|
||||
# Obsolete, use english_wordnet instead.
|
||||
"wordnet2021",
|
||||
WordNetCorpusReader,
|
||||
LazyCorpusLoader("omw-1.4", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
|
||||
)
|
||||
wordnet2022: WordNetCorpusReader = LazyCorpusLoader(
|
||||
# Obsolete, use english_wordnet instead.
|
||||
"wordnet2022",
|
||||
WordNetCorpusReader,
|
||||
LazyCorpusLoader("omw-1.4", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
|
||||
)
|
||||
english_wordnet: WordNetCorpusReader = LazyCorpusLoader(
|
||||
# Latest Open English Wordnet
|
||||
"english_wordnet",
|
||||
WordNetCorpusReader,
|
||||
LazyCorpusLoader("omw-1.4", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
|
||||
)
|
||||
wordnet_ic: WordNetICCorpusReader = LazyCorpusLoader(
|
||||
"wordnet_ic", WordNetICCorpusReader, r".*\.dat"
|
||||
)
|
||||
words: WordListCorpusReader = LazyCorpusLoader(
|
||||
"words", WordListCorpusReader, r"(?!README|\.).*", encoding="ascii"
|
||||
)
|
||||
|
||||
# defined after treebank
|
||||
propbank: PropbankCorpusReader = LazyCorpusLoader(
|
||||
"propbank",
|
||||
PropbankCorpusReader,
|
||||
"prop.txt",
|
||||
r"frames/.*\.xml",
|
||||
"verbs.txt",
|
||||
lambda filename: re.sub(r"^wsj/\d\d/", "", filename),
|
||||
treebank,
|
||||
) # Must be defined *after* treebank corpus.
|
||||
nombank: NombankCorpusReader = LazyCorpusLoader(
|
||||
"nombank.1.0",
|
||||
NombankCorpusReader,
|
||||
"nombank.1.0",
|
||||
r"frames/.*\.xml",
|
||||
"nombank.1.0.words",
|
||||
lambda filename: re.sub(r"^wsj/\d\d/", "", filename),
|
||||
treebank,
|
||||
) # Must be defined *after* treebank corpus.
|
||||
propbank_ptb: PropbankCorpusReader = LazyCorpusLoader(
|
||||
"propbank",
|
||||
PropbankCorpusReader,
|
||||
"prop.txt",
|
||||
r"frames/.*\.xml",
|
||||
"verbs.txt",
|
||||
lambda filename: filename.upper(),
|
||||
ptb,
|
||||
) # Must be defined *after* ptb corpus.
|
||||
nombank_ptb: NombankCorpusReader = LazyCorpusLoader(
|
||||
"nombank.1.0",
|
||||
NombankCorpusReader,
|
||||
"nombank.1.0",
|
||||
r"frames/.*\.xml",
|
||||
"nombank.1.0.words",
|
||||
lambda filename: filename.upper(),
|
||||
ptb,
|
||||
) # Must be defined *after* ptb corpus.
|
||||
semcor: SemcorCorpusReader = LazyCorpusLoader(
|
||||
"semcor", SemcorCorpusReader, r"brown./tagfiles/br-.*\.xml", wordnet
|
||||
) # Must be defined *after* wordnet corpus.
|
||||
|
||||
nonbreaking_prefixes: NonbreakingPrefixesCorpusReader = LazyCorpusLoader(
|
||||
"nonbreaking_prefixes",
|
||||
NonbreakingPrefixesCorpusReader,
|
||||
r"(?!README|\.).*",
|
||||
encoding="utf8",
|
||||
)
|
||||
perluniprops: UnicharsCorpusReader = LazyCorpusLoader(
|
||||
"perluniprops",
|
||||
UnicharsCorpusReader,
|
||||
r"(?!README|\.).*",
|
||||
nltk_data_subdir="misc",
|
||||
encoding="utf8",
|
||||
)
|
||||
|
||||
# mwa_ppdb = LazyCorpusLoader(
|
||||
# 'mwa_ppdb', MWAPPDBCorpusReader, r'(?!README|\.).*', nltk_data_subdir='misc', encoding='utf8')
|
||||
|
||||
# See https://github.com/nltk/nltk/issues/1579
|
||||
# and https://github.com/nltk/nltk/issues/1716
|
||||
#
|
||||
# pl196x = LazyCorpusLoader(
|
||||
# 'pl196x', Pl196xCorpusReader, r'[a-z]-.*\.xml',
|
||||
# cat_file='cats.txt', textid_file='textids.txt', encoding='utf8')
|
||||
#
|
||||
# ipipan = LazyCorpusLoader(
|
||||
# 'ipipan', IPIPANCorpusReader, r'(?!\.).*morph\.xml')
|
||||
#
|
||||
# nkjp = LazyCorpusLoader(
|
||||
# 'nkjp', NKJPCorpusReader, r'', encoding='utf8')
|
||||
#
|
||||
# panlex_lite = LazyCorpusLoader(
|
||||
# 'panlex_lite', PanLexLiteCorpusReader)
|
||||
#
|
||||
# ycoe = LazyCorpusLoader(
|
||||
# 'ycoe', YCOECorpusReader)
|
||||
#
|
||||
# corpus not available with NLTK; these lines caused help(nltk.corpus) to break
|
||||
# hebrew_treebank = LazyCorpusLoader(
|
||||
# 'hebrew_treebank', BracketParseCorpusReader, r'.*\.txt')
|
||||
|
||||
|
||||
# FIXME: override any imported demo from various corpora, see https://github.com/nltk/nltk/issues/2116
|
||||
def demo():
|
||||
# This is out-of-date:
|
||||
abc.demo()
|
||||
brown.demo()
|
||||
# chat80.demo()
|
||||
cmudict.demo()
|
||||
conll2000.demo()
|
||||
conll2002.demo()
|
||||
genesis.demo()
|
||||
gutenberg.demo()
|
||||
ieer.demo()
|
||||
inaugural.demo()
|
||||
indian.demo()
|
||||
names.demo()
|
||||
ppattach.demo()
|
||||
senseval.demo()
|
||||
shakespeare.demo()
|
||||
sinica_treebank.demo()
|
||||
state_union.demo()
|
||||
stopwords.demo()
|
||||
timit.demo()
|
||||
toolbox.demo()
|
||||
treebank.demo()
|
||||
udhr.demo()
|
||||
webtext.demo()
|
||||
words.demo()
|
||||
|
||||
|
||||
# ycoe.demo()
|
||||
|
||||
if __name__ == "__main__":
|
||||
# demo()
|
||||
pass
|
||||
56
backend/venv/Lib/site-packages/nltk/corpus/europarl_raw.py
Normal file
56
backend/venv/Lib/site-packages/nltk/corpus/europarl_raw.py
Normal file
@@ -0,0 +1,56 @@
|
||||
# Natural Language Toolkit: Europarl Corpus Readers
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Nitin Madnani <nmadnani@umiacs.umd.edu>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import re
|
||||
|
||||
from nltk.corpus.reader import *
|
||||
from nltk.corpus.util import LazyCorpusLoader
|
||||
|
||||
# Create a new corpus reader instance for each European language
|
||||
danish: EuroparlCorpusReader = LazyCorpusLoader(
|
||||
"europarl_raw/danish", EuroparlCorpusReader, r"ep-.*\.da", encoding="utf-8"
|
||||
)
|
||||
|
||||
dutch: EuroparlCorpusReader = LazyCorpusLoader(
|
||||
"europarl_raw/dutch", EuroparlCorpusReader, r"ep-.*\.nl", encoding="utf-8"
|
||||
)
|
||||
|
||||
english: EuroparlCorpusReader = LazyCorpusLoader(
|
||||
"europarl_raw/english", EuroparlCorpusReader, r"ep-.*\.en", encoding="utf-8"
|
||||
)
|
||||
|
||||
finnish: EuroparlCorpusReader = LazyCorpusLoader(
|
||||
"europarl_raw/finnish", EuroparlCorpusReader, r"ep-.*\.fi", encoding="utf-8"
|
||||
)
|
||||
|
||||
french: EuroparlCorpusReader = LazyCorpusLoader(
|
||||
"europarl_raw/french", EuroparlCorpusReader, r"ep-.*\.fr", encoding="utf-8"
|
||||
)
|
||||
|
||||
german: EuroparlCorpusReader = LazyCorpusLoader(
|
||||
"europarl_raw/german", EuroparlCorpusReader, r"ep-.*\.de", encoding="utf-8"
|
||||
)
|
||||
|
||||
greek: EuroparlCorpusReader = LazyCorpusLoader(
|
||||
"europarl_raw/greek", EuroparlCorpusReader, r"ep-.*\.el", encoding="utf-8"
|
||||
)
|
||||
|
||||
italian: EuroparlCorpusReader = LazyCorpusLoader(
|
||||
"europarl_raw/italian", EuroparlCorpusReader, r"ep-.*\.it", encoding="utf-8"
|
||||
)
|
||||
|
||||
portuguese: EuroparlCorpusReader = LazyCorpusLoader(
|
||||
"europarl_raw/portuguese", EuroparlCorpusReader, r"ep-.*\.pt", encoding="utf-8"
|
||||
)
|
||||
|
||||
spanish: EuroparlCorpusReader = LazyCorpusLoader(
|
||||
"europarl_raw/spanish", EuroparlCorpusReader, r"ep-.*\.es", encoding="utf-8"
|
||||
)
|
||||
|
||||
swedish: EuroparlCorpusReader = LazyCorpusLoader(
|
||||
"europarl_raw/swedish", EuroparlCorpusReader, r"ep-.*\.sv", encoding="utf-8"
|
||||
)
|
||||
186
backend/venv/Lib/site-packages/nltk/corpus/reader/__init__.py
Normal file
186
backend/venv/Lib/site-packages/nltk/corpus/reader/__init__.py
Normal file
@@ -0,0 +1,186 @@
|
||||
# Natural Language Toolkit: Corpus Readers
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
NLTK corpus readers. The modules in this package provide functions
|
||||
that can be used to read corpus fileids in a variety of formats. These
|
||||
functions can be used to read both the corpus fileids that are
|
||||
distributed in the NLTK corpus package, and corpus fileids that are part
|
||||
of external corpora.
|
||||
|
||||
Corpus Reader Functions
|
||||
=======================
|
||||
Each corpus module defines one or more "corpus reader functions",
|
||||
which can be used to read documents from that corpus. These functions
|
||||
take an argument, ``item``, which is used to indicate which document
|
||||
should be read from the corpus:
|
||||
|
||||
- If ``item`` is one of the unique identifiers listed in the corpus
|
||||
module's ``items`` variable, then the corresponding document will
|
||||
be loaded from the NLTK corpus package.
|
||||
- If ``item`` is a fileid, then that file will be read.
|
||||
|
||||
Additionally, corpus reader functions can be given lists of item
|
||||
names; in which case, they will return a concatenation of the
|
||||
corresponding documents.
|
||||
|
||||
Corpus reader functions are named based on the type of information
|
||||
they return. Some common examples, and their return types, are:
|
||||
|
||||
- words(): list of str
|
||||
- sents(): list of (list of str)
|
||||
- paras(): list of (list of (list of str))
|
||||
- tagged_words(): list of (str,str) tuple
|
||||
- tagged_sents(): list of (list of (str,str))
|
||||
- tagged_paras(): list of (list of (list of (str,str)))
|
||||
- chunked_sents(): list of (Tree w/ (str,str) leaves)
|
||||
- parsed_sents(): list of (Tree with str leaves)
|
||||
- parsed_paras(): list of (list of (Tree with str leaves))
|
||||
- xml(): A single xml ElementTree
|
||||
- raw(): unprocessed corpus contents
|
||||
|
||||
For example, to read a list of the words in the Brown Corpus, use
|
||||
``nltk.corpus.brown.words()``:
|
||||
|
||||
>>> from nltk.corpus import brown
|
||||
>>> print(", ".join(brown.words()[:6])) # only first 6 words
|
||||
The, Fulton, County, Grand, Jury, said
|
||||
|
||||
isort:skip_file
|
||||
"""
|
||||
|
||||
from nltk.corpus.reader.plaintext import *
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.corpus.reader.tagged import *
|
||||
from nltk.corpus.reader.cmudict import *
|
||||
from nltk.corpus.reader.conll import *
|
||||
from nltk.corpus.reader.chunked import *
|
||||
from nltk.corpus.reader.wordlist import *
|
||||
from nltk.corpus.reader.xmldocs import *
|
||||
from nltk.corpus.reader.ppattach import *
|
||||
from nltk.corpus.reader.senseval import *
|
||||
from nltk.corpus.reader.ieer import *
|
||||
from nltk.corpus.reader.sinica_treebank import *
|
||||
from nltk.corpus.reader.bracket_parse import *
|
||||
from nltk.corpus.reader.indian import *
|
||||
from nltk.corpus.reader.toolbox import *
|
||||
from nltk.corpus.reader.timit import *
|
||||
from nltk.corpus.reader.ycoe import *
|
||||
from nltk.corpus.reader.rte import *
|
||||
from nltk.corpus.reader.string_category import *
|
||||
from nltk.corpus.reader.propbank import *
|
||||
from nltk.corpus.reader.verbnet import *
|
||||
from nltk.corpus.reader.bnc import *
|
||||
from nltk.corpus.reader.nps_chat import *
|
||||
from nltk.corpus.reader.wordnet import *
|
||||
from nltk.corpus.reader.switchboard import *
|
||||
from nltk.corpus.reader.dependency import *
|
||||
from nltk.corpus.reader.nombank import *
|
||||
from nltk.corpus.reader.ipipan import *
|
||||
from nltk.corpus.reader.pl196x import *
|
||||
from nltk.corpus.reader.knbc import *
|
||||
from nltk.corpus.reader.chasen import *
|
||||
from nltk.corpus.reader.childes import *
|
||||
from nltk.corpus.reader.aligned import *
|
||||
from nltk.corpus.reader.lin import *
|
||||
from nltk.corpus.reader.semcor import *
|
||||
from nltk.corpus.reader.framenet import *
|
||||
from nltk.corpus.reader.udhr import *
|
||||
from nltk.corpus.reader.bnc import *
|
||||
from nltk.corpus.reader.sentiwordnet import *
|
||||
from nltk.corpus.reader.twitter import *
|
||||
from nltk.corpus.reader.nkjp import *
|
||||
from nltk.corpus.reader.crubadan import *
|
||||
from nltk.corpus.reader.mte import *
|
||||
from nltk.corpus.reader.reviews import *
|
||||
from nltk.corpus.reader.opinion_lexicon import *
|
||||
from nltk.corpus.reader.pros_cons import *
|
||||
from nltk.corpus.reader.categorized_sents import *
|
||||
from nltk.corpus.reader.comparative_sents import *
|
||||
from nltk.corpus.reader.panlex_lite import *
|
||||
from nltk.corpus.reader.panlex_swadesh import *
|
||||
from nltk.corpus.reader.bcp47 import *
|
||||
|
||||
# Make sure that nltk.corpus.reader.bracket_parse gives the module, not
|
||||
# the function bracket_parse() defined in nltk.tree:
|
||||
from nltk.corpus.reader import bracket_parse
|
||||
|
||||
__all__ = [
|
||||
"CorpusReader",
|
||||
"CategorizedCorpusReader",
|
||||
"PlaintextCorpusReader",
|
||||
"find_corpus_fileids",
|
||||
"TaggedCorpusReader",
|
||||
"CMUDictCorpusReader",
|
||||
"ConllChunkCorpusReader",
|
||||
"WordListCorpusReader",
|
||||
"PPAttachmentCorpusReader",
|
||||
"SensevalCorpusReader",
|
||||
"IEERCorpusReader",
|
||||
"ChunkedCorpusReader",
|
||||
"SinicaTreebankCorpusReader",
|
||||
"BracketParseCorpusReader",
|
||||
"IndianCorpusReader",
|
||||
"ToolboxCorpusReader",
|
||||
"TimitCorpusReader",
|
||||
"YCOECorpusReader",
|
||||
"MacMorphoCorpusReader",
|
||||
"SyntaxCorpusReader",
|
||||
"AlpinoCorpusReader",
|
||||
"RTECorpusReader",
|
||||
"StringCategoryCorpusReader",
|
||||
"EuroparlCorpusReader",
|
||||
"CategorizedBracketParseCorpusReader",
|
||||
"CategorizedTaggedCorpusReader",
|
||||
"CategorizedPlaintextCorpusReader",
|
||||
"PortugueseCategorizedPlaintextCorpusReader",
|
||||
"tagged_treebank_para_block_reader",
|
||||
"PropbankCorpusReader",
|
||||
"VerbnetCorpusReader",
|
||||
"BNCCorpusReader",
|
||||
"ConllCorpusReader",
|
||||
"XMLCorpusReader",
|
||||
"NPSChatCorpusReader",
|
||||
"SwadeshCorpusReader",
|
||||
"WordNetCorpusReader",
|
||||
"WordNetICCorpusReader",
|
||||
"SwitchboardCorpusReader",
|
||||
"DependencyCorpusReader",
|
||||
"NombankCorpusReader",
|
||||
"IPIPANCorpusReader",
|
||||
"Pl196xCorpusReader",
|
||||
"TEICorpusView",
|
||||
"KNBCorpusReader",
|
||||
"ChasenCorpusReader",
|
||||
"CHILDESCorpusReader",
|
||||
"AlignedCorpusReader",
|
||||
"TimitTaggedCorpusReader",
|
||||
"LinThesaurusCorpusReader",
|
||||
"SemcorCorpusReader",
|
||||
"FramenetCorpusReader",
|
||||
"UdhrCorpusReader",
|
||||
"BNCCorpusReader",
|
||||
"SentiWordNetCorpusReader",
|
||||
"SentiSynset",
|
||||
"TwitterCorpusReader",
|
||||
"NKJPCorpusReader",
|
||||
"CrubadanCorpusReader",
|
||||
"MTECorpusReader",
|
||||
"ReviewsCorpusReader",
|
||||
"OpinionLexiconCorpusReader",
|
||||
"ProsConsCorpusReader",
|
||||
"CategorizedSentencesCorpusReader",
|
||||
"ComparativeSentencesCorpusReader",
|
||||
"PanLexLiteCorpusReader",
|
||||
"NonbreakingPrefixesCorpusReader",
|
||||
"UnicharsCorpusReader",
|
||||
"MWAPPDBCorpusReader",
|
||||
"PanlexSwadeshCorpusReader",
|
||||
"BCP47CorpusReader",
|
||||
]
|
||||
154
backend/venv/Lib/site-packages/nltk/corpus/reader/aligned.py
Normal file
154
backend/venv/Lib/site-packages/nltk/corpus/reader/aligned.py
Normal file
@@ -0,0 +1,154 @@
|
||||
# Natural Language Toolkit: Aligned Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# URL: <https://www.nltk.org/>
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
from nltk.corpus.reader.api import CorpusReader
|
||||
from nltk.corpus.reader.util import (
|
||||
StreamBackedCorpusView,
|
||||
concat,
|
||||
read_alignedsent_block,
|
||||
)
|
||||
from nltk.tokenize import RegexpTokenizer, WhitespaceTokenizer
|
||||
from nltk.translate import AlignedSent, Alignment
|
||||
|
||||
|
||||
class AlignedCorpusReader(CorpusReader):
|
||||
"""
|
||||
Reader for corpora of word-aligned sentences. Tokens are assumed
|
||||
to be separated by whitespace. Sentences begin on separate lines.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
root,
|
||||
fileids,
|
||||
sep="/",
|
||||
word_tokenizer=WhitespaceTokenizer(),
|
||||
sent_tokenizer=RegexpTokenizer("\n", gaps=True),
|
||||
alignedsent_block_reader=read_alignedsent_block,
|
||||
encoding="latin1",
|
||||
):
|
||||
"""
|
||||
Construct a new Aligned Corpus reader for a set of documents
|
||||
located at the given root directory. Example usage:
|
||||
|
||||
>>> root = '/...path to corpus.../'
|
||||
>>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP
|
||||
|
||||
:param root: The root directory for this corpus.
|
||||
:param fileids: A list or regexp specifying the fileids in this corpus.
|
||||
"""
|
||||
CorpusReader.__init__(self, root, fileids, encoding)
|
||||
self._sep = sep
|
||||
self._word_tokenizer = word_tokenizer
|
||||
self._sent_tokenizer = sent_tokenizer
|
||||
self._alignedsent_block_reader = alignedsent_block_reader
|
||||
|
||||
def words(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of words
|
||||
and punctuation symbols.
|
||||
:rtype: list(str)
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
AlignedSentCorpusView(
|
||||
fileid,
|
||||
enc,
|
||||
False,
|
||||
False,
|
||||
self._word_tokenizer,
|
||||
self._sent_tokenizer,
|
||||
self._alignedsent_block_reader,
|
||||
)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def sents(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of
|
||||
sentences or utterances, each encoded as a list of word
|
||||
strings.
|
||||
:rtype: list(list(str))
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
AlignedSentCorpusView(
|
||||
fileid,
|
||||
enc,
|
||||
False,
|
||||
True,
|
||||
self._word_tokenizer,
|
||||
self._sent_tokenizer,
|
||||
self._alignedsent_block_reader,
|
||||
)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def aligned_sents(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of AlignedSent objects.
|
||||
:rtype: list(AlignedSent)
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
AlignedSentCorpusView(
|
||||
fileid,
|
||||
enc,
|
||||
True,
|
||||
True,
|
||||
self._word_tokenizer,
|
||||
self._sent_tokenizer,
|
||||
self._alignedsent_block_reader,
|
||||
)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
class AlignedSentCorpusView(StreamBackedCorpusView):
|
||||
"""
|
||||
A specialized corpus view for aligned sentences.
|
||||
``AlignedSentCorpusView`` objects are typically created by
|
||||
``AlignedCorpusReader`` (not directly by nltk users).
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
corpus_file,
|
||||
encoding,
|
||||
aligned,
|
||||
group_by_sent,
|
||||
word_tokenizer,
|
||||
sent_tokenizer,
|
||||
alignedsent_block_reader,
|
||||
):
|
||||
self._aligned = aligned
|
||||
self._group_by_sent = group_by_sent
|
||||
self._word_tokenizer = word_tokenizer
|
||||
self._sent_tokenizer = sent_tokenizer
|
||||
self._alignedsent_block_reader = alignedsent_block_reader
|
||||
StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
|
||||
|
||||
def read_block(self, stream):
|
||||
block = [
|
||||
self._word_tokenizer.tokenize(sent_str)
|
||||
for alignedsent_str in self._alignedsent_block_reader(stream)
|
||||
for sent_str in self._sent_tokenizer.tokenize(alignedsent_str)
|
||||
]
|
||||
if self._aligned:
|
||||
block[2] = Alignment.fromstring(
|
||||
" ".join(block[2])
|
||||
) # kludge; we shouldn't have tokenized the alignment string
|
||||
block = [AlignedSent(*block)]
|
||||
elif self._group_by_sent:
|
||||
block = [block[0]]
|
||||
else:
|
||||
block = block[0]
|
||||
|
||||
return block
|
||||
517
backend/venv/Lib/site-packages/nltk/corpus/reader/api.py
Normal file
517
backend/venv/Lib/site-packages/nltk/corpus/reader/api.py
Normal file
@@ -0,0 +1,517 @@
|
||||
# Natural Language Toolkit: API for Corpus Readers
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
API for corpus readers.
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from itertools import chain
|
||||
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.data import FileSystemPathPointer, PathPointer, ZipFilePathPointer
|
||||
|
||||
|
||||
class CorpusReader:
|
||||
"""
|
||||
A base class for "corpus reader" classes, each of which can be
|
||||
used to read a specific corpus format. Each individual corpus
|
||||
reader instance is used to read a specific corpus, consisting of
|
||||
one or more files under a common root directory. Each file is
|
||||
identified by its ``file identifier``, which is the relative path
|
||||
to the file from the root directory.
|
||||
|
||||
A separate subclass is defined for each corpus format. These
|
||||
subclasses define one or more methods that provide 'views' on the
|
||||
corpus contents, such as ``words()`` (for a list of words) and
|
||||
``parsed_sents()`` (for a list of parsed sentences). Called with
|
||||
no arguments, these methods will return the contents of the entire
|
||||
corpus. For most corpora, these methods define one or more
|
||||
selection arguments, such as ``fileids`` or ``categories``, which can
|
||||
be used to select which portion of the corpus should be returned.
|
||||
"""
|
||||
|
||||
def __init__(self, root, fileids, encoding="utf8", tagset=None):
|
||||
"""
|
||||
:type root: PathPointer or str
|
||||
:param root: A path pointer identifying the root directory for
|
||||
this corpus. If a string is specified, then it will be
|
||||
converted to a ``PathPointer`` automatically.
|
||||
:param fileids: A list of the files that make up this corpus.
|
||||
This list can either be specified explicitly, as a list of
|
||||
strings; or implicitly, as a regular expression over file
|
||||
paths. The absolute path for each file will be constructed
|
||||
by joining the reader's root to each file name.
|
||||
:param encoding: The default unicode encoding for the files
|
||||
that make up the corpus. The value of ``encoding`` can be any
|
||||
of the following:
|
||||
|
||||
- A string: ``encoding`` is the encoding name for all files.
|
||||
- A dictionary: ``encoding[file_id]`` is the encoding
|
||||
name for the file whose identifier is ``file_id``. If
|
||||
``file_id`` is not in ``encoding``, then the file
|
||||
contents will be processed using non-unicode byte strings.
|
||||
- A list: ``encoding`` should be a list of ``(regexp, encoding)``
|
||||
tuples. The encoding for a file whose identifier is ``file_id``
|
||||
will be the ``encoding`` value for the first tuple whose
|
||||
``regexp`` matches the ``file_id``. If no tuple's ``regexp``
|
||||
matches the ``file_id``, the file contents will be processed
|
||||
using non-unicode byte strings.
|
||||
- None: the file contents of all files will be
|
||||
processed using non-unicode byte strings.
|
||||
:param tagset: The name of the tagset used by this corpus, to be used
|
||||
for normalizing or converting the POS tags returned by the
|
||||
``tagged_...()`` methods.
|
||||
"""
|
||||
# Convert the root to a path pointer, if necessary.
|
||||
if isinstance(root, str) and not isinstance(root, PathPointer):
|
||||
m = re.match(r"(.*\.zip)/?(.*)$|", root)
|
||||
zipfile, zipentry = m.groups()
|
||||
if zipfile:
|
||||
root = ZipFilePathPointer(zipfile, zipentry)
|
||||
else:
|
||||
root = FileSystemPathPointer(root)
|
||||
elif not isinstance(root, PathPointer):
|
||||
raise TypeError("CorpusReader: expected a string or a PathPointer")
|
||||
|
||||
# If `fileids` is a regexp, then expand it.
|
||||
if isinstance(fileids, str):
|
||||
fileids = find_corpus_fileids(root, fileids)
|
||||
|
||||
self._fileids = fileids
|
||||
"""A list of the relative paths for the fileids that make up
|
||||
this corpus."""
|
||||
|
||||
self._root = root
|
||||
"""The root directory for this corpus."""
|
||||
|
||||
self._readme = "README"
|
||||
self._license = "LICENSE"
|
||||
self._citation = "citation.bib"
|
||||
|
||||
# If encoding was specified as a list of regexps, then convert
|
||||
# it to a dictionary.
|
||||
if isinstance(encoding, list):
|
||||
encoding_dict = {}
|
||||
for fileid in self._fileids:
|
||||
for x in encoding:
|
||||
(regexp, enc) = x
|
||||
if re.match(regexp, fileid):
|
||||
encoding_dict[fileid] = enc
|
||||
break
|
||||
encoding = encoding_dict
|
||||
|
||||
self._encoding = encoding
|
||||
"""The default unicode encoding for the fileids that make up
|
||||
this corpus. If ``encoding`` is None, then the file
|
||||
contents are processed using byte strings."""
|
||||
self._tagset = tagset
|
||||
|
||||
def __repr__(self):
|
||||
if isinstance(self._root, ZipFilePathPointer):
|
||||
path = f"{self._root.zipfile.filename}/{self._root.entry}"
|
||||
else:
|
||||
path = "%s" % self._root.path
|
||||
return f"<{self.__class__.__name__} in {path!r}>"
|
||||
|
||||
def ensure_loaded(self):
|
||||
"""
|
||||
Load this corpus (if it has not already been loaded). This is
|
||||
used by LazyCorpusLoader as a simple method that can be used to
|
||||
make sure a corpus is loaded -- e.g., in case a user wants to
|
||||
do help(some_corpus).
|
||||
"""
|
||||
pass # no need to actually do anything.
|
||||
|
||||
def readme(self):
|
||||
"""
|
||||
Return the contents of the corpus README file, if it exists.
|
||||
"""
|
||||
with self.open(self._readme) as f:
|
||||
return f.read()
|
||||
|
||||
def license(self):
|
||||
"""
|
||||
Return the contents of the corpus LICENSE file, if it exists.
|
||||
"""
|
||||
with self.open(self._license) as f:
|
||||
return f.read()
|
||||
|
||||
def citation(self):
|
||||
"""
|
||||
Return the contents of the corpus citation.bib file, if it exists.
|
||||
"""
|
||||
with self.open(self._citation) as f:
|
||||
return f.read()
|
||||
|
||||
def fileids(self):
|
||||
"""
|
||||
Return a list of file identifiers for the fileids that make up
|
||||
this corpus.
|
||||
"""
|
||||
return self._fileids
|
||||
|
||||
def abspath(self, fileid):
|
||||
"""
|
||||
Return the absolute path for the given file.
|
||||
|
||||
:type fileid: str
|
||||
:param fileid: The file identifier for the file whose path
|
||||
should be returned.
|
||||
:rtype: PathPointer
|
||||
"""
|
||||
return self._root.join(fileid)
|
||||
|
||||
def abspaths(self, fileids=None, include_encoding=False, include_fileid=False):
|
||||
"""
|
||||
Return a list of the absolute paths for all fileids in this corpus;
|
||||
or for the given list of fileids, if specified.
|
||||
|
||||
:type fileids: None or str or list
|
||||
:param fileids: Specifies the set of fileids for which paths should
|
||||
be returned. Can be None, for all fileids; a list of
|
||||
file identifiers, for a specified set of fileids; or a single
|
||||
file identifier, for a single file. Note that the return
|
||||
value is always a list of paths, even if ``fileids`` is a
|
||||
single file identifier.
|
||||
|
||||
:param include_encoding: If true, then return a list of
|
||||
``(path_pointer, encoding)`` tuples.
|
||||
|
||||
:rtype: list(PathPointer)
|
||||
"""
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, str):
|
||||
fileids = [fileids]
|
||||
|
||||
paths = [self._root.join(f) for f in fileids]
|
||||
|
||||
if include_encoding and include_fileid:
|
||||
return list(zip(paths, [self.encoding(f) for f in fileids], fileids))
|
||||
elif include_fileid:
|
||||
return list(zip(paths, fileids))
|
||||
elif include_encoding:
|
||||
return list(zip(paths, [self.encoding(f) for f in fileids]))
|
||||
else:
|
||||
return paths
|
||||
|
||||
def raw(self, fileids=None):
|
||||
"""
|
||||
:param fileids: A list specifying the fileids that should be used.
|
||||
:return: the given file(s) as a single string.
|
||||
:rtype: str
|
||||
"""
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, str):
|
||||
fileids = [fileids]
|
||||
contents = []
|
||||
for f in fileids:
|
||||
with self.open(f) as fp:
|
||||
contents.append(fp.read())
|
||||
return concat(contents)
|
||||
|
||||
def open(self, file):
|
||||
"""
|
||||
Return an open stream that can be used to read the given file.
|
||||
If the file's encoding is not None, then the stream will
|
||||
automatically decode the file's contents into unicode.
|
||||
|
||||
:param file: The file identifier of the file to read.
|
||||
"""
|
||||
encoding = self.encoding(file)
|
||||
stream = self._root.join(file).open(encoding)
|
||||
return stream
|
||||
|
||||
def encoding(self, file):
|
||||
"""
|
||||
Return the unicode encoding for the given corpus file, if known.
|
||||
If the encoding is unknown, or if the given file should be
|
||||
processed using byte strings (str), then return None.
|
||||
"""
|
||||
if isinstance(self._encoding, dict):
|
||||
return self._encoding.get(file)
|
||||
else:
|
||||
return self._encoding
|
||||
|
||||
def _get_root(self):
|
||||
return self._root
|
||||
|
||||
root = property(
|
||||
_get_root,
|
||||
doc="""
|
||||
The directory where this corpus is stored.
|
||||
|
||||
:type: PathPointer""",
|
||||
)
|
||||
|
||||
|
||||
######################################################################
|
||||
# { Corpora containing categorized items
|
||||
######################################################################
|
||||
|
||||
|
||||
class CategorizedCorpusReader:
|
||||
"""
|
||||
A mixin class used to aid in the implementation of corpus readers
|
||||
for categorized corpora. This class defines the method
|
||||
``categories()``, which returns a list of the categories for the
|
||||
corpus or for a specified set of fileids; and overrides ``fileids()``
|
||||
to take a ``categories`` argument, restricting the set of fileids to
|
||||
be returned.
|
||||
|
||||
Subclasses are expected to:
|
||||
|
||||
- Call ``__init__()`` to set up the mapping.
|
||||
|
||||
- Override all view methods to accept a ``categories`` parameter,
|
||||
which can be used *instead* of the ``fileids`` parameter, to
|
||||
select which fileids should be included in the returned view.
|
||||
"""
|
||||
|
||||
def __init__(self, kwargs):
|
||||
"""
|
||||
Initialize this mapping based on keyword arguments, as
|
||||
follows:
|
||||
|
||||
- cat_pattern: A regular expression pattern used to find the
|
||||
category for each file identifier. The pattern will be
|
||||
applied to each file identifier, and the first matching
|
||||
group will be used as the category label for that file.
|
||||
|
||||
- cat_map: A dictionary, mapping from file identifiers to
|
||||
category labels.
|
||||
|
||||
- cat_file: The name of a file that contains the mapping
|
||||
from file identifiers to categories. The argument
|
||||
``cat_delimiter`` can be used to specify a delimiter.
|
||||
|
||||
The corresponding argument will be deleted from ``kwargs``. If
|
||||
more than one argument is specified, an exception will be
|
||||
raised.
|
||||
"""
|
||||
self._f2c = None #: file-to-category mapping
|
||||
self._c2f = None #: category-to-file mapping
|
||||
|
||||
self._pattern = None #: regexp specifying the mapping
|
||||
self._map = None #: dict specifying the mapping
|
||||
self._file = None #: fileid of file containing the mapping
|
||||
self._delimiter = None #: delimiter for ``self._file``
|
||||
|
||||
if "cat_pattern" in kwargs:
|
||||
self._pattern = kwargs["cat_pattern"]
|
||||
del kwargs["cat_pattern"]
|
||||
elif "cat_map" in kwargs:
|
||||
self._map = kwargs["cat_map"]
|
||||
del kwargs["cat_map"]
|
||||
elif "cat_file" in kwargs:
|
||||
self._file = kwargs["cat_file"]
|
||||
del kwargs["cat_file"]
|
||||
if "cat_delimiter" in kwargs:
|
||||
self._delimiter = kwargs["cat_delimiter"]
|
||||
del kwargs["cat_delimiter"]
|
||||
else:
|
||||
raise ValueError(
|
||||
"Expected keyword argument cat_pattern or " "cat_map or cat_file."
|
||||
)
|
||||
|
||||
if "cat_pattern" in kwargs or "cat_map" in kwargs or "cat_file" in kwargs:
|
||||
raise ValueError(
|
||||
"Specify exactly one of: cat_pattern, " "cat_map, cat_file."
|
||||
)
|
||||
|
||||
def _init(self):
|
||||
self._f2c = defaultdict(set)
|
||||
self._c2f = defaultdict(set)
|
||||
|
||||
if self._pattern is not None:
|
||||
for file_id in self._fileids:
|
||||
category = re.match(self._pattern, file_id).group(1)
|
||||
self._add(file_id, category)
|
||||
|
||||
elif self._map is not None:
|
||||
for file_id, categories in self._map.items():
|
||||
for category in categories:
|
||||
self._add(file_id, category)
|
||||
|
||||
elif self._file is not None:
|
||||
with self.open(self._file) as f:
|
||||
for line in f.readlines():
|
||||
line = line.strip()
|
||||
file_id, categories = line.split(self._delimiter, 1)
|
||||
if file_id not in self.fileids():
|
||||
raise ValueError(
|
||||
"In category mapping file %s: %s "
|
||||
"not found" % (self._file, file_id)
|
||||
)
|
||||
for category in categories.split(self._delimiter):
|
||||
self._add(file_id, category)
|
||||
|
||||
def _add(self, file_id, category):
|
||||
self._f2c[file_id].add(category)
|
||||
self._c2f[category].add(file_id)
|
||||
|
||||
def categories(self, fileids=None):
|
||||
"""
|
||||
Return a list of the categories that are defined for this corpus,
|
||||
or for the file(s) if it is given.
|
||||
"""
|
||||
if self._f2c is None:
|
||||
self._init()
|
||||
if fileids is None:
|
||||
return sorted(self._c2f)
|
||||
if isinstance(fileids, str):
|
||||
fileids = [fileids]
|
||||
return sorted(set.union(*(self._f2c[d] for d in fileids)))
|
||||
|
||||
def fileids(self, categories=None):
|
||||
"""
|
||||
Return a list of file identifiers for the files that make up
|
||||
this corpus, or that make up the given category(s) if specified.
|
||||
"""
|
||||
if categories is None:
|
||||
return super().fileids()
|
||||
elif isinstance(categories, str):
|
||||
if self._f2c is None:
|
||||
self._init()
|
||||
if categories in self._c2f:
|
||||
return sorted(self._c2f[categories])
|
||||
else:
|
||||
raise ValueError("Category %s not found" % categories)
|
||||
else:
|
||||
if self._f2c is None:
|
||||
self._init()
|
||||
return sorted(set.union(*(self._c2f[c] for c in categories)))
|
||||
|
||||
def _resolve(self, fileids, categories):
|
||||
if fileids is not None and categories is not None:
|
||||
raise ValueError("Specify fileids or categories, not both")
|
||||
if categories is not None:
|
||||
return self.fileids(categories)
|
||||
else:
|
||||
return fileids
|
||||
|
||||
def raw(self, fileids=None, categories=None):
|
||||
return super().raw(self._resolve(fileids, categories))
|
||||
|
||||
def words(self, fileids=None, categories=None):
|
||||
return super().words(self._resolve(fileids, categories))
|
||||
|
||||
def sents(self, fileids=None, categories=None):
|
||||
return super().sents(self._resolve(fileids, categories))
|
||||
|
||||
def paras(self, fileids=None, categories=None):
|
||||
return super().paras(self._resolve(fileids, categories))
|
||||
|
||||
|
||||
######################################################################
|
||||
# { Treebank readers
|
||||
######################################################################
|
||||
|
||||
|
||||
# [xx] is it worth it to factor this out?
|
||||
class SyntaxCorpusReader(CorpusReader):
|
||||
"""
|
||||
An abstract base class for reading corpora consisting of
|
||||
syntactically parsed text. Subclasses should define:
|
||||
|
||||
- ``__init__``, which specifies the location of the corpus
|
||||
and a method for detecting the sentence blocks in corpus files.
|
||||
- ``_read_block``, which reads a block from the input stream.
|
||||
- ``_word``, which takes a block and returns a list of list of words.
|
||||
- ``_tag``, which takes a block and returns a list of list of tagged
|
||||
words.
|
||||
- ``_parse``, which takes a block and returns a list of parsed
|
||||
sentences.
|
||||
"""
|
||||
|
||||
def _parse(self, s):
|
||||
raise NotImplementedError()
|
||||
|
||||
def _word(self, s):
|
||||
raise NotImplementedError()
|
||||
|
||||
def _tag(self, s):
|
||||
raise NotImplementedError()
|
||||
|
||||
def _read_block(self, stream):
|
||||
raise NotImplementedError()
|
||||
|
||||
def parsed_sents(self, fileids=None):
|
||||
reader = self._read_parsed_sent_block
|
||||
return concat(
|
||||
[
|
||||
StreamBackedCorpusView(fileid, reader, encoding=enc)
|
||||
for fileid, enc in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_sents(self, fileids=None, tagset=None):
|
||||
def reader(stream):
|
||||
return self._read_tagged_sent_block(stream, tagset)
|
||||
|
||||
return concat(
|
||||
[
|
||||
StreamBackedCorpusView(fileid, reader, encoding=enc)
|
||||
for fileid, enc in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def sents(self, fileids=None):
|
||||
reader = self._read_sent_block
|
||||
return concat(
|
||||
[
|
||||
StreamBackedCorpusView(fileid, reader, encoding=enc)
|
||||
for fileid, enc in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_words(self, fileids=None, tagset=None):
|
||||
def reader(stream):
|
||||
return self._read_tagged_word_block(stream, tagset)
|
||||
|
||||
return concat(
|
||||
[
|
||||
StreamBackedCorpusView(fileid, reader, encoding=enc)
|
||||
for fileid, enc in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def words(self, fileids=None):
|
||||
return concat(
|
||||
[
|
||||
StreamBackedCorpusView(fileid, self._read_word_block, encoding=enc)
|
||||
for fileid, enc in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# { Block Readers
|
||||
|
||||
def _read_word_block(self, stream):
|
||||
return list(chain.from_iterable(self._read_sent_block(stream)))
|
||||
|
||||
def _read_tagged_word_block(self, stream, tagset=None):
|
||||
return list(chain.from_iterable(self._read_tagged_sent_block(stream, tagset)))
|
||||
|
||||
def _read_sent_block(self, stream):
|
||||
return list(filter(None, [self._word(t) for t in self._read_block(stream)]))
|
||||
|
||||
def _read_tagged_sent_block(self, stream, tagset=None):
|
||||
return list(
|
||||
filter(None, [self._tag(t, tagset) for t in self._read_block(stream)])
|
||||
)
|
||||
|
||||
def _read_parsed_sent_block(self, stream):
|
||||
return list(filter(None, [self._parse(t) for t in self._read_block(stream)]))
|
||||
|
||||
# } End of Block Readers
|
||||
# ------------------------------------------------------------
|
||||
218
backend/venv/Lib/site-packages/nltk/corpus/reader/bcp47.py
Normal file
218
backend/venv/Lib/site-packages/nltk/corpus/reader/bcp47.py
Normal file
@@ -0,0 +1,218 @@
|
||||
# Natural Language Toolkit: BCP-47 language tags
|
||||
#
|
||||
# Copyright (C) 2022-2023 NLTK Project
|
||||
# Author: Eric Kafe <kafe.eric@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import re
|
||||
from warnings import warn
|
||||
from xml.etree import ElementTree as et
|
||||
|
||||
from nltk.corpus.reader import CorpusReader
|
||||
|
||||
|
||||
class BCP47CorpusReader(CorpusReader):
|
||||
"""
|
||||
Parse BCP-47 composite language tags
|
||||
|
||||
Supports all the main subtags, and the 'u-sd' extension:
|
||||
|
||||
>>> from nltk.corpus import bcp47
|
||||
>>> bcp47.name('oc-gascon-u-sd-fr64')
|
||||
'Occitan (post 1500): Gascon: Pyrénées-Atlantiques'
|
||||
|
||||
Can load a conversion table to Wikidata Q-codes:
|
||||
>>> bcp47.load_wiki_q()
|
||||
>>> bcp47.wiki_q['en-GI-spanglis']
|
||||
'Q79388'
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, root, fileids):
|
||||
"""Read the BCP-47 database"""
|
||||
super().__init__(root, fileids)
|
||||
self.langcode = {}
|
||||
with self.open("iana/language-subtag-registry.txt") as fp:
|
||||
self.db = self.data_dict(fp.read().split("%%\n"))
|
||||
with self.open("cldr/common-subdivisions-en.xml") as fp:
|
||||
self.subdiv = self.subdiv_dict(
|
||||
et.parse(fp).iterfind("localeDisplayNames/subdivisions/subdivision")
|
||||
)
|
||||
self.morphology()
|
||||
|
||||
def load_wiki_q(self):
|
||||
"""Load conversion table to Wikidata Q-codes (only if needed)"""
|
||||
with self.open("cldr/tools-cldr-rdf-external-entityToCode.tsv") as fp:
|
||||
self.wiki_q = self.wiki_dict(fp.read().strip().split("\n")[1:])
|
||||
|
||||
def wiki_dict(self, lines):
|
||||
"""Convert Wikidata list of Q-codes to a BCP-47 dictionary"""
|
||||
return {
|
||||
pair[1]: pair[0].split("/")[-1]
|
||||
for pair in [line.strip().split("\t") for line in lines]
|
||||
}
|
||||
|
||||
def subdiv_dict(self, subdivs):
|
||||
"""Convert the CLDR subdivisions list to a dictionary"""
|
||||
return {sub.attrib["type"]: sub.text for sub in subdivs}
|
||||
|
||||
def morphology(self):
|
||||
self.casing = {
|
||||
"language": str.lower,
|
||||
"extlang": str.lower,
|
||||
"script": str.title,
|
||||
"region": str.upper,
|
||||
"variant": str.lower,
|
||||
}
|
||||
dig = "[0-9]"
|
||||
low = "[a-z]"
|
||||
up = "[A-Z]"
|
||||
alnum = "[a-zA-Z0-9]"
|
||||
self.format = {
|
||||
"language": re.compile(f"{low*3}?"),
|
||||
"extlang": re.compile(f"{low*3}"),
|
||||
"script": re.compile(f"{up}{low*3}"),
|
||||
"region": re.compile(f"({up*2})|({dig*3})"),
|
||||
"variant": re.compile(f"{alnum*4}{(alnum+'?')*4}"),
|
||||
"singleton": re.compile(f"{low}"),
|
||||
}
|
||||
|
||||
def data_dict(self, records):
|
||||
"""Convert the BCP-47 language subtag registry to a dictionary"""
|
||||
self.version = records[0].replace("File-Date:", "").strip()
|
||||
dic = {}
|
||||
dic["deprecated"] = {}
|
||||
for label in [
|
||||
"language",
|
||||
"extlang",
|
||||
"script",
|
||||
"region",
|
||||
"variant",
|
||||
"redundant",
|
||||
"grandfathered",
|
||||
]:
|
||||
dic["deprecated"][label] = {}
|
||||
for record in records[1:]:
|
||||
fields = [field.split(": ") for field in record.strip().split("\n")]
|
||||
typ = fields[0][1]
|
||||
tag = fields[1][1]
|
||||
if typ not in dic:
|
||||
dic[typ] = {}
|
||||
subfields = {}
|
||||
for field in fields[2:]:
|
||||
if len(field) == 2:
|
||||
[key, val] = field
|
||||
if key not in subfields:
|
||||
subfields[key] = [val]
|
||||
else: # multiple value
|
||||
subfields[key].append(val)
|
||||
else: # multiline field
|
||||
subfields[key][-1] += " " + field[0].strip()
|
||||
if (
|
||||
"Deprecated" not in record
|
||||
and typ == "language"
|
||||
and key == "Description"
|
||||
):
|
||||
self.langcode[subfields[key][-1]] = tag
|
||||
for key in subfields:
|
||||
if len(subfields[key]) == 1: # single value
|
||||
subfields[key] = subfields[key][0]
|
||||
if "Deprecated" in record:
|
||||
dic["deprecated"][typ][tag] = subfields
|
||||
else:
|
||||
dic[typ][tag] = subfields
|
||||
return dic
|
||||
|
||||
def val2str(self, val):
|
||||
"""Return only first value"""
|
||||
if type(val) == list:
|
||||
# val = "/".join(val) # Concatenate all values
|
||||
val = val[0]
|
||||
return val
|
||||
|
||||
def lang2str(self, lg_record):
|
||||
"""Concatenate subtag values"""
|
||||
name = f"{lg_record['language']}"
|
||||
for label in ["extlang", "script", "region", "variant", "extension"]:
|
||||
if label in lg_record:
|
||||
name += f": {lg_record[label]}"
|
||||
return name
|
||||
|
||||
def parse_tag(self, tag):
|
||||
"""Convert a BCP-47 tag to a dictionary of labelled subtags"""
|
||||
subtags = tag.split("-")
|
||||
lang = {}
|
||||
labels = ["language", "extlang", "script", "region", "variant", "variant"]
|
||||
while subtags and labels:
|
||||
subtag = subtags.pop(0)
|
||||
found = False
|
||||
while labels:
|
||||
label = labels.pop(0)
|
||||
subtag = self.casing[label](subtag)
|
||||
if self.format[label].fullmatch(subtag):
|
||||
if subtag in self.db[label]:
|
||||
found = True
|
||||
valstr = self.val2str(self.db[label][subtag]["Description"])
|
||||
if label == "variant" and label in lang:
|
||||
lang[label] += ": " + valstr
|
||||
else:
|
||||
lang[label] = valstr
|
||||
break
|
||||
elif subtag in self.db["deprecated"][label]:
|
||||
found = True
|
||||
note = f"The {subtag!r} {label} code is deprecated"
|
||||
if "Preferred-Value" in self.db["deprecated"][label][subtag]:
|
||||
prefer = self.db["deprecated"][label][subtag][
|
||||
"Preferred-Value"
|
||||
]
|
||||
note += f"', prefer '{self.val2str(prefer)}'"
|
||||
lang[label] = self.val2str(
|
||||
self.db["deprecated"][label][subtag]["Description"]
|
||||
)
|
||||
warn(note)
|
||||
break
|
||||
if not found:
|
||||
if subtag == "u" and subtags[0] == "sd": # CLDR regional subdivisions
|
||||
sd = subtags[1]
|
||||
if sd in self.subdiv:
|
||||
ext = self.subdiv[sd]
|
||||
else:
|
||||
ext = f"<Unknown subdivision: {ext}>"
|
||||
else: # other extension subtags are not supported yet
|
||||
ext = f"{subtag}{''.join(['-'+ext for ext in subtags])}".lower()
|
||||
if not self.format["singleton"].fullmatch(subtag):
|
||||
ext = f"<Invalid extension: {ext}>"
|
||||
warn(ext)
|
||||
lang["extension"] = ext
|
||||
subtags = []
|
||||
return lang
|
||||
|
||||
def name(self, tag):
|
||||
"""
|
||||
Convert a BCP-47 tag to a colon-separated string of subtag names
|
||||
|
||||
>>> from nltk.corpus import bcp47
|
||||
>>> bcp47.name('ca-Latn-ES-valencia')
|
||||
'Catalan: Latin: Spain: Valencian'
|
||||
|
||||
"""
|
||||
for label in ["redundant", "grandfathered"]:
|
||||
val = None
|
||||
if tag in self.db[label]:
|
||||
val = f"{self.db[label][tag]['Description']}"
|
||||
note = f"The {tag!r} code is {label}"
|
||||
elif tag in self.db["deprecated"][label]:
|
||||
val = f"{self.db['deprecated'][label][tag]['Description']}"
|
||||
note = f"The {tag!r} code is {label} and deprecated"
|
||||
if "Preferred-Value" in self.db["deprecated"][label][tag]:
|
||||
prefer = self.db["deprecated"][label][tag]["Preferred-Value"]
|
||||
note += f", prefer {self.val2str(prefer)!r}"
|
||||
if val:
|
||||
warn(note)
|
||||
return val
|
||||
try:
|
||||
return self.lang2str(self.parse_tag(tag))
|
||||
except:
|
||||
warn(f"Tag {tag!r} was not recognized")
|
||||
return None
|
||||
265
backend/venv/Lib/site-packages/nltk/corpus/reader/bnc.py
Normal file
265
backend/venv/Lib/site-packages/nltk/corpus/reader/bnc.py
Normal file
@@ -0,0 +1,265 @@
|
||||
# Natural Language Toolkit: Plaintext Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""Corpus reader for the XML version of the British National Corpus."""
|
||||
|
||||
from nltk.corpus.reader.util import concat
|
||||
from nltk.corpus.reader.xmldocs import ElementTree, XMLCorpusReader, XMLCorpusView
|
||||
|
||||
|
||||
class BNCCorpusReader(XMLCorpusReader):
|
||||
r"""Corpus reader for the XML version of the British National Corpus.
|
||||
|
||||
For access to the complete XML data structure, use the ``xml()``
|
||||
method. For access to simple word lists and tagged word lists, use
|
||||
``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``.
|
||||
|
||||
You can obtain the full version of the BNC corpus at
|
||||
https://www.ota.ox.ac.uk/desc/2554
|
||||
|
||||
If you extracted the archive to a directory called `BNC`, then you can
|
||||
instantiate the reader as::
|
||||
|
||||
BNCCorpusReader(root='BNC/Texts/', fileids=r'[A-K]/\w*/\w*\.xml')
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, root, fileids, lazy=True):
|
||||
XMLCorpusReader.__init__(self, root, fileids)
|
||||
self._lazy = lazy
|
||||
|
||||
def words(self, fileids=None, strip_space=True, stem=False):
|
||||
"""
|
||||
:return: the given file(s) as a list of words
|
||||
and punctuation symbols.
|
||||
:rtype: list(str)
|
||||
|
||||
:param strip_space: If true, then strip trailing spaces from
|
||||
word tokens. Otherwise, leave the spaces on the tokens.
|
||||
:param stem: If true, then use word stems instead of word strings.
|
||||
"""
|
||||
return self._views(fileids, False, None, strip_space, stem)
|
||||
|
||||
def tagged_words(self, fileids=None, c5=False, strip_space=True, stem=False):
|
||||
"""
|
||||
:return: the given file(s) as a list of tagged
|
||||
words and punctuation symbols, encoded as tuples
|
||||
``(word,tag)``.
|
||||
:rtype: list(tuple(str,str))
|
||||
|
||||
:param c5: If true, then the tags used will be the more detailed
|
||||
c5 tags. Otherwise, the simplified tags will be used.
|
||||
:param strip_space: If true, then strip trailing spaces from
|
||||
word tokens. Otherwise, leave the spaces on the tokens.
|
||||
:param stem: If true, then use word stems instead of word strings.
|
||||
"""
|
||||
tag = "c5" if c5 else "pos"
|
||||
return self._views(fileids, False, tag, strip_space, stem)
|
||||
|
||||
def sents(self, fileids=None, strip_space=True, stem=False):
|
||||
"""
|
||||
:return: the given file(s) as a list of
|
||||
sentences or utterances, each encoded as a list of word
|
||||
strings.
|
||||
:rtype: list(list(str))
|
||||
|
||||
:param strip_space: If true, then strip trailing spaces from
|
||||
word tokens. Otherwise, leave the spaces on the tokens.
|
||||
:param stem: If true, then use word stems instead of word strings.
|
||||
"""
|
||||
return self._views(fileids, True, None, strip_space, stem)
|
||||
|
||||
def tagged_sents(self, fileids=None, c5=False, strip_space=True, stem=False):
|
||||
"""
|
||||
:return: the given file(s) as a list of
|
||||
sentences, each encoded as a list of ``(word,tag)`` tuples.
|
||||
:rtype: list(list(tuple(str,str)))
|
||||
|
||||
:param c5: If true, then the tags used will be the more detailed
|
||||
c5 tags. Otherwise, the simplified tags will be used.
|
||||
:param strip_space: If true, then strip trailing spaces from
|
||||
word tokens. Otherwise, leave the spaces on the tokens.
|
||||
:param stem: If true, then use word stems instead of word strings.
|
||||
"""
|
||||
tag = "c5" if c5 else "pos"
|
||||
return self._views(
|
||||
fileids, sent=True, tag=tag, strip_space=strip_space, stem=stem
|
||||
)
|
||||
|
||||
def _views(self, fileids=None, sent=False, tag=False, strip_space=True, stem=False):
|
||||
"""A helper function that instantiates BNCWordViews or the list of words/sentences."""
|
||||
f = BNCWordView if self._lazy else self._words
|
||||
return concat(
|
||||
[
|
||||
f(fileid, sent, tag, strip_space, stem)
|
||||
for fileid in self.abspaths(fileids)
|
||||
]
|
||||
)
|
||||
|
||||
def _words(self, fileid, bracket_sent, tag, strip_space, stem):
|
||||
"""
|
||||
Helper used to implement the view methods -- returns a list of
|
||||
words or a list of sentences, optionally tagged.
|
||||
|
||||
:param fileid: The name of the underlying file.
|
||||
:param bracket_sent: If true, include sentence bracketing.
|
||||
:param tag: The name of the tagset to use, or None for no tags.
|
||||
:param strip_space: If true, strip spaces from word tokens.
|
||||
:param stem: If true, then substitute stems for words.
|
||||
"""
|
||||
result = []
|
||||
|
||||
xmldoc = ElementTree.parse(fileid).getroot()
|
||||
for xmlsent in xmldoc.findall(".//s"):
|
||||
sent = []
|
||||
for xmlword in _all_xmlwords_in(xmlsent):
|
||||
word = xmlword.text
|
||||
if not word:
|
||||
word = "" # fixes issue 337?
|
||||
if strip_space or stem:
|
||||
word = word.strip()
|
||||
if stem:
|
||||
word = xmlword.get("hw", word)
|
||||
if tag == "c5":
|
||||
word = (word, xmlword.get("c5"))
|
||||
elif tag == "pos":
|
||||
word = (word, xmlword.get("pos", xmlword.get("c5")))
|
||||
sent.append(word)
|
||||
if bracket_sent:
|
||||
result.append(BNCSentence(xmlsent.attrib["n"], sent))
|
||||
else:
|
||||
result.extend(sent)
|
||||
|
||||
assert None not in result
|
||||
return result
|
||||
|
||||
|
||||
def _all_xmlwords_in(elt, result=None):
|
||||
if result is None:
|
||||
result = []
|
||||
for child in elt:
|
||||
if child.tag in ("c", "w"):
|
||||
result.append(child)
|
||||
else:
|
||||
_all_xmlwords_in(child, result)
|
||||
return result
|
||||
|
||||
|
||||
class BNCSentence(list):
|
||||
"""
|
||||
A list of words, augmented by an attribute ``num`` used to record
|
||||
the sentence identifier (the ``n`` attribute from the XML).
|
||||
"""
|
||||
|
||||
def __init__(self, num, items):
|
||||
self.num = num
|
||||
list.__init__(self, items)
|
||||
|
||||
|
||||
class BNCWordView(XMLCorpusView):
|
||||
"""
|
||||
A stream backed corpus view specialized for use with the BNC corpus.
|
||||
"""
|
||||
|
||||
tags_to_ignore = {
|
||||
"pb",
|
||||
"gap",
|
||||
"vocal",
|
||||
"event",
|
||||
"unclear",
|
||||
"shift",
|
||||
"pause",
|
||||
"align",
|
||||
}
|
||||
"""These tags are ignored. For their description refer to the
|
||||
technical documentation, for example,
|
||||
http://www.natcorp.ox.ac.uk/docs/URG/ref-vocal.html
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, fileid, sent, tag, strip_space, stem):
|
||||
"""
|
||||
:param fileid: The name of the underlying file.
|
||||
:param sent: If true, include sentence bracketing.
|
||||
:param tag: The name of the tagset to use, or None for no tags.
|
||||
:param strip_space: If true, strip spaces from word tokens.
|
||||
:param stem: If true, then substitute stems for words.
|
||||
"""
|
||||
if sent:
|
||||
tagspec = ".*/s"
|
||||
else:
|
||||
tagspec = ".*/s/(.*/)?(c|w)"
|
||||
self._sent = sent
|
||||
self._tag = tag
|
||||
self._strip_space = strip_space
|
||||
self._stem = stem
|
||||
|
||||
self.title = None #: Title of the document.
|
||||
self.author = None #: Author of the document.
|
||||
self.editor = None #: Editor
|
||||
self.resps = None #: Statement of responsibility
|
||||
|
||||
XMLCorpusView.__init__(self, fileid, tagspec)
|
||||
|
||||
# Read in a tasty header.
|
||||
self._open()
|
||||
self.read_block(self._stream, ".*/teiHeader$", self.handle_header)
|
||||
self.close()
|
||||
|
||||
# Reset tag context.
|
||||
self._tag_context = {0: ()}
|
||||
|
||||
def handle_header(self, elt, context):
|
||||
# Set up some metadata!
|
||||
titles = elt.findall("titleStmt/title")
|
||||
if titles:
|
||||
self.title = "\n".join(title.text.strip() for title in titles)
|
||||
|
||||
authors = elt.findall("titleStmt/author")
|
||||
if authors:
|
||||
self.author = "\n".join(author.text.strip() for author in authors)
|
||||
|
||||
editors = elt.findall("titleStmt/editor")
|
||||
if editors:
|
||||
self.editor = "\n".join(editor.text.strip() for editor in editors)
|
||||
|
||||
resps = elt.findall("titleStmt/respStmt")
|
||||
if resps:
|
||||
self.resps = "\n\n".join(
|
||||
"\n".join(resp_elt.text.strip() for resp_elt in resp) for resp in resps
|
||||
)
|
||||
|
||||
def handle_elt(self, elt, context):
|
||||
if self._sent:
|
||||
return self.handle_sent(elt)
|
||||
else:
|
||||
return self.handle_word(elt)
|
||||
|
||||
def handle_word(self, elt):
|
||||
word = elt.text
|
||||
if not word:
|
||||
word = "" # fixes issue 337?
|
||||
if self._strip_space or self._stem:
|
||||
word = word.strip()
|
||||
if self._stem:
|
||||
word = elt.get("hw", word)
|
||||
if self._tag == "c5":
|
||||
word = (word, elt.get("c5"))
|
||||
elif self._tag == "pos":
|
||||
word = (word, elt.get("pos", elt.get("c5")))
|
||||
return word
|
||||
|
||||
def handle_sent(self, elt):
|
||||
sent = []
|
||||
for child in elt:
|
||||
if child.tag in ("mw", "hi", "corr", "trunc"):
|
||||
sent += [self.handle_word(w) for w in child]
|
||||
elif child.tag in ("w", "c"):
|
||||
sent.append(self.handle_word(child))
|
||||
elif child.tag not in self.tags_to_ignore:
|
||||
raise ValueError("Unexpected element %s" % child.tag)
|
||||
return BNCSentence(elt.attrib["n"], sent)
|
||||
@@ -0,0 +1,237 @@
|
||||
# Natural Language Toolkit: Penn Treebank Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
"""
|
||||
Corpus reader for corpora that consist of parenthesis-delineated parse trees.
|
||||
"""
|
||||
|
||||
import sys
|
||||
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.tag import map_tag
|
||||
from nltk.tree import Tree
|
||||
|
||||
# we use [^\s()]+ instead of \S+? to avoid matching ()
|
||||
SORTTAGWRD = re.compile(r"\((\d+) ([^\s()]+) ([^\s()]+)\)")
|
||||
TAGWORD = re.compile(r"\(([^\s()]+) ([^\s()]+)\)")
|
||||
WORD = re.compile(r"\([^\s()]+ ([^\s()]+)\)")
|
||||
EMPTY_BRACKETS = re.compile(r"\s*\(\s*\(")
|
||||
|
||||
|
||||
class BracketParseCorpusReader(SyntaxCorpusReader):
|
||||
"""
|
||||
Reader for corpora that consist of parenthesis-delineated parse trees,
|
||||
like those found in the "combined" section of the Penn Treebank,
|
||||
e.g. "(S (NP (DT the) (JJ little) (NN dog)) (VP (VBD barked)))".
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
root,
|
||||
fileids,
|
||||
comment_char=None,
|
||||
detect_blocks="unindented_paren",
|
||||
encoding="utf8",
|
||||
tagset=None,
|
||||
):
|
||||
"""
|
||||
:param root: The root directory for this corpus.
|
||||
:param fileids: A list or regexp specifying the fileids in this corpus.
|
||||
:param comment_char: The character which can appear at the start of
|
||||
a line to indicate that the rest of the line is a comment.
|
||||
:param detect_blocks: The method that is used to find blocks
|
||||
in the corpus; can be 'unindented_paren' (every unindented
|
||||
parenthesis starts a new parse) or 'sexpr' (brackets are
|
||||
matched).
|
||||
:param tagset: The name of the tagset used by this corpus, to be used
|
||||
for normalizing or converting the POS tags returned by the
|
||||
``tagged_...()`` methods.
|
||||
"""
|
||||
SyntaxCorpusReader.__init__(self, root, fileids, encoding)
|
||||
self._comment_char = comment_char
|
||||
self._detect_blocks = detect_blocks
|
||||
self._tagset = tagset
|
||||
|
||||
def _read_block(self, stream):
|
||||
if self._detect_blocks == "sexpr":
|
||||
return read_sexpr_block(stream, comment_char=self._comment_char)
|
||||
elif self._detect_blocks == "blankline":
|
||||
return read_blankline_block(stream)
|
||||
elif self._detect_blocks == "unindented_paren":
|
||||
# Tokens start with unindented left parens.
|
||||
toks = read_regexp_block(stream, start_re=r"^\(")
|
||||
# Strip any comments out of the tokens.
|
||||
if self._comment_char:
|
||||
toks = [
|
||||
re.sub("(?m)^%s.*" % re.escape(self._comment_char), "", tok)
|
||||
for tok in toks
|
||||
]
|
||||
return toks
|
||||
else:
|
||||
assert 0, "bad block type"
|
||||
|
||||
def _normalize(self, t):
|
||||
# Replace leaves of the form (!), (,), with (! !), (, ,)
|
||||
t = re.sub(r"\((.)\)", r"(\1 \1)", t)
|
||||
# Replace leaves of the form (tag word root) with (tag word)
|
||||
t = re.sub(r"\(([^\s()]+) ([^\s()]+) [^\s()]+\)", r"(\1 \2)", t)
|
||||
return t
|
||||
|
||||
def _parse(self, t):
|
||||
try:
|
||||
tree = Tree.fromstring(self._normalize(t))
|
||||
# If there's an empty node at the top, strip it off
|
||||
if tree.label() == "" and len(tree) == 1:
|
||||
return tree[0]
|
||||
else:
|
||||
return tree
|
||||
|
||||
except ValueError as e:
|
||||
sys.stderr.write("Bad tree detected; trying to recover...\n")
|
||||
# Try to recover, if we can:
|
||||
if e.args == ("mismatched parens",):
|
||||
for n in range(1, 5):
|
||||
try:
|
||||
v = Tree(self._normalize(t + ")" * n))
|
||||
sys.stderr.write(
|
||||
" Recovered by adding %d close " "paren(s)\n" % n
|
||||
)
|
||||
return v
|
||||
except ValueError:
|
||||
pass
|
||||
# Try something else:
|
||||
sys.stderr.write(" Recovered by returning a flat parse.\n")
|
||||
# sys.stderr.write(' '.join(t.split())+'\n')
|
||||
return Tree("S", self._tag(t))
|
||||
|
||||
def _tag(self, t, tagset=None):
|
||||
tagged_sent = [(w, p) for (p, w) in TAGWORD.findall(self._normalize(t))]
|
||||
if tagset and tagset != self._tagset:
|
||||
tagged_sent = [
|
||||
(w, map_tag(self._tagset, tagset, p)) for (w, p) in tagged_sent
|
||||
]
|
||||
return tagged_sent
|
||||
|
||||
def _word(self, t):
|
||||
return WORD.findall(self._normalize(t))
|
||||
|
||||
|
||||
class CategorizedBracketParseCorpusReader(
|
||||
CategorizedCorpusReader, BracketParseCorpusReader
|
||||
):
|
||||
"""
|
||||
A reader for parsed corpora whose documents are
|
||||
divided into categories based on their file identifiers.
|
||||
@author: Nathan Schneider <nschneid@cs.cmu.edu>
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
"""
|
||||
Initialize the corpus reader. Categorization arguments
|
||||
(C{cat_pattern}, C{cat_map}, and C{cat_file}) are passed to
|
||||
the L{CategorizedCorpusReader constructor
|
||||
<CategorizedCorpusReader.__init__>}. The remaining arguments
|
||||
are passed to the L{BracketParseCorpusReader constructor
|
||||
<BracketParseCorpusReader.__init__>}.
|
||||
"""
|
||||
CategorizedCorpusReader.__init__(self, kwargs)
|
||||
BracketParseCorpusReader.__init__(self, *args, **kwargs)
|
||||
|
||||
def tagged_words(self, fileids=None, categories=None, tagset=None):
|
||||
return super().tagged_words(self._resolve(fileids, categories), tagset)
|
||||
|
||||
def tagged_sents(self, fileids=None, categories=None, tagset=None):
|
||||
return super().tagged_sents(self._resolve(fileids, categories), tagset)
|
||||
|
||||
def tagged_paras(self, fileids=None, categories=None, tagset=None):
|
||||
return super().tagged_paras(self._resolve(fileids, categories), tagset)
|
||||
|
||||
def parsed_words(self, fileids=None, categories=None):
|
||||
return super().parsed_words(self._resolve(fileids, categories))
|
||||
|
||||
def parsed_sents(self, fileids=None, categories=None):
|
||||
return super().parsed_sents(self._resolve(fileids, categories))
|
||||
|
||||
def parsed_paras(self, fileids=None, categories=None):
|
||||
return super().parsed_paras(self._resolve(fileids, categories))
|
||||
|
||||
|
||||
class AlpinoCorpusReader(BracketParseCorpusReader):
|
||||
"""
|
||||
Reader for the Alpino Dutch Treebank.
|
||||
This corpus has a lexical breakdown structure embedded, as read by `_parse`
|
||||
Unfortunately this puts punctuation and some other words out of the sentence
|
||||
order in the xml element tree. This is no good for `tag_` and `word_`
|
||||
`_tag` and `_word` will be overridden to use a non-default new parameter 'ordered'
|
||||
to the overridden _normalize function. The _parse function can then remain
|
||||
untouched.
|
||||
"""
|
||||
|
||||
def __init__(self, root, encoding="ISO-8859-1", tagset=None):
|
||||
BracketParseCorpusReader.__init__(
|
||||
self,
|
||||
root,
|
||||
r"alpino\.xml",
|
||||
detect_blocks="blankline",
|
||||
encoding=encoding,
|
||||
tagset=tagset,
|
||||
)
|
||||
|
||||
def _normalize(self, t, ordered=False):
|
||||
"""Normalize the xml sentence element in t.
|
||||
The sentence elements <alpino_ds>, although embedded in a few overall
|
||||
xml elements, are separated by blank lines. That's how the reader can
|
||||
deliver them one at a time.
|
||||
Each sentence has a few category subnodes that are of no use to us.
|
||||
The remaining word nodes may or may not appear in the proper order.
|
||||
Each word node has attributes, among which:
|
||||
- begin : the position of the word in the sentence
|
||||
- pos : Part of Speech: the Tag
|
||||
- word : the actual word
|
||||
The return value is a string with all xml elementes replaced by
|
||||
clauses: either a cat clause with nested clauses, or a word clause.
|
||||
The order of the bracket clauses closely follows the xml.
|
||||
If ordered == True, the word clauses include an order sequence number.
|
||||
If ordered == False, the word clauses only have pos and word parts.
|
||||
"""
|
||||
if t[:10] != "<alpino_ds":
|
||||
return ""
|
||||
# convert XML to sexpr notation
|
||||
t = re.sub(r' <node .*? cat="(\w+)".*>', r"(\1", t)
|
||||
if ordered:
|
||||
t = re.sub(
|
||||
r' <node. *?begin="(\d+)".*? pos="(\w+)".*? word="([^"]+)".*?/>',
|
||||
r"(\1 \2 \3)",
|
||||
t,
|
||||
)
|
||||
else:
|
||||
t = re.sub(r' <node .*?pos="(\w+)".*? word="([^"]+)".*?/>', r"(\1 \2)", t)
|
||||
t = re.sub(r" </node>", r")", t)
|
||||
t = re.sub(r"<sentence>.*</sentence>", r"", t)
|
||||
t = re.sub(r"</?alpino_ds.*>", r"", t)
|
||||
return t
|
||||
|
||||
def _tag(self, t, tagset=None):
|
||||
tagged_sent = [
|
||||
(int(o), w, p)
|
||||
for (o, p, w) in SORTTAGWRD.findall(self._normalize(t, ordered=True))
|
||||
]
|
||||
tagged_sent.sort()
|
||||
if tagset and tagset != self._tagset:
|
||||
tagged_sent = [
|
||||
(w, map_tag(self._tagset, tagset, p)) for (o, w, p) in tagged_sent
|
||||
]
|
||||
else:
|
||||
tagged_sent = [(w, p) for (o, w, p) in tagged_sent]
|
||||
return tagged_sent
|
||||
|
||||
def _word(self, t):
|
||||
"""Return a correctly ordered list if words"""
|
||||
tagged_sent = self._tag(t)
|
||||
return [w for (w, p) in tagged_sent]
|
||||
@@ -0,0 +1,168 @@
|
||||
# Natural Language Toolkit: Categorized Sentences Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
CorpusReader structured for corpora that contain one instance on each row.
|
||||
This CorpusReader is specifically used for the Subjectivity Dataset and the
|
||||
Sentence Polarity Dataset.
|
||||
|
||||
- Subjectivity Dataset information -
|
||||
|
||||
Authors: Bo Pang and Lillian Lee.
|
||||
Url: https://www.cs.cornell.edu/people/pabo/movie-review-data
|
||||
|
||||
Distributed with permission.
|
||||
|
||||
Related papers:
|
||||
|
||||
- Bo Pang and Lillian Lee. "A Sentimental Education: Sentiment Analysis Using
|
||||
Subjectivity Summarization Based on Minimum Cuts". Proceedings of the ACL,
|
||||
2004.
|
||||
|
||||
- Sentence Polarity Dataset information -
|
||||
|
||||
Authors: Bo Pang and Lillian Lee.
|
||||
Url: https://www.cs.cornell.edu/people/pabo/movie-review-data
|
||||
|
||||
Related papers:
|
||||
|
||||
- Bo Pang and Lillian Lee. "Seeing stars: Exploiting class relationships for
|
||||
sentiment categorization with respect to rating scales". Proceedings of the
|
||||
ACL, 2005.
|
||||
"""
|
||||
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.tokenize import *
|
||||
|
||||
|
||||
class CategorizedSentencesCorpusReader(CategorizedCorpusReader, CorpusReader):
|
||||
"""
|
||||
A reader for corpora in which each row represents a single instance, mainly
|
||||
a sentence. Istances are divided into categories based on their file identifiers
|
||||
(see CategorizedCorpusReader).
|
||||
Since many corpora allow rows that contain more than one sentence, it is
|
||||
possible to specify a sentence tokenizer to retrieve all sentences instead
|
||||
than all rows.
|
||||
|
||||
Examples using the Subjectivity Dataset:
|
||||
|
||||
>>> from nltk.corpus import subjectivity
|
||||
>>> subjectivity.sents()[23] # doctest: +NORMALIZE_WHITESPACE
|
||||
['television', 'made', 'him', 'famous', ',', 'but', 'his', 'biggest', 'hits',
|
||||
'happened', 'off', 'screen', '.']
|
||||
>>> subjectivity.categories()
|
||||
['obj', 'subj']
|
||||
>>> subjectivity.words(categories='subj')
|
||||
['smart', 'and', 'alert', ',', 'thirteen', ...]
|
||||
|
||||
Examples using the Sentence Polarity Dataset:
|
||||
|
||||
>>> from nltk.corpus import sentence_polarity
|
||||
>>> sentence_polarity.sents() # doctest: +NORMALIZE_WHITESPACE
|
||||
[['simplistic', ',', 'silly', 'and', 'tedious', '.'], ["it's", 'so', 'laddish',
|
||||
'and', 'juvenile', ',', 'only', 'teenage', 'boys', 'could', 'possibly', 'find',
|
||||
'it', 'funny', '.'], ...]
|
||||
>>> sentence_polarity.categories()
|
||||
['neg', 'pos']
|
||||
"""
|
||||
|
||||
CorpusView = StreamBackedCorpusView
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
root,
|
||||
fileids,
|
||||
word_tokenizer=WhitespaceTokenizer(),
|
||||
sent_tokenizer=None,
|
||||
encoding="utf8",
|
||||
**kwargs
|
||||
):
|
||||
"""
|
||||
:param root: The root directory for the corpus.
|
||||
:param fileids: a list or regexp specifying the fileids in the corpus.
|
||||
:param word_tokenizer: a tokenizer for breaking sentences or paragraphs
|
||||
into words. Default: `WhitespaceTokenizer`
|
||||
:param sent_tokenizer: a tokenizer for breaking paragraphs into sentences.
|
||||
:param encoding: the encoding that should be used to read the corpus.
|
||||
:param kwargs: additional parameters passed to CategorizedCorpusReader.
|
||||
"""
|
||||
|
||||
CorpusReader.__init__(self, root, fileids, encoding)
|
||||
CategorizedCorpusReader.__init__(self, kwargs)
|
||||
self._word_tokenizer = word_tokenizer
|
||||
self._sent_tokenizer = sent_tokenizer
|
||||
|
||||
def sents(self, fileids=None, categories=None):
|
||||
"""
|
||||
Return all sentences in the corpus or in the specified file(s).
|
||||
|
||||
:param fileids: a list or regexp specifying the ids of the files whose
|
||||
sentences have to be returned.
|
||||
:param categories: a list specifying the categories whose sentences have
|
||||
to be returned.
|
||||
:return: the given file(s) as a list of sentences.
|
||||
Each sentence is tokenized using the specified word_tokenizer.
|
||||
:rtype: list(list(str))
|
||||
"""
|
||||
fileids = self._resolve(fileids, categories)
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, str):
|
||||
fileids = [fileids]
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(path, self._read_sent_block, encoding=enc)
|
||||
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
||||
]
|
||||
)
|
||||
|
||||
def words(self, fileids=None, categories=None):
|
||||
"""
|
||||
Return all words and punctuation symbols in the corpus or in the specified
|
||||
file(s).
|
||||
|
||||
:param fileids: a list or regexp specifying the ids of the files whose
|
||||
words have to be returned.
|
||||
:param categories: a list specifying the categories whose words have to
|
||||
be returned.
|
||||
:return: the given file(s) as a list of words and punctuation symbols.
|
||||
:rtype: list(str)
|
||||
"""
|
||||
fileids = self._resolve(fileids, categories)
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, str):
|
||||
fileids = [fileids]
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(path, self._read_word_block, encoding=enc)
|
||||
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
||||
]
|
||||
)
|
||||
|
||||
def _read_sent_block(self, stream):
|
||||
sents = []
|
||||
for i in range(20): # Read 20 lines at a time.
|
||||
line = stream.readline()
|
||||
if not line:
|
||||
continue
|
||||
if self._sent_tokenizer:
|
||||
sents.extend(
|
||||
[
|
||||
self._word_tokenizer.tokenize(sent)
|
||||
for sent in self._sent_tokenizer.tokenize(line)
|
||||
]
|
||||
)
|
||||
else:
|
||||
sents.append(self._word_tokenizer.tokenize(line))
|
||||
return sents
|
||||
|
||||
def _read_word_block(self, stream):
|
||||
words = []
|
||||
for sent in self._read_sent_block(stream):
|
||||
words.extend(sent)
|
||||
return words
|
||||
154
backend/venv/Lib/site-packages/nltk/corpus/reader/chasen.py
Normal file
154
backend/venv/Lib/site-packages/nltk/corpus/reader/chasen.py
Normal file
@@ -0,0 +1,154 @@
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Masato Hagiwara <hagisan@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import sys
|
||||
|
||||
from nltk.corpus.reader import util
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.corpus.reader.util import *
|
||||
|
||||
|
||||
class ChasenCorpusReader(CorpusReader):
|
||||
def __init__(self, root, fileids, encoding="utf8", sent_splitter=None):
|
||||
self._sent_splitter = sent_splitter
|
||||
CorpusReader.__init__(self, root, fileids, encoding)
|
||||
|
||||
def words(self, fileids=None):
|
||||
return concat(
|
||||
[
|
||||
ChasenCorpusView(fileid, enc, False, False, False, self._sent_splitter)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_words(self, fileids=None):
|
||||
return concat(
|
||||
[
|
||||
ChasenCorpusView(fileid, enc, True, False, False, self._sent_splitter)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def sents(self, fileids=None):
|
||||
return concat(
|
||||
[
|
||||
ChasenCorpusView(fileid, enc, False, True, False, self._sent_splitter)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_sents(self, fileids=None):
|
||||
return concat(
|
||||
[
|
||||
ChasenCorpusView(fileid, enc, True, True, False, self._sent_splitter)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def paras(self, fileids=None):
|
||||
return concat(
|
||||
[
|
||||
ChasenCorpusView(fileid, enc, False, True, True, self._sent_splitter)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_paras(self, fileids=None):
|
||||
return concat(
|
||||
[
|
||||
ChasenCorpusView(fileid, enc, True, True, True, self._sent_splitter)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
class ChasenCorpusView(StreamBackedCorpusView):
|
||||
"""
|
||||
A specialized corpus view for ChasenReader. Similar to ``TaggedCorpusView``,
|
||||
but this'll use fixed sets of word and sentence tokenizer.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
corpus_file,
|
||||
encoding,
|
||||
tagged,
|
||||
group_by_sent,
|
||||
group_by_para,
|
||||
sent_splitter=None,
|
||||
):
|
||||
self._tagged = tagged
|
||||
self._group_by_sent = group_by_sent
|
||||
self._group_by_para = group_by_para
|
||||
self._sent_splitter = sent_splitter
|
||||
StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
|
||||
|
||||
def read_block(self, stream):
|
||||
"""Reads one paragraph at a time."""
|
||||
block = []
|
||||
for para_str in read_regexp_block(stream, r".", r"^EOS\n"):
|
||||
para = []
|
||||
|
||||
sent = []
|
||||
for line in para_str.splitlines():
|
||||
_eos = line.strip() == "EOS"
|
||||
_cells = line.split("\t")
|
||||
w = (_cells[0], "\t".join(_cells[1:]))
|
||||
if not _eos:
|
||||
sent.append(w)
|
||||
|
||||
if _eos or (self._sent_splitter and self._sent_splitter(w)):
|
||||
if not self._tagged:
|
||||
sent = [w for (w, t) in sent]
|
||||
if self._group_by_sent:
|
||||
para.append(sent)
|
||||
else:
|
||||
para.extend(sent)
|
||||
sent = []
|
||||
|
||||
if len(sent) > 0:
|
||||
if not self._tagged:
|
||||
sent = [w for (w, t) in sent]
|
||||
|
||||
if self._group_by_sent:
|
||||
para.append(sent)
|
||||
else:
|
||||
para.extend(sent)
|
||||
|
||||
if self._group_by_para:
|
||||
block.append(para)
|
||||
else:
|
||||
block.extend(para)
|
||||
|
||||
return block
|
||||
|
||||
|
||||
def demo():
|
||||
import nltk
|
||||
from nltk.corpus.util import LazyCorpusLoader
|
||||
|
||||
jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*chasen", encoding="utf-8")
|
||||
print("/".join(jeita.words()[22100:22140]))
|
||||
|
||||
print(
|
||||
"\nEOS\n".join(
|
||||
"\n".join("{}/{}".format(w[0], w[1].split("\t")[2]) for w in sent)
|
||||
for sent in jeita.tagged_sents()[2170:2173]
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def test():
|
||||
from nltk.corpus.util import LazyCorpusLoader
|
||||
|
||||
jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*chasen", encoding="utf-8")
|
||||
|
||||
assert isinstance(jeita.tagged_words()[0][1], str)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
test()
|
||||
630
backend/venv/Lib/site-packages/nltk/corpus/reader/childes.py
Normal file
630
backend/venv/Lib/site-packages/nltk/corpus/reader/childes.py
Normal file
@@ -0,0 +1,630 @@
|
||||
# CHILDES XML Corpus Reader
|
||||
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Tomonori Nagano <tnagano@gc.cuny.edu>
|
||||
# Alexis Dimitriadis <A.Dimitriadis@uu.nl>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Corpus reader for the XML version of the CHILDES corpus.
|
||||
"""
|
||||
|
||||
__docformat__ = "epytext en"
|
||||
|
||||
import re
|
||||
from collections import defaultdict
|
||||
|
||||
from nltk.corpus.reader.util import concat
|
||||
from nltk.corpus.reader.xmldocs import ElementTree, XMLCorpusReader
|
||||
from nltk.util import LazyConcatenation, LazyMap, flatten
|
||||
|
||||
# to resolve the namespace issue
|
||||
NS = "http://www.talkbank.org/ns/talkbank"
|
||||
|
||||
|
||||
class CHILDESCorpusReader(XMLCorpusReader):
|
||||
"""
|
||||
Corpus reader for the XML version of the CHILDES corpus.
|
||||
The CHILDES corpus is available at ``https://childes.talkbank.org/``. The XML
|
||||
version of CHILDES is located at ``https://childes.talkbank.org/data-xml/``.
|
||||
Copy the needed parts of the CHILDES XML corpus into the NLTK data directory
|
||||
(``nltk_data/corpora/CHILDES/``).
|
||||
|
||||
For access to the file text use the usual nltk functions,
|
||||
``words()``, ``sents()``, ``tagged_words()`` and ``tagged_sents()``.
|
||||
"""
|
||||
|
||||
def __init__(self, root, fileids, lazy=True):
|
||||
XMLCorpusReader.__init__(self, root, fileids)
|
||||
self._lazy = lazy
|
||||
|
||||
def words(
|
||||
self,
|
||||
fileids=None,
|
||||
speaker="ALL",
|
||||
stem=False,
|
||||
relation=False,
|
||||
strip_space=True,
|
||||
replace=False,
|
||||
):
|
||||
"""
|
||||
:return: the given file(s) as a list of words
|
||||
:rtype: list(str)
|
||||
|
||||
:param speaker: If specified, select specific speaker(s) defined
|
||||
in the corpus. Default is 'ALL' (all participants). Common choices
|
||||
are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
|
||||
researchers)
|
||||
:param stem: If true, then use word stems instead of word strings.
|
||||
:param relation: If true, then return tuples of (stem, index,
|
||||
dependent_index)
|
||||
:param strip_space: If true, then strip trailing spaces from word
|
||||
tokens. Otherwise, leave the spaces on the tokens.
|
||||
:param replace: If true, then use the replaced (intended) word instead
|
||||
of the original word (e.g., 'wat' will be replaced with 'watch')
|
||||
"""
|
||||
sent = None
|
||||
pos = False
|
||||
if not self._lazy:
|
||||
return [
|
||||
self._get_words(
|
||||
fileid, speaker, sent, stem, relation, pos, strip_space, replace
|
||||
)
|
||||
for fileid in self.abspaths(fileids)
|
||||
]
|
||||
|
||||
get_words = lambda fileid: self._get_words(
|
||||
fileid, speaker, sent, stem, relation, pos, strip_space, replace
|
||||
)
|
||||
return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
|
||||
|
||||
def tagged_words(
|
||||
self,
|
||||
fileids=None,
|
||||
speaker="ALL",
|
||||
stem=False,
|
||||
relation=False,
|
||||
strip_space=True,
|
||||
replace=False,
|
||||
):
|
||||
"""
|
||||
:return: the given file(s) as a list of tagged
|
||||
words and punctuation symbols, encoded as tuples
|
||||
``(word,tag)``.
|
||||
:rtype: list(tuple(str,str))
|
||||
|
||||
:param speaker: If specified, select specific speaker(s) defined
|
||||
in the corpus. Default is 'ALL' (all participants). Common choices
|
||||
are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
|
||||
researchers)
|
||||
:param stem: If true, then use word stems instead of word strings.
|
||||
:param relation: If true, then return tuples of (stem, index,
|
||||
dependent_index)
|
||||
:param strip_space: If true, then strip trailing spaces from word
|
||||
tokens. Otherwise, leave the spaces on the tokens.
|
||||
:param replace: If true, then use the replaced (intended) word instead
|
||||
of the original word (e.g., 'wat' will be replaced with 'watch')
|
||||
"""
|
||||
sent = None
|
||||
pos = True
|
||||
if not self._lazy:
|
||||
return [
|
||||
self._get_words(
|
||||
fileid, speaker, sent, stem, relation, pos, strip_space, replace
|
||||
)
|
||||
for fileid in self.abspaths(fileids)
|
||||
]
|
||||
|
||||
get_words = lambda fileid: self._get_words(
|
||||
fileid, speaker, sent, stem, relation, pos, strip_space, replace
|
||||
)
|
||||
return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
|
||||
|
||||
def sents(
|
||||
self,
|
||||
fileids=None,
|
||||
speaker="ALL",
|
||||
stem=False,
|
||||
relation=None,
|
||||
strip_space=True,
|
||||
replace=False,
|
||||
):
|
||||
"""
|
||||
:return: the given file(s) as a list of sentences or utterances, each
|
||||
encoded as a list of word strings.
|
||||
:rtype: list(list(str))
|
||||
|
||||
:param speaker: If specified, select specific speaker(s) defined
|
||||
in the corpus. Default is 'ALL' (all participants). Common choices
|
||||
are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
|
||||
researchers)
|
||||
:param stem: If true, then use word stems instead of word strings.
|
||||
:param relation: If true, then return tuples of ``(str,pos,relation_list)``.
|
||||
If there is manually-annotated relation info, it will return
|
||||
tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)``
|
||||
:param strip_space: If true, then strip trailing spaces from word
|
||||
tokens. Otherwise, leave the spaces on the tokens.
|
||||
:param replace: If true, then use the replaced (intended) word instead
|
||||
of the original word (e.g., 'wat' will be replaced with 'watch')
|
||||
"""
|
||||
sent = True
|
||||
pos = False
|
||||
if not self._lazy:
|
||||
return [
|
||||
self._get_words(
|
||||
fileid, speaker, sent, stem, relation, pos, strip_space, replace
|
||||
)
|
||||
for fileid in self.abspaths(fileids)
|
||||
]
|
||||
|
||||
get_words = lambda fileid: self._get_words(
|
||||
fileid, speaker, sent, stem, relation, pos, strip_space, replace
|
||||
)
|
||||
return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
|
||||
|
||||
def tagged_sents(
|
||||
self,
|
||||
fileids=None,
|
||||
speaker="ALL",
|
||||
stem=False,
|
||||
relation=None,
|
||||
strip_space=True,
|
||||
replace=False,
|
||||
):
|
||||
"""
|
||||
:return: the given file(s) as a list of
|
||||
sentences, each encoded as a list of ``(word,tag)`` tuples.
|
||||
:rtype: list(list(tuple(str,str)))
|
||||
|
||||
:param speaker: If specified, select specific speaker(s) defined
|
||||
in the corpus. Default is 'ALL' (all participants). Common choices
|
||||
are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
|
||||
researchers)
|
||||
:param stem: If true, then use word stems instead of word strings.
|
||||
:param relation: If true, then return tuples of ``(str,pos,relation_list)``.
|
||||
If there is manually-annotated relation info, it will return
|
||||
tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)``
|
||||
:param strip_space: If true, then strip trailing spaces from word
|
||||
tokens. Otherwise, leave the spaces on the tokens.
|
||||
:param replace: If true, then use the replaced (intended) word instead
|
||||
of the original word (e.g., 'wat' will be replaced with 'watch')
|
||||
"""
|
||||
sent = True
|
||||
pos = True
|
||||
if not self._lazy:
|
||||
return [
|
||||
self._get_words(
|
||||
fileid, speaker, sent, stem, relation, pos, strip_space, replace
|
||||
)
|
||||
for fileid in self.abspaths(fileids)
|
||||
]
|
||||
|
||||
get_words = lambda fileid: self._get_words(
|
||||
fileid, speaker, sent, stem, relation, pos, strip_space, replace
|
||||
)
|
||||
return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
|
||||
|
||||
def corpus(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a dict of ``(corpus_property_key, value)``
|
||||
:rtype: list(dict)
|
||||
"""
|
||||
if not self._lazy:
|
||||
return [self._get_corpus(fileid) for fileid in self.abspaths(fileids)]
|
||||
return LazyMap(self._get_corpus, self.abspaths(fileids))
|
||||
|
||||
def _get_corpus(self, fileid):
|
||||
results = dict()
|
||||
xmldoc = ElementTree.parse(fileid).getroot()
|
||||
for key, value in xmldoc.items():
|
||||
results[key] = value
|
||||
return results
|
||||
|
||||
def participants(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a dict of
|
||||
``(participant_property_key, value)``
|
||||
:rtype: list(dict)
|
||||
"""
|
||||
if not self._lazy:
|
||||
return [self._get_participants(fileid) for fileid in self.abspaths(fileids)]
|
||||
return LazyMap(self._get_participants, self.abspaths(fileids))
|
||||
|
||||
def _get_participants(self, fileid):
|
||||
# multidimensional dicts
|
||||
def dictOfDicts():
|
||||
return defaultdict(dictOfDicts)
|
||||
|
||||
xmldoc = ElementTree.parse(fileid).getroot()
|
||||
# getting participants' data
|
||||
pat = dictOfDicts()
|
||||
for participant in xmldoc.findall(
|
||||
f".//{{{NS}}}Participants/{{{NS}}}participant"
|
||||
):
|
||||
for key, value in participant.items():
|
||||
pat[participant.get("id")][key] = value
|
||||
return pat
|
||||
|
||||
def age(self, fileids=None, speaker="CHI", month=False):
|
||||
"""
|
||||
:return: the given file(s) as string or int
|
||||
:rtype: list or int
|
||||
|
||||
:param month: If true, return months instead of year-month-date
|
||||
"""
|
||||
if not self._lazy:
|
||||
return [
|
||||
self._get_age(fileid, speaker, month)
|
||||
for fileid in self.abspaths(fileids)
|
||||
]
|
||||
get_age = lambda fileid: self._get_age(fileid, speaker, month)
|
||||
return LazyMap(get_age, self.abspaths(fileids))
|
||||
|
||||
def _get_age(self, fileid, speaker, month):
|
||||
xmldoc = ElementTree.parse(fileid).getroot()
|
||||
for pat in xmldoc.findall(f".//{{{NS}}}Participants/{{{NS}}}participant"):
|
||||
try:
|
||||
if pat.get("id") == speaker:
|
||||
age = pat.get("age")
|
||||
if month:
|
||||
age = self.convert_age(age)
|
||||
return age
|
||||
# some files don't have age data
|
||||
except (TypeError, AttributeError) as e:
|
||||
return None
|
||||
|
||||
def convert_age(self, age_year):
|
||||
"Caclculate age in months from a string in CHILDES format"
|
||||
m = re.match(r"P(\d+)Y(\d+)M?(\d?\d?)D?", age_year)
|
||||
age_month = int(m.group(1)) * 12 + int(m.group(2))
|
||||
try:
|
||||
if int(m.group(3)) > 15:
|
||||
age_month += 1
|
||||
# some corpora don't have age information?
|
||||
except ValueError as e:
|
||||
pass
|
||||
return age_month
|
||||
|
||||
def MLU(self, fileids=None, speaker="CHI"):
|
||||
"""
|
||||
:return: the given file(s) as a floating number
|
||||
:rtype: list(float)
|
||||
"""
|
||||
if not self._lazy:
|
||||
return [
|
||||
self._getMLU(fileid, speaker=speaker)
|
||||
for fileid in self.abspaths(fileids)
|
||||
]
|
||||
get_MLU = lambda fileid: self._getMLU(fileid, speaker=speaker)
|
||||
return LazyMap(get_MLU, self.abspaths(fileids))
|
||||
|
||||
def _getMLU(self, fileid, speaker):
|
||||
sents = self._get_words(
|
||||
fileid,
|
||||
speaker=speaker,
|
||||
sent=True,
|
||||
stem=True,
|
||||
relation=False,
|
||||
pos=True,
|
||||
strip_space=True,
|
||||
replace=True,
|
||||
)
|
||||
results = []
|
||||
lastSent = []
|
||||
numFillers = 0
|
||||
sentDiscount = 0
|
||||
for sent in sents:
|
||||
posList = [pos for (word, pos) in sent]
|
||||
# if any part of the sentence is intelligible
|
||||
if any(pos == "unk" for pos in posList):
|
||||
continue
|
||||
# if the sentence is null
|
||||
elif sent == []:
|
||||
continue
|
||||
# if the sentence is the same as the last sent
|
||||
elif sent == lastSent:
|
||||
continue
|
||||
else:
|
||||
results.append([word for (word, pos) in sent])
|
||||
# count number of fillers
|
||||
if len({"co", None}.intersection(posList)) > 0:
|
||||
numFillers += posList.count("co")
|
||||
numFillers += posList.count(None)
|
||||
sentDiscount += 1
|
||||
lastSent = sent
|
||||
try:
|
||||
thisWordList = flatten(results)
|
||||
# count number of morphemes
|
||||
# (e.g., 'read' = 1 morpheme but 'read-PAST' is 2 morphemes)
|
||||
numWords = (
|
||||
len(flatten([word.split("-") for word in thisWordList])) - numFillers
|
||||
)
|
||||
numSents = len(results) - sentDiscount
|
||||
mlu = numWords / numSents
|
||||
except ZeroDivisionError:
|
||||
mlu = 0
|
||||
# return {'mlu':mlu,'wordNum':numWords,'sentNum':numSents}
|
||||
return mlu
|
||||
|
||||
def _get_words(
|
||||
self, fileid, speaker, sent, stem, relation, pos, strip_space, replace
|
||||
):
|
||||
if (
|
||||
isinstance(speaker, str) and speaker != "ALL"
|
||||
): # ensure we have a list of speakers
|
||||
speaker = [speaker]
|
||||
xmldoc = ElementTree.parse(fileid).getroot()
|
||||
# processing each xml doc
|
||||
results = []
|
||||
for xmlsent in xmldoc.findall(".//{%s}u" % NS):
|
||||
sents = []
|
||||
# select speakers
|
||||
if speaker == "ALL" or xmlsent.get("who") in speaker:
|
||||
for xmlword in xmlsent.findall(".//{%s}w" % NS):
|
||||
infl = None
|
||||
suffixStem = None
|
||||
suffixTag = None
|
||||
# getting replaced words
|
||||
if replace and xmlsent.find(f".//{{{NS}}}w/{{{NS}}}replacement"):
|
||||
xmlword = xmlsent.find(
|
||||
f".//{{{NS}}}w/{{{NS}}}replacement/{{{NS}}}w"
|
||||
)
|
||||
elif replace and xmlsent.find(f".//{{{NS}}}w/{{{NS}}}wk"):
|
||||
xmlword = xmlsent.find(f".//{{{NS}}}w/{{{NS}}}wk")
|
||||
# get text
|
||||
if xmlword.text:
|
||||
word = xmlword.text
|
||||
else:
|
||||
word = ""
|
||||
# strip tailing space
|
||||
if strip_space:
|
||||
word = word.strip()
|
||||
# stem
|
||||
if relation or stem:
|
||||
try:
|
||||
xmlstem = xmlword.find(".//{%s}stem" % NS)
|
||||
word = xmlstem.text
|
||||
except AttributeError as e:
|
||||
pass
|
||||
# if there is an inflection
|
||||
try:
|
||||
xmlinfl = xmlword.find(
|
||||
f".//{{{NS}}}mor/{{{NS}}}mw/{{{NS}}}mk"
|
||||
)
|
||||
word += "-" + xmlinfl.text
|
||||
except:
|
||||
pass
|
||||
# if there is a suffix
|
||||
try:
|
||||
xmlsuffix = xmlword.find(
|
||||
".//{%s}mor/{%s}mor-post/{%s}mw/{%s}stem"
|
||||
% (NS, NS, NS, NS)
|
||||
)
|
||||
suffixStem = xmlsuffix.text
|
||||
except AttributeError:
|
||||
suffixStem = ""
|
||||
if suffixStem:
|
||||
word += "~" + suffixStem
|
||||
# pos
|
||||
if relation or pos:
|
||||
try:
|
||||
xmlpos = xmlword.findall(".//{%s}c" % NS)
|
||||
xmlpos2 = xmlword.findall(".//{%s}s" % NS)
|
||||
if xmlpos2 != []:
|
||||
tag = xmlpos[0].text + ":" + xmlpos2[0].text
|
||||
else:
|
||||
tag = xmlpos[0].text
|
||||
except (AttributeError, IndexError) as e:
|
||||
tag = ""
|
||||
try:
|
||||
xmlsuffixpos = xmlword.findall(
|
||||
".//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}c"
|
||||
% (NS, NS, NS, NS, NS)
|
||||
)
|
||||
xmlsuffixpos2 = xmlword.findall(
|
||||
".//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}s"
|
||||
% (NS, NS, NS, NS, NS)
|
||||
)
|
||||
if xmlsuffixpos2:
|
||||
suffixTag = (
|
||||
xmlsuffixpos[0].text + ":" + xmlsuffixpos2[0].text
|
||||
)
|
||||
else:
|
||||
suffixTag = xmlsuffixpos[0].text
|
||||
except:
|
||||
pass
|
||||
if suffixTag:
|
||||
tag += "~" + suffixTag
|
||||
word = (word, tag)
|
||||
# relational
|
||||
# the gold standard is stored in
|
||||
# <mor></mor><mor type="trn"><gra type="grt">
|
||||
if relation == True:
|
||||
for xmlstem_rel in xmlword.findall(
|
||||
f".//{{{NS}}}mor/{{{NS}}}gra"
|
||||
):
|
||||
if not xmlstem_rel.get("type") == "grt":
|
||||
word = (
|
||||
word[0],
|
||||
word[1],
|
||||
xmlstem_rel.get("index")
|
||||
+ "|"
|
||||
+ xmlstem_rel.get("head")
|
||||
+ "|"
|
||||
+ xmlstem_rel.get("relation"),
|
||||
)
|
||||
else:
|
||||
word = (
|
||||
word[0],
|
||||
word[1],
|
||||
word[2],
|
||||
word[0],
|
||||
word[1],
|
||||
xmlstem_rel.get("index")
|
||||
+ "|"
|
||||
+ xmlstem_rel.get("head")
|
||||
+ "|"
|
||||
+ xmlstem_rel.get("relation"),
|
||||
)
|
||||
try:
|
||||
for xmlpost_rel in xmlword.findall(
|
||||
f".//{{{NS}}}mor/{{{NS}}}mor-post/{{{NS}}}gra"
|
||||
):
|
||||
if not xmlpost_rel.get("type") == "grt":
|
||||
suffixStem = (
|
||||
suffixStem[0],
|
||||
suffixStem[1],
|
||||
xmlpost_rel.get("index")
|
||||
+ "|"
|
||||
+ xmlpost_rel.get("head")
|
||||
+ "|"
|
||||
+ xmlpost_rel.get("relation"),
|
||||
)
|
||||
else:
|
||||
suffixStem = (
|
||||
suffixStem[0],
|
||||
suffixStem[1],
|
||||
suffixStem[2],
|
||||
suffixStem[0],
|
||||
suffixStem[1],
|
||||
xmlpost_rel.get("index")
|
||||
+ "|"
|
||||
+ xmlpost_rel.get("head")
|
||||
+ "|"
|
||||
+ xmlpost_rel.get("relation"),
|
||||
)
|
||||
except:
|
||||
pass
|
||||
sents.append(word)
|
||||
if sent or relation:
|
||||
results.append(sents)
|
||||
else:
|
||||
results.extend(sents)
|
||||
return LazyMap(lambda x: x, results)
|
||||
|
||||
# Ready-to-use browser opener
|
||||
|
||||
"""
|
||||
The base URL for viewing files on the childes website. This
|
||||
shouldn't need to be changed, unless CHILDES changes the configuration
|
||||
of their server or unless the user sets up their own corpus webserver.
|
||||
"""
|
||||
childes_url_base = r"https://childes.talkbank.org/browser/index.php?url="
|
||||
|
||||
def webview_file(self, fileid, urlbase=None):
|
||||
"""Map a corpus file to its web version on the CHILDES website,
|
||||
and open it in a web browser.
|
||||
|
||||
The complete URL to be used is:
|
||||
childes.childes_url_base + urlbase + fileid.replace('.xml', '.cha')
|
||||
|
||||
If no urlbase is passed, we try to calculate it. This
|
||||
requires that the childes corpus was set up to mirror the
|
||||
folder hierarchy under childes.psy.cmu.edu/data-xml/, e.g.:
|
||||
nltk_data/corpora/childes/Eng-USA/Cornell/??? or
|
||||
nltk_data/corpora/childes/Romance/Spanish/Aguirre/???
|
||||
|
||||
The function first looks (as a special case) if "Eng-USA" is
|
||||
on the path consisting of <corpus root>+fileid; then if
|
||||
"childes", possibly followed by "data-xml", appears. If neither
|
||||
one is found, we use the unmodified fileid and hope for the best.
|
||||
If this is not right, specify urlbase explicitly, e.g., if the
|
||||
corpus root points to the Cornell folder, urlbase='Eng-USA/Cornell'.
|
||||
"""
|
||||
|
||||
import webbrowser
|
||||
|
||||
if urlbase:
|
||||
path = urlbase + "/" + fileid
|
||||
else:
|
||||
full = self.root + "/" + fileid
|
||||
full = re.sub(r"\\", "/", full)
|
||||
if "/childes/" in full.lower():
|
||||
# Discard /data-xml/ if present
|
||||
path = re.findall(r"(?i)/childes(?:/data-xml)?/(.*)\.xml", full)[0]
|
||||
elif "eng-usa" in full.lower():
|
||||
path = "Eng-USA/" + re.findall(r"/(?i)Eng-USA/(.*)\.xml", full)[0]
|
||||
else:
|
||||
path = fileid
|
||||
|
||||
# Strip ".xml" and add ".cha", as necessary:
|
||||
if path.endswith(".xml"):
|
||||
path = path[:-4]
|
||||
|
||||
if not path.endswith(".cha"):
|
||||
path = path + ".cha"
|
||||
|
||||
url = self.childes_url_base + path
|
||||
|
||||
webbrowser.open_new_tab(url)
|
||||
print("Opening in browser:", url)
|
||||
# Pausing is a good idea, but it's up to the user...
|
||||
# raw_input("Hit Return to continue")
|
||||
|
||||
|
||||
def demo(corpus_root=None):
|
||||
"""
|
||||
The CHILDES corpus should be manually downloaded and saved
|
||||
to ``[NLTK_Data_Dir]/corpora/childes/``
|
||||
"""
|
||||
if not corpus_root:
|
||||
from nltk.data import find
|
||||
|
||||
corpus_root = find("corpora/childes/data-xml/Eng-USA/")
|
||||
|
||||
try:
|
||||
childes = CHILDESCorpusReader(corpus_root, ".*.xml")
|
||||
# describe all corpus
|
||||
for file in childes.fileids()[:5]:
|
||||
corpus = ""
|
||||
corpus_id = ""
|
||||
for key, value in childes.corpus(file)[0].items():
|
||||
if key == "Corpus":
|
||||
corpus = value
|
||||
if key == "Id":
|
||||
corpus_id = value
|
||||
print("Reading", corpus, corpus_id, " .....")
|
||||
print("words:", childes.words(file)[:7], "...")
|
||||
print(
|
||||
"words with replaced words:",
|
||||
childes.words(file, replace=True)[:7],
|
||||
" ...",
|
||||
)
|
||||
print("words with pos tags:", childes.tagged_words(file)[:7], " ...")
|
||||
print("words (only MOT):", childes.words(file, speaker="MOT")[:7], "...")
|
||||
print("words (only CHI):", childes.words(file, speaker="CHI")[:7], "...")
|
||||
print("stemmed words:", childes.words(file, stem=True)[:7], " ...")
|
||||
print(
|
||||
"words with relations and pos-tag:",
|
||||
childes.words(file, relation=True)[:5],
|
||||
" ...",
|
||||
)
|
||||
print("sentence:", childes.sents(file)[:2], " ...")
|
||||
for participant, values in childes.participants(file)[0].items():
|
||||
for key, value in values.items():
|
||||
print("\tparticipant", participant, key, ":", value)
|
||||
print("num of sent:", len(childes.sents(file)))
|
||||
print("num of morphemes:", len(childes.words(file, stem=True)))
|
||||
print("age:", childes.age(file))
|
||||
print("age in month:", childes.age(file, month=True))
|
||||
print("MLU:", childes.MLU(file))
|
||||
print()
|
||||
|
||||
except LookupError as e:
|
||||
print(
|
||||
"""The CHILDES corpus, or the parts you need, should be manually
|
||||
downloaded from https://childes.talkbank.org/data-xml/ and saved at
|
||||
[NLTK_Data_Dir]/corpora/childes/
|
||||
Alternately, you can call the demo with the path to a portion of the CHILDES corpus, e.g.:
|
||||
demo('/path/to/childes/data-xml/Eng-USA/")
|
||||
"""
|
||||
)
|
||||
# corpus_root_http = urllib2.urlopen('https://childes.talkbank.org/data-xml/Eng-USA/Bates.zip')
|
||||
# corpus_root_http_bates = zipfile.ZipFile(cStringIO.StringIO(corpus_root_http.read()))
|
||||
##this fails
|
||||
# childes = CHILDESCorpusReader(corpus_root_http_bates,corpus_root_http_bates.namelist())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
273
backend/venv/Lib/site-packages/nltk/corpus/reader/chunked.py
Normal file
273
backend/venv/Lib/site-packages/nltk/corpus/reader/chunked.py
Normal file
@@ -0,0 +1,273 @@
|
||||
# Natural Language Toolkit: Chunked Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
A reader for corpora that contain chunked (and optionally tagged)
|
||||
documents.
|
||||
"""
|
||||
|
||||
import codecs
|
||||
import os.path
|
||||
|
||||
import nltk
|
||||
from nltk.chunk import tagstr2tree
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.tokenize import *
|
||||
from nltk.tree import Tree
|
||||
|
||||
|
||||
class ChunkedCorpusReader(CorpusReader):
|
||||
"""
|
||||
Reader for chunked (and optionally tagged) corpora. Paragraphs
|
||||
are split using a block reader. They are then tokenized into
|
||||
sentences using a sentence tokenizer. Finally, these sentences
|
||||
are parsed into chunk trees using a string-to-chunktree conversion
|
||||
function. Each of these steps can be performed using a default
|
||||
function or a custom function. By default, paragraphs are split
|
||||
on blank lines; sentences are listed one per line; and sentences
|
||||
are parsed into chunk trees using ``nltk.chunk.tagstr2tree``.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
root,
|
||||
fileids,
|
||||
extension="",
|
||||
str2chunktree=tagstr2tree,
|
||||
sent_tokenizer=RegexpTokenizer("\n", gaps=True),
|
||||
para_block_reader=read_blankline_block,
|
||||
encoding="utf8",
|
||||
tagset=None,
|
||||
):
|
||||
"""
|
||||
:param root: The root directory for this corpus.
|
||||
:param fileids: A list or regexp specifying the fileids in this corpus.
|
||||
"""
|
||||
CorpusReader.__init__(self, root, fileids, encoding)
|
||||
self._cv_args = (str2chunktree, sent_tokenizer, para_block_reader, tagset)
|
||||
"""Arguments for corpus views generated by this corpus: a tuple
|
||||
(str2chunktree, sent_tokenizer, para_block_tokenizer)"""
|
||||
|
||||
def words(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of words
|
||||
and punctuation symbols.
|
||||
:rtype: list(str)
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
ChunkedCorpusView(f, enc, 0, 0, 0, 0, *self._cv_args)
|
||||
for (f, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def sents(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of
|
||||
sentences or utterances, each encoded as a list of word
|
||||
strings.
|
||||
:rtype: list(list(str))
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
ChunkedCorpusView(f, enc, 0, 1, 0, 0, *self._cv_args)
|
||||
for (f, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def paras(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of
|
||||
paragraphs, each encoded as a list of sentences, which are
|
||||
in turn encoded as lists of word strings.
|
||||
:rtype: list(list(list(str)))
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
ChunkedCorpusView(f, enc, 0, 1, 1, 0, *self._cv_args)
|
||||
for (f, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_words(self, fileids=None, tagset=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of tagged
|
||||
words and punctuation symbols, encoded as tuples
|
||||
``(word,tag)``.
|
||||
:rtype: list(tuple(str,str))
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
ChunkedCorpusView(
|
||||
f, enc, 1, 0, 0, 0, *self._cv_args, target_tagset=tagset
|
||||
)
|
||||
for (f, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_sents(self, fileids=None, tagset=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of
|
||||
sentences, each encoded as a list of ``(word,tag)`` tuples.
|
||||
|
||||
:rtype: list(list(tuple(str,str)))
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
ChunkedCorpusView(
|
||||
f, enc, 1, 1, 0, 0, *self._cv_args, target_tagset=tagset
|
||||
)
|
||||
for (f, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_paras(self, fileids=None, tagset=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of
|
||||
paragraphs, each encoded as a list of sentences, which are
|
||||
in turn encoded as lists of ``(word,tag)`` tuples.
|
||||
:rtype: list(list(list(tuple(str,str))))
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
ChunkedCorpusView(
|
||||
f, enc, 1, 1, 1, 0, *self._cv_args, target_tagset=tagset
|
||||
)
|
||||
for (f, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def chunked_words(self, fileids=None, tagset=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of tagged
|
||||
words and chunks. Words are encoded as ``(word, tag)``
|
||||
tuples (if the corpus has tags) or word strings (if the
|
||||
corpus has no tags). Chunks are encoded as depth-one
|
||||
trees over ``(word,tag)`` tuples or word strings.
|
||||
:rtype: list(tuple(str,str) and Tree)
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
ChunkedCorpusView(
|
||||
f, enc, 1, 0, 0, 1, *self._cv_args, target_tagset=tagset
|
||||
)
|
||||
for (f, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def chunked_sents(self, fileids=None, tagset=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of
|
||||
sentences, each encoded as a shallow Tree. The leaves
|
||||
of these trees are encoded as ``(word, tag)`` tuples (if
|
||||
the corpus has tags) or word strings (if the corpus has no
|
||||
tags).
|
||||
:rtype: list(Tree)
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
ChunkedCorpusView(
|
||||
f, enc, 1, 1, 0, 1, *self._cv_args, target_tagset=tagset
|
||||
)
|
||||
for (f, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def chunked_paras(self, fileids=None, tagset=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of
|
||||
paragraphs, each encoded as a list of sentences, which are
|
||||
in turn encoded as a shallow Tree. The leaves of these
|
||||
trees are encoded as ``(word, tag)`` tuples (if the corpus
|
||||
has tags) or word strings (if the corpus has no tags).
|
||||
:rtype: list(list(Tree))
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
ChunkedCorpusView(
|
||||
f, enc, 1, 1, 1, 1, *self._cv_args, target_tagset=tagset
|
||||
)
|
||||
for (f, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def _read_block(self, stream):
|
||||
return [tagstr2tree(t) for t in read_blankline_block(stream)]
|
||||
|
||||
|
||||
class ChunkedCorpusView(StreamBackedCorpusView):
|
||||
def __init__(
|
||||
self,
|
||||
fileid,
|
||||
encoding,
|
||||
tagged,
|
||||
group_by_sent,
|
||||
group_by_para,
|
||||
chunked,
|
||||
str2chunktree,
|
||||
sent_tokenizer,
|
||||
para_block_reader,
|
||||
source_tagset=None,
|
||||
target_tagset=None,
|
||||
):
|
||||
StreamBackedCorpusView.__init__(self, fileid, encoding=encoding)
|
||||
self._tagged = tagged
|
||||
self._group_by_sent = group_by_sent
|
||||
self._group_by_para = group_by_para
|
||||
self._chunked = chunked
|
||||
self._str2chunktree = str2chunktree
|
||||
self._sent_tokenizer = sent_tokenizer
|
||||
self._para_block_reader = para_block_reader
|
||||
self._source_tagset = source_tagset
|
||||
self._target_tagset = target_tagset
|
||||
|
||||
def read_block(self, stream):
|
||||
block = []
|
||||
for para_str in self._para_block_reader(stream):
|
||||
para = []
|
||||
for sent_str in self._sent_tokenizer.tokenize(para_str):
|
||||
sent = self._str2chunktree(
|
||||
sent_str,
|
||||
source_tagset=self._source_tagset,
|
||||
target_tagset=self._target_tagset,
|
||||
)
|
||||
|
||||
# If requested, throw away the tags.
|
||||
if not self._tagged:
|
||||
sent = self._untag(sent)
|
||||
|
||||
# If requested, throw away the chunks.
|
||||
if not self._chunked:
|
||||
sent = sent.leaves()
|
||||
|
||||
# Add the sentence to `para`.
|
||||
if self._group_by_sent:
|
||||
para.append(sent)
|
||||
else:
|
||||
para.extend(sent)
|
||||
|
||||
# Add the paragraph to `block`.
|
||||
if self._group_by_para:
|
||||
block.append(para)
|
||||
else:
|
||||
block.extend(para)
|
||||
|
||||
# Return the block
|
||||
return block
|
||||
|
||||
def _untag(self, tree):
|
||||
for i, child in enumerate(tree):
|
||||
if isinstance(child, Tree):
|
||||
self._untag(child)
|
||||
elif isinstance(child, tuple):
|
||||
tree[i] = child[0]
|
||||
else:
|
||||
raise ValueError("expected child to be Tree or tuple")
|
||||
return tree
|
||||
88
backend/venv/Lib/site-packages/nltk/corpus/reader/cmudict.py
Normal file
88
backend/venv/Lib/site-packages/nltk/corpus/reader/cmudict.py
Normal file
@@ -0,0 +1,88 @@
|
||||
# Natural Language Toolkit: Carnegie Mellon Pronouncing Dictionary Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
The Carnegie Mellon Pronouncing Dictionary [cmudict.0.6]
|
||||
ftp://ftp.cs.cmu.edu/project/speech/dict/
|
||||
Copyright 1998 Carnegie Mellon University
|
||||
|
||||
File Format: Each line consists of an uppercased word, a counter
|
||||
(for alternative pronunciations), and a transcription. Vowels are
|
||||
marked for stress (1=primary, 2=secondary, 0=no stress). E.g.:
|
||||
NATURAL 1 N AE1 CH ER0 AH0 L
|
||||
|
||||
The dictionary contains 127069 entries. Of these, 119400 words are assigned
|
||||
a unique pronunciation, 6830 words have two pronunciations, and 839 words have
|
||||
three or more pronunciations. Many of these are fast-speech variants.
|
||||
|
||||
Phonemes: There are 39 phonemes, as shown below:
|
||||
|
||||
Phoneme Example Translation Phoneme Example Translation
|
||||
------- ------- ----------- ------- ------- -----------
|
||||
AA odd AA D AE at AE T
|
||||
AH hut HH AH T AO ought AO T
|
||||
AW cow K AW AY hide HH AY D
|
||||
B be B IY CH cheese CH IY Z
|
||||
D dee D IY DH thee DH IY
|
||||
EH Ed EH D ER hurt HH ER T
|
||||
EY ate EY T F fee F IY
|
||||
G green G R IY N HH he HH IY
|
||||
IH it IH T IY eat IY T
|
||||
JH gee JH IY K key K IY
|
||||
L lee L IY M me M IY
|
||||
N knee N IY NG ping P IH NG
|
||||
OW oat OW T OY toy T OY
|
||||
P pee P IY R read R IY D
|
||||
S sea S IY SH she SH IY
|
||||
T tea T IY TH theta TH EY T AH
|
||||
UH hood HH UH D UW two T UW
|
||||
V vee V IY W we W IY
|
||||
Y yield Y IY L D Z zee Z IY
|
||||
ZH seizure S IY ZH ER
|
||||
"""
|
||||
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.util import Index
|
||||
|
||||
|
||||
class CMUDictCorpusReader(CorpusReader):
|
||||
def entries(self):
|
||||
"""
|
||||
:return: the cmudict lexicon as a list of entries
|
||||
containing (word, transcriptions) tuples.
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
StreamBackedCorpusView(fileid, read_cmudict_block, encoding=enc)
|
||||
for fileid, enc in self.abspaths(None, True)
|
||||
]
|
||||
)
|
||||
|
||||
def words(self):
|
||||
"""
|
||||
:return: a list of all words defined in the cmudict lexicon.
|
||||
"""
|
||||
return [word.lower() for (word, _) in self.entries()]
|
||||
|
||||
def dict(self):
|
||||
"""
|
||||
:return: the cmudict lexicon as a dictionary, whose keys are
|
||||
lowercase words and whose values are lists of pronunciations.
|
||||
"""
|
||||
return dict(Index(self.entries()))
|
||||
|
||||
|
||||
def read_cmudict_block(stream):
|
||||
entries = []
|
||||
while len(entries) < 100: # Read 100 at a time.
|
||||
line = stream.readline()
|
||||
if line == "":
|
||||
return entries # end of file.
|
||||
pieces = line.split()
|
||||
entries.append((pieces[0].lower(), pieces[2:]))
|
||||
return entries
|
||||
@@ -0,0 +1,309 @@
|
||||
# Natural Language Toolkit: Comparative Sentence Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
CorpusReader for the Comparative Sentence Dataset.
|
||||
|
||||
- Comparative Sentence Dataset information -
|
||||
|
||||
Annotated by: Nitin Jindal and Bing Liu, 2006.
|
||||
Department of Computer Sicence
|
||||
University of Illinois at Chicago
|
||||
|
||||
Contact: Nitin Jindal, njindal@cs.uic.edu
|
||||
Bing Liu, liub@cs.uic.edu (https://www.cs.uic.edu/~liub)
|
||||
|
||||
Distributed with permission.
|
||||
|
||||
Related papers:
|
||||
|
||||
- Nitin Jindal and Bing Liu. "Identifying Comparative Sentences in Text Documents".
|
||||
Proceedings of the ACM SIGIR International Conference on Information Retrieval
|
||||
(SIGIR-06), 2006.
|
||||
|
||||
- Nitin Jindal and Bing Liu. "Mining Comprative Sentences and Relations".
|
||||
Proceedings of Twenty First National Conference on Artificial Intelligence
|
||||
(AAAI-2006), 2006.
|
||||
|
||||
- Murthy Ganapathibhotla and Bing Liu. "Mining Opinions in Comparative Sentences".
|
||||
Proceedings of the 22nd International Conference on Computational Linguistics
|
||||
(Coling-2008), Manchester, 18-22 August, 2008.
|
||||
"""
|
||||
import re
|
||||
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.tokenize import *
|
||||
|
||||
# Regular expressions for dataset components
|
||||
STARS = re.compile(r"^\*+$")
|
||||
COMPARISON = re.compile(r"<cs-[1234]>")
|
||||
CLOSE_COMPARISON = re.compile(r"</cs-[1234]>")
|
||||
GRAD_COMPARISON = re.compile(r"<cs-[123]>")
|
||||
NON_GRAD_COMPARISON = re.compile(r"<cs-4>")
|
||||
ENTITIES_FEATS = re.compile(r"(\d)_((?:[\.\w\s/-](?!\d_))+)")
|
||||
KEYWORD = re.compile(r"\(([^\(]*)\)$")
|
||||
|
||||
|
||||
class Comparison:
|
||||
"""
|
||||
A Comparison represents a comparative sentence and its constituents.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
text=None,
|
||||
comp_type=None,
|
||||
entity_1=None,
|
||||
entity_2=None,
|
||||
feature=None,
|
||||
keyword=None,
|
||||
):
|
||||
"""
|
||||
:param text: a string (optionally tokenized) containing a comparison.
|
||||
:param comp_type: an integer defining the type of comparison expressed.
|
||||
Values can be: 1 (Non-equal gradable), 2 (Equative), 3 (Superlative),
|
||||
4 (Non-gradable).
|
||||
:param entity_1: the first entity considered in the comparison relation.
|
||||
:param entity_2: the second entity considered in the comparison relation.
|
||||
:param feature: the feature considered in the comparison relation.
|
||||
:param keyword: the word or phrase which is used for that comparative relation.
|
||||
"""
|
||||
self.text = text
|
||||
self.comp_type = comp_type
|
||||
self.entity_1 = entity_1
|
||||
self.entity_2 = entity_2
|
||||
self.feature = feature
|
||||
self.keyword = keyword
|
||||
|
||||
def __repr__(self):
|
||||
return (
|
||||
'Comparison(text="{}", comp_type={}, entity_1="{}", entity_2="{}", '
|
||||
'feature="{}", keyword="{}")'
|
||||
).format(
|
||||
self.text,
|
||||
self.comp_type,
|
||||
self.entity_1,
|
||||
self.entity_2,
|
||||
self.feature,
|
||||
self.keyword,
|
||||
)
|
||||
|
||||
|
||||
class ComparativeSentencesCorpusReader(CorpusReader):
|
||||
"""
|
||||
Reader for the Comparative Sentence Dataset by Jindal and Liu (2006).
|
||||
|
||||
>>> from nltk.corpus import comparative_sentences
|
||||
>>> comparison = comparative_sentences.comparisons()[0]
|
||||
>>> comparison.text # doctest: +NORMALIZE_WHITESPACE
|
||||
['its', 'fast-forward', 'and', 'rewind', 'work', 'much', 'more', 'smoothly',
|
||||
'and', 'consistently', 'than', 'those', 'of', 'other', 'models', 'i', "'ve",
|
||||
'had', '.']
|
||||
>>> comparison.entity_2
|
||||
'models'
|
||||
>>> (comparison.feature, comparison.keyword)
|
||||
('rewind', 'more')
|
||||
>>> len(comparative_sentences.comparisons())
|
||||
853
|
||||
"""
|
||||
|
||||
CorpusView = StreamBackedCorpusView
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
root,
|
||||
fileids,
|
||||
word_tokenizer=WhitespaceTokenizer(),
|
||||
sent_tokenizer=None,
|
||||
encoding="utf8",
|
||||
):
|
||||
"""
|
||||
:param root: The root directory for this corpus.
|
||||
:param fileids: a list or regexp specifying the fileids in this corpus.
|
||||
:param word_tokenizer: tokenizer for breaking sentences or paragraphs
|
||||
into words. Default: `WhitespaceTokenizer`
|
||||
:param sent_tokenizer: tokenizer for breaking paragraphs into sentences.
|
||||
:param encoding: the encoding that should be used to read the corpus.
|
||||
"""
|
||||
|
||||
CorpusReader.__init__(self, root, fileids, encoding)
|
||||
self._word_tokenizer = word_tokenizer
|
||||
self._sent_tokenizer = sent_tokenizer
|
||||
self._readme = "README.txt"
|
||||
|
||||
def comparisons(self, fileids=None):
|
||||
"""
|
||||
Return all comparisons in the corpus.
|
||||
|
||||
:param fileids: a list or regexp specifying the ids of the files whose
|
||||
comparisons have to be returned.
|
||||
:return: the given file(s) as a list of Comparison objects.
|
||||
:rtype: list(Comparison)
|
||||
"""
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, str):
|
||||
fileids = [fileids]
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(path, self._read_comparison_block, encoding=enc)
|
||||
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
||||
]
|
||||
)
|
||||
|
||||
def keywords(self, fileids=None):
|
||||
"""
|
||||
Return a set of all keywords used in the corpus.
|
||||
|
||||
:param fileids: a list or regexp specifying the ids of the files whose
|
||||
keywords have to be returned.
|
||||
:return: the set of keywords and comparative phrases used in the corpus.
|
||||
:rtype: set(str)
|
||||
"""
|
||||
all_keywords = concat(
|
||||
[
|
||||
self.CorpusView(path, self._read_keyword_block, encoding=enc)
|
||||
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
||||
]
|
||||
)
|
||||
|
||||
keywords_set = {keyword.lower() for keyword in all_keywords if keyword}
|
||||
return keywords_set
|
||||
|
||||
def keywords_readme(self):
|
||||
"""
|
||||
Return the list of words and constituents considered as clues of a
|
||||
comparison (from listOfkeywords.txt).
|
||||
"""
|
||||
keywords = []
|
||||
with self.open("listOfkeywords.txt") as fp:
|
||||
raw_text = fp.read()
|
||||
for line in raw_text.split("\n"):
|
||||
if not line or line.startswith("//"):
|
||||
continue
|
||||
keywords.append(line.strip())
|
||||
return keywords
|
||||
|
||||
def sents(self, fileids=None):
|
||||
"""
|
||||
Return all sentences in the corpus.
|
||||
|
||||
:param fileids: a list or regexp specifying the ids of the files whose
|
||||
sentences have to be returned.
|
||||
:return: all sentences of the corpus as lists of tokens (or as plain
|
||||
strings, if no word tokenizer is specified).
|
||||
:rtype: list(list(str)) or list(str)
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(path, self._read_sent_block, encoding=enc)
|
||||
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
||||
]
|
||||
)
|
||||
|
||||
def words(self, fileids=None):
|
||||
"""
|
||||
Return all words and punctuation symbols in the corpus.
|
||||
|
||||
:param fileids: a list or regexp specifying the ids of the files whose
|
||||
words have to be returned.
|
||||
:return: the given file(s) as a list of words and punctuation symbols.
|
||||
:rtype: list(str)
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(path, self._read_word_block, encoding=enc)
|
||||
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
||||
]
|
||||
)
|
||||
|
||||
def _read_comparison_block(self, stream):
|
||||
while True:
|
||||
line = stream.readline()
|
||||
if not line:
|
||||
return [] # end of file.
|
||||
comparison_tags = re.findall(COMPARISON, line)
|
||||
if comparison_tags:
|
||||
grad_comparisons = re.findall(GRAD_COMPARISON, line)
|
||||
non_grad_comparisons = re.findall(NON_GRAD_COMPARISON, line)
|
||||
# Advance to the next line (it contains the comparative sentence)
|
||||
comparison_text = stream.readline().strip()
|
||||
if self._word_tokenizer:
|
||||
comparison_text = self._word_tokenizer.tokenize(comparison_text)
|
||||
# Skip the next line (it contains closing comparison tags)
|
||||
stream.readline()
|
||||
# If gradable comparisons are found, create Comparison instances
|
||||
# and populate their fields
|
||||
comparison_bundle = []
|
||||
if grad_comparisons:
|
||||
# Each comparison tag has its own relations on a separate line
|
||||
for comp in grad_comparisons:
|
||||
comp_type = int(re.match(r"<cs-(\d)>", comp).group(1))
|
||||
comparison = Comparison(
|
||||
text=comparison_text, comp_type=comp_type
|
||||
)
|
||||
line = stream.readline()
|
||||
entities_feats = ENTITIES_FEATS.findall(line)
|
||||
if entities_feats:
|
||||
for code, entity_feat in entities_feats:
|
||||
if code == "1":
|
||||
comparison.entity_1 = entity_feat.strip()
|
||||
elif code == "2":
|
||||
comparison.entity_2 = entity_feat.strip()
|
||||
elif code == "3":
|
||||
comparison.feature = entity_feat.strip()
|
||||
keyword = KEYWORD.findall(line)
|
||||
if keyword:
|
||||
comparison.keyword = keyword[0]
|
||||
comparison_bundle.append(comparison)
|
||||
# If non-gradable comparisons are found, create a simple Comparison
|
||||
# instance for each one
|
||||
if non_grad_comparisons:
|
||||
for comp in non_grad_comparisons:
|
||||
# comp_type in this case should always be 4.
|
||||
comp_type = int(re.match(r"<cs-(\d)>", comp).group(1))
|
||||
comparison = Comparison(
|
||||
text=comparison_text, comp_type=comp_type
|
||||
)
|
||||
comparison_bundle.append(comparison)
|
||||
# Flatten the list of comparisons before returning them
|
||||
# return concat([comparison_bundle])
|
||||
return comparison_bundle
|
||||
|
||||
def _read_keyword_block(self, stream):
|
||||
keywords = []
|
||||
for comparison in self._read_comparison_block(stream):
|
||||
keywords.append(comparison.keyword)
|
||||
return keywords
|
||||
|
||||
def _read_sent_block(self, stream):
|
||||
while True:
|
||||
line = stream.readline()
|
||||
if re.match(STARS, line):
|
||||
while True:
|
||||
line = stream.readline()
|
||||
if re.match(STARS, line):
|
||||
break
|
||||
continue
|
||||
if (
|
||||
not re.findall(COMPARISON, line)
|
||||
and not ENTITIES_FEATS.findall(line)
|
||||
and not re.findall(CLOSE_COMPARISON, line)
|
||||
):
|
||||
if self._sent_tokenizer:
|
||||
return [
|
||||
self._word_tokenizer.tokenize(sent)
|
||||
for sent in self._sent_tokenizer.tokenize(line)
|
||||
]
|
||||
else:
|
||||
return [self._word_tokenizer.tokenize(line)]
|
||||
|
||||
def _read_word_block(self, stream):
|
||||
words = []
|
||||
for sent in self._read_sent_block(stream):
|
||||
words.extend(sent)
|
||||
return words
|
||||
579
backend/venv/Lib/site-packages/nltk/corpus/reader/conll.py
Normal file
579
backend/venv/Lib/site-packages/nltk/corpus/reader/conll.py
Normal file
@@ -0,0 +1,579 @@
|
||||
# Natural Language Toolkit: CONLL Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Read CoNLL-style chunk fileids.
|
||||
"""
|
||||
|
||||
import textwrap
|
||||
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.tag import map_tag
|
||||
from nltk.tree import Tree
|
||||
from nltk.util import LazyConcatenation, LazyMap
|
||||
|
||||
|
||||
class ConllCorpusReader(CorpusReader):
|
||||
"""
|
||||
A corpus reader for CoNLL-style files. These files consist of a
|
||||
series of sentences, separated by blank lines. Each sentence is
|
||||
encoded using a table (or "grid") of values, where each line
|
||||
corresponds to a single word, and each column corresponds to an
|
||||
annotation type. The set of columns used by CoNLL-style files can
|
||||
vary from corpus to corpus; the ``ConllCorpusReader`` constructor
|
||||
therefore takes an argument, ``columntypes``, which is used to
|
||||
specify the columns that are used by a given corpus. By default
|
||||
columns are split by consecutive whitespaces, with the
|
||||
``separator`` argument you can set a string to split by (e.g.
|
||||
``\'\t\'``).
|
||||
|
||||
|
||||
@todo: Add support for reading from corpora where different
|
||||
parallel files contain different columns.
|
||||
@todo: Possibly add caching of the grid corpus view? This would
|
||||
allow the same grid view to be used by different data access
|
||||
methods (eg words() and parsed_sents() could both share the
|
||||
same grid corpus view object).
|
||||
@todo: Better support for -DOCSTART-. Currently, we just ignore
|
||||
it, but it could be used to define methods that retrieve a
|
||||
document at a time (eg parsed_documents()).
|
||||
"""
|
||||
|
||||
# /////////////////////////////////////////////////////////////////
|
||||
# Column Types
|
||||
# /////////////////////////////////////////////////////////////////
|
||||
|
||||
WORDS = "words" #: column type for words
|
||||
POS = "pos" #: column type for part-of-speech tags
|
||||
TREE = "tree" #: column type for parse trees
|
||||
CHUNK = "chunk" #: column type for chunk structures
|
||||
NE = "ne" #: column type for named entities
|
||||
SRL = "srl" #: column type for semantic role labels
|
||||
IGNORE = "ignore" #: column type for column that should be ignored
|
||||
|
||||
#: A list of all column types supported by the conll corpus reader.
|
||||
COLUMN_TYPES = (WORDS, POS, TREE, CHUNK, NE, SRL, IGNORE)
|
||||
|
||||
# /////////////////////////////////////////////////////////////////
|
||||
# Constructor
|
||||
# /////////////////////////////////////////////////////////////////
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
root,
|
||||
fileids,
|
||||
columntypes,
|
||||
chunk_types=None,
|
||||
root_label="S",
|
||||
pos_in_tree=False,
|
||||
srl_includes_roleset=True,
|
||||
encoding="utf8",
|
||||
tree_class=Tree,
|
||||
tagset=None,
|
||||
separator=None,
|
||||
):
|
||||
for columntype in columntypes:
|
||||
if columntype not in self.COLUMN_TYPES:
|
||||
raise ValueError("Bad column type %r" % columntype)
|
||||
if isinstance(chunk_types, str):
|
||||
chunk_types = [chunk_types]
|
||||
self._chunk_types = chunk_types
|
||||
self._colmap = {c: i for (i, c) in enumerate(columntypes)}
|
||||
self._pos_in_tree = pos_in_tree
|
||||
self._root_label = root_label # for chunks
|
||||
self._srl_includes_roleset = srl_includes_roleset
|
||||
self._tree_class = tree_class
|
||||
CorpusReader.__init__(self, root, fileids, encoding)
|
||||
self._tagset = tagset
|
||||
self.sep = separator
|
||||
|
||||
# /////////////////////////////////////////////////////////////////
|
||||
# Data Access Methods
|
||||
# /////////////////////////////////////////////////////////////////
|
||||
|
||||
def words(self, fileids=None):
|
||||
self._require(self.WORDS)
|
||||
return LazyConcatenation(LazyMap(self._get_words, self._grids(fileids)))
|
||||
|
||||
def sents(self, fileids=None):
|
||||
self._require(self.WORDS)
|
||||
return LazyMap(self._get_words, self._grids(fileids))
|
||||
|
||||
def tagged_words(self, fileids=None, tagset=None):
|
||||
self._require(self.WORDS, self.POS)
|
||||
|
||||
def get_tagged_words(grid):
|
||||
return self._get_tagged_words(grid, tagset)
|
||||
|
||||
return LazyConcatenation(LazyMap(get_tagged_words, self._grids(fileids)))
|
||||
|
||||
def tagged_sents(self, fileids=None, tagset=None):
|
||||
self._require(self.WORDS, self.POS)
|
||||
|
||||
def get_tagged_words(grid):
|
||||
return self._get_tagged_words(grid, tagset)
|
||||
|
||||
return LazyMap(get_tagged_words, self._grids(fileids))
|
||||
|
||||
def chunked_words(self, fileids=None, chunk_types=None, tagset=None):
|
||||
self._require(self.WORDS, self.POS, self.CHUNK)
|
||||
if chunk_types is None:
|
||||
chunk_types = self._chunk_types
|
||||
|
||||
def get_chunked_words(grid): # capture chunk_types as local var
|
||||
return self._get_chunked_words(grid, chunk_types, tagset)
|
||||
|
||||
return LazyConcatenation(LazyMap(get_chunked_words, self._grids(fileids)))
|
||||
|
||||
def chunked_sents(self, fileids=None, chunk_types=None, tagset=None):
|
||||
self._require(self.WORDS, self.POS, self.CHUNK)
|
||||
if chunk_types is None:
|
||||
chunk_types = self._chunk_types
|
||||
|
||||
def get_chunked_words(grid): # capture chunk_types as local var
|
||||
return self._get_chunked_words(grid, chunk_types, tagset)
|
||||
|
||||
return LazyMap(get_chunked_words, self._grids(fileids))
|
||||
|
||||
def parsed_sents(self, fileids=None, pos_in_tree=None, tagset=None):
|
||||
self._require(self.WORDS, self.POS, self.TREE)
|
||||
if pos_in_tree is None:
|
||||
pos_in_tree = self._pos_in_tree
|
||||
|
||||
def get_parsed_sent(grid): # capture pos_in_tree as local var
|
||||
return self._get_parsed_sent(grid, pos_in_tree, tagset)
|
||||
|
||||
return LazyMap(get_parsed_sent, self._grids(fileids))
|
||||
|
||||
def srl_spans(self, fileids=None):
|
||||
self._require(self.SRL)
|
||||
return LazyMap(self._get_srl_spans, self._grids(fileids))
|
||||
|
||||
def srl_instances(self, fileids=None, pos_in_tree=None, flatten=True):
|
||||
self._require(self.WORDS, self.POS, self.TREE, self.SRL)
|
||||
if pos_in_tree is None:
|
||||
pos_in_tree = self._pos_in_tree
|
||||
|
||||
def get_srl_instances(grid): # capture pos_in_tree as local var
|
||||
return self._get_srl_instances(grid, pos_in_tree)
|
||||
|
||||
result = LazyMap(get_srl_instances, self._grids(fileids))
|
||||
if flatten:
|
||||
result = LazyConcatenation(result)
|
||||
return result
|
||||
|
||||
def iob_words(self, fileids=None, tagset=None):
|
||||
"""
|
||||
:return: a list of word/tag/IOB tuples
|
||||
:rtype: list(tuple)
|
||||
:param fileids: the list of fileids that make up this corpus
|
||||
:type fileids: None or str or list
|
||||
"""
|
||||
self._require(self.WORDS, self.POS, self.CHUNK)
|
||||
|
||||
def get_iob_words(grid):
|
||||
return self._get_iob_words(grid, tagset)
|
||||
|
||||
return LazyConcatenation(LazyMap(get_iob_words, self._grids(fileids)))
|
||||
|
||||
def iob_sents(self, fileids=None, tagset=None):
|
||||
"""
|
||||
:return: a list of lists of word/tag/IOB tuples
|
||||
:rtype: list(list)
|
||||
:param fileids: the list of fileids that make up this corpus
|
||||
:type fileids: None or str or list
|
||||
"""
|
||||
self._require(self.WORDS, self.POS, self.CHUNK)
|
||||
|
||||
def get_iob_words(grid):
|
||||
return self._get_iob_words(grid, tagset)
|
||||
|
||||
return LazyMap(get_iob_words, self._grids(fileids))
|
||||
|
||||
# /////////////////////////////////////////////////////////////////
|
||||
# Grid Reading
|
||||
# /////////////////////////////////////////////////////////////////
|
||||
|
||||
def _grids(self, fileids=None):
|
||||
# n.b.: we could cache the object returned here (keyed on
|
||||
# fileids), which would let us reuse the same corpus view for
|
||||
# different things (eg srl and parse trees).
|
||||
return concat(
|
||||
[
|
||||
StreamBackedCorpusView(fileid, self._read_grid_block, encoding=enc)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def _read_grid_block(self, stream):
|
||||
grids = []
|
||||
for block in read_blankline_block(stream):
|
||||
block = block.strip()
|
||||
if not block:
|
||||
continue
|
||||
|
||||
grid = [line.split(self.sep) for line in block.split("\n")]
|
||||
|
||||
# If there's a docstart row, then discard. ([xx] eventually it
|
||||
# would be good to actually use it)
|
||||
if grid[0][self._colmap.get("words", 0)] == "-DOCSTART-":
|
||||
del grid[0]
|
||||
|
||||
# Check that the grid is consistent.
|
||||
for row in grid:
|
||||
if len(row) != len(grid[0]):
|
||||
raise ValueError("Inconsistent number of columns:\n%s" % block)
|
||||
grids.append(grid)
|
||||
return grids
|
||||
|
||||
# /////////////////////////////////////////////////////////////////
|
||||
# Transforms
|
||||
# /////////////////////////////////////////////////////////////////
|
||||
# given a grid, transform it into some representation (e.g.,
|
||||
# a list of words or a parse tree).
|
||||
|
||||
def _get_words(self, grid):
|
||||
return self._get_column(grid, self._colmap["words"])
|
||||
|
||||
def _get_tagged_words(self, grid, tagset=None):
|
||||
pos_tags = self._get_column(grid, self._colmap["pos"])
|
||||
if tagset and tagset != self._tagset:
|
||||
pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
|
||||
return list(zip(self._get_column(grid, self._colmap["words"]), pos_tags))
|
||||
|
||||
def _get_iob_words(self, grid, tagset=None):
|
||||
pos_tags = self._get_column(grid, self._colmap["pos"])
|
||||
if tagset and tagset != self._tagset:
|
||||
pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
|
||||
return list(
|
||||
zip(
|
||||
self._get_column(grid, self._colmap["words"]),
|
||||
pos_tags,
|
||||
self._get_column(grid, self._colmap["chunk"]),
|
||||
)
|
||||
)
|
||||
|
||||
def _get_chunked_words(self, grid, chunk_types, tagset=None):
|
||||
# n.b.: this method is very similar to conllstr2tree.
|
||||
words = self._get_column(grid, self._colmap["words"])
|
||||
pos_tags = self._get_column(grid, self._colmap["pos"])
|
||||
if tagset and tagset != self._tagset:
|
||||
pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
|
||||
chunk_tags = self._get_column(grid, self._colmap["chunk"])
|
||||
|
||||
stack = [Tree(self._root_label, [])]
|
||||
|
||||
for word, pos_tag, chunk_tag in zip(words, pos_tags, chunk_tags):
|
||||
if chunk_tag == "O":
|
||||
state, chunk_type = "O", ""
|
||||
else:
|
||||
(state, chunk_type) = chunk_tag.split("-")
|
||||
# If it's a chunk we don't care about, treat it as O.
|
||||
if chunk_types is not None and chunk_type not in chunk_types:
|
||||
state = "O"
|
||||
# Treat a mismatching I like a B.
|
||||
if state == "I" and chunk_type != stack[-1].label():
|
||||
state = "B"
|
||||
# For B or I: close any open chunks
|
||||
if state in "BO" and len(stack) == 2:
|
||||
stack.pop()
|
||||
# For B: start a new chunk.
|
||||
if state == "B":
|
||||
new_chunk = Tree(chunk_type, [])
|
||||
stack[-1].append(new_chunk)
|
||||
stack.append(new_chunk)
|
||||
# Add the word token.
|
||||
stack[-1].append((word, pos_tag))
|
||||
|
||||
return stack[0]
|
||||
|
||||
def _get_parsed_sent(self, grid, pos_in_tree, tagset=None):
|
||||
words = self._get_column(grid, self._colmap["words"])
|
||||
pos_tags = self._get_column(grid, self._colmap["pos"])
|
||||
if tagset and tagset != self._tagset:
|
||||
pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
|
||||
parse_tags = self._get_column(grid, self._colmap["tree"])
|
||||
|
||||
treestr = ""
|
||||
for word, pos_tag, parse_tag in zip(words, pos_tags, parse_tags):
|
||||
if word == "(":
|
||||
word = "-LRB-"
|
||||
if word == ")":
|
||||
word = "-RRB-"
|
||||
if pos_tag == "(":
|
||||
pos_tag = "-LRB-"
|
||||
if pos_tag == ")":
|
||||
pos_tag = "-RRB-"
|
||||
(left, right) = parse_tag.split("*")
|
||||
right = right.count(")") * ")" # only keep ')'.
|
||||
treestr += f"{left} ({pos_tag} {word}) {right}"
|
||||
try:
|
||||
tree = self._tree_class.fromstring(treestr)
|
||||
except (ValueError, IndexError):
|
||||
tree = self._tree_class.fromstring(f"({self._root_label} {treestr})")
|
||||
|
||||
if not pos_in_tree:
|
||||
for subtree in tree.subtrees():
|
||||
for i, child in enumerate(subtree):
|
||||
if (
|
||||
isinstance(child, Tree)
|
||||
and len(child) == 1
|
||||
and isinstance(child[0], str)
|
||||
):
|
||||
subtree[i] = (child[0], child.label())
|
||||
|
||||
return tree
|
||||
|
||||
def _get_srl_spans(self, grid):
|
||||
"""
|
||||
list of list of (start, end), tag) tuples
|
||||
"""
|
||||
if self._srl_includes_roleset:
|
||||
predicates = self._get_column(grid, self._colmap["srl"] + 1)
|
||||
start_col = self._colmap["srl"] + 2
|
||||
else:
|
||||
predicates = self._get_column(grid, self._colmap["srl"])
|
||||
start_col = self._colmap["srl"] + 1
|
||||
|
||||
# Count how many predicates there are. This tells us how many
|
||||
# columns to expect for SRL data.
|
||||
num_preds = len([p for p in predicates if p != "-"])
|
||||
|
||||
spanlists = []
|
||||
for i in range(num_preds):
|
||||
col = self._get_column(grid, start_col + i)
|
||||
spanlist = []
|
||||
stack = []
|
||||
for wordnum, srl_tag in enumerate(col):
|
||||
(left, right) = srl_tag.split("*")
|
||||
for tag in left.split("("):
|
||||
if tag:
|
||||
stack.append((tag, wordnum))
|
||||
for i in range(right.count(")")):
|
||||
(tag, start) = stack.pop()
|
||||
spanlist.append(((start, wordnum + 1), tag))
|
||||
spanlists.append(spanlist)
|
||||
|
||||
return spanlists
|
||||
|
||||
def _get_srl_instances(self, grid, pos_in_tree):
|
||||
tree = self._get_parsed_sent(grid, pos_in_tree)
|
||||
spanlists = self._get_srl_spans(grid)
|
||||
if self._srl_includes_roleset:
|
||||
predicates = self._get_column(grid, self._colmap["srl"] + 1)
|
||||
rolesets = self._get_column(grid, self._colmap["srl"])
|
||||
else:
|
||||
predicates = self._get_column(grid, self._colmap["srl"])
|
||||
rolesets = [None] * len(predicates)
|
||||
|
||||
instances = ConllSRLInstanceList(tree)
|
||||
for wordnum, predicate in enumerate(predicates):
|
||||
if predicate == "-":
|
||||
continue
|
||||
# Decide which spanlist to use. Don't assume that they're
|
||||
# sorted in the same order as the predicates (even though
|
||||
# they usually are).
|
||||
for spanlist in spanlists:
|
||||
for (start, end), tag in spanlist:
|
||||
if wordnum in range(start, end) and tag in ("V", "C-V"):
|
||||
break
|
||||
else:
|
||||
continue
|
||||
break
|
||||
else:
|
||||
raise ValueError("No srl column found for %r" % predicate)
|
||||
instances.append(
|
||||
ConllSRLInstance(tree, wordnum, predicate, rolesets[wordnum], spanlist)
|
||||
)
|
||||
|
||||
return instances
|
||||
|
||||
# /////////////////////////////////////////////////////////////////
|
||||
# Helper Methods
|
||||
# /////////////////////////////////////////////////////////////////
|
||||
|
||||
def _require(self, *columntypes):
|
||||
for columntype in columntypes:
|
||||
if columntype not in self._colmap:
|
||||
raise ValueError(
|
||||
"This corpus does not contain a %s " "column." % columntype
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _get_column(grid, column_index):
|
||||
return [grid[i][column_index] for i in range(len(grid))]
|
||||
|
||||
|
||||
class ConllSRLInstance:
|
||||
"""
|
||||
An SRL instance from a CoNLL corpus, which identifies and
|
||||
providing labels for the arguments of a single verb.
|
||||
"""
|
||||
|
||||
# [xx] add inst.core_arguments, inst.argm_arguments?
|
||||
|
||||
def __init__(self, tree, verb_head, verb_stem, roleset, tagged_spans):
|
||||
self.verb = []
|
||||
"""A list of the word indices of the words that compose the
|
||||
verb whose arguments are identified by this instance.
|
||||
This will contain multiple word indices when multi-word
|
||||
verbs are used (e.g. 'turn on')."""
|
||||
|
||||
self.verb_head = verb_head
|
||||
"""The word index of the head word of the verb whose arguments
|
||||
are identified by this instance. E.g., for a sentence that
|
||||
uses the verb 'turn on,' ``verb_head`` will be the word index
|
||||
of the word 'turn'."""
|
||||
|
||||
self.verb_stem = verb_stem
|
||||
|
||||
self.roleset = roleset
|
||||
|
||||
self.arguments = []
|
||||
"""A list of ``(argspan, argid)`` tuples, specifying the location
|
||||
and type for each of the arguments identified by this
|
||||
instance. ``argspan`` is a tuple ``start, end``, indicating
|
||||
that the argument consists of the ``words[start:end]``."""
|
||||
|
||||
self.tagged_spans = tagged_spans
|
||||
"""A list of ``(span, id)`` tuples, specifying the location and
|
||||
type for each of the arguments, as well as the verb pieces,
|
||||
that make up this instance."""
|
||||
|
||||
self.tree = tree
|
||||
"""The parse tree for the sentence containing this instance."""
|
||||
|
||||
self.words = tree.leaves()
|
||||
"""A list of the words in the sentence containing this
|
||||
instance."""
|
||||
|
||||
# Fill in the self.verb and self.arguments values.
|
||||
for (start, end), tag in tagged_spans:
|
||||
if tag in ("V", "C-V"):
|
||||
self.verb += list(range(start, end))
|
||||
else:
|
||||
self.arguments.append(((start, end), tag))
|
||||
|
||||
def __repr__(self):
|
||||
# Originally, its:
|
||||
##plural = 's' if len(self.arguments) != 1 else ''
|
||||
plural = "s" if len(self.arguments) != 1 else ""
|
||||
return "<ConllSRLInstance for %r with %d argument%s>" % (
|
||||
(self.verb_stem, len(self.arguments), plural)
|
||||
)
|
||||
|
||||
def pprint(self):
|
||||
verbstr = " ".join(self.words[i][0] for i in self.verb)
|
||||
hdr = f"SRL for {verbstr!r} (stem={self.verb_stem!r}):\n"
|
||||
s = ""
|
||||
for i, word in enumerate(self.words):
|
||||
if isinstance(word, tuple):
|
||||
word = word[0]
|
||||
for (start, end), argid in self.arguments:
|
||||
if i == start:
|
||||
s += "[%s " % argid
|
||||
if i == end:
|
||||
s += "] "
|
||||
if i in self.verb:
|
||||
word = "<<%s>>" % word
|
||||
s += word + " "
|
||||
return hdr + textwrap.fill(
|
||||
s.replace(" ]", "]"), initial_indent=" ", subsequent_indent=" "
|
||||
)
|
||||
|
||||
|
||||
class ConllSRLInstanceList(list):
|
||||
"""
|
||||
Set of instances for a single sentence
|
||||
"""
|
||||
|
||||
def __init__(self, tree, instances=()):
|
||||
self.tree = tree
|
||||
list.__init__(self, instances)
|
||||
|
||||
def __str__(self):
|
||||
return self.pprint()
|
||||
|
||||
def pprint(self, include_tree=False):
|
||||
# Sanity check: trees should be the same
|
||||
for inst in self:
|
||||
if inst.tree != self.tree:
|
||||
raise ValueError("Tree mismatch!")
|
||||
|
||||
# If desired, add trees:
|
||||
if include_tree:
|
||||
words = self.tree.leaves()
|
||||
pos = [None] * len(words)
|
||||
synt = ["*"] * len(words)
|
||||
self._tree2conll(self.tree, 0, words, pos, synt)
|
||||
|
||||
s = ""
|
||||
for i in range(len(words)):
|
||||
# optional tree columns
|
||||
if include_tree:
|
||||
s += "%-20s " % words[i]
|
||||
s += "%-8s " % pos[i]
|
||||
s += "%15s*%-8s " % tuple(synt[i].split("*"))
|
||||
|
||||
# verb head column
|
||||
for inst in self:
|
||||
if i == inst.verb_head:
|
||||
s += "%-20s " % inst.verb_stem
|
||||
break
|
||||
else:
|
||||
s += "%-20s " % "-"
|
||||
# Remaining columns: self
|
||||
for inst in self:
|
||||
argstr = "*"
|
||||
for (start, end), argid in inst.tagged_spans:
|
||||
if i == start:
|
||||
argstr = f"({argid}{argstr}"
|
||||
if i == (end - 1):
|
||||
argstr += ")"
|
||||
s += "%-12s " % argstr
|
||||
s += "\n"
|
||||
return s
|
||||
|
||||
def _tree2conll(self, tree, wordnum, words, pos, synt):
|
||||
assert isinstance(tree, Tree)
|
||||
if len(tree) == 1 and isinstance(tree[0], str):
|
||||
pos[wordnum] = tree.label()
|
||||
assert words[wordnum] == tree[0]
|
||||
return wordnum + 1
|
||||
elif len(tree) == 1 and isinstance(tree[0], tuple):
|
||||
assert len(tree[0]) == 2
|
||||
pos[wordnum], pos[wordnum] = tree[0]
|
||||
return wordnum + 1
|
||||
else:
|
||||
synt[wordnum] = f"({tree.label()}{synt[wordnum]}"
|
||||
for child in tree:
|
||||
wordnum = self._tree2conll(child, wordnum, words, pos, synt)
|
||||
synt[wordnum - 1] += ")"
|
||||
return wordnum
|
||||
|
||||
|
||||
class ConllChunkCorpusReader(ConllCorpusReader):
|
||||
"""
|
||||
A ConllCorpusReader whose data file contains three columns: words,
|
||||
pos, and chunk.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, root, fileids, chunk_types, encoding="utf8", tagset=None, separator=None
|
||||
):
|
||||
ConllCorpusReader.__init__(
|
||||
self,
|
||||
root,
|
||||
fileids,
|
||||
("words", "pos", "chunk"),
|
||||
chunk_types=chunk_types,
|
||||
encoding=encoding,
|
||||
tagset=tagset,
|
||||
separator=separator,
|
||||
)
|
||||
106
backend/venv/Lib/site-packages/nltk/corpus/reader/crubadan.py
Normal file
106
backend/venv/Lib/site-packages/nltk/corpus/reader/crubadan.py
Normal file
@@ -0,0 +1,106 @@
|
||||
# Natural Language Toolkit: An Crubadan N-grams Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Avital Pekker <avital.pekker@utoronto.ca>
|
||||
#
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
An NLTK interface for the n-gram statistics gathered from
|
||||
the corpora for each language using An Crubadan.
|
||||
|
||||
There are multiple potential applications for the data but
|
||||
this reader was created with the goal of using it in the
|
||||
context of language identification.
|
||||
|
||||
For details about An Crubadan, this data, and its potential uses, see:
|
||||
http://borel.slu.edu/crubadan/index.html
|
||||
"""
|
||||
|
||||
import re
|
||||
from os import path
|
||||
|
||||
from nltk.corpus.reader import CorpusReader
|
||||
from nltk.data import ZipFilePathPointer
|
||||
from nltk.probability import FreqDist
|
||||
|
||||
|
||||
class CrubadanCorpusReader(CorpusReader):
|
||||
"""
|
||||
A corpus reader used to access language An Crubadan n-gram files.
|
||||
"""
|
||||
|
||||
_LANG_MAPPER_FILE = "table.txt"
|
||||
_all_lang_freq = {}
|
||||
|
||||
def __init__(self, root, fileids, encoding="utf8", tagset=None):
|
||||
super().__init__(root, fileids, encoding="utf8")
|
||||
self._lang_mapping_data = []
|
||||
self._load_lang_mapping_data()
|
||||
|
||||
def lang_freq(self, lang):
|
||||
"""Return n-gram FreqDist for a specific language
|
||||
given ISO 639-3 language code"""
|
||||
|
||||
if lang not in self._all_lang_freq:
|
||||
self._all_lang_freq[lang] = self._load_lang_ngrams(lang)
|
||||
|
||||
return self._all_lang_freq[lang]
|
||||
|
||||
def langs(self):
|
||||
"""Return a list of supported languages as ISO 639-3 codes"""
|
||||
return [row[1] for row in self._lang_mapping_data]
|
||||
|
||||
def iso_to_crubadan(self, lang):
|
||||
"""Return internal Crubadan code based on ISO 639-3 code"""
|
||||
for i in self._lang_mapping_data:
|
||||
if i[1].lower() == lang.lower():
|
||||
return i[0]
|
||||
|
||||
def crubadan_to_iso(self, lang):
|
||||
"""Return ISO 639-3 code given internal Crubadan code"""
|
||||
for i in self._lang_mapping_data:
|
||||
if i[0].lower() == lang.lower():
|
||||
return i[1]
|
||||
|
||||
def _load_lang_mapping_data(self):
|
||||
"""Load language mappings between codes and description from table.txt"""
|
||||
if isinstance(self.root, ZipFilePathPointer):
|
||||
raise RuntimeError(
|
||||
"Please install the 'crubadan' corpus first, use nltk.download()"
|
||||
)
|
||||
|
||||
mapper_file = path.join(self.root, self._LANG_MAPPER_FILE)
|
||||
if self._LANG_MAPPER_FILE not in self.fileids():
|
||||
raise RuntimeError("Could not find language mapper file: " + mapper_file)
|
||||
|
||||
with open(mapper_file, encoding="utf-8") as raw:
|
||||
strip_raw = raw.read().strip()
|
||||
|
||||
self._lang_mapping_data = [row.split("\t") for row in strip_raw.split("\n")]
|
||||
|
||||
def _load_lang_ngrams(self, lang):
|
||||
"""Load single n-gram language file given the ISO 639-3 language code
|
||||
and return its FreqDist"""
|
||||
|
||||
if lang not in self.langs():
|
||||
raise RuntimeError("Unsupported language.")
|
||||
|
||||
crubadan_code = self.iso_to_crubadan(lang)
|
||||
ngram_file = path.join(self.root, crubadan_code + "-3grams.txt")
|
||||
|
||||
if not path.isfile(ngram_file):
|
||||
raise RuntimeError("No N-gram file found for requested language.")
|
||||
|
||||
counts = FreqDist()
|
||||
with open(ngram_file, encoding="utf-8") as f:
|
||||
for line in f:
|
||||
data = line.split(" ")
|
||||
|
||||
ngram = data[1].strip("\n")
|
||||
freq = int(data[0])
|
||||
|
||||
counts[ngram] = freq
|
||||
|
||||
return counts
|
||||
115
backend/venv/Lib/site-packages/nltk/corpus/reader/dependency.py
Normal file
115
backend/venv/Lib/site-packages/nltk/corpus/reader/dependency.py
Normal file
@@ -0,0 +1,115 @@
|
||||
# Natural Language Toolkit: Dependency Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Kepa Sarasola <kepa.sarasola@ehu.es>
|
||||
# Iker Manterola <returntothehangar@hotmail.com>
|
||||
#
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.parse import DependencyGraph
|
||||
from nltk.tokenize import *
|
||||
|
||||
|
||||
class DependencyCorpusReader(SyntaxCorpusReader):
|
||||
def __init__(
|
||||
self,
|
||||
root,
|
||||
fileids,
|
||||
encoding="utf8",
|
||||
word_tokenizer=TabTokenizer(),
|
||||
sent_tokenizer=RegexpTokenizer("\n", gaps=True),
|
||||
para_block_reader=read_blankline_block,
|
||||
):
|
||||
SyntaxCorpusReader.__init__(self, root, fileids, encoding)
|
||||
|
||||
#########################################################
|
||||
|
||||
def words(self, fileids=None):
|
||||
return concat(
|
||||
[
|
||||
DependencyCorpusView(fileid, False, False, False, encoding=enc)
|
||||
for fileid, enc in self.abspaths(fileids, include_encoding=True)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_words(self, fileids=None):
|
||||
return concat(
|
||||
[
|
||||
DependencyCorpusView(fileid, True, False, False, encoding=enc)
|
||||
for fileid, enc in self.abspaths(fileids, include_encoding=True)
|
||||
]
|
||||
)
|
||||
|
||||
def sents(self, fileids=None):
|
||||
return concat(
|
||||
[
|
||||
DependencyCorpusView(fileid, False, True, False, encoding=enc)
|
||||
for fileid, enc in self.abspaths(fileids, include_encoding=True)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_sents(self, fileids=None):
|
||||
return concat(
|
||||
[
|
||||
DependencyCorpusView(fileid, True, True, False, encoding=enc)
|
||||
for fileid, enc in self.abspaths(fileids, include_encoding=True)
|
||||
]
|
||||
)
|
||||
|
||||
def parsed_sents(self, fileids=None):
|
||||
sents = concat(
|
||||
[
|
||||
DependencyCorpusView(fileid, False, True, True, encoding=enc)
|
||||
for fileid, enc in self.abspaths(fileids, include_encoding=True)
|
||||
]
|
||||
)
|
||||
return [DependencyGraph(sent) for sent in sents]
|
||||
|
||||
|
||||
class DependencyCorpusView(StreamBackedCorpusView):
|
||||
_DOCSTART = "-DOCSTART- -DOCSTART- O\n" # dokumentu hasiera definitzen da
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
corpus_file,
|
||||
tagged,
|
||||
group_by_sent,
|
||||
dependencies,
|
||||
chunk_types=None,
|
||||
encoding="utf8",
|
||||
):
|
||||
self._tagged = tagged
|
||||
self._dependencies = dependencies
|
||||
self._group_by_sent = group_by_sent
|
||||
self._chunk_types = chunk_types
|
||||
StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
|
||||
|
||||
def read_block(self, stream):
|
||||
# Read the next sentence.
|
||||
sent = read_blankline_block(stream)[0].strip()
|
||||
# Strip off the docstart marker, if present.
|
||||
if sent.startswith(self._DOCSTART):
|
||||
sent = sent[len(self._DOCSTART) :].lstrip()
|
||||
|
||||
# extract word and tag from any of the formats
|
||||
if not self._dependencies:
|
||||
lines = [line.split("\t") for line in sent.split("\n")]
|
||||
if len(lines[0]) == 3 or len(lines[0]) == 4:
|
||||
sent = [(line[0], line[1]) for line in lines]
|
||||
elif len(lines[0]) == 10:
|
||||
sent = [(line[1], line[4]) for line in lines]
|
||||
else:
|
||||
raise ValueError("Unexpected number of fields in dependency tree file")
|
||||
|
||||
# discard tags if they weren't requested
|
||||
if not self._tagged:
|
||||
sent = [word for (word, tag) in sent]
|
||||
|
||||
# Return the result.
|
||||
if self._group_by_sent:
|
||||
return [sent]
|
||||
else:
|
||||
return list(sent)
|
||||
3428
backend/venv/Lib/site-packages/nltk/corpus/reader/framenet.py
Normal file
3428
backend/venv/Lib/site-packages/nltk/corpus/reader/framenet.py
Normal file
File diff suppressed because it is too large
Load Diff
116
backend/venv/Lib/site-packages/nltk/corpus/reader/ieer.py
Normal file
116
backend/venv/Lib/site-packages/nltk/corpus/reader/ieer.py
Normal file
@@ -0,0 +1,116 @@
|
||||
# Natural Language Toolkit: IEER Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Corpus reader for the Information Extraction and Entity Recognition Corpus.
|
||||
|
||||
NIST 1999 Information Extraction: Entity Recognition Evaluation
|
||||
https://www.itl.nist.gov/iad/894.01/tests/ie-er/er_99/er_99.htm
|
||||
|
||||
This corpus contains the NEWSWIRE development test data for the
|
||||
NIST 1999 IE-ER Evaluation. The files were taken from the
|
||||
subdirectory: ``/ie_er_99/english/devtest/newswire/*.ref.nwt``
|
||||
and filenames were shortened.
|
||||
|
||||
The corpus contains the following files: APW_19980314, APW_19980424,
|
||||
APW_19980429, NYT_19980315, NYT_19980403, and NYT_19980407.
|
||||
"""
|
||||
|
||||
import nltk
|
||||
from nltk.corpus.reader.api import *
|
||||
|
||||
#: A dictionary whose keys are the names of documents in this corpus;
|
||||
#: and whose values are descriptions of those documents' contents.
|
||||
titles = {
|
||||
"APW_19980314": "Associated Press Weekly, 14 March 1998",
|
||||
"APW_19980424": "Associated Press Weekly, 24 April 1998",
|
||||
"APW_19980429": "Associated Press Weekly, 29 April 1998",
|
||||
"NYT_19980315": "New York Times, 15 March 1998",
|
||||
"NYT_19980403": "New York Times, 3 April 1998",
|
||||
"NYT_19980407": "New York Times, 7 April 1998",
|
||||
}
|
||||
|
||||
#: A list of all documents in this corpus.
|
||||
documents = sorted(titles)
|
||||
|
||||
|
||||
class IEERDocument:
|
||||
def __init__(self, text, docno=None, doctype=None, date_time=None, headline=""):
|
||||
self.text = text
|
||||
self.docno = docno
|
||||
self.doctype = doctype
|
||||
self.date_time = date_time
|
||||
self.headline = headline
|
||||
|
||||
def __repr__(self):
|
||||
if self.headline:
|
||||
headline = " ".join(self.headline.leaves())
|
||||
else:
|
||||
headline = (
|
||||
" ".join([w for w in self.text.leaves() if w[:1] != "<"][:12]) + "..."
|
||||
)
|
||||
if self.docno is not None:
|
||||
return f"<IEERDocument {self.docno}: {headline!r}>"
|
||||
else:
|
||||
return "<IEERDocument: %r>" % headline
|
||||
|
||||
|
||||
class IEERCorpusReader(CorpusReader):
|
||||
""" """
|
||||
|
||||
def docs(self, fileids=None):
|
||||
return concat(
|
||||
[
|
||||
StreamBackedCorpusView(fileid, self._read_block, encoding=enc)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def parsed_docs(self, fileids=None):
|
||||
return concat(
|
||||
[
|
||||
StreamBackedCorpusView(fileid, self._read_parsed_block, encoding=enc)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def _read_parsed_block(self, stream):
|
||||
# TODO: figure out while empty documents are being returned
|
||||
return [
|
||||
self._parse(doc)
|
||||
for doc in self._read_block(stream)
|
||||
if self._parse(doc).docno is not None
|
||||
]
|
||||
|
||||
def _parse(self, doc):
|
||||
val = nltk.chunk.ieerstr2tree(doc, root_label="DOCUMENT")
|
||||
if isinstance(val, dict):
|
||||
return IEERDocument(**val)
|
||||
else:
|
||||
return IEERDocument(val)
|
||||
|
||||
def _read_block(self, stream):
|
||||
out = []
|
||||
# Skip any preamble.
|
||||
while True:
|
||||
line = stream.readline()
|
||||
if not line:
|
||||
break
|
||||
if line.strip() == "<DOC>":
|
||||
break
|
||||
out.append(line)
|
||||
# Read the document
|
||||
while True:
|
||||
line = stream.readline()
|
||||
if not line:
|
||||
break
|
||||
out.append(line)
|
||||
if line.strip() == "</DOC>":
|
||||
break
|
||||
# Return the document
|
||||
return ["\n".join(out)]
|
||||
93
backend/venv/Lib/site-packages/nltk/corpus/reader/indian.py
Normal file
93
backend/venv/Lib/site-packages/nltk/corpus/reader/indian.py
Normal file
@@ -0,0 +1,93 @@
|
||||
# Natural Language Toolkit: Indian Language POS-Tagged Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Indian Language POS-Tagged Corpus
|
||||
Collected by A Kumaran, Microsoft Research, India
|
||||
Distributed with permission
|
||||
|
||||
Contents:
|
||||
- Bangla: IIT Kharagpur
|
||||
- Hindi: Microsoft Research India
|
||||
- Marathi: IIT Bombay
|
||||
- Telugu: IIIT Hyderabad
|
||||
"""
|
||||
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.tag import map_tag, str2tuple
|
||||
|
||||
|
||||
class IndianCorpusReader(CorpusReader):
|
||||
"""
|
||||
List of words, one per line. Blank lines are ignored.
|
||||
"""
|
||||
|
||||
def words(self, fileids=None):
|
||||
return concat(
|
||||
[
|
||||
IndianCorpusView(fileid, enc, False, False)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_words(self, fileids=None, tagset=None):
|
||||
if tagset and tagset != self._tagset:
|
||||
tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
|
||||
else:
|
||||
tag_mapping_function = None
|
||||
return concat(
|
||||
[
|
||||
IndianCorpusView(fileid, enc, True, False, tag_mapping_function)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def sents(self, fileids=None):
|
||||
return concat(
|
||||
[
|
||||
IndianCorpusView(fileid, enc, False, True)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_sents(self, fileids=None, tagset=None):
|
||||
if tagset and tagset != self._tagset:
|
||||
tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
|
||||
else:
|
||||
tag_mapping_function = None
|
||||
return concat(
|
||||
[
|
||||
IndianCorpusView(fileid, enc, True, True, tag_mapping_function)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
class IndianCorpusView(StreamBackedCorpusView):
|
||||
def __init__(
|
||||
self, corpus_file, encoding, tagged, group_by_sent, tag_mapping_function=None
|
||||
):
|
||||
self._tagged = tagged
|
||||
self._group_by_sent = group_by_sent
|
||||
self._tag_mapping_function = tag_mapping_function
|
||||
StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
|
||||
|
||||
def read_block(self, stream):
|
||||
line = stream.readline()
|
||||
if line.startswith("<"):
|
||||
return []
|
||||
sent = [str2tuple(word, sep="_") for word in line.split()]
|
||||
if self._tag_mapping_function:
|
||||
sent = [(w, self._tag_mapping_function(t)) for (w, t) in sent]
|
||||
if not self._tagged:
|
||||
sent = [w for (w, t) in sent]
|
||||
if self._group_by_sent:
|
||||
return [sent]
|
||||
else:
|
||||
return sent
|
||||
354
backend/venv/Lib/site-packages/nltk/corpus/reader/ipipan.py
Normal file
354
backend/venv/Lib/site-packages/nltk/corpus/reader/ipipan.py
Normal file
@@ -0,0 +1,354 @@
|
||||
# Natural Language Toolkit: IPI PAN Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Konrad Goluchowski <kodie@mimuw.edu.pl>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import functools
|
||||
|
||||
from nltk.corpus.reader.api import CorpusReader
|
||||
from nltk.corpus.reader.util import StreamBackedCorpusView, concat
|
||||
|
||||
|
||||
def _parse_args(fun):
|
||||
@functools.wraps(fun)
|
||||
def decorator(self, fileids=None, **kwargs):
|
||||
kwargs.pop("tags", None)
|
||||
if not fileids:
|
||||
fileids = self.fileids()
|
||||
return fun(self, fileids, **kwargs)
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
class IPIPANCorpusReader(CorpusReader):
|
||||
"""
|
||||
Corpus reader designed to work with corpus created by IPI PAN.
|
||||
See http://korpus.pl/en/ for more details about IPI PAN corpus.
|
||||
|
||||
The corpus includes information about text domain, channel and categories.
|
||||
You can access possible values using ``domains()``, ``channels()`` and
|
||||
``categories()``. You can use also this metadata to filter files, e.g.:
|
||||
``fileids(channel='prasa')``, ``fileids(categories='publicystyczny')``.
|
||||
|
||||
The reader supports methods: words, sents, paras and their tagged versions.
|
||||
You can get part of speech instead of full tag by giving "simplify_tags=True"
|
||||
parameter, e.g.: ``tagged_sents(simplify_tags=True)``.
|
||||
|
||||
Also you can get all tags disambiguated tags specifying parameter
|
||||
"one_tag=False", e.g.: ``tagged_paras(one_tag=False)``.
|
||||
|
||||
You can get all tags that were assigned by a morphological analyzer specifying
|
||||
parameter "disamb_only=False", e.g. ``tagged_words(disamb_only=False)``.
|
||||
|
||||
The IPIPAN Corpus contains tags indicating if there is a space between two
|
||||
tokens. To add special "no space" markers, you should specify parameter
|
||||
"append_no_space=True", e.g. ``tagged_words(append_no_space=True)``.
|
||||
As a result in place where there should be no space between two tokens new
|
||||
pair ('', 'no-space') will be inserted (for tagged data) and just '' for
|
||||
methods without tags.
|
||||
|
||||
The corpus reader can also try to append spaces between words. To enable this
|
||||
option, specify parameter "append_space=True", e.g. ``words(append_space=True)``.
|
||||
As a result either ' ' or (' ', 'space') will be inserted between tokens.
|
||||
|
||||
By default, xml entities like " and & are replaced by corresponding
|
||||
characters. You can turn off this feature, specifying parameter
|
||||
"replace_xmlentities=False", e.g. ``words(replace_xmlentities=False)``.
|
||||
"""
|
||||
|
||||
def __init__(self, root, fileids):
|
||||
CorpusReader.__init__(self, root, fileids, None, None)
|
||||
|
||||
def channels(self, fileids=None):
|
||||
if not fileids:
|
||||
fileids = self.fileids()
|
||||
return self._parse_header(fileids, "channel")
|
||||
|
||||
def domains(self, fileids=None):
|
||||
if not fileids:
|
||||
fileids = self.fileids()
|
||||
return self._parse_header(fileids, "domain")
|
||||
|
||||
def categories(self, fileids=None):
|
||||
if not fileids:
|
||||
fileids = self.fileids()
|
||||
return [
|
||||
self._map_category(cat) for cat in self._parse_header(fileids, "keyTerm")
|
||||
]
|
||||
|
||||
def fileids(self, channels=None, domains=None, categories=None):
|
||||
if channels is not None and domains is not None and categories is not None:
|
||||
raise ValueError(
|
||||
"You can specify only one of channels, domains "
|
||||
"and categories parameter at once"
|
||||
)
|
||||
if channels is None and domains is None and categories is None:
|
||||
return CorpusReader.fileids(self)
|
||||
if isinstance(channels, str):
|
||||
channels = [channels]
|
||||
if isinstance(domains, str):
|
||||
domains = [domains]
|
||||
if isinstance(categories, str):
|
||||
categories = [categories]
|
||||
if channels:
|
||||
return self._list_morph_files_by("channel", channels)
|
||||
elif domains:
|
||||
return self._list_morph_files_by("domain", domains)
|
||||
else:
|
||||
return self._list_morph_files_by(
|
||||
"keyTerm", categories, map=self._map_category
|
||||
)
|
||||
|
||||
@_parse_args
|
||||
def sents(self, fileids=None, **kwargs):
|
||||
return concat(
|
||||
[
|
||||
self._view(
|
||||
fileid, mode=IPIPANCorpusView.SENTS_MODE, tags=False, **kwargs
|
||||
)
|
||||
for fileid in self._list_morph_files(fileids)
|
||||
]
|
||||
)
|
||||
|
||||
@_parse_args
|
||||
def paras(self, fileids=None, **kwargs):
|
||||
return concat(
|
||||
[
|
||||
self._view(
|
||||
fileid, mode=IPIPANCorpusView.PARAS_MODE, tags=False, **kwargs
|
||||
)
|
||||
for fileid in self._list_morph_files(fileids)
|
||||
]
|
||||
)
|
||||
|
||||
@_parse_args
|
||||
def words(self, fileids=None, **kwargs):
|
||||
return concat(
|
||||
[
|
||||
self._view(fileid, tags=False, **kwargs)
|
||||
for fileid in self._list_morph_files(fileids)
|
||||
]
|
||||
)
|
||||
|
||||
@_parse_args
|
||||
def tagged_sents(self, fileids=None, **kwargs):
|
||||
return concat(
|
||||
[
|
||||
self._view(fileid, mode=IPIPANCorpusView.SENTS_MODE, **kwargs)
|
||||
for fileid in self._list_morph_files(fileids)
|
||||
]
|
||||
)
|
||||
|
||||
@_parse_args
|
||||
def tagged_paras(self, fileids=None, **kwargs):
|
||||
return concat(
|
||||
[
|
||||
self._view(fileid, mode=IPIPANCorpusView.PARAS_MODE, **kwargs)
|
||||
for fileid in self._list_morph_files(fileids)
|
||||
]
|
||||
)
|
||||
|
||||
@_parse_args
|
||||
def tagged_words(self, fileids=None, **kwargs):
|
||||
return concat(
|
||||
[self._view(fileid, **kwargs) for fileid in self._list_morph_files(fileids)]
|
||||
)
|
||||
|
||||
def _list_morph_files(self, fileids):
|
||||
return [f for f in self.abspaths(fileids)]
|
||||
|
||||
def _list_header_files(self, fileids):
|
||||
return [
|
||||
f.replace("morph.xml", "header.xml")
|
||||
for f in self._list_morph_files(fileids)
|
||||
]
|
||||
|
||||
def _parse_header(self, fileids, tag):
|
||||
values = set()
|
||||
for f in self._list_header_files(fileids):
|
||||
values_list = self._get_tag(f, tag)
|
||||
for v in values_list:
|
||||
values.add(v)
|
||||
return list(values)
|
||||
|
||||
def _list_morph_files_by(self, tag, values, map=None):
|
||||
fileids = self.fileids()
|
||||
ret_fileids = set()
|
||||
for f in fileids:
|
||||
fp = self.abspath(f).replace("morph.xml", "header.xml")
|
||||
values_list = self._get_tag(fp, tag)
|
||||
for value in values_list:
|
||||
if map is not None:
|
||||
value = map(value)
|
||||
if value in values:
|
||||
ret_fileids.add(f)
|
||||
return list(ret_fileids)
|
||||
|
||||
def _get_tag(self, f, tag):
|
||||
tags = []
|
||||
with open(f) as infile:
|
||||
header = infile.read()
|
||||
tag_end = 0
|
||||
while True:
|
||||
tag_pos = header.find("<" + tag, tag_end)
|
||||
if tag_pos < 0:
|
||||
return tags
|
||||
tag_end = header.find("</" + tag + ">", tag_pos)
|
||||
tags.append(header[tag_pos + len(tag) + 2 : tag_end])
|
||||
|
||||
def _map_category(self, cat):
|
||||
pos = cat.find(">")
|
||||
if pos == -1:
|
||||
return cat
|
||||
else:
|
||||
return cat[pos + 1 :]
|
||||
|
||||
def _view(self, filename, **kwargs):
|
||||
tags = kwargs.pop("tags", True)
|
||||
mode = kwargs.pop("mode", 0)
|
||||
simplify_tags = kwargs.pop("simplify_tags", False)
|
||||
one_tag = kwargs.pop("one_tag", True)
|
||||
disamb_only = kwargs.pop("disamb_only", True)
|
||||
append_no_space = kwargs.pop("append_no_space", False)
|
||||
append_space = kwargs.pop("append_space", False)
|
||||
replace_xmlentities = kwargs.pop("replace_xmlentities", True)
|
||||
|
||||
if len(kwargs) > 0:
|
||||
raise ValueError("Unexpected arguments: %s" % kwargs.keys())
|
||||
if not one_tag and not disamb_only:
|
||||
raise ValueError(
|
||||
"You cannot specify both one_tag=False and " "disamb_only=False"
|
||||
)
|
||||
if not tags and (simplify_tags or not one_tag or not disamb_only):
|
||||
raise ValueError(
|
||||
"You cannot specify simplify_tags, one_tag or "
|
||||
"disamb_only with functions other than tagged_*"
|
||||
)
|
||||
|
||||
return IPIPANCorpusView(
|
||||
filename,
|
||||
tags=tags,
|
||||
mode=mode,
|
||||
simplify_tags=simplify_tags,
|
||||
one_tag=one_tag,
|
||||
disamb_only=disamb_only,
|
||||
append_no_space=append_no_space,
|
||||
append_space=append_space,
|
||||
replace_xmlentities=replace_xmlentities,
|
||||
)
|
||||
|
||||
|
||||
class IPIPANCorpusView(StreamBackedCorpusView):
|
||||
WORDS_MODE = 0
|
||||
SENTS_MODE = 1
|
||||
PARAS_MODE = 2
|
||||
|
||||
def __init__(self, filename, startpos=0, **kwargs):
|
||||
StreamBackedCorpusView.__init__(self, filename, None, startpos, None)
|
||||
self.in_sentence = False
|
||||
self.position = 0
|
||||
|
||||
self.show_tags = kwargs.pop("tags", True)
|
||||
self.disamb_only = kwargs.pop("disamb_only", True)
|
||||
self.mode = kwargs.pop("mode", IPIPANCorpusView.WORDS_MODE)
|
||||
self.simplify_tags = kwargs.pop("simplify_tags", False)
|
||||
self.one_tag = kwargs.pop("one_tag", True)
|
||||
self.append_no_space = kwargs.pop("append_no_space", False)
|
||||
self.append_space = kwargs.pop("append_space", False)
|
||||
self.replace_xmlentities = kwargs.pop("replace_xmlentities", True)
|
||||
|
||||
def read_block(self, stream):
|
||||
sentence = []
|
||||
sentences = []
|
||||
space = False
|
||||
no_space = False
|
||||
|
||||
tags = set()
|
||||
|
||||
lines = self._read_data(stream)
|
||||
|
||||
while True:
|
||||
# we may have only part of last line
|
||||
if len(lines) <= 1:
|
||||
self._seek(stream)
|
||||
lines = self._read_data(stream)
|
||||
|
||||
if lines == [""]:
|
||||
assert not sentences
|
||||
return []
|
||||
|
||||
line = lines.pop()
|
||||
self.position += len(line) + 1
|
||||
|
||||
if line.startswith('<chunk type="s"'):
|
||||
self.in_sentence = True
|
||||
elif line.startswith('<chunk type="p"'):
|
||||
pass
|
||||
elif line.startswith("<tok"):
|
||||
if self.append_space and space and not no_space:
|
||||
self._append_space(sentence)
|
||||
space = True
|
||||
no_space = False
|
||||
orth = ""
|
||||
tags = set()
|
||||
elif line.startswith("</chunk"):
|
||||
if self.in_sentence:
|
||||
self.in_sentence = False
|
||||
self._seek(stream)
|
||||
if self.mode == self.SENTS_MODE:
|
||||
return [sentence]
|
||||
elif self.mode == self.WORDS_MODE:
|
||||
if self.append_space:
|
||||
self._append_space(sentence)
|
||||
return sentence
|
||||
else:
|
||||
sentences.append(sentence)
|
||||
elif self.mode == self.PARAS_MODE:
|
||||
self._seek(stream)
|
||||
return [sentences]
|
||||
elif line.startswith("<orth"):
|
||||
orth = line[6:-7]
|
||||
if self.replace_xmlentities:
|
||||
orth = orth.replace(""", '"').replace("&", "&")
|
||||
elif line.startswith("<lex"):
|
||||
if not self.disamb_only or line.find("disamb=") != -1:
|
||||
tag = line[line.index("<ctag") + 6 : line.index("</ctag")]
|
||||
tags.add(tag)
|
||||
elif line.startswith("</tok"):
|
||||
if self.show_tags:
|
||||
if self.simplify_tags:
|
||||
tags = [t.split(":")[0] for t in tags]
|
||||
if not self.one_tag or not self.disamb_only:
|
||||
sentence.append((orth, tuple(tags)))
|
||||
else:
|
||||
sentence.append((orth, tags.pop()))
|
||||
else:
|
||||
sentence.append(orth)
|
||||
elif line.startswith("<ns/>"):
|
||||
if self.append_space:
|
||||
no_space = True
|
||||
if self.append_no_space:
|
||||
if self.show_tags:
|
||||
sentence.append(("", "no-space"))
|
||||
else:
|
||||
sentence.append("")
|
||||
elif line.startswith("</cesAna"):
|
||||
pass
|
||||
|
||||
def _read_data(self, stream):
|
||||
self.position = stream.tell()
|
||||
buff = stream.read(4096)
|
||||
lines = buff.split("\n")
|
||||
lines.reverse()
|
||||
return lines
|
||||
|
||||
def _seek(self, stream):
|
||||
stream.seek(self.position)
|
||||
|
||||
def _append_space(self, sentence):
|
||||
if self.show_tags:
|
||||
sentence.append((" ", "space"))
|
||||
else:
|
||||
sentence.append(" ")
|
||||
186
backend/venv/Lib/site-packages/nltk/corpus/reader/knbc.py
Normal file
186
backend/venv/Lib/site-packages/nltk/corpus/reader/knbc.py
Normal file
@@ -0,0 +1,186 @@
|
||||
#! /usr/bin/env python
|
||||
# KNB Corpus reader
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Masato Hagiwara <hagisan@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
# For more information, see http://lilyx.net/pages/nltkjapanesecorpus.html
|
||||
|
||||
import re
|
||||
|
||||
from nltk.corpus.reader.api import CorpusReader, SyntaxCorpusReader
|
||||
from nltk.corpus.reader.util import (
|
||||
FileSystemPathPointer,
|
||||
find_corpus_fileids,
|
||||
read_blankline_block,
|
||||
)
|
||||
from nltk.parse import DependencyGraph
|
||||
|
||||
# default function to convert morphlist to str for tree representation
|
||||
_morphs2str_default = lambda morphs: "/".join(m[0] for m in morphs if m[0] != "EOS")
|
||||
|
||||
|
||||
class KNBCorpusReader(SyntaxCorpusReader):
|
||||
"""
|
||||
This class implements:
|
||||
- ``__init__``, which specifies the location of the corpus
|
||||
and a method for detecting the sentence blocks in corpus files.
|
||||
- ``_read_block``, which reads a block from the input stream.
|
||||
- ``_word``, which takes a block and returns a list of list of words.
|
||||
- ``_tag``, which takes a block and returns a list of list of tagged
|
||||
words.
|
||||
- ``_parse``, which takes a block and returns a list of parsed
|
||||
sentences.
|
||||
|
||||
The structure of tagged words:
|
||||
tagged_word = (word(str), tags(tuple))
|
||||
tags = (surface, reading, lemma, pos1, posid1, pos2, posid2, pos3, posid3, others ...)
|
||||
|
||||
Usage example
|
||||
|
||||
>>> from nltk.corpus.util import LazyCorpusLoader
|
||||
>>> knbc = LazyCorpusLoader(
|
||||
... 'knbc/corpus1',
|
||||
... KNBCorpusReader,
|
||||
... r'.*/KN.*',
|
||||
... encoding='euc-jp',
|
||||
... )
|
||||
|
||||
>>> len(knbc.sents()[0])
|
||||
9
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, root, fileids, encoding="utf8", morphs2str=_morphs2str_default):
|
||||
"""
|
||||
Initialize KNBCorpusReader
|
||||
morphs2str is a function to convert morphlist to str for tree representation
|
||||
for _parse()
|
||||
"""
|
||||
SyntaxCorpusReader.__init__(self, root, fileids, encoding)
|
||||
self.morphs2str = morphs2str
|
||||
|
||||
def _read_block(self, stream):
|
||||
# blocks are split by blankline (or EOF) - default
|
||||
return read_blankline_block(stream)
|
||||
|
||||
def _word(self, t):
|
||||
res = []
|
||||
for line in t.splitlines():
|
||||
# ignore the Bunsets headers
|
||||
if not re.match(r"EOS|\*|\#|\+", line):
|
||||
cells = line.strip().split(" ")
|
||||
res.append(cells[0])
|
||||
|
||||
return res
|
||||
|
||||
# ignores tagset argument
|
||||
def _tag(self, t, tagset=None):
|
||||
res = []
|
||||
for line in t.splitlines():
|
||||
# ignore the Bunsets headers
|
||||
if not re.match(r"EOS|\*|\#|\+", line):
|
||||
cells = line.strip().split(" ")
|
||||
# convert cells to morph tuples
|
||||
res.append((cells[0], " ".join(cells[1:])))
|
||||
|
||||
return res
|
||||
|
||||
def _parse(self, t):
|
||||
dg = DependencyGraph()
|
||||
i = 0
|
||||
for line in t.splitlines():
|
||||
if line[0] in "*+":
|
||||
# start of bunsetsu or tag
|
||||
|
||||
cells = line.strip().split(" ", 3)
|
||||
m = re.match(r"([\-0-9]*)([ADIP])", cells[1])
|
||||
|
||||
assert m is not None
|
||||
|
||||
node = dg.nodes[i]
|
||||
node.update({"address": i, "rel": m.group(2), "word": []})
|
||||
|
||||
dep_parent = int(m.group(1))
|
||||
|
||||
if dep_parent == -1:
|
||||
dg.root = node
|
||||
else:
|
||||
dg.nodes[dep_parent]["deps"].append(i)
|
||||
|
||||
i += 1
|
||||
elif line[0] != "#":
|
||||
# normal morph
|
||||
cells = line.strip().split(" ")
|
||||
# convert cells to morph tuples
|
||||
morph = cells[0], " ".join(cells[1:])
|
||||
dg.nodes[i - 1]["word"].append(morph)
|
||||
|
||||
if self.morphs2str:
|
||||
for node in dg.nodes.values():
|
||||
node["word"] = self.morphs2str(node["word"])
|
||||
|
||||
return dg.tree()
|
||||
|
||||
|
||||
######################################################################
|
||||
# Demo
|
||||
######################################################################
|
||||
|
||||
|
||||
def demo():
|
||||
import nltk
|
||||
from nltk.corpus.util import LazyCorpusLoader
|
||||
|
||||
root = nltk.data.find("corpora/knbc/corpus1")
|
||||
fileids = [
|
||||
f
|
||||
for f in find_corpus_fileids(FileSystemPathPointer(root), ".*")
|
||||
if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)
|
||||
]
|
||||
|
||||
def _knbc_fileids_sort(x):
|
||||
cells = x.split("-")
|
||||
return (cells[0], int(cells[1]), int(cells[2]), int(cells[3]))
|
||||
|
||||
knbc = LazyCorpusLoader(
|
||||
"knbc/corpus1",
|
||||
KNBCorpusReader,
|
||||
sorted(fileids, key=_knbc_fileids_sort),
|
||||
encoding="euc-jp",
|
||||
)
|
||||
|
||||
print(knbc.fileids()[:10])
|
||||
print("".join(knbc.words()[:100]))
|
||||
|
||||
print("\n\n".join(str(tree) for tree in knbc.parsed_sents()[:2]))
|
||||
|
||||
knbc.morphs2str = lambda morphs: "/".join(
|
||||
"{}({})".format(m[0], m[1].split(" ")[2]) for m in morphs if m[0] != "EOS"
|
||||
).encode("utf-8")
|
||||
|
||||
print("\n\n".join("%s" % tree for tree in knbc.parsed_sents()[:2]))
|
||||
|
||||
print(
|
||||
"\n".join(
|
||||
" ".join("{}/{}".format(w[0], w[1].split(" ")[2]) for w in sent)
|
||||
for sent in knbc.tagged_sents()[0:2]
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def test():
|
||||
from nltk.corpus.util import LazyCorpusLoader
|
||||
|
||||
knbc = LazyCorpusLoader(
|
||||
"knbc/corpus1", KNBCorpusReader, r".*/KN.*", encoding="euc-jp"
|
||||
)
|
||||
assert isinstance(knbc.words()[0], str)
|
||||
assert isinstance(knbc.sents()[0][0], str)
|
||||
assert isinstance(knbc.tagged_words()[0], tuple)
|
||||
assert isinstance(knbc.tagged_sents()[0][0], tuple)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
183
backend/venv/Lib/site-packages/nltk/corpus/reader/lin.py
Normal file
183
backend/venv/Lib/site-packages/nltk/corpus/reader/lin.py
Normal file
@@ -0,0 +1,183 @@
|
||||
# Natural Language Toolkit: Lin's Thesaurus
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Dan Blanchard <dblanchard@ets.org>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.txt
|
||||
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from functools import reduce
|
||||
|
||||
from nltk.corpus.reader import CorpusReader
|
||||
|
||||
|
||||
class LinThesaurusCorpusReader(CorpusReader):
|
||||
"""Wrapper for the LISP-formatted thesauruses distributed by Dekang Lin."""
|
||||
|
||||
# Compiled regular expression for extracting the key from the first line of each
|
||||
# thesaurus entry
|
||||
_key_re = re.compile(r'\("?([^"]+)"? \(desc [0-9.]+\).+')
|
||||
|
||||
@staticmethod
|
||||
def __defaultdict_factory():
|
||||
"""Factory for creating defaultdict of defaultdict(dict)s"""
|
||||
return defaultdict(dict)
|
||||
|
||||
def __init__(self, root, badscore=0.0):
|
||||
"""
|
||||
Initialize the thesaurus.
|
||||
|
||||
:param root: root directory containing thesaurus LISP files
|
||||
:type root: C{string}
|
||||
:param badscore: the score to give to words which do not appear in each other's sets of synonyms
|
||||
:type badscore: C{float}
|
||||
"""
|
||||
|
||||
super().__init__(root, r"sim[A-Z]\.lsp")
|
||||
self._thesaurus = defaultdict(LinThesaurusCorpusReader.__defaultdict_factory)
|
||||
self._badscore = badscore
|
||||
for path, encoding, fileid in self.abspaths(
|
||||
include_encoding=True, include_fileid=True
|
||||
):
|
||||
with open(path) as lin_file:
|
||||
first = True
|
||||
for line in lin_file:
|
||||
line = line.strip()
|
||||
# Start of entry
|
||||
if first:
|
||||
key = LinThesaurusCorpusReader._key_re.sub(r"\1", line)
|
||||
first = False
|
||||
# End of entry
|
||||
elif line == "))":
|
||||
first = True
|
||||
# Lines with pairs of ngrams and scores
|
||||
else:
|
||||
split_line = line.split("\t")
|
||||
if len(split_line) == 2:
|
||||
ngram, score = split_line
|
||||
self._thesaurus[fileid][key][ngram.strip('"')] = float(
|
||||
score
|
||||
)
|
||||
|
||||
def similarity(self, ngram1, ngram2, fileid=None):
|
||||
"""
|
||||
Returns the similarity score for two ngrams.
|
||||
|
||||
:param ngram1: first ngram to compare
|
||||
:type ngram1: C{string}
|
||||
:param ngram2: second ngram to compare
|
||||
:type ngram2: C{string}
|
||||
:param fileid: thesaurus fileid to search in. If None, search all fileids.
|
||||
:type fileid: C{string}
|
||||
:return: If fileid is specified, just the score for the two ngrams; otherwise,
|
||||
list of tuples of fileids and scores.
|
||||
"""
|
||||
# Entries don't contain themselves, so make sure similarity between item and itself is 1.0
|
||||
if ngram1 == ngram2:
|
||||
if fileid:
|
||||
return 1.0
|
||||
else:
|
||||
return [(fid, 1.0) for fid in self._fileids]
|
||||
else:
|
||||
if fileid:
|
||||
return (
|
||||
self._thesaurus[fileid][ngram1][ngram2]
|
||||
if ngram2 in self._thesaurus[fileid][ngram1]
|
||||
else self._badscore
|
||||
)
|
||||
else:
|
||||
return [
|
||||
(
|
||||
fid,
|
||||
(
|
||||
self._thesaurus[fid][ngram1][ngram2]
|
||||
if ngram2 in self._thesaurus[fid][ngram1]
|
||||
else self._badscore
|
||||
),
|
||||
)
|
||||
for fid in self._fileids
|
||||
]
|
||||
|
||||
def scored_synonyms(self, ngram, fileid=None):
|
||||
"""
|
||||
Returns a list of scored synonyms (tuples of synonyms and scores) for the current ngram
|
||||
|
||||
:param ngram: ngram to lookup
|
||||
:type ngram: C{string}
|
||||
:param fileid: thesaurus fileid to search in. If None, search all fileids.
|
||||
:type fileid: C{string}
|
||||
:return: If fileid is specified, list of tuples of scores and synonyms; otherwise,
|
||||
list of tuples of fileids and lists, where inner lists consist of tuples of
|
||||
scores and synonyms.
|
||||
"""
|
||||
if fileid:
|
||||
return self._thesaurus[fileid][ngram].items()
|
||||
else:
|
||||
return [
|
||||
(fileid, self._thesaurus[fileid][ngram].items())
|
||||
for fileid in self._fileids
|
||||
]
|
||||
|
||||
def synonyms(self, ngram, fileid=None):
|
||||
"""
|
||||
Returns a list of synonyms for the current ngram.
|
||||
|
||||
:param ngram: ngram to lookup
|
||||
:type ngram: C{string}
|
||||
:param fileid: thesaurus fileid to search in. If None, search all fileids.
|
||||
:type fileid: C{string}
|
||||
:return: If fileid is specified, list of synonyms; otherwise, list of tuples of fileids and
|
||||
lists, where inner lists contain synonyms.
|
||||
"""
|
||||
if fileid:
|
||||
return self._thesaurus[fileid][ngram].keys()
|
||||
else:
|
||||
return [
|
||||
(fileid, self._thesaurus[fileid][ngram].keys())
|
||||
for fileid in self._fileids
|
||||
]
|
||||
|
||||
def __contains__(self, ngram):
|
||||
"""
|
||||
Determines whether or not the given ngram is in the thesaurus.
|
||||
|
||||
:param ngram: ngram to lookup
|
||||
:type ngram: C{string}
|
||||
:return: whether the given ngram is in the thesaurus.
|
||||
"""
|
||||
return reduce(
|
||||
lambda accum, fileid: accum or (ngram in self._thesaurus[fileid]),
|
||||
self._fileids,
|
||||
False,
|
||||
)
|
||||
|
||||
|
||||
######################################################################
|
||||
# Demo
|
||||
######################################################################
|
||||
|
||||
|
||||
def demo():
|
||||
from nltk.corpus import lin_thesaurus as thes
|
||||
|
||||
word1 = "business"
|
||||
word2 = "enterprise"
|
||||
print("Getting synonyms for " + word1)
|
||||
print(thes.synonyms(word1))
|
||||
|
||||
print("Getting scored synonyms for " + word1)
|
||||
print(thes.scored_synonyms(word1))
|
||||
|
||||
print("Getting synonyms from simN.lsp (noun subsection) for " + word1)
|
||||
print(thes.synonyms(word1, fileid="simN.lsp"))
|
||||
|
||||
print("Getting synonyms from simN.lsp (noun subsection) for " + word1)
|
||||
print(thes.synonyms(word1, fileid="simN.lsp"))
|
||||
|
||||
print(f"Similarity score for {word1} and {word2}:")
|
||||
print(thes.similarity(word1, word2))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
344
backend/venv/Lib/site-packages/nltk/corpus/reader/markdown.py
Normal file
344
backend/venv/Lib/site-packages/nltk/corpus/reader/markdown.py
Normal file
@@ -0,0 +1,344 @@
|
||||
from collections import namedtuple
|
||||
from functools import partial, wraps
|
||||
|
||||
from nltk.corpus.reader.api import CategorizedCorpusReader
|
||||
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
|
||||
from nltk.corpus.reader.util import concat, read_blankline_block
|
||||
from nltk.tokenize import blankline_tokenize, sent_tokenize, word_tokenize
|
||||
|
||||
|
||||
def comma_separated_string_args(func):
|
||||
"""
|
||||
A decorator that allows a function to be called with
|
||||
a single string of comma-separated values which become
|
||||
individual function arguments.
|
||||
"""
|
||||
|
||||
@wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
_args = list()
|
||||
for arg in args:
|
||||
if isinstance(arg, str):
|
||||
_args.append({part.strip() for part in arg.split(",")})
|
||||
elif isinstance(arg, list):
|
||||
_args.append(set(arg))
|
||||
else:
|
||||
_args.append(arg)
|
||||
for name, value in kwargs.items():
|
||||
if isinstance(value, str):
|
||||
kwargs[name] = {part.strip() for part in value.split(",")}
|
||||
return func(*_args, **kwargs)
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
def read_parse_blankline_block(stream, parser):
|
||||
block = read_blankline_block(stream)
|
||||
if block:
|
||||
return [parser.render(block[0])]
|
||||
return block
|
||||
|
||||
|
||||
class MarkdownBlock:
|
||||
def __init__(self, content):
|
||||
self.content = content
|
||||
self.truncate_at = 16
|
||||
|
||||
def __repr__(self):
|
||||
return f"{self.__class__.__name__}(content={repr(str(self))})"
|
||||
|
||||
def __str__(self):
|
||||
return (
|
||||
f"{self.content[:self.truncate_at]}"
|
||||
f"{'...' if len(self.content) > self.truncate_at else ''}"
|
||||
)
|
||||
|
||||
@property
|
||||
def raw(self):
|
||||
return self.content
|
||||
|
||||
@property
|
||||
def words(self):
|
||||
return word_tokenize(self.content)
|
||||
|
||||
@property
|
||||
def sents(self):
|
||||
return [word_tokenize(sent) for sent in sent_tokenize(self.content)]
|
||||
|
||||
@property
|
||||
def paras(self):
|
||||
return [
|
||||
[word_tokenize(sent) for sent in sent_tokenize(para)]
|
||||
for para in blankline_tokenize(self.content)
|
||||
]
|
||||
|
||||
|
||||
class CodeBlock(MarkdownBlock):
|
||||
def __init__(self, language, *args):
|
||||
self.language = language
|
||||
super().__init__(*args)
|
||||
|
||||
@property
|
||||
def sents(self):
|
||||
return [word_tokenize(line) for line in self.content.splitlines()]
|
||||
|
||||
@property
|
||||
def lines(self):
|
||||
return self.content.splitlines()
|
||||
|
||||
@property
|
||||
def paras(self):
|
||||
return [
|
||||
[word_tokenize(line) for line in para.splitlines()]
|
||||
for para in blankline_tokenize(self.content)
|
||||
]
|
||||
|
||||
|
||||
class MarkdownSection(MarkdownBlock):
|
||||
def __init__(self, heading, level, *args):
|
||||
self.heading = heading
|
||||
self.level = level
|
||||
super().__init__(*args)
|
||||
|
||||
|
||||
Image = namedtuple("Image", "label, src, title")
|
||||
Link = namedtuple("Link", "label, href, title")
|
||||
List = namedtuple("List", "is_ordered, items")
|
||||
|
||||
|
||||
class MarkdownCorpusReader(PlaintextCorpusReader):
|
||||
def __init__(self, *args, parser=None, **kwargs):
|
||||
from markdown_it import MarkdownIt
|
||||
from mdit_plain.renderer import RendererPlain
|
||||
from mdit_py_plugins.front_matter import front_matter_plugin
|
||||
|
||||
self.parser = parser
|
||||
if self.parser is None:
|
||||
self.parser = MarkdownIt("commonmark", renderer_cls=RendererPlain)
|
||||
self.parser.use(front_matter_plugin)
|
||||
|
||||
kwargs.setdefault(
|
||||
"para_block_reader", partial(read_parse_blankline_block, parser=self.parser)
|
||||
)
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
# This override takes care of removing markup.
|
||||
def _read_word_block(self, stream):
|
||||
words = list()
|
||||
for para in self._para_block_reader(stream):
|
||||
words.extend(self._word_tokenizer.tokenize(para))
|
||||
return words
|
||||
|
||||
|
||||
class CategorizedMarkdownCorpusReader(CategorizedCorpusReader, MarkdownCorpusReader):
|
||||
"""
|
||||
A reader for markdown corpora whose documents are divided into
|
||||
categories based on their file identifiers.
|
||||
|
||||
Based on nltk.corpus.reader.plaintext.CategorizedPlaintextCorpusReader:
|
||||
https://www.nltk.org/_modules/nltk/corpus/reader/api.html#CategorizedCorpusReader
|
||||
"""
|
||||
|
||||
def __init__(self, *args, cat_field="tags", **kwargs):
|
||||
"""
|
||||
Initialize the corpus reader. Categorization arguments
|
||||
(``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
|
||||
the ``CategorizedCorpusReader`` constructor. The remaining arguments
|
||||
are passed to the ``MarkdownCorpusReader`` constructor.
|
||||
"""
|
||||
cat_args = ["cat_pattern", "cat_map", "cat_file"]
|
||||
if not any(arg in kwargs for arg in cat_args):
|
||||
# Initialize with a blank map now,
|
||||
# and try to build categories from document metadata later.
|
||||
kwargs["cat_map"] = dict()
|
||||
CategorizedCorpusReader.__init__(self, kwargs)
|
||||
MarkdownCorpusReader.__init__(self, *args, **kwargs)
|
||||
|
||||
# Map file IDs to categories if self._map exists but is still empty:
|
||||
if self._map is not None and not self._map:
|
||||
for file_id in self._fileids:
|
||||
metadata = self.metadata(file_id)
|
||||
if metadata:
|
||||
self._map[file_id] = metadata[0].get(cat_field, [])
|
||||
|
||||
### Begin CategorizedCorpusReader Overrides
|
||||
@comma_separated_string_args
|
||||
def categories(self, fileids=None):
|
||||
return super().categories(fileids)
|
||||
|
||||
@comma_separated_string_args
|
||||
def fileids(self, categories=None):
|
||||
if categories is None:
|
||||
return self._fileids
|
||||
return super().fileids(categories)
|
||||
|
||||
### End CategorizedCorpusReader Overrides
|
||||
|
||||
### Begin MarkdownCorpusReader Overrides
|
||||
@comma_separated_string_args
|
||||
def raw(self, fileids=None, categories=None):
|
||||
return super().raw(self._resolve(fileids, categories))
|
||||
|
||||
@comma_separated_string_args
|
||||
def words(self, fileids=None, categories=None):
|
||||
return super().words(self._resolve(fileids, categories))
|
||||
|
||||
@comma_separated_string_args
|
||||
def sents(self, fileids=None, categories=None):
|
||||
return super().sents(self._resolve(fileids, categories))
|
||||
|
||||
@comma_separated_string_args
|
||||
def paras(self, fileids=None, categories=None):
|
||||
return super().paras(self._resolve(fileids, categories))
|
||||
|
||||
### End MarkdownCorpusReader Overrides
|
||||
|
||||
def concatenated_view(self, reader, fileids, categories):
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(path, reader, encoding=enc)
|
||||
for (path, enc) in self.abspaths(
|
||||
self._resolve(fileids, categories), include_encoding=True
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
def metadata_reader(self, stream):
|
||||
from yaml import safe_load
|
||||
|
||||
return [
|
||||
safe_load(t.content)
|
||||
for t in self.parser.parse(stream.read())
|
||||
if t.type == "front_matter"
|
||||
]
|
||||
|
||||
@comma_separated_string_args
|
||||
def metadata(self, fileids=None, categories=None):
|
||||
return self.concatenated_view(self.metadata_reader, fileids, categories)
|
||||
|
||||
def blockquote_reader(self, stream):
|
||||
tokens = self.parser.parse(stream.read())
|
||||
opening_tokens = filter(
|
||||
lambda t: t.level == 0 and t.type == "blockquote_open", tokens
|
||||
)
|
||||
closing_tokens = filter(
|
||||
lambda t: t.level == 0 and t.type == "blockquote_close", tokens
|
||||
)
|
||||
blockquotes = list()
|
||||
for o, c in zip(opening_tokens, closing_tokens):
|
||||
opening_index = tokens.index(o)
|
||||
closing_index = tokens.index(c, opening_index)
|
||||
blockquotes.append(tokens[opening_index : closing_index + 1])
|
||||
return [
|
||||
MarkdownBlock(
|
||||
self.parser.renderer.render(block, self.parser.options, env=None)
|
||||
)
|
||||
for block in blockquotes
|
||||
]
|
||||
|
||||
@comma_separated_string_args
|
||||
def blockquotes(self, fileids=None, categories=None):
|
||||
return self.concatenated_view(self.blockquote_reader, fileids, categories)
|
||||
|
||||
def code_block_reader(self, stream):
|
||||
return [
|
||||
CodeBlock(
|
||||
t.info,
|
||||
t.content,
|
||||
)
|
||||
for t in self.parser.parse(stream.read())
|
||||
if t.level == 0 and t.type in ("fence", "code_block")
|
||||
]
|
||||
|
||||
@comma_separated_string_args
|
||||
def code_blocks(self, fileids=None, categories=None):
|
||||
return self.concatenated_view(self.code_block_reader, fileids, categories)
|
||||
|
||||
def image_reader(self, stream):
|
||||
return [
|
||||
Image(
|
||||
child_token.content,
|
||||
child_token.attrGet("src"),
|
||||
child_token.attrGet("title"),
|
||||
)
|
||||
for inline_token in filter(
|
||||
lambda t: t.type == "inline", self.parser.parse(stream.read())
|
||||
)
|
||||
for child_token in inline_token.children
|
||||
if child_token.type == "image"
|
||||
]
|
||||
|
||||
@comma_separated_string_args
|
||||
def images(self, fileids=None, categories=None):
|
||||
return self.concatenated_view(self.image_reader, fileids, categories)
|
||||
|
||||
def link_reader(self, stream):
|
||||
return [
|
||||
Link(
|
||||
inline_token.children[i + 1].content,
|
||||
child_token.attrGet("href"),
|
||||
child_token.attrGet("title"),
|
||||
)
|
||||
for inline_token in filter(
|
||||
lambda t: t.type == "inline", self.parser.parse(stream.read())
|
||||
)
|
||||
for i, child_token in enumerate(inline_token.children)
|
||||
if child_token.type == "link_open"
|
||||
]
|
||||
|
||||
@comma_separated_string_args
|
||||
def links(self, fileids=None, categories=None):
|
||||
return self.concatenated_view(self.link_reader, fileids, categories)
|
||||
|
||||
def list_reader(self, stream):
|
||||
tokens = self.parser.parse(stream.read())
|
||||
opening_types = ("bullet_list_open", "ordered_list_open")
|
||||
opening_tokens = filter(
|
||||
lambda t: t.level == 0 and t.type in opening_types, tokens
|
||||
)
|
||||
closing_types = ("bullet_list_close", "ordered_list_close")
|
||||
closing_tokens = filter(
|
||||
lambda t: t.level == 0 and t.type in closing_types, tokens
|
||||
)
|
||||
list_blocks = list()
|
||||
for o, c in zip(opening_tokens, closing_tokens):
|
||||
opening_index = tokens.index(o)
|
||||
closing_index = tokens.index(c, opening_index)
|
||||
list_blocks.append(tokens[opening_index : closing_index + 1])
|
||||
return [
|
||||
List(
|
||||
tokens[0].type == "ordered_list_open",
|
||||
[t.content for t in tokens if t.content],
|
||||
)
|
||||
for tokens in list_blocks
|
||||
]
|
||||
|
||||
@comma_separated_string_args
|
||||
def lists(self, fileids=None, categories=None):
|
||||
return self.concatenated_view(self.list_reader, fileids, categories)
|
||||
|
||||
def section_reader(self, stream):
|
||||
section_blocks, block = list(), list()
|
||||
for t in self.parser.parse(stream.read()):
|
||||
if t.level == 0 and t.type == "heading_open":
|
||||
if not block:
|
||||
block.append(t)
|
||||
else:
|
||||
section_blocks.append(block)
|
||||
block = [t]
|
||||
elif block:
|
||||
block.append(t)
|
||||
if block:
|
||||
section_blocks.append(block)
|
||||
return [
|
||||
MarkdownSection(
|
||||
block[1].content,
|
||||
block[0].markup.count("#"),
|
||||
self.parser.renderer.render(block, self.parser.options, env=None),
|
||||
)
|
||||
for block in section_blocks
|
||||
]
|
||||
|
||||
@comma_separated_string_args
|
||||
def sections(self, fileids=None, categories=None):
|
||||
return self.concatenated_view(self.section_reader, fileids, categories)
|
||||
398
backend/venv/Lib/site-packages/nltk/corpus/reader/mte.py
Normal file
398
backend/venv/Lib/site-packages/nltk/corpus/reader/mte.py
Normal file
@@ -0,0 +1,398 @@
|
||||
"""
|
||||
A reader for corpora whose documents are in MTE format.
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
from functools import reduce
|
||||
|
||||
from nltk.corpus.reader import TaggedCorpusReader, concat
|
||||
from nltk.corpus.reader.xmldocs import XMLCorpusView
|
||||
|
||||
|
||||
def xpath(root, path, ns):
|
||||
return root.findall(path, ns)
|
||||
|
||||
|
||||
class MTECorpusView(XMLCorpusView):
|
||||
"""
|
||||
Class for lazy viewing the MTE Corpus.
|
||||
"""
|
||||
|
||||
def __init__(self, fileid, tagspec, elt_handler=None):
|
||||
XMLCorpusView.__init__(self, fileid, tagspec, elt_handler)
|
||||
|
||||
def read_block(self, stream, tagspec=None, elt_handler=None):
|
||||
return list(
|
||||
filter(
|
||||
lambda x: x is not None,
|
||||
XMLCorpusView.read_block(self, stream, tagspec, elt_handler),
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
class MTEFileReader:
|
||||
"""
|
||||
Class for loading the content of the multext-east corpus. It
|
||||
parses the xml files and does some tag-filtering depending on the
|
||||
given method parameters.
|
||||
"""
|
||||
|
||||
ns = {
|
||||
"tei": "https://www.tei-c.org/ns/1.0",
|
||||
"xml": "https://www.w3.org/XML/1998/namespace",
|
||||
}
|
||||
tag_ns = "{https://www.tei-c.org/ns/1.0}"
|
||||
xml_ns = "{https://www.w3.org/XML/1998/namespace}"
|
||||
word_path = "TEI/text/body/div/div/p/s/(w|c)"
|
||||
sent_path = "TEI/text/body/div/div/p/s"
|
||||
para_path = "TEI/text/body/div/div/p"
|
||||
|
||||
def __init__(self, file_path):
|
||||
self.__file_path = file_path
|
||||
|
||||
@classmethod
|
||||
def _word_elt(cls, elt, context):
|
||||
return elt.text
|
||||
|
||||
@classmethod
|
||||
def _sent_elt(cls, elt, context):
|
||||
return [cls._word_elt(w, None) for w in xpath(elt, "*", cls.ns)]
|
||||
|
||||
@classmethod
|
||||
def _para_elt(cls, elt, context):
|
||||
return [cls._sent_elt(s, None) for s in xpath(elt, "*", cls.ns)]
|
||||
|
||||
@classmethod
|
||||
def _tagged_word_elt(cls, elt, context):
|
||||
if "ana" not in elt.attrib:
|
||||
return (elt.text, "")
|
||||
|
||||
if cls.__tags == "" and cls.__tagset == "msd":
|
||||
return (elt.text, elt.attrib["ana"])
|
||||
elif cls.__tags == "" and cls.__tagset == "universal":
|
||||
return (elt.text, MTETagConverter.msd_to_universal(elt.attrib["ana"]))
|
||||
else:
|
||||
tags = re.compile("^" + re.sub("-", ".", cls.__tags) + ".*$")
|
||||
if tags.match(elt.attrib["ana"]):
|
||||
if cls.__tagset == "msd":
|
||||
return (elt.text, elt.attrib["ana"])
|
||||
else:
|
||||
return (
|
||||
elt.text,
|
||||
MTETagConverter.msd_to_universal(elt.attrib["ana"]),
|
||||
)
|
||||
else:
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def _tagged_sent_elt(cls, elt, context):
|
||||
return list(
|
||||
filter(
|
||||
lambda x: x is not None,
|
||||
[cls._tagged_word_elt(w, None) for w in xpath(elt, "*", cls.ns)],
|
||||
)
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _tagged_para_elt(cls, elt, context):
|
||||
return list(
|
||||
filter(
|
||||
lambda x: x is not None,
|
||||
[cls._tagged_sent_elt(s, None) for s in xpath(elt, "*", cls.ns)],
|
||||
)
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _lemma_word_elt(cls, elt, context):
|
||||
if "lemma" not in elt.attrib:
|
||||
return (elt.text, "")
|
||||
else:
|
||||
return (elt.text, elt.attrib["lemma"])
|
||||
|
||||
@classmethod
|
||||
def _lemma_sent_elt(cls, elt, context):
|
||||
return [cls._lemma_word_elt(w, None) for w in xpath(elt, "*", cls.ns)]
|
||||
|
||||
@classmethod
|
||||
def _lemma_para_elt(cls, elt, context):
|
||||
return [cls._lemma_sent_elt(s, None) for s in xpath(elt, "*", cls.ns)]
|
||||
|
||||
def words(self):
|
||||
return MTECorpusView(
|
||||
self.__file_path, MTEFileReader.word_path, MTEFileReader._word_elt
|
||||
)
|
||||
|
||||
def sents(self):
|
||||
return MTECorpusView(
|
||||
self.__file_path, MTEFileReader.sent_path, MTEFileReader._sent_elt
|
||||
)
|
||||
|
||||
def paras(self):
|
||||
return MTECorpusView(
|
||||
self.__file_path, MTEFileReader.para_path, MTEFileReader._para_elt
|
||||
)
|
||||
|
||||
def lemma_words(self):
|
||||
return MTECorpusView(
|
||||
self.__file_path, MTEFileReader.word_path, MTEFileReader._lemma_word_elt
|
||||
)
|
||||
|
||||
def tagged_words(self, tagset, tags):
|
||||
MTEFileReader.__tagset = tagset
|
||||
MTEFileReader.__tags = tags
|
||||
return MTECorpusView(
|
||||
self.__file_path, MTEFileReader.word_path, MTEFileReader._tagged_word_elt
|
||||
)
|
||||
|
||||
def lemma_sents(self):
|
||||
return MTECorpusView(
|
||||
self.__file_path, MTEFileReader.sent_path, MTEFileReader._lemma_sent_elt
|
||||
)
|
||||
|
||||
def tagged_sents(self, tagset, tags):
|
||||
MTEFileReader.__tagset = tagset
|
||||
MTEFileReader.__tags = tags
|
||||
return MTECorpusView(
|
||||
self.__file_path, MTEFileReader.sent_path, MTEFileReader._tagged_sent_elt
|
||||
)
|
||||
|
||||
def lemma_paras(self):
|
||||
return MTECorpusView(
|
||||
self.__file_path, MTEFileReader.para_path, MTEFileReader._lemma_para_elt
|
||||
)
|
||||
|
||||
def tagged_paras(self, tagset, tags):
|
||||
MTEFileReader.__tagset = tagset
|
||||
MTEFileReader.__tags = tags
|
||||
return MTECorpusView(
|
||||
self.__file_path, MTEFileReader.para_path, MTEFileReader._tagged_para_elt
|
||||
)
|
||||
|
||||
|
||||
class MTETagConverter:
|
||||
"""
|
||||
Class for converting msd tags to universal tags, more conversion
|
||||
options are currently not implemented.
|
||||
"""
|
||||
|
||||
mapping_msd_universal = {
|
||||
"A": "ADJ",
|
||||
"S": "ADP",
|
||||
"R": "ADV",
|
||||
"C": "CONJ",
|
||||
"D": "DET",
|
||||
"N": "NOUN",
|
||||
"M": "NUM",
|
||||
"Q": "PRT",
|
||||
"P": "PRON",
|
||||
"V": "VERB",
|
||||
".": ".",
|
||||
"-": "X",
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def msd_to_universal(tag):
|
||||
"""
|
||||
This function converts the annotation from the Multex-East to the universal tagset
|
||||
as described in Chapter 5 of the NLTK-Book
|
||||
|
||||
Unknown Tags will be mapped to X. Punctuation marks are not supported in MSD tags, so
|
||||
"""
|
||||
indicator = tag[0] if not tag[0] == "#" else tag[1]
|
||||
|
||||
if not indicator in MTETagConverter.mapping_msd_universal:
|
||||
indicator = "-"
|
||||
|
||||
return MTETagConverter.mapping_msd_universal[indicator]
|
||||
|
||||
|
||||
class MTECorpusReader(TaggedCorpusReader):
|
||||
"""
|
||||
Reader for corpora following the TEI-p5 xml scheme, such as MULTEXT-East.
|
||||
MULTEXT-East contains part-of-speech-tagged words with a quite precise tagging
|
||||
scheme. These tags can be converted to the Universal tagset
|
||||
"""
|
||||
|
||||
def __init__(self, root=None, fileids=None, encoding="utf8"):
|
||||
"""
|
||||
Construct a new MTECorpusreader for a set of documents
|
||||
located at the given root directory. Example usage:
|
||||
|
||||
>>> root = '/...path to corpus.../'
|
||||
>>> reader = MTECorpusReader(root, 'oana-*.xml', 'utf8') # doctest: +SKIP
|
||||
|
||||
:param root: The root directory for this corpus. (default points to location in multext config file)
|
||||
:param fileids: A list or regexp specifying the fileids in this corpus. (default is oana-en.xml)
|
||||
:param encoding: The encoding of the given files (default is utf8)
|
||||
"""
|
||||
TaggedCorpusReader.__init__(self, root, fileids, encoding)
|
||||
self._readme = "00README.txt"
|
||||
|
||||
def __fileids(self, fileids):
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, str):
|
||||
fileids = [fileids]
|
||||
# filter wrong userinput
|
||||
fileids = filter(lambda x: x in self._fileids, fileids)
|
||||
# filter multext-east sourcefiles that are not compatible to the teip5 specification
|
||||
fileids = filter(lambda x: x not in ["oana-bg.xml", "oana-mk.xml"], fileids)
|
||||
if not fileids:
|
||||
print("No valid multext-east file specified")
|
||||
return fileids
|
||||
|
||||
def words(self, fileids=None):
|
||||
"""
|
||||
:param fileids: A list specifying the fileids that should be used.
|
||||
:return: the given file(s) as a list of words and punctuation symbols.
|
||||
:rtype: list(str)
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
MTEFileReader(os.path.join(self._root, f)).words()
|
||||
for f in self.__fileids(fileids)
|
||||
]
|
||||
)
|
||||
|
||||
def sents(self, fileids=None):
|
||||
"""
|
||||
:param fileids: A list specifying the fileids that should be used.
|
||||
:return: the given file(s) as a list of sentences or utterances,
|
||||
each encoded as a list of word strings
|
||||
:rtype: list(list(str))
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
MTEFileReader(os.path.join(self._root, f)).sents()
|
||||
for f in self.__fileids(fileids)
|
||||
]
|
||||
)
|
||||
|
||||
def paras(self, fileids=None):
|
||||
"""
|
||||
:param fileids: A list specifying the fileids that should be used.
|
||||
:return: the given file(s) as a list of paragraphs, each encoded as a list
|
||||
of sentences, which are in turn encoded as lists of word string
|
||||
:rtype: list(list(list(str)))
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
MTEFileReader(os.path.join(self._root, f)).paras()
|
||||
for f in self.__fileids(fileids)
|
||||
]
|
||||
)
|
||||
|
||||
def lemma_words(self, fileids=None):
|
||||
"""
|
||||
:param fileids: A list specifying the fileids that should be used.
|
||||
:return: the given file(s) as a list of words, the corresponding lemmas
|
||||
and punctuation symbols, encoded as tuples (word, lemma)
|
||||
:rtype: list(tuple(str,str))
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
MTEFileReader(os.path.join(self._root, f)).lemma_words()
|
||||
for f in self.__fileids(fileids)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_words(self, fileids=None, tagset="msd", tags=""):
|
||||
"""
|
||||
:param fileids: A list specifying the fileids that should be used.
|
||||
:param tagset: The tagset that should be used in the returned object,
|
||||
either "universal" or "msd", "msd" is the default
|
||||
:param tags: An MSD Tag that is used to filter all parts of the used corpus
|
||||
that are not more precise or at least equal to the given tag
|
||||
:return: the given file(s) as a list of tagged words and punctuation symbols
|
||||
encoded as tuples (word, tag)
|
||||
:rtype: list(tuple(str, str))
|
||||
"""
|
||||
if tagset == "universal" or tagset == "msd":
|
||||
return concat(
|
||||
[
|
||||
MTEFileReader(os.path.join(self._root, f)).tagged_words(
|
||||
tagset, tags
|
||||
)
|
||||
for f in self.__fileids(fileids)
|
||||
]
|
||||
)
|
||||
else:
|
||||
print("Unknown tagset specified.")
|
||||
|
||||
def lemma_sents(self, fileids=None):
|
||||
"""
|
||||
:param fileids: A list specifying the fileids that should be used.
|
||||
:return: the given file(s) as a list of sentences or utterances, each
|
||||
encoded as a list of tuples of the word and the corresponding
|
||||
lemma (word, lemma)
|
||||
:rtype: list(list(tuple(str, str)))
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
MTEFileReader(os.path.join(self._root, f)).lemma_sents()
|
||||
for f in self.__fileids(fileids)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_sents(self, fileids=None, tagset="msd", tags=""):
|
||||
"""
|
||||
:param fileids: A list specifying the fileids that should be used.
|
||||
:param tagset: The tagset that should be used in the returned object,
|
||||
either "universal" or "msd", "msd" is the default
|
||||
:param tags: An MSD Tag that is used to filter all parts of the used corpus
|
||||
that are not more precise or at least equal to the given tag
|
||||
:return: the given file(s) as a list of sentences or utterances, each
|
||||
each encoded as a list of (word,tag) tuples
|
||||
:rtype: list(list(tuple(str, str)))
|
||||
"""
|
||||
if tagset == "universal" or tagset == "msd":
|
||||
return concat(
|
||||
[
|
||||
MTEFileReader(os.path.join(self._root, f)).tagged_sents(
|
||||
tagset, tags
|
||||
)
|
||||
for f in self.__fileids(fileids)
|
||||
]
|
||||
)
|
||||
else:
|
||||
print("Unknown tagset specified.")
|
||||
|
||||
def lemma_paras(self, fileids=None):
|
||||
"""
|
||||
:param fileids: A list specifying the fileids that should be used.
|
||||
:return: the given file(s) as a list of paragraphs, each encoded as a
|
||||
list of sentences, which are in turn encoded as a list of
|
||||
tuples of the word and the corresponding lemma (word, lemma)
|
||||
:rtype: list(List(List(tuple(str, str))))
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
MTEFileReader(os.path.join(self._root, f)).lemma_paras()
|
||||
for f in self.__fileids(fileids)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_paras(self, fileids=None, tagset="msd", tags=""):
|
||||
"""
|
||||
:param fileids: A list specifying the fileids that should be used.
|
||||
:param tagset: The tagset that should be used in the returned object,
|
||||
either "universal" or "msd", "msd" is the default
|
||||
:param tags: An MSD Tag that is used to filter all parts of the used corpus
|
||||
that are not more precise or at least equal to the given tag
|
||||
:return: the given file(s) as a list of paragraphs, each encoded as a
|
||||
list of sentences, which are in turn encoded as a list
|
||||
of (word,tag) tuples
|
||||
:rtype: list(list(list(tuple(str, str))))
|
||||
"""
|
||||
if tagset == "universal" or tagset == "msd":
|
||||
return concat(
|
||||
[
|
||||
MTEFileReader(os.path.join(self._root, f)).tagged_paras(
|
||||
tagset, tags
|
||||
)
|
||||
for f in self.__fileids(fileids)
|
||||
]
|
||||
)
|
||||
else:
|
||||
print("Unknown tagset specified.")
|
||||
486
backend/venv/Lib/site-packages/nltk/corpus/reader/nkjp.py
Normal file
486
backend/venv/Lib/site-packages/nltk/corpus/reader/nkjp.py
Normal file
@@ -0,0 +1,486 @@
|
||||
# Natural Language Toolkit: NKJP Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Gabriela Kaczka
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import functools
|
||||
import os
|
||||
import re
|
||||
import tempfile
|
||||
|
||||
from nltk.corpus.reader.util import concat
|
||||
from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView
|
||||
|
||||
|
||||
def _parse_args(fun):
|
||||
"""
|
||||
Wraps function arguments:
|
||||
if fileids not specified then function set NKJPCorpusReader paths.
|
||||
"""
|
||||
|
||||
@functools.wraps(fun)
|
||||
def decorator(self, fileids=None, **kwargs):
|
||||
if not fileids:
|
||||
fileids = self._paths
|
||||
return fun(self, fileids, **kwargs)
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
class NKJPCorpusReader(XMLCorpusReader):
|
||||
WORDS_MODE = 0
|
||||
SENTS_MODE = 1
|
||||
HEADER_MODE = 2
|
||||
RAW_MODE = 3
|
||||
|
||||
def __init__(self, root, fileids=".*"):
|
||||
"""
|
||||
Corpus reader designed to work with National Corpus of Polish.
|
||||
See http://nkjp.pl/ for more details about NKJP.
|
||||
use example:
|
||||
import nltk
|
||||
import nkjp
|
||||
from nkjp import NKJPCorpusReader
|
||||
x = NKJPCorpusReader(root='/home/USER/nltk_data/corpora/nkjp/', fileids='') # obtain the whole corpus
|
||||
x.header()
|
||||
x.raw()
|
||||
x.words()
|
||||
x.tagged_words(tags=['subst', 'comp']) #Link to find more tags: nkjp.pl/poliqarp/help/ense2.html
|
||||
x.sents()
|
||||
x = NKJPCorpusReader(root='/home/USER/nltk_data/corpora/nkjp/', fileids='Wilk*') # obtain particular file(s)
|
||||
x.header(fileids=['WilkDom', '/home/USER/nltk_data/corpora/nkjp/WilkWilczy'])
|
||||
x.tagged_words(fileids=['WilkDom', '/home/USER/nltk_data/corpora/nkjp/WilkWilczy'], tags=['subst', 'comp'])
|
||||
"""
|
||||
if isinstance(fileids, str):
|
||||
XMLCorpusReader.__init__(self, root, fileids + ".*/header.xml")
|
||||
else:
|
||||
XMLCorpusReader.__init__(
|
||||
self, root, [fileid + "/header.xml" for fileid in fileids]
|
||||
)
|
||||
self._paths = self.get_paths()
|
||||
|
||||
def get_paths(self):
|
||||
return [
|
||||
os.path.join(str(self._root), f.split("header.xml")[0])
|
||||
for f in self._fileids
|
||||
]
|
||||
|
||||
def fileids(self):
|
||||
"""
|
||||
Returns a list of file identifiers for the fileids that make up
|
||||
this corpus.
|
||||
"""
|
||||
return [f.split("header.xml")[0] for f in self._fileids]
|
||||
|
||||
def _view(self, filename, tags=None, **kwargs):
|
||||
"""
|
||||
Returns a view specialised for use with particular corpus file.
|
||||
"""
|
||||
mode = kwargs.pop("mode", NKJPCorpusReader.WORDS_MODE)
|
||||
if mode is NKJPCorpusReader.WORDS_MODE:
|
||||
return NKJPCorpus_Morph_View(filename, tags=tags)
|
||||
elif mode is NKJPCorpusReader.SENTS_MODE:
|
||||
return NKJPCorpus_Segmentation_View(filename, tags=tags)
|
||||
elif mode is NKJPCorpusReader.HEADER_MODE:
|
||||
return NKJPCorpus_Header_View(filename, tags=tags)
|
||||
elif mode is NKJPCorpusReader.RAW_MODE:
|
||||
return NKJPCorpus_Text_View(
|
||||
filename, tags=tags, mode=NKJPCorpus_Text_View.RAW_MODE
|
||||
)
|
||||
|
||||
else:
|
||||
raise NameError("No such mode!")
|
||||
|
||||
def add_root(self, fileid):
|
||||
"""
|
||||
Add root if necessary to specified fileid.
|
||||
"""
|
||||
if self.root in fileid:
|
||||
return fileid
|
||||
return self.root + fileid
|
||||
|
||||
@_parse_args
|
||||
def header(self, fileids=None, **kwargs):
|
||||
"""
|
||||
Returns header(s) of specified fileids.
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
self._view(
|
||||
self.add_root(fileid), mode=NKJPCorpusReader.HEADER_MODE, **kwargs
|
||||
).handle_query()
|
||||
for fileid in fileids
|
||||
]
|
||||
)
|
||||
|
||||
@_parse_args
|
||||
def sents(self, fileids=None, **kwargs):
|
||||
"""
|
||||
Returns sentences in specified fileids.
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
self._view(
|
||||
self.add_root(fileid), mode=NKJPCorpusReader.SENTS_MODE, **kwargs
|
||||
).handle_query()
|
||||
for fileid in fileids
|
||||
]
|
||||
)
|
||||
|
||||
@_parse_args
|
||||
def words(self, fileids=None, **kwargs):
|
||||
"""
|
||||
Returns words in specified fileids.
|
||||
"""
|
||||
|
||||
return concat(
|
||||
[
|
||||
self._view(
|
||||
self.add_root(fileid), mode=NKJPCorpusReader.WORDS_MODE, **kwargs
|
||||
).handle_query()
|
||||
for fileid in fileids
|
||||
]
|
||||
)
|
||||
|
||||
@_parse_args
|
||||
def tagged_words(self, fileids=None, **kwargs):
|
||||
"""
|
||||
Call with specified tags as a list, e.g. tags=['subst', 'comp'].
|
||||
Returns tagged words in specified fileids.
|
||||
"""
|
||||
tags = kwargs.pop("tags", [])
|
||||
return concat(
|
||||
[
|
||||
self._view(
|
||||
self.add_root(fileid),
|
||||
mode=NKJPCorpusReader.WORDS_MODE,
|
||||
tags=tags,
|
||||
**kwargs
|
||||
).handle_query()
|
||||
for fileid in fileids
|
||||
]
|
||||
)
|
||||
|
||||
@_parse_args
|
||||
def raw(self, fileids=None, **kwargs):
|
||||
"""
|
||||
Returns words in specified fileids.
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
self._view(
|
||||
self.add_root(fileid), mode=NKJPCorpusReader.RAW_MODE, **kwargs
|
||||
).handle_query()
|
||||
for fileid in fileids
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
class NKJPCorpus_Header_View(XMLCorpusView):
|
||||
def __init__(self, filename, **kwargs):
|
||||
"""
|
||||
HEADER_MODE
|
||||
A stream backed corpus view specialized for use with
|
||||
header.xml files in NKJP corpus.
|
||||
"""
|
||||
self.tagspec = ".*/sourceDesc$"
|
||||
XMLCorpusView.__init__(self, filename + "header.xml", self.tagspec)
|
||||
|
||||
def handle_query(self):
|
||||
self._open()
|
||||
header = []
|
||||
while True:
|
||||
segm = XMLCorpusView.read_block(self, self._stream)
|
||||
if len(segm) == 0:
|
||||
break
|
||||
header.extend(segm)
|
||||
self.close()
|
||||
return header
|
||||
|
||||
def handle_elt(self, elt, context):
|
||||
titles = elt.findall("bibl/title")
|
||||
title = []
|
||||
if titles:
|
||||
title = "\n".join(title.text.strip() for title in titles)
|
||||
|
||||
authors = elt.findall("bibl/author")
|
||||
author = []
|
||||
if authors:
|
||||
author = "\n".join(author.text.strip() for author in authors)
|
||||
|
||||
dates = elt.findall("bibl/date")
|
||||
date = []
|
||||
if dates:
|
||||
date = "\n".join(date.text.strip() for date in dates)
|
||||
|
||||
publishers = elt.findall("bibl/publisher")
|
||||
publisher = []
|
||||
if publishers:
|
||||
publisher = "\n".join(publisher.text.strip() for publisher in publishers)
|
||||
|
||||
idnos = elt.findall("bibl/idno")
|
||||
idno = []
|
||||
if idnos:
|
||||
idno = "\n".join(idno.text.strip() for idno in idnos)
|
||||
|
||||
notes = elt.findall("bibl/note")
|
||||
note = []
|
||||
if notes:
|
||||
note = "\n".join(note.text.strip() for note in notes)
|
||||
|
||||
return {
|
||||
"title": title,
|
||||
"author": author,
|
||||
"date": date,
|
||||
"publisher": publisher,
|
||||
"idno": idno,
|
||||
"note": note,
|
||||
}
|
||||
|
||||
|
||||
class XML_Tool:
|
||||
"""
|
||||
Helper class creating xml file to one without references to nkjp: namespace.
|
||||
That's needed because the XMLCorpusView assumes that one can find short substrings
|
||||
of XML that are valid XML, which is not true if a namespace is declared at top level
|
||||
"""
|
||||
|
||||
def __init__(self, root, filename):
|
||||
self.read_file = os.path.join(root, filename)
|
||||
self.write_file = tempfile.NamedTemporaryFile(delete=False)
|
||||
|
||||
def build_preprocessed_file(self):
|
||||
try:
|
||||
fr = open(self.read_file)
|
||||
fw = self.write_file
|
||||
line = " "
|
||||
while len(line):
|
||||
line = fr.readline()
|
||||
x = re.split(r"nkjp:[^ ]* ", line) # in all files
|
||||
ret = " ".join(x)
|
||||
x = re.split("<nkjp:paren>", ret) # in ann_segmentation.xml
|
||||
ret = " ".join(x)
|
||||
x = re.split("</nkjp:paren>", ret) # in ann_segmentation.xml
|
||||
ret = " ".join(x)
|
||||
x = re.split("<choice>", ret) # in ann_segmentation.xml
|
||||
ret = " ".join(x)
|
||||
x = re.split("</choice>", ret) # in ann_segmentation.xml
|
||||
ret = " ".join(x)
|
||||
fw.write(ret)
|
||||
fr.close()
|
||||
fw.close()
|
||||
return self.write_file.name
|
||||
except Exception as e:
|
||||
self.remove_preprocessed_file()
|
||||
raise Exception from e
|
||||
|
||||
def remove_preprocessed_file(self):
|
||||
os.remove(self.write_file.name)
|
||||
|
||||
|
||||
class NKJPCorpus_Segmentation_View(XMLCorpusView):
|
||||
"""
|
||||
A stream backed corpus view specialized for use with
|
||||
ann_segmentation.xml files in NKJP corpus.
|
||||
"""
|
||||
|
||||
def __init__(self, filename, **kwargs):
|
||||
self.tagspec = ".*p/.*s"
|
||||
# intersperse NKJPCorpus_Text_View
|
||||
self.text_view = NKJPCorpus_Text_View(
|
||||
filename, mode=NKJPCorpus_Text_View.SENTS_MODE
|
||||
)
|
||||
self.text_view.handle_query()
|
||||
# xml preprocessing
|
||||
self.xml_tool = XML_Tool(filename, "ann_segmentation.xml")
|
||||
# base class init
|
||||
XMLCorpusView.__init__(
|
||||
self, self.xml_tool.build_preprocessed_file(), self.tagspec
|
||||
)
|
||||
|
||||
def get_segm_id(self, example_word):
|
||||
return example_word.split("(")[1].split(",")[0]
|
||||
|
||||
def get_sent_beg(self, beg_word):
|
||||
# returns index of beginning letter in sentence
|
||||
return int(beg_word.split(",")[1])
|
||||
|
||||
def get_sent_end(self, end_word):
|
||||
# returns index of end letter in sentence
|
||||
splitted = end_word.split(")")[0].split(",")
|
||||
return int(splitted[1]) + int(splitted[2])
|
||||
|
||||
def get_sentences(self, sent_segm):
|
||||
# returns one sentence
|
||||
id = self.get_segm_id(sent_segm[0])
|
||||
segm = self.text_view.segm_dict[id] # text segment
|
||||
beg = self.get_sent_beg(sent_segm[0])
|
||||
end = self.get_sent_end(sent_segm[len(sent_segm) - 1])
|
||||
return segm[beg:end]
|
||||
|
||||
def remove_choice(self, segm):
|
||||
ret = []
|
||||
prev_txt_end = -1
|
||||
prev_txt_nr = -1
|
||||
for word in segm:
|
||||
txt_nr = self.get_segm_id(word)
|
||||
# get increasing sequence of ids: in case of choice get first possibility
|
||||
if self.get_sent_beg(word) > prev_txt_end - 1 or prev_txt_nr != txt_nr:
|
||||
ret.append(word)
|
||||
prev_txt_end = self.get_sent_end(word)
|
||||
prev_txt_nr = txt_nr
|
||||
|
||||
return ret
|
||||
|
||||
def handle_query(self):
|
||||
try:
|
||||
self._open()
|
||||
sentences = []
|
||||
while True:
|
||||
sent_segm = XMLCorpusView.read_block(self, self._stream)
|
||||
if len(sent_segm) == 0:
|
||||
break
|
||||
for segm in sent_segm:
|
||||
segm = self.remove_choice(segm)
|
||||
sentences.append(self.get_sentences(segm))
|
||||
self.close()
|
||||
self.xml_tool.remove_preprocessed_file()
|
||||
return sentences
|
||||
except Exception as e:
|
||||
self.xml_tool.remove_preprocessed_file()
|
||||
raise Exception from e
|
||||
|
||||
def handle_elt(self, elt, context):
|
||||
ret = []
|
||||
for seg in elt:
|
||||
ret.append(seg.get("corresp"))
|
||||
return ret
|
||||
|
||||
|
||||
class NKJPCorpus_Text_View(XMLCorpusView):
|
||||
"""
|
||||
A stream backed corpus view specialized for use with
|
||||
text.xml files in NKJP corpus.
|
||||
"""
|
||||
|
||||
SENTS_MODE = 0
|
||||
RAW_MODE = 1
|
||||
|
||||
def __init__(self, filename, **kwargs):
|
||||
self.mode = kwargs.pop("mode", 0)
|
||||
self.tagspec = ".*/div/ab"
|
||||
self.segm_dict = dict()
|
||||
# xml preprocessing
|
||||
self.xml_tool = XML_Tool(filename, "text.xml")
|
||||
# base class init
|
||||
XMLCorpusView.__init__(
|
||||
self, self.xml_tool.build_preprocessed_file(), self.tagspec
|
||||
)
|
||||
|
||||
def handle_query(self):
|
||||
try:
|
||||
self._open()
|
||||
x = self.read_block(self._stream)
|
||||
self.close()
|
||||
self.xml_tool.remove_preprocessed_file()
|
||||
return x
|
||||
except Exception as e:
|
||||
self.xml_tool.remove_preprocessed_file()
|
||||
raise Exception from e
|
||||
|
||||
def read_block(self, stream, tagspec=None, elt_handler=None):
|
||||
"""
|
||||
Returns text as a list of sentences.
|
||||
"""
|
||||
txt = []
|
||||
while True:
|
||||
segm = XMLCorpusView.read_block(self, stream)
|
||||
if len(segm) == 0:
|
||||
break
|
||||
for part in segm:
|
||||
txt.append(part)
|
||||
|
||||
return [" ".join([segm for segm in txt])]
|
||||
|
||||
def get_segm_id(self, elt):
|
||||
for attr in elt.attrib:
|
||||
if attr.endswith("id"):
|
||||
return elt.get(attr)
|
||||
|
||||
def handle_elt(self, elt, context):
|
||||
# fill dictionary to use later in sents mode
|
||||
if self.mode is NKJPCorpus_Text_View.SENTS_MODE:
|
||||
self.segm_dict[self.get_segm_id(elt)] = elt.text
|
||||
return elt.text
|
||||
|
||||
|
||||
class NKJPCorpus_Morph_View(XMLCorpusView):
|
||||
"""
|
||||
A stream backed corpus view specialized for use with
|
||||
ann_morphosyntax.xml files in NKJP corpus.
|
||||
"""
|
||||
|
||||
def __init__(self, filename, **kwargs):
|
||||
self.tags = kwargs.pop("tags", None)
|
||||
self.tagspec = ".*/seg/fs"
|
||||
self.xml_tool = XML_Tool(filename, "ann_morphosyntax.xml")
|
||||
XMLCorpusView.__init__(
|
||||
self, self.xml_tool.build_preprocessed_file(), self.tagspec
|
||||
)
|
||||
|
||||
def handle_query(self):
|
||||
try:
|
||||
self._open()
|
||||
words = []
|
||||
while True:
|
||||
segm = XMLCorpusView.read_block(self, self._stream)
|
||||
if len(segm) == 0:
|
||||
break
|
||||
for part in segm:
|
||||
if part is not None:
|
||||
words.append(part)
|
||||
self.close()
|
||||
self.xml_tool.remove_preprocessed_file()
|
||||
return words
|
||||
except Exception as e:
|
||||
self.xml_tool.remove_preprocessed_file()
|
||||
raise Exception from e
|
||||
|
||||
def handle_elt(self, elt, context):
|
||||
word = ""
|
||||
flag = False
|
||||
is_not_interp = True
|
||||
# if tags not specified, then always return word
|
||||
if self.tags is None:
|
||||
flag = True
|
||||
|
||||
for child in elt:
|
||||
# get word
|
||||
if "name" in child.keys() and child.attrib["name"] == "orth":
|
||||
for symbol in child:
|
||||
if symbol.tag == "string":
|
||||
word = symbol.text
|
||||
elif "name" in child.keys() and child.attrib["name"] == "interps":
|
||||
for symbol in child:
|
||||
if "type" in symbol.keys() and symbol.attrib["type"] == "lex":
|
||||
for symbol2 in symbol:
|
||||
if (
|
||||
"name" in symbol2.keys()
|
||||
and symbol2.attrib["name"] == "ctag"
|
||||
):
|
||||
for symbol3 in symbol2:
|
||||
if (
|
||||
"value" in symbol3.keys()
|
||||
and self.tags is not None
|
||||
and symbol3.attrib["value"] in self.tags
|
||||
):
|
||||
flag = True
|
||||
elif (
|
||||
"value" in symbol3.keys()
|
||||
and symbol3.attrib["value"] == "interp"
|
||||
):
|
||||
is_not_interp = False
|
||||
if flag and is_not_interp:
|
||||
return word
|
||||
465
backend/venv/Lib/site-packages/nltk/corpus/reader/nombank.py
Normal file
465
backend/venv/Lib/site-packages/nltk/corpus/reader/nombank.py
Normal file
@@ -0,0 +1,465 @@
|
||||
# Natural Language Toolkit: NomBank Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Authors: Paul Bedaride <paul.bedaride@gmail.com>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
from functools import total_ordering
|
||||
from xml.etree import ElementTree
|
||||
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.internals import raise_unorderable_types
|
||||
from nltk.tree import Tree
|
||||
|
||||
|
||||
class NombankCorpusReader(CorpusReader):
|
||||
"""
|
||||
Corpus reader for the nombank corpus, which augments the Penn
|
||||
Treebank with information about the predicate argument structure
|
||||
of every noun instance. The corpus consists of two parts: the
|
||||
predicate-argument annotations themselves, and a set of "frameset
|
||||
files" which define the argument labels used by the annotations,
|
||||
on a per-noun basis. Each "frameset file" contains one or more
|
||||
predicates, such as ``'turn'`` or ``'turn_on'``, each of which is
|
||||
divided into coarse-grained word senses called "rolesets". For
|
||||
each "roleset", the frameset file provides descriptions of the
|
||||
argument roles, along with examples.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
root,
|
||||
nomfile,
|
||||
framefiles="",
|
||||
nounsfile=None,
|
||||
parse_fileid_xform=None,
|
||||
parse_corpus=None,
|
||||
encoding="utf8",
|
||||
):
|
||||
"""
|
||||
:param root: The root directory for this corpus.
|
||||
:param nomfile: The name of the file containing the predicate-
|
||||
argument annotations (relative to ``root``).
|
||||
:param framefiles: A list or regexp specifying the frameset
|
||||
fileids for this corpus.
|
||||
:param parse_fileid_xform: A transform that should be applied
|
||||
to the fileids in this corpus. This should be a function
|
||||
of one argument (a fileid) that returns a string (the new
|
||||
fileid).
|
||||
:param parse_corpus: The corpus containing the parse trees
|
||||
corresponding to this corpus. These parse trees are
|
||||
necessary to resolve the tree pointers used by nombank.
|
||||
"""
|
||||
|
||||
# If framefiles is specified as a regexp, expand it.
|
||||
if isinstance(framefiles, str):
|
||||
self._fileids = find_corpus_fileids(root, framefiles)
|
||||
self._fileids = list(framefiles)
|
||||
# Initialize the corpus reader.
|
||||
CorpusReader.__init__(self, root, framefiles, encoding)
|
||||
|
||||
# Record our nom file & nouns file.
|
||||
self._nomfile = nomfile
|
||||
self._nounsfile = nounsfile
|
||||
self._parse_fileid_xform = parse_fileid_xform
|
||||
self._parse_corpus = parse_corpus
|
||||
|
||||
def instances(self, baseform=None):
|
||||
"""
|
||||
:return: a corpus view that acts as a list of
|
||||
``NombankInstance`` objects, one for each noun in the corpus.
|
||||
"""
|
||||
kwargs = {}
|
||||
if baseform is not None:
|
||||
kwargs["instance_filter"] = lambda inst: inst.baseform == baseform
|
||||
return StreamBackedCorpusView(
|
||||
self.abspath(self._nomfile),
|
||||
lambda stream: self._read_instance_block(stream, **kwargs),
|
||||
encoding=self.encoding(self._nomfile),
|
||||
)
|
||||
|
||||
def lines(self):
|
||||
"""
|
||||
:return: a corpus view that acts as a list of strings, one for
|
||||
each line in the predicate-argument annotation file.
|
||||
"""
|
||||
return StreamBackedCorpusView(
|
||||
self.abspath(self._nomfile),
|
||||
read_line_block,
|
||||
encoding=self.encoding(self._nomfile),
|
||||
)
|
||||
|
||||
def roleset(self, roleset_id):
|
||||
"""
|
||||
:return: the xml description for the given roleset.
|
||||
"""
|
||||
baseform = roleset_id.split(".")[0]
|
||||
baseform = baseform.replace("perc-sign", "%")
|
||||
baseform = baseform.replace("oneslashonezero", "1/10").replace(
|
||||
"1/10", "1-slash-10"
|
||||
)
|
||||
framefile = "frames/%s.xml" % baseform
|
||||
if framefile not in self.fileids():
|
||||
raise ValueError("Frameset file for %s not found" % roleset_id)
|
||||
|
||||
# n.b.: The encoding for XML fileids is specified by the file
|
||||
# itself; so we ignore self._encoding here.
|
||||
with self.abspath(framefile).open() as fp:
|
||||
etree = ElementTree.parse(fp).getroot()
|
||||
for roleset in etree.findall("predicate/roleset"):
|
||||
if roleset.attrib["id"] == roleset_id:
|
||||
return roleset
|
||||
raise ValueError(f"Roleset {roleset_id} not found in {framefile}")
|
||||
|
||||
def rolesets(self, baseform=None):
|
||||
"""
|
||||
:return: list of xml descriptions for rolesets.
|
||||
"""
|
||||
if baseform is not None:
|
||||
framefile = "frames/%s.xml" % baseform
|
||||
if framefile not in self.fileids():
|
||||
raise ValueError("Frameset file for %s not found" % baseform)
|
||||
framefiles = [framefile]
|
||||
else:
|
||||
framefiles = self.fileids()
|
||||
|
||||
rsets = []
|
||||
for framefile in framefiles:
|
||||
# n.b.: The encoding for XML fileids is specified by the file
|
||||
# itself; so we ignore self._encoding here.
|
||||
with self.abspath(framefile).open() as fp:
|
||||
etree = ElementTree.parse(fp).getroot()
|
||||
rsets.append(etree.findall("predicate/roleset"))
|
||||
return LazyConcatenation(rsets)
|
||||
|
||||
def nouns(self):
|
||||
"""
|
||||
:return: a corpus view that acts as a list of all noun lemmas
|
||||
in this corpus (from the nombank.1.0.words file).
|
||||
"""
|
||||
return StreamBackedCorpusView(
|
||||
self.abspath(self._nounsfile),
|
||||
read_line_block,
|
||||
encoding=self.encoding(self._nounsfile),
|
||||
)
|
||||
|
||||
def _read_instance_block(self, stream, instance_filter=lambda inst: True):
|
||||
block = []
|
||||
|
||||
# Read 100 at a time.
|
||||
for i in range(100):
|
||||
line = stream.readline().strip()
|
||||
if line:
|
||||
inst = NombankInstance.parse(
|
||||
line, self._parse_fileid_xform, self._parse_corpus
|
||||
)
|
||||
if instance_filter(inst):
|
||||
block.append(inst)
|
||||
|
||||
return block
|
||||
|
||||
|
||||
######################################################################
|
||||
# { Nombank Instance & related datatypes
|
||||
######################################################################
|
||||
|
||||
|
||||
class NombankInstance:
|
||||
def __init__(
|
||||
self,
|
||||
fileid,
|
||||
sentnum,
|
||||
wordnum,
|
||||
baseform,
|
||||
sensenumber,
|
||||
predicate,
|
||||
predid,
|
||||
arguments,
|
||||
parse_corpus=None,
|
||||
):
|
||||
self.fileid = fileid
|
||||
"""The name of the file containing the parse tree for this
|
||||
instance's sentence."""
|
||||
|
||||
self.sentnum = sentnum
|
||||
"""The sentence number of this sentence within ``fileid``.
|
||||
Indexing starts from zero."""
|
||||
|
||||
self.wordnum = wordnum
|
||||
"""The word number of this instance's predicate within its
|
||||
containing sentence. Word numbers are indexed starting from
|
||||
zero, and include traces and other empty parse elements."""
|
||||
|
||||
self.baseform = baseform
|
||||
"""The baseform of the predicate."""
|
||||
|
||||
self.sensenumber = sensenumber
|
||||
"""The sense number of the predicate."""
|
||||
|
||||
self.predicate = predicate
|
||||
"""A ``NombankTreePointer`` indicating the position of this
|
||||
instance's predicate within its containing sentence."""
|
||||
|
||||
self.predid = predid
|
||||
"""Identifier of the predicate."""
|
||||
|
||||
self.arguments = tuple(arguments)
|
||||
"""A list of tuples (argloc, argid), specifying the location
|
||||
and identifier for each of the predicate's argument in the
|
||||
containing sentence. Argument identifiers are strings such as
|
||||
``'ARG0'`` or ``'ARGM-TMP'``. This list does *not* contain
|
||||
the predicate."""
|
||||
|
||||
self.parse_corpus = parse_corpus
|
||||
"""A corpus reader for the parse trees corresponding to the
|
||||
instances in this nombank corpus."""
|
||||
|
||||
@property
|
||||
def roleset(self):
|
||||
"""The name of the roleset used by this instance's predicate.
|
||||
Use ``nombank.roleset() <NombankCorpusReader.roleset>`` to
|
||||
look up information about the roleset."""
|
||||
r = self.baseform.replace("%", "perc-sign")
|
||||
r = r.replace("1/10", "1-slash-10").replace("1-slash-10", "oneslashonezero")
|
||||
return f"{r}.{self.sensenumber}"
|
||||
|
||||
def __repr__(self):
|
||||
return "<NombankInstance: {}, sent {}, word {}>".format(
|
||||
self.fileid,
|
||||
self.sentnum,
|
||||
self.wordnum,
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
s = "{} {} {} {} {}".format(
|
||||
self.fileid,
|
||||
self.sentnum,
|
||||
self.wordnum,
|
||||
self.baseform,
|
||||
self.sensenumber,
|
||||
)
|
||||
items = self.arguments + ((self.predicate, "rel"),)
|
||||
for argloc, argid in sorted(items):
|
||||
s += f" {argloc}-{argid}"
|
||||
return s
|
||||
|
||||
def _get_tree(self):
|
||||
if self.parse_corpus is None:
|
||||
return None
|
||||
if self.fileid not in self.parse_corpus.fileids():
|
||||
return None
|
||||
return self.parse_corpus.parsed_sents(self.fileid)[self.sentnum]
|
||||
|
||||
tree = property(
|
||||
_get_tree,
|
||||
doc="""
|
||||
The parse tree corresponding to this instance, or None if
|
||||
the corresponding tree is not available.""",
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def parse(s, parse_fileid_xform=None, parse_corpus=None):
|
||||
pieces = s.split()
|
||||
if len(pieces) < 6:
|
||||
raise ValueError("Badly formatted nombank line: %r" % s)
|
||||
|
||||
# Divide the line into its basic pieces.
|
||||
(fileid, sentnum, wordnum, baseform, sensenumber) = pieces[:5]
|
||||
|
||||
args = pieces[5:]
|
||||
rel = [args.pop(i) for i, p in enumerate(args) if "-rel" in p]
|
||||
if len(rel) != 1:
|
||||
raise ValueError("Badly formatted nombank line: %r" % s)
|
||||
|
||||
# Apply the fileid selector, if any.
|
||||
if parse_fileid_xform is not None:
|
||||
fileid = parse_fileid_xform(fileid)
|
||||
|
||||
# Convert sentence & word numbers to ints.
|
||||
sentnum = int(sentnum)
|
||||
wordnum = int(wordnum)
|
||||
|
||||
# Parse the predicate location.
|
||||
|
||||
predloc, predid = rel[0].split("-", 1)
|
||||
predicate = NombankTreePointer.parse(predloc)
|
||||
|
||||
# Parse the arguments.
|
||||
arguments = []
|
||||
for arg in args:
|
||||
argloc, argid = arg.split("-", 1)
|
||||
arguments.append((NombankTreePointer.parse(argloc), argid))
|
||||
|
||||
# Put it all together.
|
||||
return NombankInstance(
|
||||
fileid,
|
||||
sentnum,
|
||||
wordnum,
|
||||
baseform,
|
||||
sensenumber,
|
||||
predicate,
|
||||
predid,
|
||||
arguments,
|
||||
parse_corpus,
|
||||
)
|
||||
|
||||
|
||||
class NombankPointer:
|
||||
"""
|
||||
A pointer used by nombank to identify one or more constituents in
|
||||
a parse tree. ``NombankPointer`` is an abstract base class with
|
||||
three concrete subclasses:
|
||||
|
||||
- ``NombankTreePointer`` is used to point to single constituents.
|
||||
- ``NombankSplitTreePointer`` is used to point to 'split'
|
||||
constituents, which consist of a sequence of two or more
|
||||
``NombankTreePointer`` pointers.
|
||||
- ``NombankChainTreePointer`` is used to point to entire trace
|
||||
chains in a tree. It consists of a sequence of pieces, which
|
||||
can be ``NombankTreePointer`` or ``NombankSplitTreePointer`` pointers.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
if self.__class__ == NombankPointer:
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
class NombankChainTreePointer(NombankPointer):
|
||||
def __init__(self, pieces):
|
||||
self.pieces = pieces
|
||||
"""A list of the pieces that make up this chain. Elements may
|
||||
be either ``NombankSplitTreePointer`` or
|
||||
``NombankTreePointer`` pointers."""
|
||||
|
||||
def __str__(self):
|
||||
return "*".join("%s" % p for p in self.pieces)
|
||||
|
||||
def __repr__(self):
|
||||
return "<NombankChainTreePointer: %s>" % self
|
||||
|
||||
def select(self, tree):
|
||||
if tree is None:
|
||||
raise ValueError("Parse tree not available")
|
||||
return Tree("*CHAIN*", [p.select(tree) for p in self.pieces])
|
||||
|
||||
|
||||
class NombankSplitTreePointer(NombankPointer):
|
||||
def __init__(self, pieces):
|
||||
self.pieces = pieces
|
||||
"""A list of the pieces that make up this chain. Elements are
|
||||
all ``NombankTreePointer`` pointers."""
|
||||
|
||||
def __str__(self):
|
||||
return ",".join("%s" % p for p in self.pieces)
|
||||
|
||||
def __repr__(self):
|
||||
return "<NombankSplitTreePointer: %s>" % self
|
||||
|
||||
def select(self, tree):
|
||||
if tree is None:
|
||||
raise ValueError("Parse tree not available")
|
||||
return Tree("*SPLIT*", [p.select(tree) for p in self.pieces])
|
||||
|
||||
|
||||
@total_ordering
|
||||
class NombankTreePointer(NombankPointer):
|
||||
"""
|
||||
wordnum:height*wordnum:height*...
|
||||
wordnum:height,
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, wordnum, height):
|
||||
self.wordnum = wordnum
|
||||
self.height = height
|
||||
|
||||
@staticmethod
|
||||
def parse(s):
|
||||
# Deal with chains (xx*yy*zz)
|
||||
pieces = s.split("*")
|
||||
if len(pieces) > 1:
|
||||
return NombankChainTreePointer(
|
||||
[NombankTreePointer.parse(elt) for elt in pieces]
|
||||
)
|
||||
|
||||
# Deal with split args (xx,yy,zz)
|
||||
pieces = s.split(",")
|
||||
if len(pieces) > 1:
|
||||
return NombankSplitTreePointer(
|
||||
[NombankTreePointer.parse(elt) for elt in pieces]
|
||||
)
|
||||
|
||||
# Deal with normal pointers.
|
||||
pieces = s.split(":")
|
||||
if len(pieces) != 2:
|
||||
raise ValueError("bad nombank pointer %r" % s)
|
||||
return NombankTreePointer(int(pieces[0]), int(pieces[1]))
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.wordnum}:{self.height}"
|
||||
|
||||
def __repr__(self):
|
||||
return "NombankTreePointer(%d, %d)" % (self.wordnum, self.height)
|
||||
|
||||
def __eq__(self, other):
|
||||
while isinstance(other, (NombankChainTreePointer, NombankSplitTreePointer)):
|
||||
other = other.pieces[0]
|
||||
|
||||
if not isinstance(other, NombankTreePointer):
|
||||
return self is other
|
||||
|
||||
return self.wordnum == other.wordnum and self.height == other.height
|
||||
|
||||
def __ne__(self, other):
|
||||
return not self == other
|
||||
|
||||
def __lt__(self, other):
|
||||
while isinstance(other, (NombankChainTreePointer, NombankSplitTreePointer)):
|
||||
other = other.pieces[0]
|
||||
|
||||
if not isinstance(other, NombankTreePointer):
|
||||
return id(self) < id(other)
|
||||
|
||||
return (self.wordnum, -self.height) < (other.wordnum, -other.height)
|
||||
|
||||
def select(self, tree):
|
||||
if tree is None:
|
||||
raise ValueError("Parse tree not available")
|
||||
return tree[self.treepos(tree)]
|
||||
|
||||
def treepos(self, tree):
|
||||
"""
|
||||
Convert this pointer to a standard 'tree position' pointer,
|
||||
given that it points to the given tree.
|
||||
"""
|
||||
if tree is None:
|
||||
raise ValueError("Parse tree not available")
|
||||
stack = [tree]
|
||||
treepos = []
|
||||
|
||||
wordnum = 0
|
||||
while True:
|
||||
# tree node:
|
||||
if isinstance(stack[-1], Tree):
|
||||
# Select the next child.
|
||||
if len(treepos) < len(stack):
|
||||
treepos.append(0)
|
||||
else:
|
||||
treepos[-1] += 1
|
||||
# Update the stack.
|
||||
if treepos[-1] < len(stack[-1]):
|
||||
stack.append(stack[-1][treepos[-1]])
|
||||
else:
|
||||
# End of node's child list: pop up a level.
|
||||
stack.pop()
|
||||
treepos.pop()
|
||||
# word node:
|
||||
else:
|
||||
if wordnum == self.wordnum:
|
||||
return tuple(treepos[: len(treepos) - self.height - 1])
|
||||
else:
|
||||
wordnum += 1
|
||||
stack.pop()
|
||||
@@ -0,0 +1,90 @@
|
||||
# Natural Language Toolkit: NPS Chat Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import re
|
||||
import textwrap
|
||||
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.corpus.reader.xmldocs import *
|
||||
from nltk.internals import ElementWrapper
|
||||
from nltk.tag import map_tag
|
||||
from nltk.util import LazyConcatenation
|
||||
|
||||
|
||||
class NPSChatCorpusReader(XMLCorpusReader):
|
||||
def __init__(self, root, fileids, wrap_etree=False, tagset=None):
|
||||
XMLCorpusReader.__init__(self, root, fileids, wrap_etree)
|
||||
self._tagset = tagset
|
||||
|
||||
def xml_posts(self, fileids=None):
|
||||
if self._wrap_etree:
|
||||
return concat(
|
||||
[
|
||||
XMLCorpusView(fileid, "Session/Posts/Post", self._wrap_elt)
|
||||
for fileid in self.abspaths(fileids)
|
||||
]
|
||||
)
|
||||
else:
|
||||
return concat(
|
||||
[
|
||||
XMLCorpusView(fileid, "Session/Posts/Post")
|
||||
for fileid in self.abspaths(fileids)
|
||||
]
|
||||
)
|
||||
|
||||
def posts(self, fileids=None):
|
||||
return concat(
|
||||
[
|
||||
XMLCorpusView(
|
||||
fileid, "Session/Posts/Post/terminals", self._elt_to_words
|
||||
)
|
||||
for fileid in self.abspaths(fileids)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_posts(self, fileids=None, tagset=None):
|
||||
def reader(elt, handler):
|
||||
return self._elt_to_tagged_words(elt, handler, tagset)
|
||||
|
||||
return concat(
|
||||
[
|
||||
XMLCorpusView(fileid, "Session/Posts/Post/terminals", reader)
|
||||
for fileid in self.abspaths(fileids)
|
||||
]
|
||||
)
|
||||
|
||||
def words(self, fileids=None):
|
||||
return LazyConcatenation(self.posts(fileids))
|
||||
|
||||
def tagged_words(self, fileids=None, tagset=None):
|
||||
return LazyConcatenation(self.tagged_posts(fileids, tagset))
|
||||
|
||||
def _wrap_elt(self, elt, handler):
|
||||
return ElementWrapper(elt)
|
||||
|
||||
def _elt_to_words(self, elt, handler):
|
||||
return [self._simplify_username(t.attrib["word"]) for t in elt.findall("t")]
|
||||
|
||||
def _elt_to_tagged_words(self, elt, handler, tagset=None):
|
||||
tagged_post = [
|
||||
(self._simplify_username(t.attrib["word"]), t.attrib["pos"])
|
||||
for t in elt.findall("t")
|
||||
]
|
||||
if tagset and tagset != self._tagset:
|
||||
tagged_post = [
|
||||
(w, map_tag(self._tagset, tagset, t)) for (w, t) in tagged_post
|
||||
]
|
||||
return tagged_post
|
||||
|
||||
@staticmethod
|
||||
def _simplify_username(word):
|
||||
if "User" in word:
|
||||
word = "U" + word.split("User", 1)[1]
|
||||
elif isinstance(word, bytes):
|
||||
word = word.decode("ascii")
|
||||
return word
|
||||
@@ -0,0 +1,125 @@
|
||||
# Natural Language Toolkit: Opinion Lexicon Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
CorpusReader for the Opinion Lexicon.
|
||||
|
||||
Opinion Lexicon information
|
||||
===========================
|
||||
|
||||
Authors: Minqing Hu and Bing Liu, 2004.
|
||||
Department of Computer Science
|
||||
University of Illinois at Chicago
|
||||
|
||||
Contact: Bing Liu, liub@cs.uic.edu
|
||||
https://www.cs.uic.edu/~liub
|
||||
|
||||
Distributed with permission.
|
||||
|
||||
Related papers:
|
||||
|
||||
- Minqing Hu and Bing Liu. "Mining and summarizing customer reviews".
|
||||
Proceedings of the ACM SIGKDD International Conference on Knowledge Discovery
|
||||
& Data Mining (KDD-04), Aug 22-25, 2004, Seattle, Washington, USA.
|
||||
|
||||
- Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing and
|
||||
Comparing Opinions on the Web". Proceedings of the 14th International World
|
||||
Wide Web conference (WWW-2005), May 10-14, 2005, Chiba, Japan.
|
||||
"""
|
||||
|
||||
from nltk.corpus.reader import WordListCorpusReader
|
||||
from nltk.corpus.reader.api import *
|
||||
|
||||
|
||||
class IgnoreReadmeCorpusView(StreamBackedCorpusView):
|
||||
"""
|
||||
This CorpusView is used to skip the initial readme block of the corpus.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
StreamBackedCorpusView.__init__(self, *args, **kwargs)
|
||||
# open self._stream
|
||||
self._open()
|
||||
# skip the readme block
|
||||
read_blankline_block(self._stream)
|
||||
# Set the initial position to the current stream position
|
||||
self._filepos = [self._stream.tell()]
|
||||
|
||||
|
||||
class OpinionLexiconCorpusReader(WordListCorpusReader):
|
||||
"""
|
||||
Reader for Liu and Hu opinion lexicon. Blank lines and readme are ignored.
|
||||
|
||||
>>> from nltk.corpus import opinion_lexicon
|
||||
>>> opinion_lexicon.words()
|
||||
['2-faced', '2-faces', 'abnormal', 'abolish', ...]
|
||||
|
||||
The OpinionLexiconCorpusReader provides shortcuts to retrieve positive/negative
|
||||
words:
|
||||
|
||||
>>> opinion_lexicon.negative()
|
||||
['2-faced', '2-faces', 'abnormal', 'abolish', ...]
|
||||
|
||||
Note that words from `words()` method are sorted by file id, not alphabetically:
|
||||
|
||||
>>> opinion_lexicon.words()[0:10] # doctest: +NORMALIZE_WHITESPACE
|
||||
['2-faced', '2-faces', 'abnormal', 'abolish', 'abominable', 'abominably',
|
||||
'abominate', 'abomination', 'abort', 'aborted']
|
||||
>>> sorted(opinion_lexicon.words())[0:10] # doctest: +NORMALIZE_WHITESPACE
|
||||
['2-faced', '2-faces', 'a+', 'abnormal', 'abolish', 'abominable', 'abominably',
|
||||
'abominate', 'abomination', 'abort']
|
||||
"""
|
||||
|
||||
CorpusView = IgnoreReadmeCorpusView
|
||||
|
||||
def words(self, fileids=None):
|
||||
"""
|
||||
Return all words in the opinion lexicon. Note that these words are not
|
||||
sorted in alphabetical order.
|
||||
|
||||
:param fileids: a list or regexp specifying the ids of the files whose
|
||||
words have to be returned.
|
||||
:return: the given file(s) as a list of words and punctuation symbols.
|
||||
:rtype: list(str)
|
||||
"""
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, str):
|
||||
fileids = [fileids]
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(path, self._read_word_block, encoding=enc)
|
||||
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
||||
]
|
||||
)
|
||||
|
||||
def positive(self):
|
||||
"""
|
||||
Return all positive words in alphabetical order.
|
||||
|
||||
:return: a list of positive words.
|
||||
:rtype: list(str)
|
||||
"""
|
||||
return self.words("positive-words.txt")
|
||||
|
||||
def negative(self):
|
||||
"""
|
||||
Return all negative words in alphabetical order.
|
||||
|
||||
:return: a list of negative words.
|
||||
:rtype: list(str)
|
||||
"""
|
||||
return self.words("negative-words.txt")
|
||||
|
||||
def _read_word_block(self, stream):
|
||||
words = []
|
||||
for i in range(20): # Read 20 lines at a time.
|
||||
line = stream.readline()
|
||||
if not line:
|
||||
continue
|
||||
words.append(line.strip())
|
||||
return words
|
||||
174
backend/venv/Lib/site-packages/nltk/corpus/reader/panlex_lite.py
Normal file
174
backend/venv/Lib/site-packages/nltk/corpus/reader/panlex_lite.py
Normal file
@@ -0,0 +1,174 @@
|
||||
# Natural Language Toolkit: PanLex Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: David Kamholz <kamholz@panlex.org>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
CorpusReader for PanLex Lite, a stripped down version of PanLex distributed
|
||||
as an SQLite database. See the README.txt in the panlex_lite corpus directory
|
||||
for more information on PanLex Lite.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sqlite3
|
||||
|
||||
from nltk.corpus.reader.api import CorpusReader
|
||||
|
||||
|
||||
class PanLexLiteCorpusReader(CorpusReader):
|
||||
MEANING_Q = """
|
||||
SELECT dnx2.mn, dnx2.uq, dnx2.ap, dnx2.ui, ex2.tt, ex2.lv
|
||||
FROM dnx
|
||||
JOIN ex ON (ex.ex = dnx.ex)
|
||||
JOIN dnx dnx2 ON (dnx2.mn = dnx.mn)
|
||||
JOIN ex ex2 ON (ex2.ex = dnx2.ex)
|
||||
WHERE dnx.ex != dnx2.ex AND ex.tt = ? AND ex.lv = ?
|
||||
ORDER BY dnx2.uq DESC
|
||||
"""
|
||||
|
||||
TRANSLATION_Q = """
|
||||
SELECT s.tt, sum(s.uq) AS trq FROM (
|
||||
SELECT ex2.tt, max(dnx.uq) AS uq
|
||||
FROM dnx
|
||||
JOIN ex ON (ex.ex = dnx.ex)
|
||||
JOIN dnx dnx2 ON (dnx2.mn = dnx.mn)
|
||||
JOIN ex ex2 ON (ex2.ex = dnx2.ex)
|
||||
WHERE dnx.ex != dnx2.ex AND ex.lv = ? AND ex.tt = ? AND ex2.lv = ?
|
||||
GROUP BY ex2.tt, dnx.ui
|
||||
) s
|
||||
GROUP BY s.tt
|
||||
ORDER BY trq DESC, s.tt
|
||||
"""
|
||||
|
||||
def __init__(self, root):
|
||||
self._c = sqlite3.connect(os.path.join(root, "db.sqlite")).cursor()
|
||||
|
||||
self._uid_lv = {}
|
||||
self._lv_uid = {}
|
||||
|
||||
for row in self._c.execute("SELECT uid, lv FROM lv"):
|
||||
self._uid_lv[row[0]] = row[1]
|
||||
self._lv_uid[row[1]] = row[0]
|
||||
|
||||
def language_varieties(self, lc=None):
|
||||
"""
|
||||
Return a list of PanLex language varieties.
|
||||
|
||||
:param lc: ISO 639 alpha-3 code. If specified, filters returned varieties
|
||||
by this code. If unspecified, all varieties are returned.
|
||||
:return: the specified language varieties as a list of tuples. The first
|
||||
element is the language variety's seven-character uniform identifier,
|
||||
and the second element is its default name.
|
||||
:rtype: list(tuple)
|
||||
"""
|
||||
|
||||
if lc is None:
|
||||
return self._c.execute("SELECT uid, tt FROM lv ORDER BY uid").fetchall()
|
||||
else:
|
||||
return self._c.execute(
|
||||
"SELECT uid, tt FROM lv WHERE lc = ? ORDER BY uid", (lc,)
|
||||
).fetchall()
|
||||
|
||||
def meanings(self, expr_uid, expr_tt):
|
||||
"""
|
||||
Return a list of meanings for an expression.
|
||||
|
||||
:param expr_uid: the expression's language variety, as a seven-character
|
||||
uniform identifier.
|
||||
:param expr_tt: the expression's text.
|
||||
:return: a list of Meaning objects.
|
||||
:rtype: list(Meaning)
|
||||
"""
|
||||
|
||||
expr_lv = self._uid_lv[expr_uid]
|
||||
|
||||
mn_info = {}
|
||||
|
||||
for i in self._c.execute(self.MEANING_Q, (expr_tt, expr_lv)):
|
||||
mn = i[0]
|
||||
uid = self._lv_uid[i[5]]
|
||||
|
||||
if not mn in mn_info:
|
||||
mn_info[mn] = {
|
||||
"uq": i[1],
|
||||
"ap": i[2],
|
||||
"ui": i[3],
|
||||
"ex": {expr_uid: [expr_tt]},
|
||||
}
|
||||
|
||||
if not uid in mn_info[mn]["ex"]:
|
||||
mn_info[mn]["ex"][uid] = []
|
||||
|
||||
mn_info[mn]["ex"][uid].append(i[4])
|
||||
|
||||
return [Meaning(mn, mn_info[mn]) for mn in mn_info]
|
||||
|
||||
def translations(self, from_uid, from_tt, to_uid):
|
||||
"""
|
||||
Return a list of translations for an expression into a single language
|
||||
variety.
|
||||
|
||||
:param from_uid: the source expression's language variety, as a
|
||||
seven-character uniform identifier.
|
||||
:param from_tt: the source expression's text.
|
||||
:param to_uid: the target language variety, as a seven-character
|
||||
uniform identifier.
|
||||
:return: a list of translation tuples. The first element is the expression
|
||||
text and the second element is the translation quality.
|
||||
:rtype: list(tuple)
|
||||
"""
|
||||
|
||||
from_lv = self._uid_lv[from_uid]
|
||||
to_lv = self._uid_lv[to_uid]
|
||||
|
||||
return self._c.execute(self.TRANSLATION_Q, (from_lv, from_tt, to_lv)).fetchall()
|
||||
|
||||
|
||||
class Meaning(dict):
|
||||
"""
|
||||
Represents a single PanLex meaning. A meaning is a translation set derived
|
||||
from a single source.
|
||||
"""
|
||||
|
||||
def __init__(self, mn, attr):
|
||||
super().__init__(**attr)
|
||||
self["mn"] = mn
|
||||
|
||||
def id(self):
|
||||
"""
|
||||
:return: the meaning's id.
|
||||
:rtype: int
|
||||
"""
|
||||
return self["mn"]
|
||||
|
||||
def quality(self):
|
||||
"""
|
||||
:return: the meaning's source's quality (0=worst, 9=best).
|
||||
:rtype: int
|
||||
"""
|
||||
return self["uq"]
|
||||
|
||||
def source(self):
|
||||
"""
|
||||
:return: the meaning's source id.
|
||||
:rtype: int
|
||||
"""
|
||||
return self["ap"]
|
||||
|
||||
def source_group(self):
|
||||
"""
|
||||
:return: the meaning's source group id.
|
||||
:rtype: int
|
||||
"""
|
||||
return self["ui"]
|
||||
|
||||
def expressions(self):
|
||||
"""
|
||||
:return: the meaning's expressions as a dictionary whose keys are language
|
||||
variety uniform identifiers and whose values are lists of expression
|
||||
texts.
|
||||
:rtype: dict
|
||||
"""
|
||||
return self["ex"]
|
||||
@@ -0,0 +1,95 @@
|
||||
# Natural Language Toolkit: Word List Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
|
||||
import re
|
||||
from collections import defaultdict, namedtuple
|
||||
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.corpus.reader.wordlist import WordListCorpusReader
|
||||
from nltk.tokenize import line_tokenize
|
||||
|
||||
PanlexLanguage = namedtuple(
|
||||
"PanlexLanguage",
|
||||
[
|
||||
"panlex_uid", # (1) PanLex UID
|
||||
"iso639", # (2) ISO 639 language code
|
||||
"iso639_type", # (3) ISO 639 language type, see README
|
||||
"script", # (4) normal scripts of expressions
|
||||
"name", # (5) PanLex default name
|
||||
"langvar_uid", # (6) UID of the language variety in which the default name is an expression
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
class PanlexSwadeshCorpusReader(WordListCorpusReader):
|
||||
"""
|
||||
This is a class to read the PanLex Swadesh list from
|
||||
|
||||
David Kamholz, Jonathan Pool, and Susan M. Colowick (2014).
|
||||
PanLex: Building a Resource for Panlingual Lexical Translation.
|
||||
In LREC. http://www.lrec-conf.org/proceedings/lrec2014/pdf/1029_Paper.pdf
|
||||
|
||||
License: CC0 1.0 Universal
|
||||
https://creativecommons.org/publicdomain/zero/1.0/legalcode
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
# Find the swadesh size using the fileids' path.
|
||||
self.swadesh_size = re.match(r"swadesh([0-9].*)\/", self.fileids()[0]).group(1)
|
||||
self._languages = {lang.panlex_uid: lang for lang in self.get_languages()}
|
||||
self._macro_langauges = self.get_macrolanguages()
|
||||
|
||||
def license(self):
|
||||
return "CC0 1.0 Universal"
|
||||
|
||||
def language_codes(self):
|
||||
return self._languages.keys()
|
||||
|
||||
def get_languages(self):
|
||||
for line in self.raw(f"langs{self.swadesh_size}.txt").split("\n"):
|
||||
if not line.strip(): # Skip empty lines.
|
||||
continue
|
||||
yield PanlexLanguage(*line.strip().split("\t"))
|
||||
|
||||
def get_macrolanguages(self):
|
||||
macro_langauges = defaultdict(list)
|
||||
for lang in self._languages.values():
|
||||
macro_langauges[lang.iso639].append(lang.panlex_uid)
|
||||
return macro_langauges
|
||||
|
||||
def words_by_lang(self, lang_code):
|
||||
"""
|
||||
:return: a list of list(str)
|
||||
"""
|
||||
fileid = f"swadesh{self.swadesh_size}/{lang_code}.txt"
|
||||
return [concept.split("\t") for concept in self.words(fileid)]
|
||||
|
||||
def words_by_iso639(self, iso63_code):
|
||||
"""
|
||||
:return: a list of list(str)
|
||||
"""
|
||||
fileids = [
|
||||
f"swadesh{self.swadesh_size}/{lang_code}.txt"
|
||||
for lang_code in self._macro_langauges[iso63_code]
|
||||
]
|
||||
return [
|
||||
concept.split("\t") for fileid in fileids for concept in self.words(fileid)
|
||||
]
|
||||
|
||||
def entries(self, fileids=None):
|
||||
"""
|
||||
:return: a tuple of words for the specified fileids.
|
||||
"""
|
||||
if not fileids:
|
||||
fileids = self.fileids()
|
||||
|
||||
wordlists = [self.words(f) for f in fileids]
|
||||
return list(zip(*wordlists))
|
||||
373
backend/venv/Lib/site-packages/nltk/corpus/reader/pl196x.py
Normal file
373
backend/venv/Lib/site-packages/nltk/corpus/reader/pl196x.py
Normal file
@@ -0,0 +1,373 @@
|
||||
# Natural Language Toolkit:
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Piotr Kasprzyk <p.j.kasprzyk@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.corpus.reader.xmldocs import XMLCorpusReader
|
||||
|
||||
PARA = re.compile(r"<p(?: [^>]*){0,1}>(.*?)</p>")
|
||||
SENT = re.compile(r"<s(?: [^>]*){0,1}>(.*?)</s>")
|
||||
|
||||
TAGGEDWORD = re.compile(r"<([wc](?: [^>]*){0,1}>)(.*?)</[wc]>")
|
||||
WORD = re.compile(r"<[wc](?: [^>]*){0,1}>(.*?)</[wc]>")
|
||||
|
||||
TYPE = re.compile(r'type="(.*?)"')
|
||||
ANA = re.compile(r'ana="(.*?)"')
|
||||
|
||||
TEXTID = re.compile(r'text id="(.*?)"')
|
||||
|
||||
|
||||
class TEICorpusView(StreamBackedCorpusView):
|
||||
def __init__(
|
||||
self,
|
||||
corpus_file,
|
||||
tagged,
|
||||
group_by_sent,
|
||||
group_by_para,
|
||||
tagset=None,
|
||||
head_len=0,
|
||||
textids=None,
|
||||
):
|
||||
self._tagged = tagged
|
||||
self._textids = textids
|
||||
|
||||
self._group_by_sent = group_by_sent
|
||||
self._group_by_para = group_by_para
|
||||
# WARNING -- skip header
|
||||
StreamBackedCorpusView.__init__(self, corpus_file, startpos=head_len)
|
||||
|
||||
_pagesize = 4096
|
||||
|
||||
def read_block(self, stream):
|
||||
block = stream.readlines(self._pagesize)
|
||||
block = concat(block)
|
||||
while (block.count("<text id") > block.count("</text>")) or block.count(
|
||||
"<text id"
|
||||
) == 0:
|
||||
tmp = stream.readline()
|
||||
if len(tmp) <= 0:
|
||||
break
|
||||
block += tmp
|
||||
|
||||
block = block.replace("\n", "")
|
||||
|
||||
textids = TEXTID.findall(block)
|
||||
if self._textids:
|
||||
for tid in textids:
|
||||
if tid not in self._textids:
|
||||
beg = block.find(tid) - 1
|
||||
end = block[beg:].find("</text>") + len("</text>")
|
||||
block = block[:beg] + block[beg + end :]
|
||||
|
||||
output = []
|
||||
for para_str in PARA.findall(block):
|
||||
para = []
|
||||
for sent_str in SENT.findall(para_str):
|
||||
if not self._tagged:
|
||||
sent = WORD.findall(sent_str)
|
||||
else:
|
||||
sent = list(map(self._parse_tag, TAGGEDWORD.findall(sent_str)))
|
||||
if self._group_by_sent:
|
||||
para.append(sent)
|
||||
else:
|
||||
para.extend(sent)
|
||||
if self._group_by_para:
|
||||
output.append(para)
|
||||
else:
|
||||
output.extend(para)
|
||||
return output
|
||||
|
||||
def _parse_tag(self, tag_word_tuple):
|
||||
(tag, word) = tag_word_tuple
|
||||
if tag.startswith("w"):
|
||||
tag = ANA.search(tag).group(1)
|
||||
else: # tag.startswith('c')
|
||||
tag = TYPE.search(tag).group(1)
|
||||
return word, tag
|
||||
|
||||
|
||||
class Pl196xCorpusReader(CategorizedCorpusReader, XMLCorpusReader):
|
||||
head_len = 2770
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
if "textid_file" in kwargs:
|
||||
self._textids = kwargs["textid_file"]
|
||||
else:
|
||||
self._textids = None
|
||||
|
||||
XMLCorpusReader.__init__(self, *args)
|
||||
CategorizedCorpusReader.__init__(self, kwargs)
|
||||
|
||||
self._init_textids()
|
||||
|
||||
def _init_textids(self):
|
||||
self._f2t = defaultdict(list)
|
||||
self._t2f = defaultdict(list)
|
||||
if self._textids is not None:
|
||||
with open(self._textids) as fp:
|
||||
for line in fp:
|
||||
line = line.strip()
|
||||
file_id, text_ids = line.split(" ", 1)
|
||||
if file_id not in self.fileids():
|
||||
raise ValueError(
|
||||
"In text_id mapping file %s: %s not found"
|
||||
% (self._textids, file_id)
|
||||
)
|
||||
for text_id in text_ids.split(self._delimiter):
|
||||
self._add_textids(file_id, text_id)
|
||||
|
||||
def _add_textids(self, file_id, text_id):
|
||||
self._f2t[file_id].append(text_id)
|
||||
self._t2f[text_id].append(file_id)
|
||||
|
||||
def _resolve(self, fileids, categories, textids=None):
|
||||
tmp = None
|
||||
if (
|
||||
len(
|
||||
list(
|
||||
filter(
|
||||
lambda accessor: accessor is None,
|
||||
(fileids, categories, textids),
|
||||
)
|
||||
)
|
||||
)
|
||||
!= 1
|
||||
):
|
||||
raise ValueError(
|
||||
"Specify exactly one of: fileids, " "categories or textids"
|
||||
)
|
||||
|
||||
if fileids is not None:
|
||||
return fileids, None
|
||||
|
||||
if categories is not None:
|
||||
return self.fileids(categories), None
|
||||
|
||||
if textids is not None:
|
||||
if isinstance(textids, str):
|
||||
textids = [textids]
|
||||
files = sum((self._t2f[t] for t in textids), [])
|
||||
tdict = dict()
|
||||
for f in files:
|
||||
tdict[f] = set(self._f2t[f]) & set(textids)
|
||||
return files, tdict
|
||||
|
||||
def decode_tag(self, tag):
|
||||
# to be implemented
|
||||
return tag
|
||||
|
||||
def textids(self, fileids=None, categories=None):
|
||||
"""
|
||||
In the pl196x corpus each category is stored in single
|
||||
file and thus both methods provide identical functionality. In order
|
||||
to accommodate finer granularity, a non-standard textids() method was
|
||||
implemented. All the main functions can be supplied with a list
|
||||
of required chunks---giving much more control to the user.
|
||||
"""
|
||||
fileids, _ = self._resolve(fileids, categories)
|
||||
if fileids is None:
|
||||
return sorted(self._t2f)
|
||||
|
||||
if isinstance(fileids, str):
|
||||
fileids = [fileids]
|
||||
return sorted(sum((self._f2t[d] for d in fileids), []))
|
||||
|
||||
def words(self, fileids=None, categories=None, textids=None):
|
||||
fileids, textids = self._resolve(fileids, categories, textids)
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, str):
|
||||
fileids = [fileids]
|
||||
|
||||
if textids:
|
||||
return concat(
|
||||
[
|
||||
TEICorpusView(
|
||||
self.abspath(fileid),
|
||||
False,
|
||||
False,
|
||||
False,
|
||||
head_len=self.head_len,
|
||||
textids=textids[fileid],
|
||||
)
|
||||
for fileid in fileids
|
||||
]
|
||||
)
|
||||
else:
|
||||
return concat(
|
||||
[
|
||||
TEICorpusView(
|
||||
self.abspath(fileid),
|
||||
False,
|
||||
False,
|
||||
False,
|
||||
head_len=self.head_len,
|
||||
)
|
||||
for fileid in fileids
|
||||
]
|
||||
)
|
||||
|
||||
def sents(self, fileids=None, categories=None, textids=None):
|
||||
fileids, textids = self._resolve(fileids, categories, textids)
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, str):
|
||||
fileids = [fileids]
|
||||
|
||||
if textids:
|
||||
return concat(
|
||||
[
|
||||
TEICorpusView(
|
||||
self.abspath(fileid),
|
||||
False,
|
||||
True,
|
||||
False,
|
||||
head_len=self.head_len,
|
||||
textids=textids[fileid],
|
||||
)
|
||||
for fileid in fileids
|
||||
]
|
||||
)
|
||||
else:
|
||||
return concat(
|
||||
[
|
||||
TEICorpusView(
|
||||
self.abspath(fileid), False, True, False, head_len=self.head_len
|
||||
)
|
||||
for fileid in fileids
|
||||
]
|
||||
)
|
||||
|
||||
def paras(self, fileids=None, categories=None, textids=None):
|
||||
fileids, textids = self._resolve(fileids, categories, textids)
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, str):
|
||||
fileids = [fileids]
|
||||
|
||||
if textids:
|
||||
return concat(
|
||||
[
|
||||
TEICorpusView(
|
||||
self.abspath(fileid),
|
||||
False,
|
||||
True,
|
||||
True,
|
||||
head_len=self.head_len,
|
||||
textids=textids[fileid],
|
||||
)
|
||||
for fileid in fileids
|
||||
]
|
||||
)
|
||||
else:
|
||||
return concat(
|
||||
[
|
||||
TEICorpusView(
|
||||
self.abspath(fileid), False, True, True, head_len=self.head_len
|
||||
)
|
||||
for fileid in fileids
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_words(self, fileids=None, categories=None, textids=None):
|
||||
fileids, textids = self._resolve(fileids, categories, textids)
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, str):
|
||||
fileids = [fileids]
|
||||
|
||||
if textids:
|
||||
return concat(
|
||||
[
|
||||
TEICorpusView(
|
||||
self.abspath(fileid),
|
||||
True,
|
||||
False,
|
||||
False,
|
||||
head_len=self.head_len,
|
||||
textids=textids[fileid],
|
||||
)
|
||||
for fileid in fileids
|
||||
]
|
||||
)
|
||||
else:
|
||||
return concat(
|
||||
[
|
||||
TEICorpusView(
|
||||
self.abspath(fileid), True, False, False, head_len=self.head_len
|
||||
)
|
||||
for fileid in fileids
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_sents(self, fileids=None, categories=None, textids=None):
|
||||
fileids, textids = self._resolve(fileids, categories, textids)
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, str):
|
||||
fileids = [fileids]
|
||||
|
||||
if textids:
|
||||
return concat(
|
||||
[
|
||||
TEICorpusView(
|
||||
self.abspath(fileid),
|
||||
True,
|
||||
True,
|
||||
False,
|
||||
head_len=self.head_len,
|
||||
textids=textids[fileid],
|
||||
)
|
||||
for fileid in fileids
|
||||
]
|
||||
)
|
||||
else:
|
||||
return concat(
|
||||
[
|
||||
TEICorpusView(
|
||||
self.abspath(fileid), True, True, False, head_len=self.head_len
|
||||
)
|
||||
for fileid in fileids
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_paras(self, fileids=None, categories=None, textids=None):
|
||||
fileids, textids = self._resolve(fileids, categories, textids)
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, str):
|
||||
fileids = [fileids]
|
||||
|
||||
if textids:
|
||||
return concat(
|
||||
[
|
||||
TEICorpusView(
|
||||
self.abspath(fileid),
|
||||
True,
|
||||
True,
|
||||
True,
|
||||
head_len=self.head_len,
|
||||
textids=textids[fileid],
|
||||
)
|
||||
for fileid in fileids
|
||||
]
|
||||
)
|
||||
else:
|
||||
return concat(
|
||||
[
|
||||
TEICorpusView(
|
||||
self.abspath(fileid), True, True, True, head_len=self.head_len
|
||||
)
|
||||
for fileid in fileids
|
||||
]
|
||||
)
|
||||
|
||||
def xml(self, fileids=None, categories=None):
|
||||
fileids, _ = self._resolve(fileids, categories)
|
||||
if len(fileids) == 1:
|
||||
return XMLCorpusReader.xml(self, fileids[0])
|
||||
else:
|
||||
raise TypeError("Expected a single file")
|
||||
237
backend/venv/Lib/site-packages/nltk/corpus/reader/plaintext.py
Normal file
237
backend/venv/Lib/site-packages/nltk/corpus/reader/plaintext.py
Normal file
@@ -0,0 +1,237 @@
|
||||
# Natural Language Toolkit: Plaintext Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# Nitin Madnani <nmadnani@umiacs.umd.edu>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
A reader for corpora that consist of plaintext documents.
|
||||
"""
|
||||
|
||||
import nltk.data
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.tokenize import *
|
||||
|
||||
|
||||
class PlaintextCorpusReader(CorpusReader):
|
||||
"""
|
||||
Reader for corpora that consist of plaintext documents. Paragraphs
|
||||
are assumed to be split using blank lines. Sentences and words can
|
||||
be tokenized using the default tokenizers, or by custom tokenizers
|
||||
specified as parameters to the constructor.
|
||||
|
||||
This corpus reader can be customized (e.g., to skip preface
|
||||
sections of specific document formats) by creating a subclass and
|
||||
overriding the ``CorpusView`` class variable.
|
||||
"""
|
||||
|
||||
CorpusView = StreamBackedCorpusView
|
||||
"""The corpus view class used by this reader. Subclasses of
|
||||
``PlaintextCorpusReader`` may specify alternative corpus view
|
||||
classes (e.g., to skip the preface sections of documents.)"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
root,
|
||||
fileids,
|
||||
word_tokenizer=WordPunctTokenizer(),
|
||||
sent_tokenizer=None,
|
||||
para_block_reader=read_blankline_block,
|
||||
encoding="utf8",
|
||||
):
|
||||
r"""
|
||||
Construct a new plaintext corpus reader for a set of documents
|
||||
located at the given root directory. Example usage:
|
||||
|
||||
>>> root = '/usr/local/share/nltk_data/corpora/webtext/'
|
||||
>>> reader = PlaintextCorpusReader(root, '.*\.txt') # doctest: +SKIP
|
||||
|
||||
:param root: The root directory for this corpus.
|
||||
:param fileids: A list or regexp specifying the fileids in this corpus.
|
||||
:param word_tokenizer: Tokenizer for breaking sentences or
|
||||
paragraphs into words.
|
||||
:param sent_tokenizer: Tokenizer for breaking paragraphs
|
||||
into words.
|
||||
:param para_block_reader: The block reader used to divide the
|
||||
corpus into paragraph blocks.
|
||||
"""
|
||||
CorpusReader.__init__(self, root, fileids, encoding)
|
||||
self._word_tokenizer = word_tokenizer
|
||||
self._sent_tokenizer = sent_tokenizer
|
||||
self._para_block_reader = para_block_reader
|
||||
|
||||
def words(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of words
|
||||
and punctuation symbols.
|
||||
:rtype: list(str)
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(path, self._read_word_block, encoding=enc)
|
||||
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
||||
]
|
||||
)
|
||||
|
||||
def sents(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of
|
||||
sentences or utterances, each encoded as a list of word
|
||||
strings.
|
||||
:rtype: list(list(str))
|
||||
"""
|
||||
if self._sent_tokenizer is None:
|
||||
try:
|
||||
self._sent_tokenizer = PunktTokenizer()
|
||||
except:
|
||||
raise ValueError("No sentence tokenizer for this corpus")
|
||||
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(path, self._read_sent_block, encoding=enc)
|
||||
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
||||
]
|
||||
)
|
||||
|
||||
def paras(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of
|
||||
paragraphs, each encoded as a list of sentences, which are
|
||||
in turn encoded as lists of word strings.
|
||||
:rtype: list(list(list(str)))
|
||||
"""
|
||||
if self._sent_tokenizer is None:
|
||||
try:
|
||||
self._sent_tokenizer = PunktTokenizer()
|
||||
except:
|
||||
raise ValueError("No sentence tokenizer for this corpus")
|
||||
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(path, self._read_para_block, encoding=enc)
|
||||
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
||||
]
|
||||
)
|
||||
|
||||
def _read_word_block(self, stream):
|
||||
words = []
|
||||
for i in range(20): # Read 20 lines at a time.
|
||||
words.extend(self._word_tokenizer.tokenize(stream.readline()))
|
||||
return words
|
||||
|
||||
def _read_sent_block(self, stream):
|
||||
sents = []
|
||||
for para in self._para_block_reader(stream):
|
||||
sents.extend(
|
||||
[
|
||||
self._word_tokenizer.tokenize(sent)
|
||||
for sent in self._sent_tokenizer.tokenize(para)
|
||||
]
|
||||
)
|
||||
return sents
|
||||
|
||||
def _read_para_block(self, stream):
|
||||
paras = []
|
||||
for para in self._para_block_reader(stream):
|
||||
paras.append(
|
||||
[
|
||||
self._word_tokenizer.tokenize(sent)
|
||||
for sent in self._sent_tokenizer.tokenize(para)
|
||||
]
|
||||
)
|
||||
return paras
|
||||
|
||||
|
||||
class CategorizedPlaintextCorpusReader(CategorizedCorpusReader, PlaintextCorpusReader):
|
||||
"""
|
||||
A reader for plaintext corpora whose documents are divided into
|
||||
categories based on their file identifiers.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
"""
|
||||
Initialize the corpus reader. Categorization arguments
|
||||
(``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
|
||||
the ``CategorizedCorpusReader`` constructor. The remaining arguments
|
||||
are passed to the ``PlaintextCorpusReader`` constructor.
|
||||
"""
|
||||
CategorizedCorpusReader.__init__(self, kwargs)
|
||||
PlaintextCorpusReader.__init__(self, *args, **kwargs)
|
||||
|
||||
|
||||
class PortugueseCategorizedPlaintextCorpusReader(CategorizedPlaintextCorpusReader):
|
||||
"""
|
||||
This class is identical with CategorizedPlaintextCorpusReader,
|
||||
except that it initializes a Portuguese PunktTokenizer:
|
||||
|
||||
>>> from nltk.corpus import machado
|
||||
>>> print(machado._sent_tokenizer._lang)
|
||||
portuguese
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
CategorizedPlaintextCorpusReader.__init__(self, *args, **kwargs)
|
||||
# Fixed (@ekaf 2025), new way to invoke Punkt:
|
||||
self._sent_tokenizer = PunktTokenizer("portuguese")
|
||||
|
||||
|
||||
class EuroparlCorpusReader(PlaintextCorpusReader):
|
||||
"""
|
||||
Reader for Europarl corpora that consist of plaintext documents.
|
||||
Documents are divided into chapters instead of paragraphs as
|
||||
for regular plaintext documents. Chapters are separated using blank
|
||||
lines. Everything is inherited from ``PlaintextCorpusReader`` except
|
||||
that:
|
||||
|
||||
- Since the corpus is pre-processed and pre-tokenized, the
|
||||
word tokenizer should just split the line at whitespaces.
|
||||
- For the same reason, the sentence tokenizer should just
|
||||
split the paragraph at line breaks.
|
||||
- There is a new 'chapters()' method that returns chapters instead
|
||||
instead of paragraphs.
|
||||
- The 'paras()' method inherited from PlaintextCorpusReader is
|
||||
made non-functional to remove any confusion between chapters
|
||||
and paragraphs for Europarl.
|
||||
"""
|
||||
|
||||
def _read_word_block(self, stream):
|
||||
words = []
|
||||
for i in range(20): # Read 20 lines at a time.
|
||||
words.extend(stream.readline().split())
|
||||
return words
|
||||
|
||||
def _read_sent_block(self, stream):
|
||||
sents = []
|
||||
for para in self._para_block_reader(stream):
|
||||
sents.extend([sent.split() for sent in para.splitlines()])
|
||||
return sents
|
||||
|
||||
def _read_para_block(self, stream):
|
||||
paras = []
|
||||
for para in self._para_block_reader(stream):
|
||||
paras.append([sent.split() for sent in para.splitlines()])
|
||||
return paras
|
||||
|
||||
def chapters(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of
|
||||
chapters, each encoded as a list of sentences, which are
|
||||
in turn encoded as lists of word strings.
|
||||
:rtype: list(list(list(str)))
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(fileid, self._read_para_block, encoding=enc)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def paras(self, fileids=None):
|
||||
raise NotImplementedError(
|
||||
"The Europarl corpus reader does not support paragraphs. Please use chapters() instead."
|
||||
)
|
||||
@@ -0,0 +1,95 @@
|
||||
# Natural Language Toolkit: PP Attachment Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Read lines from the Prepositional Phrase Attachment Corpus.
|
||||
|
||||
The PP Attachment Corpus contains several files having the format:
|
||||
|
||||
sentence_id verb noun1 preposition noun2 attachment
|
||||
|
||||
For example:
|
||||
|
||||
42960 gives authority to administration V
|
||||
46742 gives inventors of microchip N
|
||||
|
||||
The PP attachment is to the verb phrase (V) or noun phrase (N), i.e.:
|
||||
|
||||
(VP gives (NP authority) (PP to administration))
|
||||
(VP gives (NP inventors (PP of microchip)))
|
||||
|
||||
The corpus contains the following files:
|
||||
|
||||
training: training set
|
||||
devset: development test set, used for algorithm development.
|
||||
test: test set, used to report results
|
||||
bitstrings: word classes derived from Mutual Information Clustering for the Wall Street Journal.
|
||||
|
||||
Ratnaparkhi, Adwait (1994). A Maximum Entropy Model for Prepositional
|
||||
Phrase Attachment. Proceedings of the ARPA Human Language Technology
|
||||
Conference. [http://www.cis.upenn.edu/~adwait/papers/hlt94.ps]
|
||||
|
||||
The PP Attachment Corpus is distributed with NLTK with the permission
|
||||
of the author.
|
||||
"""
|
||||
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.corpus.reader.util import *
|
||||
|
||||
|
||||
class PPAttachment:
|
||||
def __init__(self, sent, verb, noun1, prep, noun2, attachment):
|
||||
self.sent = sent
|
||||
self.verb = verb
|
||||
self.noun1 = noun1
|
||||
self.prep = prep
|
||||
self.noun2 = noun2
|
||||
self.attachment = attachment
|
||||
|
||||
def __repr__(self):
|
||||
return (
|
||||
"PPAttachment(sent=%r, verb=%r, noun1=%r, prep=%r, "
|
||||
"noun2=%r, attachment=%r)"
|
||||
% (self.sent, self.verb, self.noun1, self.prep, self.noun2, self.attachment)
|
||||
)
|
||||
|
||||
|
||||
class PPAttachmentCorpusReader(CorpusReader):
|
||||
"""
|
||||
sentence_id verb noun1 preposition noun2 attachment
|
||||
"""
|
||||
|
||||
def attachments(self, fileids):
|
||||
return concat(
|
||||
[
|
||||
StreamBackedCorpusView(fileid, self._read_obj_block, encoding=enc)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def tuples(self, fileids):
|
||||
return concat(
|
||||
[
|
||||
StreamBackedCorpusView(fileid, self._read_tuple_block, encoding=enc)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def _read_tuple_block(self, stream):
|
||||
line = stream.readline()
|
||||
if line:
|
||||
return [tuple(line.split())]
|
||||
else:
|
||||
return []
|
||||
|
||||
def _read_obj_block(self, stream):
|
||||
line = stream.readline()
|
||||
if line:
|
||||
return [PPAttachment(*line.split())]
|
||||
else:
|
||||
return []
|
||||
519
backend/venv/Lib/site-packages/nltk/corpus/reader/propbank.py
Normal file
519
backend/venv/Lib/site-packages/nltk/corpus/reader/propbank.py
Normal file
@@ -0,0 +1,519 @@
|
||||
# Natural Language Toolkit: PropBank Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import re
|
||||
from functools import total_ordering
|
||||
from xml.etree import ElementTree
|
||||
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.internals import raise_unorderable_types
|
||||
from nltk.tree import Tree
|
||||
|
||||
|
||||
class PropbankCorpusReader(CorpusReader):
|
||||
"""
|
||||
Corpus reader for the propbank corpus, which augments the Penn
|
||||
Treebank with information about the predicate argument structure
|
||||
of every verb instance. The corpus consists of two parts: the
|
||||
predicate-argument annotations themselves, and a set of "frameset
|
||||
files" which define the argument labels used by the annotations,
|
||||
on a per-verb basis. Each "frameset file" contains one or more
|
||||
predicates, such as ``'turn'`` or ``'turn_on'``, each of which is
|
||||
divided into coarse-grained word senses called "rolesets". For
|
||||
each "roleset", the frameset file provides descriptions of the
|
||||
argument roles, along with examples.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
root,
|
||||
propfile,
|
||||
framefiles="",
|
||||
verbsfile=None,
|
||||
parse_fileid_xform=None,
|
||||
parse_corpus=None,
|
||||
encoding="utf8",
|
||||
):
|
||||
"""
|
||||
:param root: The root directory for this corpus.
|
||||
:param propfile: The name of the file containing the predicate-
|
||||
argument annotations (relative to ``root``).
|
||||
:param framefiles: A list or regexp specifying the frameset
|
||||
fileids for this corpus.
|
||||
:param parse_fileid_xform: A transform that should be applied
|
||||
to the fileids in this corpus. This should be a function
|
||||
of one argument (a fileid) that returns a string (the new
|
||||
fileid).
|
||||
:param parse_corpus: The corpus containing the parse trees
|
||||
corresponding to this corpus. These parse trees are
|
||||
necessary to resolve the tree pointers used by propbank.
|
||||
"""
|
||||
# If framefiles is specified as a regexp, expand it.
|
||||
if isinstance(framefiles, str):
|
||||
framefiles = find_corpus_fileids(root, framefiles)
|
||||
framefiles = list(framefiles)
|
||||
# Initialize the corpus reader.
|
||||
CorpusReader.__init__(self, root, [propfile, verbsfile] + framefiles, encoding)
|
||||
|
||||
# Record our frame fileids & prop file.
|
||||
self._propfile = propfile
|
||||
self._framefiles = framefiles
|
||||
self._verbsfile = verbsfile
|
||||
self._parse_fileid_xform = parse_fileid_xform
|
||||
self._parse_corpus = parse_corpus
|
||||
|
||||
def instances(self, baseform=None):
|
||||
"""
|
||||
:return: a corpus view that acts as a list of
|
||||
``PropBankInstance`` objects, one for each noun in the corpus.
|
||||
"""
|
||||
kwargs = {}
|
||||
if baseform is not None:
|
||||
kwargs["instance_filter"] = lambda inst: inst.baseform == baseform
|
||||
return StreamBackedCorpusView(
|
||||
self.abspath(self._propfile),
|
||||
lambda stream: self._read_instance_block(stream, **kwargs),
|
||||
encoding=self.encoding(self._propfile),
|
||||
)
|
||||
|
||||
def lines(self):
|
||||
"""
|
||||
:return: a corpus view that acts as a list of strings, one for
|
||||
each line in the predicate-argument annotation file.
|
||||
"""
|
||||
return StreamBackedCorpusView(
|
||||
self.abspath(self._propfile),
|
||||
read_line_block,
|
||||
encoding=self.encoding(self._propfile),
|
||||
)
|
||||
|
||||
def roleset(self, roleset_id):
|
||||
"""
|
||||
:return: the xml description for the given roleset.
|
||||
"""
|
||||
baseform = roleset_id.split(".")[0]
|
||||
framefile = "frames/%s.xml" % baseform
|
||||
if framefile not in self._framefiles:
|
||||
raise ValueError("Frameset file for %s not found" % roleset_id)
|
||||
|
||||
# n.b.: The encoding for XML fileids is specified by the file
|
||||
# itself; so we ignore self._encoding here.
|
||||
with self.abspath(framefile).open() as fp:
|
||||
etree = ElementTree.parse(fp).getroot()
|
||||
for roleset in etree.findall("predicate/roleset"):
|
||||
if roleset.attrib["id"] == roleset_id:
|
||||
return roleset
|
||||
raise ValueError(f"Roleset {roleset_id} not found in {framefile}")
|
||||
|
||||
def rolesets(self, baseform=None):
|
||||
"""
|
||||
:return: list of xml descriptions for rolesets.
|
||||
"""
|
||||
if baseform is not None:
|
||||
framefile = "frames/%s.xml" % baseform
|
||||
if framefile not in self._framefiles:
|
||||
raise ValueError("Frameset file for %s not found" % baseform)
|
||||
framefiles = [framefile]
|
||||
else:
|
||||
framefiles = self._framefiles
|
||||
|
||||
rsets = []
|
||||
for framefile in framefiles:
|
||||
# n.b.: The encoding for XML fileids is specified by the file
|
||||
# itself; so we ignore self._encoding here.
|
||||
with self.abspath(framefile).open() as fp:
|
||||
etree = ElementTree.parse(fp).getroot()
|
||||
rsets.append(etree.findall("predicate/roleset"))
|
||||
return LazyConcatenation(rsets)
|
||||
|
||||
def verbs(self):
|
||||
"""
|
||||
:return: a corpus view that acts as a list of all verb lemmas
|
||||
in this corpus (from the verbs.txt file).
|
||||
"""
|
||||
return StreamBackedCorpusView(
|
||||
self.abspath(self._verbsfile),
|
||||
read_line_block,
|
||||
encoding=self.encoding(self._verbsfile),
|
||||
)
|
||||
|
||||
def _read_instance_block(self, stream, instance_filter=lambda inst: True):
|
||||
block = []
|
||||
|
||||
# Read 100 at a time.
|
||||
for i in range(100):
|
||||
line = stream.readline().strip()
|
||||
if line:
|
||||
inst = PropbankInstance.parse(
|
||||
line, self._parse_fileid_xform, self._parse_corpus
|
||||
)
|
||||
if instance_filter(inst):
|
||||
block.append(inst)
|
||||
|
||||
return block
|
||||
|
||||
|
||||
######################################################################
|
||||
# { Propbank Instance & related datatypes
|
||||
######################################################################
|
||||
|
||||
|
||||
class PropbankInstance:
|
||||
def __init__(
|
||||
self,
|
||||
fileid,
|
||||
sentnum,
|
||||
wordnum,
|
||||
tagger,
|
||||
roleset,
|
||||
inflection,
|
||||
predicate,
|
||||
arguments,
|
||||
parse_corpus=None,
|
||||
):
|
||||
self.fileid = fileid
|
||||
"""The name of the file containing the parse tree for this
|
||||
instance's sentence."""
|
||||
|
||||
self.sentnum = sentnum
|
||||
"""The sentence number of this sentence within ``fileid``.
|
||||
Indexing starts from zero."""
|
||||
|
||||
self.wordnum = wordnum
|
||||
"""The word number of this instance's predicate within its
|
||||
containing sentence. Word numbers are indexed starting from
|
||||
zero, and include traces and other empty parse elements."""
|
||||
|
||||
self.tagger = tagger
|
||||
"""An identifier for the tagger who tagged this instance; or
|
||||
``'gold'`` if this is an adjuticated instance."""
|
||||
|
||||
self.roleset = roleset
|
||||
"""The name of the roleset used by this instance's predicate.
|
||||
Use ``propbank.roleset() <PropbankCorpusReader.roleset>`` to
|
||||
look up information about the roleset."""
|
||||
|
||||
self.inflection = inflection
|
||||
"""A ``PropbankInflection`` object describing the inflection of
|
||||
this instance's predicate."""
|
||||
|
||||
self.predicate = predicate
|
||||
"""A ``PropbankTreePointer`` indicating the position of this
|
||||
instance's predicate within its containing sentence."""
|
||||
|
||||
self.arguments = tuple(arguments)
|
||||
"""A list of tuples (argloc, argid), specifying the location
|
||||
and identifier for each of the predicate's argument in the
|
||||
containing sentence. Argument identifiers are strings such as
|
||||
``'ARG0'`` or ``'ARGM-TMP'``. This list does *not* contain
|
||||
the predicate."""
|
||||
|
||||
self.parse_corpus = parse_corpus
|
||||
"""A corpus reader for the parse trees corresponding to the
|
||||
instances in this propbank corpus."""
|
||||
|
||||
@property
|
||||
def baseform(self):
|
||||
"""The baseform of the predicate."""
|
||||
return self.roleset.split(".")[0]
|
||||
|
||||
@property
|
||||
def sensenumber(self):
|
||||
"""The sense number of the predicate."""
|
||||
return self.roleset.split(".")[1]
|
||||
|
||||
@property
|
||||
def predid(self):
|
||||
"""Identifier of the predicate."""
|
||||
return "rel"
|
||||
|
||||
def __repr__(self):
|
||||
return "<PropbankInstance: {}, sent {}, word {}>".format(
|
||||
self.fileid,
|
||||
self.sentnum,
|
||||
self.wordnum,
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
s = "{} {} {} {} {} {}".format(
|
||||
self.fileid,
|
||||
self.sentnum,
|
||||
self.wordnum,
|
||||
self.tagger,
|
||||
self.roleset,
|
||||
self.inflection,
|
||||
)
|
||||
items = self.arguments + ((self.predicate, "rel"),)
|
||||
for argloc, argid in sorted(items):
|
||||
s += f" {argloc}-{argid}"
|
||||
return s
|
||||
|
||||
def _get_tree(self):
|
||||
if self.parse_corpus is None:
|
||||
return None
|
||||
if self.fileid not in self.parse_corpus.fileids():
|
||||
return None
|
||||
return self.parse_corpus.parsed_sents(self.fileid)[self.sentnum]
|
||||
|
||||
tree = property(
|
||||
_get_tree,
|
||||
doc="""
|
||||
The parse tree corresponding to this instance, or None if
|
||||
the corresponding tree is not available.""",
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def parse(s, parse_fileid_xform=None, parse_corpus=None):
|
||||
pieces = s.split()
|
||||
if len(pieces) < 7:
|
||||
raise ValueError("Badly formatted propbank line: %r" % s)
|
||||
|
||||
# Divide the line into its basic pieces.
|
||||
(fileid, sentnum, wordnum, tagger, roleset, inflection) = pieces[:6]
|
||||
rel = [p for p in pieces[6:] if p.endswith("-rel")]
|
||||
args = [p for p in pieces[6:] if not p.endswith("-rel")]
|
||||
if len(rel) != 1:
|
||||
raise ValueError("Badly formatted propbank line: %r" % s)
|
||||
|
||||
# Apply the fileid selector, if any.
|
||||
if parse_fileid_xform is not None:
|
||||
fileid = parse_fileid_xform(fileid)
|
||||
|
||||
# Convert sentence & word numbers to ints.
|
||||
sentnum = int(sentnum)
|
||||
wordnum = int(wordnum)
|
||||
|
||||
# Parse the inflection
|
||||
inflection = PropbankInflection.parse(inflection)
|
||||
|
||||
# Parse the predicate location.
|
||||
predicate = PropbankTreePointer.parse(rel[0][:-4])
|
||||
|
||||
# Parse the arguments.
|
||||
arguments = []
|
||||
for arg in args:
|
||||
argloc, argid = arg.split("-", 1)
|
||||
arguments.append((PropbankTreePointer.parse(argloc), argid))
|
||||
|
||||
# Put it all together.
|
||||
return PropbankInstance(
|
||||
fileid,
|
||||
sentnum,
|
||||
wordnum,
|
||||
tagger,
|
||||
roleset,
|
||||
inflection,
|
||||
predicate,
|
||||
arguments,
|
||||
parse_corpus,
|
||||
)
|
||||
|
||||
|
||||
class PropbankPointer:
|
||||
"""
|
||||
A pointer used by propbank to identify one or more constituents in
|
||||
a parse tree. ``PropbankPointer`` is an abstract base class with
|
||||
three concrete subclasses:
|
||||
|
||||
- ``PropbankTreePointer`` is used to point to single constituents.
|
||||
- ``PropbankSplitTreePointer`` is used to point to 'split'
|
||||
constituents, which consist of a sequence of two or more
|
||||
``PropbankTreePointer`` pointers.
|
||||
- ``PropbankChainTreePointer`` is used to point to entire trace
|
||||
chains in a tree. It consists of a sequence of pieces, which
|
||||
can be ``PropbankTreePointer`` or ``PropbankSplitTreePointer`` pointers.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
if self.__class__ == PropbankPointer:
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
class PropbankChainTreePointer(PropbankPointer):
|
||||
def __init__(self, pieces):
|
||||
self.pieces = pieces
|
||||
"""A list of the pieces that make up this chain. Elements may
|
||||
be either ``PropbankSplitTreePointer`` or
|
||||
``PropbankTreePointer`` pointers."""
|
||||
|
||||
def __str__(self):
|
||||
return "*".join("%s" % p for p in self.pieces)
|
||||
|
||||
def __repr__(self):
|
||||
return "<PropbankChainTreePointer: %s>" % self
|
||||
|
||||
def select(self, tree):
|
||||
if tree is None:
|
||||
raise ValueError("Parse tree not available")
|
||||
return Tree("*CHAIN*", [p.select(tree) for p in self.pieces])
|
||||
|
||||
|
||||
class PropbankSplitTreePointer(PropbankPointer):
|
||||
def __init__(self, pieces):
|
||||
self.pieces = pieces
|
||||
"""A list of the pieces that make up this chain. Elements are
|
||||
all ``PropbankTreePointer`` pointers."""
|
||||
|
||||
def __str__(self):
|
||||
return ",".join("%s" % p for p in self.pieces)
|
||||
|
||||
def __repr__(self):
|
||||
return "<PropbankSplitTreePointer: %s>" % self
|
||||
|
||||
def select(self, tree):
|
||||
if tree is None:
|
||||
raise ValueError("Parse tree not available")
|
||||
return Tree("*SPLIT*", [p.select(tree) for p in self.pieces])
|
||||
|
||||
|
||||
@total_ordering
|
||||
class PropbankTreePointer(PropbankPointer):
|
||||
"""
|
||||
wordnum:height*wordnum:height*...
|
||||
wordnum:height,
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, wordnum, height):
|
||||
self.wordnum = wordnum
|
||||
self.height = height
|
||||
|
||||
@staticmethod
|
||||
def parse(s):
|
||||
# Deal with chains (xx*yy*zz)
|
||||
pieces = s.split("*")
|
||||
if len(pieces) > 1:
|
||||
return PropbankChainTreePointer(
|
||||
[PropbankTreePointer.parse(elt) for elt in pieces]
|
||||
)
|
||||
|
||||
# Deal with split args (xx,yy,zz)
|
||||
pieces = s.split(",")
|
||||
if len(pieces) > 1:
|
||||
return PropbankSplitTreePointer(
|
||||
[PropbankTreePointer.parse(elt) for elt in pieces]
|
||||
)
|
||||
|
||||
# Deal with normal pointers.
|
||||
pieces = s.split(":")
|
||||
if len(pieces) != 2:
|
||||
raise ValueError("bad propbank pointer %r" % s)
|
||||
return PropbankTreePointer(int(pieces[0]), int(pieces[1]))
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.wordnum}:{self.height}"
|
||||
|
||||
def __repr__(self):
|
||||
return "PropbankTreePointer(%d, %d)" % (self.wordnum, self.height)
|
||||
|
||||
def __eq__(self, other):
|
||||
while isinstance(other, (PropbankChainTreePointer, PropbankSplitTreePointer)):
|
||||
other = other.pieces[0]
|
||||
|
||||
if not isinstance(other, PropbankTreePointer):
|
||||
return self is other
|
||||
|
||||
return self.wordnum == other.wordnum and self.height == other.height
|
||||
|
||||
def __ne__(self, other):
|
||||
return not self == other
|
||||
|
||||
def __lt__(self, other):
|
||||
while isinstance(other, (PropbankChainTreePointer, PropbankSplitTreePointer)):
|
||||
other = other.pieces[0]
|
||||
|
||||
if not isinstance(other, PropbankTreePointer):
|
||||
return id(self) < id(other)
|
||||
|
||||
return (self.wordnum, -self.height) < (other.wordnum, -other.height)
|
||||
|
||||
def select(self, tree):
|
||||
if tree is None:
|
||||
raise ValueError("Parse tree not available")
|
||||
return tree[self.treepos(tree)]
|
||||
|
||||
def treepos(self, tree):
|
||||
"""
|
||||
Convert this pointer to a standard 'tree position' pointer,
|
||||
given that it points to the given tree.
|
||||
"""
|
||||
if tree is None:
|
||||
raise ValueError("Parse tree not available")
|
||||
stack = [tree]
|
||||
treepos = []
|
||||
|
||||
wordnum = 0
|
||||
while True:
|
||||
# tree node:
|
||||
if isinstance(stack[-1], Tree):
|
||||
# Select the next child.
|
||||
if len(treepos) < len(stack):
|
||||
treepos.append(0)
|
||||
else:
|
||||
treepos[-1] += 1
|
||||
# Update the stack.
|
||||
if treepos[-1] < len(stack[-1]):
|
||||
stack.append(stack[-1][treepos[-1]])
|
||||
else:
|
||||
# End of node's child list: pop up a level.
|
||||
stack.pop()
|
||||
treepos.pop()
|
||||
# word node:
|
||||
else:
|
||||
if wordnum == self.wordnum:
|
||||
return tuple(treepos[: len(treepos) - self.height - 1])
|
||||
else:
|
||||
wordnum += 1
|
||||
stack.pop()
|
||||
|
||||
|
||||
class PropbankInflection:
|
||||
# { Inflection Form
|
||||
INFINITIVE = "i"
|
||||
GERUND = "g"
|
||||
PARTICIPLE = "p"
|
||||
FINITE = "v"
|
||||
# { Inflection Tense
|
||||
FUTURE = "f"
|
||||
PAST = "p"
|
||||
PRESENT = "n"
|
||||
# { Inflection Aspect
|
||||
PERFECT = "p"
|
||||
PROGRESSIVE = "o"
|
||||
PERFECT_AND_PROGRESSIVE = "b"
|
||||
# { Inflection Person
|
||||
THIRD_PERSON = "3"
|
||||
# { Inflection Voice
|
||||
ACTIVE = "a"
|
||||
PASSIVE = "p"
|
||||
# { Inflection
|
||||
NONE = "-"
|
||||
# }
|
||||
|
||||
def __init__(self, form="-", tense="-", aspect="-", person="-", voice="-"):
|
||||
self.form = form
|
||||
self.tense = tense
|
||||
self.aspect = aspect
|
||||
self.person = person
|
||||
self.voice = voice
|
||||
|
||||
def __str__(self):
|
||||
return self.form + self.tense + self.aspect + self.person + self.voice
|
||||
|
||||
def __repr__(self):
|
||||
return "<PropbankInflection: %s>" % self
|
||||
|
||||
_VALIDATE = re.compile(r"[igpv\-][fpn\-][pob\-][3\-][ap\-]$")
|
||||
|
||||
@staticmethod
|
||||
def parse(s):
|
||||
if not isinstance(s, str):
|
||||
raise TypeError("expected a string")
|
||||
if len(s) != 5 or not PropbankInflection._VALIDATE.match(s):
|
||||
raise ValueError("Bad propbank inflection string %r" % s)
|
||||
return PropbankInflection(*s)
|
||||
133
backend/venv/Lib/site-packages/nltk/corpus/reader/pros_cons.py
Normal file
133
backend/venv/Lib/site-packages/nltk/corpus/reader/pros_cons.py
Normal file
@@ -0,0 +1,133 @@
|
||||
# Natural Language Toolkit: Pros and Cons Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
CorpusReader for the Pros and Cons dataset.
|
||||
|
||||
- Pros and Cons dataset information -
|
||||
|
||||
Contact: Bing Liu, liub@cs.uic.edu
|
||||
https://www.cs.uic.edu/~liub
|
||||
|
||||
Distributed with permission.
|
||||
|
||||
Related papers:
|
||||
|
||||
- Murthy Ganapathibhotla and Bing Liu. "Mining Opinions in Comparative Sentences".
|
||||
Proceedings of the 22nd International Conference on Computational Linguistics
|
||||
(Coling-2008), Manchester, 18-22 August, 2008.
|
||||
|
||||
- Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing and Comparing
|
||||
Opinions on the Web". Proceedings of the 14th international World Wide Web
|
||||
conference (WWW-2005), May 10-14, 2005, in Chiba, Japan.
|
||||
"""
|
||||
import re
|
||||
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.tokenize import *
|
||||
|
||||
|
||||
class ProsConsCorpusReader(CategorizedCorpusReader, CorpusReader):
|
||||
"""
|
||||
Reader for the Pros and Cons sentence dataset.
|
||||
|
||||
>>> from nltk.corpus import pros_cons
|
||||
>>> pros_cons.sents(categories='Cons') # doctest: +NORMALIZE_WHITESPACE
|
||||
[['East', 'batteries', '!', 'On', '-', 'off', 'switch', 'too', 'easy',
|
||||
'to', 'maneuver', '.'], ['Eats', '...', 'no', ',', 'GULPS', 'batteries'],
|
||||
...]
|
||||
>>> pros_cons.words('IntegratedPros.txt')
|
||||
['Easy', 'to', 'use', ',', 'economical', '!', ...]
|
||||
"""
|
||||
|
||||
CorpusView = StreamBackedCorpusView
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
root,
|
||||
fileids,
|
||||
word_tokenizer=WordPunctTokenizer(),
|
||||
encoding="utf8",
|
||||
**kwargs
|
||||
):
|
||||
"""
|
||||
:param root: The root directory for the corpus.
|
||||
:param fileids: a list or regexp specifying the fileids in the corpus.
|
||||
:param word_tokenizer: a tokenizer for breaking sentences or paragraphs
|
||||
into words. Default: `WhitespaceTokenizer`
|
||||
:param encoding: the encoding that should be used to read the corpus.
|
||||
:param kwargs: additional parameters passed to CategorizedCorpusReader.
|
||||
"""
|
||||
|
||||
CorpusReader.__init__(self, root, fileids, encoding)
|
||||
CategorizedCorpusReader.__init__(self, kwargs)
|
||||
self._word_tokenizer = word_tokenizer
|
||||
|
||||
def sents(self, fileids=None, categories=None):
|
||||
"""
|
||||
Return all sentences in the corpus or in the specified files/categories.
|
||||
|
||||
:param fileids: a list or regexp specifying the ids of the files whose
|
||||
sentences have to be returned.
|
||||
:param categories: a list specifying the categories whose sentences
|
||||
have to be returned.
|
||||
:return: the given file(s) as a list of sentences. Each sentence is
|
||||
tokenized using the specified word_tokenizer.
|
||||
:rtype: list(list(str))
|
||||
"""
|
||||
fileids = self._resolve(fileids, categories)
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, str):
|
||||
fileids = [fileids]
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(path, self._read_sent_block, encoding=enc)
|
||||
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
||||
]
|
||||
)
|
||||
|
||||
def words(self, fileids=None, categories=None):
|
||||
"""
|
||||
Return all words and punctuation symbols in the corpus or in the specified
|
||||
files/categories.
|
||||
|
||||
:param fileids: a list or regexp specifying the ids of the files whose
|
||||
words have to be returned.
|
||||
:param categories: a list specifying the categories whose words have
|
||||
to be returned.
|
||||
:return: the given file(s) as a list of words and punctuation symbols.
|
||||
:rtype: list(str)
|
||||
"""
|
||||
fileids = self._resolve(fileids, categories)
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, str):
|
||||
fileids = [fileids]
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(path, self._read_word_block, encoding=enc)
|
||||
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
||||
]
|
||||
)
|
||||
|
||||
def _read_sent_block(self, stream):
|
||||
sents = []
|
||||
for i in range(20): # Read 20 lines at a time.
|
||||
line = stream.readline()
|
||||
if not line:
|
||||
continue
|
||||
sent = re.match(r"^(?!\n)\s*<(Pros|Cons)>(.*)</(?:Pros|Cons)>", line)
|
||||
if sent:
|
||||
sents.append(self._word_tokenizer.tokenize(sent.group(2).strip()))
|
||||
return sents
|
||||
|
||||
def _read_word_block(self, stream):
|
||||
words = []
|
||||
for sent in self._read_sent_block(stream):
|
||||
words.extend(sent)
|
||||
return words
|
||||
331
backend/venv/Lib/site-packages/nltk/corpus/reader/reviews.py
Normal file
331
backend/venv/Lib/site-packages/nltk/corpus/reader/reviews.py
Normal file
@@ -0,0 +1,331 @@
|
||||
# Natural Language Toolkit: Product Reviews Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
CorpusReader for reviews corpora (syntax based on Customer Review Corpus).
|
||||
|
||||
Customer Review Corpus information
|
||||
==================================
|
||||
|
||||
Annotated by: Minqing Hu and Bing Liu, 2004.
|
||||
Department of Computer Science
|
||||
University of Illinois at Chicago
|
||||
|
||||
Contact: Bing Liu, liub@cs.uic.edu
|
||||
https://www.cs.uic.edu/~liub
|
||||
|
||||
Distributed with permission.
|
||||
|
||||
The "product_reviews_1" and "product_reviews_2" datasets respectively contain
|
||||
annotated customer reviews of 5 and 9 products from amazon.com.
|
||||
|
||||
Related papers:
|
||||
|
||||
- Minqing Hu and Bing Liu. "Mining and summarizing customer reviews".
|
||||
Proceedings of the ACM SIGKDD International Conference on Knowledge
|
||||
Discovery & Data Mining (KDD-04), 2004.
|
||||
|
||||
- Minqing Hu and Bing Liu. "Mining Opinion Features in Customer Reviews".
|
||||
Proceedings of Nineteeth National Conference on Artificial Intelligence
|
||||
(AAAI-2004), 2004.
|
||||
|
||||
- Xiaowen Ding, Bing Liu and Philip S. Yu. "A Holistic Lexicon-Based Appraoch to
|
||||
Opinion Mining." Proceedings of First ACM International Conference on Web
|
||||
Search and Data Mining (WSDM-2008), Feb 11-12, 2008, Stanford University,
|
||||
Stanford, California, USA.
|
||||
|
||||
Symbols used in the annotated reviews:
|
||||
|
||||
:[t]: the title of the review: Each [t] tag starts a review.
|
||||
:xxxx[+|-n]: xxxx is a product feature.
|
||||
:[+n]: Positive opinion, n is the opinion strength: 3 strongest, and 1 weakest.
|
||||
Note that the strength is quite subjective.
|
||||
You may want ignore it, but only considering + and -
|
||||
:[-n]: Negative opinion
|
||||
:##: start of each sentence. Each line is a sentence.
|
||||
:[u]: feature not appeared in the sentence.
|
||||
:[p]: feature not appeared in the sentence. Pronoun resolution is needed.
|
||||
:[s]: suggestion or recommendation.
|
||||
:[cc]: comparison with a competing product from a different brand.
|
||||
:[cs]: comparison with a competing product from the same brand.
|
||||
|
||||
Note: Some of the files (e.g. "ipod.txt", "Canon PowerShot SD500.txt") do not
|
||||
provide separation between different reviews. This is due to the fact that
|
||||
the dataset was specifically designed for aspect/feature-based sentiment
|
||||
analysis, for which sentence-level annotation is sufficient. For document-
|
||||
level classification and analysis, this peculiarity should be taken into
|
||||
consideration.
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.tokenize import *
|
||||
|
||||
TITLE = re.compile(r"^\[t\](.*)$") # [t] Title
|
||||
FEATURES = re.compile(
|
||||
r"((?:(?:\w+\s)+)?\w+)\[((?:\+|\-)\d)\]"
|
||||
) # find 'feature' in feature[+3]
|
||||
NOTES = re.compile(r"\[(?!t)(p|u|s|cc|cs)\]") # find 'p' in camera[+2][p]
|
||||
SENT = re.compile(r"##(.*)$") # find tokenized sentence
|
||||
|
||||
|
||||
class Review:
|
||||
"""
|
||||
A Review is the main block of a ReviewsCorpusReader.
|
||||
"""
|
||||
|
||||
def __init__(self, title=None, review_lines=None):
|
||||
"""
|
||||
:param title: the title of the review.
|
||||
:param review_lines: the list of the ReviewLines that belong to the Review.
|
||||
"""
|
||||
self.title = title
|
||||
if review_lines is None:
|
||||
self.review_lines = []
|
||||
else:
|
||||
self.review_lines = review_lines
|
||||
|
||||
def add_line(self, review_line):
|
||||
"""
|
||||
Add a line (ReviewLine) to the review.
|
||||
|
||||
:param review_line: a ReviewLine instance that belongs to the Review.
|
||||
"""
|
||||
assert isinstance(review_line, ReviewLine)
|
||||
self.review_lines.append(review_line)
|
||||
|
||||
def features(self):
|
||||
"""
|
||||
Return a list of features in the review. Each feature is a tuple made of
|
||||
the specific item feature and the opinion strength about that feature.
|
||||
|
||||
:return: all features of the review as a list of tuples (feat, score).
|
||||
:rtype: list(tuple)
|
||||
"""
|
||||
features = []
|
||||
for review_line in self.review_lines:
|
||||
features.extend(review_line.features)
|
||||
return features
|
||||
|
||||
def sents(self):
|
||||
"""
|
||||
Return all tokenized sentences in the review.
|
||||
|
||||
:return: all sentences of the review as lists of tokens.
|
||||
:rtype: list(list(str))
|
||||
"""
|
||||
return [review_line.sent for review_line in self.review_lines]
|
||||
|
||||
def __repr__(self):
|
||||
return 'Review(title="{}", review_lines={})'.format(
|
||||
self.title, self.review_lines
|
||||
)
|
||||
|
||||
|
||||
class ReviewLine:
|
||||
"""
|
||||
A ReviewLine represents a sentence of the review, together with (optional)
|
||||
annotations of its features and notes about the reviewed item.
|
||||
"""
|
||||
|
||||
def __init__(self, sent, features=None, notes=None):
|
||||
self.sent = sent
|
||||
if features is None:
|
||||
self.features = []
|
||||
else:
|
||||
self.features = features
|
||||
|
||||
if notes is None:
|
||||
self.notes = []
|
||||
else:
|
||||
self.notes = notes
|
||||
|
||||
def __repr__(self):
|
||||
return "ReviewLine(features={}, notes={}, sent={})".format(
|
||||
self.features, self.notes, self.sent
|
||||
)
|
||||
|
||||
|
||||
class ReviewsCorpusReader(CorpusReader):
|
||||
"""
|
||||
Reader for the Customer Review Data dataset by Hu, Liu (2004).
|
||||
Note: we are not applying any sentence tokenization at the moment, just word
|
||||
tokenization.
|
||||
|
||||
>>> from nltk.corpus import product_reviews_1
|
||||
>>> camera_reviews = product_reviews_1.reviews('Canon_G3.txt')
|
||||
>>> review = camera_reviews[0]
|
||||
>>> review.sents()[0] # doctest: +NORMALIZE_WHITESPACE
|
||||
['i', 'recently', 'purchased', 'the', 'canon', 'powershot', 'g3', 'and', 'am',
|
||||
'extremely', 'satisfied', 'with', 'the', 'purchase', '.']
|
||||
>>> review.features() # doctest: +NORMALIZE_WHITESPACE
|
||||
[('canon powershot g3', '+3'), ('use', '+2'), ('picture', '+2'),
|
||||
('picture quality', '+1'), ('picture quality', '+1'), ('camera', '+2'),
|
||||
('use', '+2'), ('feature', '+1'), ('picture quality', '+3'), ('use', '+1'),
|
||||
('option', '+1')]
|
||||
|
||||
We can also reach the same information directly from the stream:
|
||||
|
||||
>>> product_reviews_1.features('Canon_G3.txt')
|
||||
[('canon powershot g3', '+3'), ('use', '+2'), ...]
|
||||
|
||||
We can compute stats for specific product features:
|
||||
|
||||
>>> n_reviews = len([(feat,score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture'])
|
||||
>>> tot = sum([int(score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture'])
|
||||
>>> mean = tot / n_reviews
|
||||
>>> print(n_reviews, tot, mean)
|
||||
15 24 1.6
|
||||
"""
|
||||
|
||||
CorpusView = StreamBackedCorpusView
|
||||
|
||||
def __init__(
|
||||
self, root, fileids, word_tokenizer=WordPunctTokenizer(), encoding="utf8"
|
||||
):
|
||||
"""
|
||||
:param root: The root directory for the corpus.
|
||||
:param fileids: a list or regexp specifying the fileids in the corpus.
|
||||
:param word_tokenizer: a tokenizer for breaking sentences or paragraphs
|
||||
into words. Default: `WordPunctTokenizer`
|
||||
:param encoding: the encoding that should be used to read the corpus.
|
||||
"""
|
||||
|
||||
CorpusReader.__init__(self, root, fileids, encoding)
|
||||
self._word_tokenizer = word_tokenizer
|
||||
self._readme = "README.txt"
|
||||
|
||||
def features(self, fileids=None):
|
||||
"""
|
||||
Return a list of features. Each feature is a tuple made of the specific
|
||||
item feature and the opinion strength about that feature.
|
||||
|
||||
:param fileids: a list or regexp specifying the ids of the files whose
|
||||
features have to be returned.
|
||||
:return: all features for the item(s) in the given file(s).
|
||||
:rtype: list(tuple)
|
||||
"""
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, str):
|
||||
fileids = [fileids]
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(fileid, self._read_features, encoding=enc)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def reviews(self, fileids=None):
|
||||
"""
|
||||
Return all the reviews as a list of Review objects. If `fileids` is
|
||||
specified, return all the reviews from each of the specified files.
|
||||
|
||||
:param fileids: a list or regexp specifying the ids of the files whose
|
||||
reviews have to be returned.
|
||||
:return: the given file(s) as a list of reviews.
|
||||
"""
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(fileid, self._read_review_block, encoding=enc)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def sents(self, fileids=None):
|
||||
"""
|
||||
Return all sentences in the corpus or in the specified files.
|
||||
|
||||
:param fileids: a list or regexp specifying the ids of the files whose
|
||||
sentences have to be returned.
|
||||
:return: the given file(s) as a list of sentences, each encoded as a
|
||||
list of word strings.
|
||||
:rtype: list(list(str))
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(path, self._read_sent_block, encoding=enc)
|
||||
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
||||
]
|
||||
)
|
||||
|
||||
def words(self, fileids=None):
|
||||
"""
|
||||
Return all words and punctuation symbols in the corpus or in the specified
|
||||
files.
|
||||
|
||||
:param fileids: a list or regexp specifying the ids of the files whose
|
||||
words have to be returned.
|
||||
:return: the given file(s) as a list of words and punctuation symbols.
|
||||
:rtype: list(str)
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(path, self._read_word_block, encoding=enc)
|
||||
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
||||
]
|
||||
)
|
||||
|
||||
def _read_features(self, stream):
|
||||
features = []
|
||||
for i in range(20):
|
||||
line = stream.readline()
|
||||
if not line:
|
||||
return features
|
||||
features.extend(re.findall(FEATURES, line))
|
||||
return features
|
||||
|
||||
def _read_review_block(self, stream):
|
||||
while True:
|
||||
line = stream.readline()
|
||||
if not line:
|
||||
return [] # end of file.
|
||||
title_match = re.match(TITLE, line)
|
||||
if title_match:
|
||||
review = Review(
|
||||
title=title_match.group(1).strip()
|
||||
) # We create a new review
|
||||
break
|
||||
|
||||
# Scan until we find another line matching the regexp, or EOF.
|
||||
while True:
|
||||
oldpos = stream.tell()
|
||||
line = stream.readline()
|
||||
# End of file:
|
||||
if not line:
|
||||
return [review]
|
||||
# Start of a new review: backup to just before it starts, and
|
||||
# return the review we've already collected.
|
||||
if re.match(TITLE, line):
|
||||
stream.seek(oldpos)
|
||||
return [review]
|
||||
# Anything else is part of the review line.
|
||||
feats = re.findall(FEATURES, line)
|
||||
notes = re.findall(NOTES, line)
|
||||
sent = re.findall(SENT, line)
|
||||
if sent:
|
||||
sent = self._word_tokenizer.tokenize(sent[0])
|
||||
review_line = ReviewLine(sent=sent, features=feats, notes=notes)
|
||||
review.add_line(review_line)
|
||||
|
||||
def _read_sent_block(self, stream):
|
||||
sents = []
|
||||
for review in self._read_review_block(stream):
|
||||
sents.extend([sent for sent in review.sents()])
|
||||
return sents
|
||||
|
||||
def _read_word_block(self, stream):
|
||||
words = []
|
||||
for i in range(20): # Read 20 lines at a time.
|
||||
line = stream.readline()
|
||||
sent = re.findall(SENT, line)
|
||||
if sent:
|
||||
words.extend(self._word_tokenizer.tokenize(sent[0]))
|
||||
return words
|
||||
146
backend/venv/Lib/site-packages/nltk/corpus/reader/rte.py
Normal file
146
backend/venv/Lib/site-packages/nltk/corpus/reader/rte.py
Normal file
@@ -0,0 +1,146 @@
|
||||
# Natural Language Toolkit: RTE Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Corpus reader for the Recognizing Textual Entailment (RTE) Challenge Corpora.
|
||||
|
||||
The files were taken from the RTE1, RTE2 and RTE3 datasets and the files
|
||||
were regularized.
|
||||
|
||||
Filenames are of the form rte*_dev.xml and rte*_test.xml. The latter are the
|
||||
gold standard annotated files.
|
||||
|
||||
Each entailment corpus is a list of 'text'/'hypothesis' pairs. The following
|
||||
example is taken from RTE3::
|
||||
|
||||
<pair id="1" entailment="YES" task="IE" length="short" >
|
||||
|
||||
<t>The sale was made to pay Yukos' US$ 27.5 billion tax bill,
|
||||
Yuganskneftegaz was originally sold for US$ 9.4 billion to a little known
|
||||
company Baikalfinansgroup which was later bought by the Russian
|
||||
state-owned oil company Rosneft .</t>
|
||||
|
||||
<h>Baikalfinansgroup was sold to Rosneft.</h>
|
||||
</pair>
|
||||
|
||||
In order to provide globally unique IDs for each pair, a new attribute
|
||||
``challenge`` has been added to the root element ``entailment-corpus`` of each
|
||||
file, taking values 1, 2 or 3. The GID is formatted 'm-n', where 'm' is the
|
||||
challenge number and 'n' is the pair ID.
|
||||
"""
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.corpus.reader.xmldocs import *
|
||||
|
||||
|
||||
def norm(value_string):
|
||||
"""
|
||||
Normalize the string value in an RTE pair's ``value`` or ``entailment``
|
||||
attribute as an integer (1, 0).
|
||||
|
||||
:param value_string: the label used to classify a text/hypothesis pair
|
||||
:type value_string: str
|
||||
:rtype: int
|
||||
"""
|
||||
|
||||
valdict = {"TRUE": 1, "FALSE": 0, "YES": 1, "NO": 0}
|
||||
return valdict[value_string.upper()]
|
||||
|
||||
|
||||
class RTEPair:
|
||||
"""
|
||||
Container for RTE text-hypothesis pairs.
|
||||
|
||||
The entailment relation is signalled by the ``value`` attribute in RTE1, and by
|
||||
``entailment`` in RTE2 and RTE3. These both get mapped on to the ``entailment``
|
||||
attribute of this class.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
pair,
|
||||
challenge=None,
|
||||
id=None,
|
||||
text=None,
|
||||
hyp=None,
|
||||
value=None,
|
||||
task=None,
|
||||
length=None,
|
||||
):
|
||||
"""
|
||||
:param challenge: version of the RTE challenge (i.e., RTE1, RTE2 or RTE3)
|
||||
:param id: identifier for the pair
|
||||
:param text: the text component of the pair
|
||||
:param hyp: the hypothesis component of the pair
|
||||
:param value: classification label for the pair
|
||||
:param task: attribute for the particular NLP task that the data was drawn from
|
||||
:param length: attribute for the length of the text of the pair
|
||||
"""
|
||||
self.challenge = challenge
|
||||
self.id = pair.attrib["id"]
|
||||
self.gid = f"{self.challenge}-{self.id}"
|
||||
self.text = pair[0].text
|
||||
self.hyp = pair[1].text
|
||||
|
||||
if "value" in pair.attrib:
|
||||
self.value = norm(pair.attrib["value"])
|
||||
elif "entailment" in pair.attrib:
|
||||
self.value = norm(pair.attrib["entailment"])
|
||||
else:
|
||||
self.value = value
|
||||
if "task" in pair.attrib:
|
||||
self.task = pair.attrib["task"]
|
||||
else:
|
||||
self.task = task
|
||||
if "length" in pair.attrib:
|
||||
self.length = pair.attrib["length"]
|
||||
else:
|
||||
self.length = length
|
||||
|
||||
def __repr__(self):
|
||||
if self.challenge:
|
||||
return f"<RTEPair: gid={self.challenge}-{self.id}>"
|
||||
else:
|
||||
return "<RTEPair: id=%s>" % self.id
|
||||
|
||||
|
||||
class RTECorpusReader(XMLCorpusReader):
|
||||
"""
|
||||
Corpus reader for corpora in RTE challenges.
|
||||
|
||||
This is just a wrapper around the XMLCorpusReader. See module docstring above for the expected
|
||||
structure of input documents.
|
||||
"""
|
||||
|
||||
def _read_etree(self, doc):
|
||||
"""
|
||||
Map the XML input into an RTEPair.
|
||||
|
||||
This uses the ``getiterator()`` method from the ElementTree package to
|
||||
find all the ``<pair>`` elements.
|
||||
|
||||
:param doc: a parsed XML document
|
||||
:rtype: list(RTEPair)
|
||||
"""
|
||||
try:
|
||||
challenge = doc.attrib["challenge"]
|
||||
except KeyError:
|
||||
challenge = None
|
||||
pairiter = doc.iter("pair")
|
||||
return [RTEPair(pair, challenge=challenge) for pair in pairiter]
|
||||
|
||||
def pairs(self, fileids):
|
||||
"""
|
||||
Build a list of RTEPairs from a RTE corpus.
|
||||
|
||||
:param fileids: a list of RTE corpus fileids
|
||||
:type: list
|
||||
:rtype: list(RTEPair)
|
||||
"""
|
||||
if isinstance(fileids, str):
|
||||
fileids = [fileids]
|
||||
return concat([self._read_etree(self.xml(fileid)) for fileid in fileids])
|
||||
296
backend/venv/Lib/site-packages/nltk/corpus/reader/semcor.py
Normal file
296
backend/venv/Lib/site-packages/nltk/corpus/reader/semcor.py
Normal file
@@ -0,0 +1,296 @@
|
||||
# Natural Language Toolkit: SemCor Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Nathan Schneider <nschneid@cs.cmu.edu>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Corpus reader for the SemCor Corpus.
|
||||
"""
|
||||
|
||||
__docformat__ = "epytext en"
|
||||
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView
|
||||
from nltk.tree import Tree
|
||||
|
||||
|
||||
class SemcorCorpusReader(XMLCorpusReader):
|
||||
"""
|
||||
Corpus reader for the SemCor Corpus.
|
||||
For access to the complete XML data structure, use the ``xml()``
|
||||
method. For access to simple word lists and tagged word lists, use
|
||||
``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``.
|
||||
"""
|
||||
|
||||
def __init__(self, root, fileids, wordnet, lazy=True):
|
||||
XMLCorpusReader.__init__(self, root, fileids)
|
||||
self._lazy = lazy
|
||||
self._wordnet = wordnet
|
||||
|
||||
def words(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of words and punctuation symbols.
|
||||
:rtype: list(str)
|
||||
"""
|
||||
return self._items(fileids, "word", False, False, False)
|
||||
|
||||
def chunks(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of chunks,
|
||||
each of which is a list of words and punctuation symbols
|
||||
that form a unit.
|
||||
:rtype: list(list(str))
|
||||
"""
|
||||
return self._items(fileids, "chunk", False, False, False)
|
||||
|
||||
def tagged_chunks(self, fileids=None, tag=("pos" or "sem" or "both")):
|
||||
"""
|
||||
:return: the given file(s) as a list of tagged chunks, represented
|
||||
in tree form.
|
||||
:rtype: list(Tree)
|
||||
|
||||
:param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'`
|
||||
to indicate the kind of tags to include. Semantic tags consist of
|
||||
WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity
|
||||
without a specific entry in WordNet. (Named entities of type 'other'
|
||||
have no lemma. Other chunks not in WordNet have no semantic tag.
|
||||
Punctuation tokens have `None` for their part of speech tag.)
|
||||
"""
|
||||
return self._items(fileids, "chunk", False, tag != "sem", tag != "pos")
|
||||
|
||||
def sents(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of sentences, each encoded
|
||||
as a list of word strings.
|
||||
:rtype: list(list(str))
|
||||
"""
|
||||
return self._items(fileids, "word", True, False, False)
|
||||
|
||||
def chunk_sents(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of sentences, each encoded
|
||||
as a list of chunks.
|
||||
:rtype: list(list(list(str)))
|
||||
"""
|
||||
return self._items(fileids, "chunk", True, False, False)
|
||||
|
||||
def tagged_sents(self, fileids=None, tag=("pos" or "sem" or "both")):
|
||||
"""
|
||||
:return: the given file(s) as a list of sentences. Each sentence
|
||||
is represented as a list of tagged chunks (in tree form).
|
||||
:rtype: list(list(Tree))
|
||||
|
||||
:param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'`
|
||||
to indicate the kind of tags to include. Semantic tags consist of
|
||||
WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity
|
||||
without a specific entry in WordNet. (Named entities of type 'other'
|
||||
have no lemma. Other chunks not in WordNet have no semantic tag.
|
||||
Punctuation tokens have `None` for their part of speech tag.)
|
||||
"""
|
||||
return self._items(fileids, "chunk", True, tag != "sem", tag != "pos")
|
||||
|
||||
def _items(self, fileids, unit, bracket_sent, pos_tag, sem_tag):
|
||||
if unit == "word" and not bracket_sent:
|
||||
# the result of the SemcorWordView may be a multiword unit, so the
|
||||
# LazyConcatenation will make sure the sentence is flattened
|
||||
_ = lambda *args: LazyConcatenation(
|
||||
(SemcorWordView if self._lazy else self._words)(*args)
|
||||
)
|
||||
else:
|
||||
_ = SemcorWordView if self._lazy else self._words
|
||||
return concat(
|
||||
[
|
||||
_(fileid, unit, bracket_sent, pos_tag, sem_tag, self._wordnet)
|
||||
for fileid in self.abspaths(fileids)
|
||||
]
|
||||
)
|
||||
|
||||
def _words(self, fileid, unit, bracket_sent, pos_tag, sem_tag):
|
||||
"""
|
||||
Helper used to implement the view methods -- returns a list of
|
||||
tokens, (segmented) words, chunks, or sentences. The tokens
|
||||
and chunks may optionally be tagged (with POS and sense
|
||||
information).
|
||||
|
||||
:param fileid: The name of the underlying file.
|
||||
:param unit: One of `'token'`, `'word'`, or `'chunk'`.
|
||||
:param bracket_sent: If true, include sentence bracketing.
|
||||
:param pos_tag: Whether to include part-of-speech tags.
|
||||
:param sem_tag: Whether to include semantic tags, namely WordNet lemma
|
||||
and OOV named entity status.
|
||||
"""
|
||||
assert unit in ("token", "word", "chunk")
|
||||
result = []
|
||||
|
||||
xmldoc = ElementTree.parse(fileid).getroot()
|
||||
for xmlsent in xmldoc.findall(".//s"):
|
||||
sent = []
|
||||
for xmlword in _all_xmlwords_in(xmlsent):
|
||||
itm = SemcorCorpusReader._word(
|
||||
xmlword, unit, pos_tag, sem_tag, self._wordnet
|
||||
)
|
||||
if unit == "word":
|
||||
sent.extend(itm)
|
||||
else:
|
||||
sent.append(itm)
|
||||
|
||||
if bracket_sent:
|
||||
result.append(SemcorSentence(xmlsent.attrib["snum"], sent))
|
||||
else:
|
||||
result.extend(sent)
|
||||
|
||||
assert None not in result
|
||||
return result
|
||||
|
||||
@staticmethod
|
||||
def _word(xmlword, unit, pos_tag, sem_tag, wordnet):
|
||||
tkn = xmlword.text
|
||||
if not tkn:
|
||||
tkn = "" # fixes issue 337?
|
||||
|
||||
lemma = xmlword.get("lemma", tkn) # lemma or NE class
|
||||
lexsn = xmlword.get("lexsn") # lex_sense (locator for the lemma's sense)
|
||||
if lexsn is not None:
|
||||
sense_key = lemma + "%" + lexsn
|
||||
wnpos = ("n", "v", "a", "r", "s")[
|
||||
int(lexsn.split(":")[0]) - 1
|
||||
] # see http://wordnet.princeton.edu/man/senseidx.5WN.html
|
||||
else:
|
||||
sense_key = wnpos = None
|
||||
redef = xmlword.get(
|
||||
"rdf", tkn
|
||||
) # redefinition--this indicates the lookup string
|
||||
# does not exactly match the enclosed string, e.g. due to typographical adjustments
|
||||
# or discontinuity of a multiword expression. If a redefinition has occurred,
|
||||
# the "rdf" attribute holds its inflected form and "lemma" holds its lemma.
|
||||
# For NEs, "rdf", "lemma", and "pn" all hold the same value (the NE class).
|
||||
sensenum = xmlword.get("wnsn") # WordNet sense number
|
||||
isOOVEntity = "pn" in xmlword.keys() # a "personal name" (NE) not in WordNet
|
||||
pos = xmlword.get(
|
||||
"pos"
|
||||
) # part of speech for the whole chunk (None for punctuation)
|
||||
|
||||
if unit == "token":
|
||||
if not pos_tag and not sem_tag:
|
||||
itm = tkn
|
||||
else:
|
||||
itm = (
|
||||
(tkn,)
|
||||
+ ((pos,) if pos_tag else ())
|
||||
+ ((lemma, wnpos, sensenum, isOOVEntity) if sem_tag else ())
|
||||
)
|
||||
return itm
|
||||
else:
|
||||
ww = tkn.split("_") # TODO: case where punctuation intervenes in MWE
|
||||
if unit == "word":
|
||||
return ww
|
||||
else:
|
||||
if sensenum is not None:
|
||||
try:
|
||||
sense = wordnet.lemma_from_key(sense_key) # Lemma object
|
||||
except Exception:
|
||||
# cannot retrieve the wordnet.Lemma object. possible reasons:
|
||||
# (a) the wordnet corpus is not downloaded;
|
||||
# (b) a nonexistent sense is annotated: e.g., such.s.00 triggers:
|
||||
# nltk.corpus.reader.wordnet.WordNetError: No synset found for key u'such%5:00:01:specified:00'
|
||||
# solution: just use the lemma name as a string
|
||||
try:
|
||||
sense = "%s.%s.%02d" % (
|
||||
lemma,
|
||||
wnpos,
|
||||
int(sensenum),
|
||||
) # e.g.: reach.v.02
|
||||
except ValueError:
|
||||
sense = (
|
||||
lemma + "." + wnpos + "." + sensenum
|
||||
) # e.g. the sense number may be "2;1"
|
||||
|
||||
bottom = [Tree(pos, ww)] if pos_tag else ww
|
||||
|
||||
if sem_tag and isOOVEntity:
|
||||
if sensenum is not None:
|
||||
return Tree(sense, [Tree("NE", bottom)])
|
||||
else: # 'other' NE
|
||||
return Tree("NE", bottom)
|
||||
elif sem_tag and sensenum is not None:
|
||||
return Tree(sense, bottom)
|
||||
elif pos_tag:
|
||||
return bottom[0]
|
||||
else:
|
||||
return bottom # chunk as a list
|
||||
|
||||
|
||||
def _all_xmlwords_in(elt, result=None):
|
||||
if result is None:
|
||||
result = []
|
||||
for child in elt:
|
||||
if child.tag in ("wf", "punc"):
|
||||
result.append(child)
|
||||
else:
|
||||
_all_xmlwords_in(child, result)
|
||||
return result
|
||||
|
||||
|
||||
class SemcorSentence(list):
|
||||
"""
|
||||
A list of words, augmented by an attribute ``num`` used to record
|
||||
the sentence identifier (the ``n`` attribute from the XML).
|
||||
"""
|
||||
|
||||
def __init__(self, num, items):
|
||||
self.num = num
|
||||
list.__init__(self, items)
|
||||
|
||||
|
||||
class SemcorWordView(XMLCorpusView):
|
||||
"""
|
||||
A stream backed corpus view specialized for use with the BNC corpus.
|
||||
"""
|
||||
|
||||
def __init__(self, fileid, unit, bracket_sent, pos_tag, sem_tag, wordnet):
|
||||
"""
|
||||
:param fileid: The name of the underlying file.
|
||||
:param unit: One of `'token'`, `'word'`, or `'chunk'`.
|
||||
:param bracket_sent: If true, include sentence bracketing.
|
||||
:param pos_tag: Whether to include part-of-speech tags.
|
||||
:param sem_tag: Whether to include semantic tags, namely WordNet lemma
|
||||
and OOV named entity status.
|
||||
"""
|
||||
if bracket_sent:
|
||||
tagspec = ".*/s"
|
||||
else:
|
||||
tagspec = ".*/s/(punc|wf)"
|
||||
|
||||
self._unit = unit
|
||||
self._sent = bracket_sent
|
||||
self._pos_tag = pos_tag
|
||||
self._sem_tag = sem_tag
|
||||
self._wordnet = wordnet
|
||||
|
||||
XMLCorpusView.__init__(self, fileid, tagspec)
|
||||
|
||||
def handle_elt(self, elt, context):
|
||||
if self._sent:
|
||||
return self.handle_sent(elt)
|
||||
else:
|
||||
return self.handle_word(elt)
|
||||
|
||||
def handle_word(self, elt):
|
||||
return SemcorCorpusReader._word(
|
||||
elt, self._unit, self._pos_tag, self._sem_tag, self._wordnet
|
||||
)
|
||||
|
||||
def handle_sent(self, elt):
|
||||
sent = []
|
||||
for child in elt:
|
||||
if child.tag in ("wf", "punc"):
|
||||
itm = self.handle_word(child)
|
||||
if self._unit == "word":
|
||||
sent.extend(itm)
|
||||
else:
|
||||
sent.append(itm)
|
||||
else:
|
||||
raise ValueError("Unexpected element %s" % child.tag)
|
||||
return SemcorSentence(elt.attrib["snum"], sent)
|
||||
196
backend/venv/Lib/site-packages/nltk/corpus/reader/senseval.py
Normal file
196
backend/venv/Lib/site-packages/nltk/corpus/reader/senseval.py
Normal file
@@ -0,0 +1,196 @@
|
||||
# Natural Language Toolkit: Senseval 2 Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
|
||||
# Steven Bird <stevenbird1@gmail.com> (modifications)
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Read from the Senseval 2 Corpus.
|
||||
|
||||
SENSEVAL [http://www.senseval.org/]
|
||||
Evaluation exercises for Word Sense Disambiguation.
|
||||
Organized by ACL-SIGLEX [https://www.siglex.org/]
|
||||
|
||||
Prepared by Ted Pedersen <tpederse@umn.edu>, University of Minnesota,
|
||||
https://www.d.umn.edu/~tpederse/data.html
|
||||
Distributed with permission.
|
||||
|
||||
The NLTK version of the Senseval 2 files uses well-formed XML.
|
||||
Each instance of the ambiguous words "hard", "interest", "line", and "serve"
|
||||
is tagged with a sense identifier, and supplied with context.
|
||||
"""
|
||||
|
||||
import re
|
||||
from xml.etree import ElementTree
|
||||
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.tokenize import *
|
||||
|
||||
|
||||
class SensevalInstance:
|
||||
def __init__(self, word, position, context, senses):
|
||||
self.word = word
|
||||
self.senses = tuple(senses)
|
||||
self.position = position
|
||||
self.context = context
|
||||
|
||||
def __repr__(self):
|
||||
return "SensevalInstance(word=%r, position=%r, " "context=%r, senses=%r)" % (
|
||||
self.word,
|
||||
self.position,
|
||||
self.context,
|
||||
self.senses,
|
||||
)
|
||||
|
||||
|
||||
class SensevalCorpusReader(CorpusReader):
|
||||
def instances(self, fileids=None):
|
||||
return concat(
|
||||
[
|
||||
SensevalCorpusView(fileid, enc)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def _entry(self, tree):
|
||||
elts = []
|
||||
for lexelt in tree.findall("lexelt"):
|
||||
for inst in lexelt.findall("instance"):
|
||||
sense = inst[0].attrib["senseid"]
|
||||
context = [(w.text, w.attrib["pos"]) for w in inst[1]]
|
||||
elts.append((sense, context))
|
||||
return elts
|
||||
|
||||
|
||||
class SensevalCorpusView(StreamBackedCorpusView):
|
||||
def __init__(self, fileid, encoding):
|
||||
StreamBackedCorpusView.__init__(self, fileid, encoding=encoding)
|
||||
|
||||
self._word_tokenizer = WhitespaceTokenizer()
|
||||
self._lexelt_starts = [0] # list of streampos
|
||||
self._lexelts = [None] # list of lexelt names
|
||||
|
||||
def read_block(self, stream):
|
||||
# Decide which lexical element we're in.
|
||||
lexelt_num = bisect.bisect_right(self._lexelt_starts, stream.tell()) - 1
|
||||
lexelt = self._lexelts[lexelt_num]
|
||||
|
||||
instance_lines = []
|
||||
in_instance = False
|
||||
while True:
|
||||
line = stream.readline()
|
||||
if line == "":
|
||||
assert instance_lines == []
|
||||
return []
|
||||
|
||||
# Start of a lexical element?
|
||||
if line.lstrip().startswith("<lexelt"):
|
||||
lexelt_num += 1
|
||||
m = re.search("item=(\"[^\"]+\"|'[^']+')", line)
|
||||
assert m is not None # <lexelt> has no 'item=...'
|
||||
lexelt = m.group(1)[1:-1]
|
||||
if lexelt_num < len(self._lexelts):
|
||||
assert lexelt == self._lexelts[lexelt_num]
|
||||
else:
|
||||
self._lexelts.append(lexelt)
|
||||
self._lexelt_starts.append(stream.tell())
|
||||
|
||||
# Start of an instance?
|
||||
if line.lstrip().startswith("<instance"):
|
||||
assert instance_lines == []
|
||||
in_instance = True
|
||||
|
||||
# Body of an instance?
|
||||
if in_instance:
|
||||
instance_lines.append(line)
|
||||
|
||||
# End of an instance?
|
||||
if line.lstrip().startswith("</instance"):
|
||||
xml_block = "\n".join(instance_lines)
|
||||
xml_block = _fixXML(xml_block)
|
||||
inst = ElementTree.fromstring(xml_block)
|
||||
return [self._parse_instance(inst, lexelt)]
|
||||
|
||||
def _parse_instance(self, instance, lexelt):
|
||||
senses = []
|
||||
context = []
|
||||
position = None
|
||||
for child in instance:
|
||||
if child.tag == "answer":
|
||||
senses.append(child.attrib["senseid"])
|
||||
elif child.tag == "context":
|
||||
context += self._word_tokenizer.tokenize(child.text)
|
||||
for cword in child:
|
||||
if cword.tag == "compound":
|
||||
cword = cword[0] # is this ok to do?
|
||||
|
||||
if cword.tag == "head":
|
||||
# Some santiy checks:
|
||||
assert position is None, "head specified twice"
|
||||
assert cword.text.strip() or len(cword) == 1
|
||||
assert not (cword.text.strip() and len(cword) == 1)
|
||||
# Record the position of the head:
|
||||
position = len(context)
|
||||
# Add on the head word itself:
|
||||
if cword.text.strip():
|
||||
context.append(cword.text.strip())
|
||||
elif cword[0].tag == "wf":
|
||||
context.append((cword[0].text, cword[0].attrib["pos"]))
|
||||
if cword[0].tail:
|
||||
context += self._word_tokenizer.tokenize(cword[0].tail)
|
||||
else:
|
||||
assert False, "expected CDATA or wf in <head>"
|
||||
elif cword.tag == "wf":
|
||||
context.append((cword.text, cword.attrib["pos"]))
|
||||
elif cword.tag == "s":
|
||||
pass # Sentence boundary marker.
|
||||
|
||||
else:
|
||||
print("ACK", cword.tag)
|
||||
assert False, "expected CDATA or <wf> or <head>"
|
||||
if cword.tail:
|
||||
context += self._word_tokenizer.tokenize(cword.tail)
|
||||
else:
|
||||
assert False, "unexpected tag %s" % child.tag
|
||||
return SensevalInstance(lexelt, position, context, senses)
|
||||
|
||||
|
||||
def _fixXML(text):
|
||||
"""
|
||||
Fix the various issues with Senseval pseudo-XML.
|
||||
"""
|
||||
# <~> or <^> => ~ or ^
|
||||
text = re.sub(r"<([~\^])>", r"\1", text)
|
||||
# fix lone &
|
||||
text = re.sub(r"(\s+)\&(\s+)", r"\1&\2", text)
|
||||
# fix """
|
||||
text = re.sub(r'"""', "'\"'", text)
|
||||
# fix <s snum=dd> => <s snum="dd"/>
|
||||
text = re.sub(r'(<[^<]*snum=)([^">]+)>', r'\1"\2"/>', text)
|
||||
# fix foreign word tag
|
||||
text = re.sub(r"<\&frasl>\s*<p[^>]*>", "FRASL", text)
|
||||
# remove <&I .>
|
||||
text = re.sub(r"<\&I[^>]*>", "", text)
|
||||
# fix <{word}>
|
||||
text = re.sub(r"<{([^}]+)}>", r"\1", text)
|
||||
# remove <@>, <p>, </p>
|
||||
text = re.sub(r"<(@|/?p)>", r"", text)
|
||||
# remove <&M .> and <&T .> and <&Ms .>
|
||||
text = re.sub(r"<&\w+ \.>", r"", text)
|
||||
# remove <!DOCTYPE... > lines
|
||||
text = re.sub(r"<!DOCTYPE[^>]*>", r"", text)
|
||||
# remove <[hi]> and <[/p]> etc
|
||||
text = re.sub(r"<\[\/?[^>]+\]*>", r"", text)
|
||||
# take the thing out of the brackets: <…>
|
||||
text = re.sub(r"<(\&\w+;)>", r"\1", text)
|
||||
# and remove the & for those patterns that aren't regular XML
|
||||
text = re.sub(r"&(?!amp|gt|lt|apos|quot)", r"", text)
|
||||
# fix 'abc <p="foo"/>' style tags - now <wf pos="foo">abc</wf>
|
||||
text = re.sub(
|
||||
r'[ \t]*([^<>\s]+?)[ \t]*<p="([^"]*"?)"/>', r' <wf pos="\2">\1</wf>', text
|
||||
)
|
||||
text = re.sub(r'\s*"\s*<p=\'"\'/>', " <wf pos='\"'>\"</wf>", text)
|
||||
return text
|
||||
@@ -0,0 +1,136 @@
|
||||
# Natural Language Toolkit: SentiWordNet
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Christopher Potts <cgpotts@stanford.edu>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
An NLTK interface for SentiWordNet
|
||||
|
||||
SentiWordNet is a lexical resource for opinion mining.
|
||||
SentiWordNet assigns to each synset of WordNet three
|
||||
sentiment scores: positivity, negativity, and objectivity.
|
||||
|
||||
For details about SentiWordNet see:
|
||||
http://sentiwordnet.isti.cnr.it/
|
||||
|
||||
>>> from nltk.corpus import sentiwordnet as swn
|
||||
>>> print(swn.senti_synset('breakdown.n.03'))
|
||||
<breakdown.n.03: PosScore=0.0 NegScore=0.25>
|
||||
>>> list(swn.senti_synsets('slow'))
|
||||
[SentiSynset('decelerate.v.01'), SentiSynset('slow.v.02'),\
|
||||
SentiSynset('slow.v.03'), SentiSynset('slow.a.01'),\
|
||||
SentiSynset('slow.a.02'), SentiSynset('dense.s.04'),\
|
||||
SentiSynset('slow.a.04'), SentiSynset('boring.s.01'),\
|
||||
SentiSynset('dull.s.05'), SentiSynset('slowly.r.01'),\
|
||||
SentiSynset('behind.r.03')]
|
||||
>>> happy = swn.senti_synsets('happy', 'a')
|
||||
>>> happy0 = list(happy)[0]
|
||||
>>> happy0.pos_score()
|
||||
0.875
|
||||
>>> happy0.neg_score()
|
||||
0.0
|
||||
>>> happy0.obj_score()
|
||||
0.125
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
from nltk.corpus.reader import CorpusReader
|
||||
|
||||
|
||||
class SentiWordNetCorpusReader(CorpusReader):
|
||||
def __init__(self, root, fileids, encoding="utf-8"):
|
||||
"""
|
||||
Construct a new SentiWordNet Corpus Reader, using data from
|
||||
the specified file.
|
||||
"""
|
||||
super().__init__(root, fileids, encoding=encoding)
|
||||
if len(self._fileids) != 1:
|
||||
raise ValueError("Exactly one file must be specified")
|
||||
self._db = {}
|
||||
self._parse_src_file()
|
||||
|
||||
def _parse_src_file(self):
|
||||
lines = self.open(self._fileids[0]).read().splitlines()
|
||||
lines = filter((lambda x: not re.search(r"^\s*#", x)), lines)
|
||||
for i, line in enumerate(lines):
|
||||
fields = [field.strip() for field in re.split(r"\t+", line)]
|
||||
try:
|
||||
pos, offset, pos_score, neg_score, synset_terms, gloss = fields
|
||||
except BaseException as e:
|
||||
raise ValueError(f"Line {i} formatted incorrectly: {line}\n") from e
|
||||
if pos and offset:
|
||||
offset = int(offset)
|
||||
self._db[(pos, offset)] = (float(pos_score), float(neg_score))
|
||||
|
||||
def senti_synset(self, *vals):
|
||||
from nltk.corpus import wordnet as wn
|
||||
|
||||
if tuple(vals) in self._db:
|
||||
pos_score, neg_score = self._db[tuple(vals)]
|
||||
pos, offset = vals
|
||||
if pos == "s":
|
||||
pos = "a"
|
||||
synset = wn.synset_from_pos_and_offset(pos, offset)
|
||||
return SentiSynset(pos_score, neg_score, synset)
|
||||
else:
|
||||
synset = wn.synset(vals[0])
|
||||
pos = synset.pos()
|
||||
if pos == "s":
|
||||
pos = "a"
|
||||
offset = synset.offset()
|
||||
if (pos, offset) in self._db:
|
||||
pos_score, neg_score = self._db[(pos, offset)]
|
||||
return SentiSynset(pos_score, neg_score, synset)
|
||||
else:
|
||||
return None
|
||||
|
||||
def senti_synsets(self, string, pos=None):
|
||||
from nltk.corpus import wordnet as wn
|
||||
|
||||
sentis = []
|
||||
synset_list = wn.synsets(string, pos)
|
||||
for synset in synset_list:
|
||||
sentis.append(self.senti_synset(synset.name()))
|
||||
sentis = filter(lambda x: x, sentis)
|
||||
return sentis
|
||||
|
||||
def all_senti_synsets(self):
|
||||
from nltk.corpus import wordnet as wn
|
||||
|
||||
for key, fields in self._db.items():
|
||||
pos, offset = key
|
||||
pos_score, neg_score = fields
|
||||
synset = wn.synset_from_pos_and_offset(pos, offset)
|
||||
yield SentiSynset(pos_score, neg_score, synset)
|
||||
|
||||
|
||||
class SentiSynset:
|
||||
def __init__(self, pos_score, neg_score, synset):
|
||||
self._pos_score = pos_score
|
||||
self._neg_score = neg_score
|
||||
self._obj_score = 1.0 - (self._pos_score + self._neg_score)
|
||||
self.synset = synset
|
||||
|
||||
def pos_score(self):
|
||||
return self._pos_score
|
||||
|
||||
def neg_score(self):
|
||||
return self._neg_score
|
||||
|
||||
def obj_score(self):
|
||||
return self._obj_score
|
||||
|
||||
def __str__(self):
|
||||
"""Prints just the Pos/Neg scores for now."""
|
||||
s = "<"
|
||||
s += self.synset.name() + ": "
|
||||
s += "PosScore=%s " % self._pos_score
|
||||
s += "NegScore=%s" % self._neg_score
|
||||
s += ">"
|
||||
return s
|
||||
|
||||
def __repr__(self):
|
||||
return "Senti" + repr(self.synset)
|
||||
@@ -0,0 +1,75 @@
|
||||
# Natural Language Toolkit: Sinica Treebank Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Sinica Treebank Corpus Sample
|
||||
|
||||
http://rocling.iis.sinica.edu.tw/CKIP/engversion/treebank.htm
|
||||
|
||||
10,000 parsed sentences, drawn from the Academia Sinica Balanced
|
||||
Corpus of Modern Chinese. Parse tree notation is based on
|
||||
Information-based Case Grammar. Tagset documentation is available
|
||||
at https://www.sinica.edu.tw/SinicaCorpus/modern_e_wordtype.html
|
||||
|
||||
Language and Knowledge Processing Group, Institute of Information
|
||||
Science, Academia Sinica
|
||||
|
||||
The data is distributed with the Natural Language Toolkit under the terms of
|
||||
the Creative Commons Attribution-NonCommercial-ShareAlike License
|
||||
[https://creativecommons.org/licenses/by-nc-sa/2.5/].
|
||||
|
||||
References:
|
||||
|
||||
Feng-Yi Chen, Pi-Fang Tsai, Keh-Jiann Chen, and Chu-Ren Huang (1999)
|
||||
The Construction of Sinica Treebank. Computational Linguistics and
|
||||
Chinese Language Processing, 4, pp 87-104.
|
||||
|
||||
Huang Chu-Ren, Keh-Jiann Chen, Feng-Yi Chen, Keh-Jiann Chen, Zhao-Ming
|
||||
Gao, and Kuang-Yu Chen. 2000. Sinica Treebank: Design Criteria,
|
||||
Annotation Guidelines, and On-line Interface. Proceedings of 2nd
|
||||
Chinese Language Processing Workshop, Association for Computational
|
||||
Linguistics.
|
||||
|
||||
Chen Keh-Jiann and Yu-Ming Hsieh (2004) Chinese Treebanks and Grammar
|
||||
Extraction, Proceedings of IJCNLP-04, pp560-565.
|
||||
"""
|
||||
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.tag import map_tag
|
||||
from nltk.tree import sinica_parse
|
||||
|
||||
IDENTIFIER = re.compile(r"^#\S+\s")
|
||||
APPENDIX = re.compile(r"(?<=\))#.*$")
|
||||
TAGWORD = re.compile(r":([^:()|]+):([^:()|]+)")
|
||||
WORD = re.compile(r":[^:()|]+:([^:()|]+)")
|
||||
|
||||
|
||||
class SinicaTreebankCorpusReader(SyntaxCorpusReader):
|
||||
"""
|
||||
Reader for the sinica treebank.
|
||||
"""
|
||||
|
||||
def _read_block(self, stream):
|
||||
sent = stream.readline()
|
||||
sent = IDENTIFIER.sub("", sent)
|
||||
sent = APPENDIX.sub("", sent)
|
||||
return [sent]
|
||||
|
||||
def _parse(self, sent):
|
||||
return sinica_parse(sent)
|
||||
|
||||
def _tag(self, sent, tagset=None):
|
||||
tagged_sent = [(w, t) for (t, w) in TAGWORD.findall(sent)]
|
||||
if tagset and tagset != self._tagset:
|
||||
tagged_sent = [
|
||||
(w, map_tag(self._tagset, tagset, t)) for (w, t) in tagged_sent
|
||||
]
|
||||
return tagged_sent
|
||||
|
||||
def _word(self, sent):
|
||||
return WORD.findall(sent)
|
||||
@@ -0,0 +1,56 @@
|
||||
# Natural Language Toolkit: String Category Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Read tuples from a corpus consisting of categorized strings.
|
||||
For example, from the question classification corpus:
|
||||
|
||||
NUM:dist How far is it from Denver to Aspen ?
|
||||
LOC:city What county is Modesto , California in ?
|
||||
HUM:desc Who was Galileo ?
|
||||
DESC:def What is an atom ?
|
||||
NUM:date When did Hawaii become a state ?
|
||||
"""
|
||||
|
||||
from nltk.corpus.reader.api import *
|
||||
|
||||
# based on PPAttachmentCorpusReader
|
||||
from nltk.corpus.reader.util import *
|
||||
|
||||
|
||||
# [xx] Should the order of the tuple be reversed -- in most other places
|
||||
# in nltk, we use the form (data, tag) -- e.g., tagged words and
|
||||
# labeled texts for classifiers.
|
||||
class StringCategoryCorpusReader(CorpusReader):
|
||||
def __init__(self, root, fileids, delimiter=" ", encoding="utf8"):
|
||||
"""
|
||||
:param root: The root directory for this corpus.
|
||||
:param fileids: A list or regexp specifying the fileids in this corpus.
|
||||
:param delimiter: Field delimiter
|
||||
"""
|
||||
CorpusReader.__init__(self, root, fileids, encoding)
|
||||
self._delimiter = delimiter
|
||||
|
||||
def tuples(self, fileids=None):
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, str):
|
||||
fileids = [fileids]
|
||||
return concat(
|
||||
[
|
||||
StreamBackedCorpusView(fileid, self._read_tuple_block, encoding=enc)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def _read_tuple_block(self, stream):
|
||||
line = stream.readline().strip()
|
||||
if line:
|
||||
return [tuple(line.split(self._delimiter, 1))]
|
||||
else:
|
||||
return []
|
||||
125
backend/venv/Lib/site-packages/nltk/corpus/reader/switchboard.py
Normal file
125
backend/venv/Lib/site-packages/nltk/corpus/reader/switchboard.py
Normal file
@@ -0,0 +1,125 @@
|
||||
# Natural Language Toolkit: Switchboard Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
import re
|
||||
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.tag import map_tag, str2tuple
|
||||
|
||||
|
||||
class SwitchboardTurn(list):
|
||||
"""
|
||||
A specialized list object used to encode switchboard utterances.
|
||||
The elements of the list are the words in the utterance; and two
|
||||
attributes, ``speaker`` and ``id``, are provided to retrieve the
|
||||
spearker identifier and utterance id. Note that utterance ids
|
||||
are only unique within a given discourse.
|
||||
"""
|
||||
|
||||
def __init__(self, words, speaker, id):
|
||||
list.__init__(self, words)
|
||||
self.speaker = speaker
|
||||
self.id = int(id)
|
||||
|
||||
def __repr__(self):
|
||||
if len(self) == 0:
|
||||
text = ""
|
||||
elif isinstance(self[0], tuple):
|
||||
text = " ".join("%s/%s" % w for w in self)
|
||||
else:
|
||||
text = " ".join(self)
|
||||
return f"<{self.speaker}.{self.id}: {text!r}>"
|
||||
|
||||
|
||||
class SwitchboardCorpusReader(CorpusReader):
|
||||
_FILES = ["tagged"]
|
||||
# Use the "tagged" file even for non-tagged data methods, since
|
||||
# it's tokenized.
|
||||
|
||||
def __init__(self, root, tagset=None):
|
||||
CorpusReader.__init__(self, root, self._FILES)
|
||||
self._tagset = tagset
|
||||
|
||||
def words(self):
|
||||
return StreamBackedCorpusView(self.abspath("tagged"), self._words_block_reader)
|
||||
|
||||
def tagged_words(self, tagset=None):
|
||||
def tagged_words_block_reader(stream):
|
||||
return self._tagged_words_block_reader(stream, tagset)
|
||||
|
||||
return StreamBackedCorpusView(self.abspath("tagged"), tagged_words_block_reader)
|
||||
|
||||
def turns(self):
|
||||
return StreamBackedCorpusView(self.abspath("tagged"), self._turns_block_reader)
|
||||
|
||||
def tagged_turns(self, tagset=None):
|
||||
def tagged_turns_block_reader(stream):
|
||||
return self._tagged_turns_block_reader(stream, tagset)
|
||||
|
||||
return StreamBackedCorpusView(self.abspath("tagged"), tagged_turns_block_reader)
|
||||
|
||||
def discourses(self):
|
||||
return StreamBackedCorpusView(
|
||||
self.abspath("tagged"), self._discourses_block_reader
|
||||
)
|
||||
|
||||
def tagged_discourses(self, tagset=False):
|
||||
def tagged_discourses_block_reader(stream):
|
||||
return self._tagged_discourses_block_reader(stream, tagset)
|
||||
|
||||
return StreamBackedCorpusView(
|
||||
self.abspath("tagged"), tagged_discourses_block_reader
|
||||
)
|
||||
|
||||
def _discourses_block_reader(self, stream):
|
||||
# returns at most 1 discourse. (The other methods depend on this.)
|
||||
return [
|
||||
[
|
||||
self._parse_utterance(u, include_tag=False)
|
||||
for b in read_blankline_block(stream)
|
||||
for u in b.split("\n")
|
||||
if u.strip()
|
||||
]
|
||||
]
|
||||
|
||||
def _tagged_discourses_block_reader(self, stream, tagset=None):
|
||||
# returns at most 1 discourse. (The other methods depend on this.)
|
||||
return [
|
||||
[
|
||||
self._parse_utterance(u, include_tag=True, tagset=tagset)
|
||||
for b in read_blankline_block(stream)
|
||||
for u in b.split("\n")
|
||||
if u.strip()
|
||||
]
|
||||
]
|
||||
|
||||
def _turns_block_reader(self, stream):
|
||||
return self._discourses_block_reader(stream)[0]
|
||||
|
||||
def _tagged_turns_block_reader(self, stream, tagset=None):
|
||||
return self._tagged_discourses_block_reader(stream, tagset)[0]
|
||||
|
||||
def _words_block_reader(self, stream):
|
||||
return sum(self._discourses_block_reader(stream)[0], [])
|
||||
|
||||
def _tagged_words_block_reader(self, stream, tagset=None):
|
||||
return sum(self._tagged_discourses_block_reader(stream, tagset)[0], [])
|
||||
|
||||
_UTTERANCE_RE = re.compile(r"(\w+)\.(\d+)\:\s*(.*)")
|
||||
_SEP = "/"
|
||||
|
||||
def _parse_utterance(self, utterance, include_tag, tagset=None):
|
||||
m = self._UTTERANCE_RE.match(utterance)
|
||||
if m is None:
|
||||
raise ValueError("Bad utterance %r" % utterance)
|
||||
speaker, id, text = m.groups()
|
||||
words = [str2tuple(s, self._SEP) for s in text.split()]
|
||||
if not include_tag:
|
||||
words = [w for (w, t) in words]
|
||||
elif tagset and tagset != self._tagset:
|
||||
words = [(w, map_tag(self._tagset, tagset, t)) for (w, t) in words]
|
||||
return SwitchboardTurn(words, speaker, id)
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user