Initial commit
This commit is contained in:
34
backend/venv/Lib/site-packages/nltk/stem/__init__.py
Normal file
34
backend/venv/Lib/site-packages/nltk/stem/__init__.py
Normal file
@@ -0,0 +1,34 @@
|
||||
# Natural Language Toolkit: Stemmers
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
NLTK Stemmers
|
||||
|
||||
Interfaces used to remove morphological affixes from words, leaving
|
||||
only the word stem. Stemming algorithms aim to remove those affixes
|
||||
required for eg. grammatical role, tense, derivational morphology
|
||||
leaving only the stem of the word. This is a difficult problem due to
|
||||
irregular words (eg. common verbs in English), complicated
|
||||
morphological rules, and part-of-speech and sense ambiguities
|
||||
(eg. ``ceil-`` is not the stem of ``ceiling``).
|
||||
|
||||
StemmerI defines a standard interface for stemmers.
|
||||
"""
|
||||
|
||||
from nltk.stem.api import StemmerI
|
||||
from nltk.stem.arlstem import ARLSTem
|
||||
from nltk.stem.arlstem2 import ARLSTem2
|
||||
from nltk.stem.cistem import Cistem
|
||||
from nltk.stem.isri import ISRIStemmer
|
||||
from nltk.stem.lancaster import LancasterStemmer
|
||||
from nltk.stem.porter import PorterStemmer
|
||||
from nltk.stem.regexp import RegexpStemmer
|
||||
from nltk.stem.rslp import RSLPStemmer
|
||||
from nltk.stem.snowball import SnowballStemmer
|
||||
from nltk.stem.wordnet import WordNetLemmatizer
|
||||
27
backend/venv/Lib/site-packages/nltk/stem/api.py
Normal file
27
backend/venv/Lib/site-packages/nltk/stem/api.py
Normal file
@@ -0,0 +1,27 @@
|
||||
# Natural Language Toolkit: Stemmer Interface
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
from abc import ABCMeta, abstractmethod
|
||||
|
||||
|
||||
class StemmerI(metaclass=ABCMeta):
|
||||
"""
|
||||
A processing interface for removing morphological affixes from
|
||||
words. This process is known as stemming.
|
||||
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def stem(self, token):
|
||||
"""
|
||||
Strip affixes from the token and return the stem.
|
||||
|
||||
:param token: The token that should be stemmed.
|
||||
:type token: str
|
||||
"""
|
||||
361
backend/venv/Lib/site-packages/nltk/stem/arlstem.py
Normal file
361
backend/venv/Lib/site-packages/nltk/stem/arlstem.py
Normal file
@@ -0,0 +1,361 @@
|
||||
#
|
||||
# Natural Language Toolkit: ARLSTem Stemmer
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
#
|
||||
# Author: Kheireddine Abainia (x-programer) <k.abainia@gmail.com>
|
||||
# Algorithms: Kheireddine Abainia <k.abainia@gmail.com>
|
||||
# Siham Ouamour
|
||||
# Halim Sayoud
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
|
||||
"""
|
||||
ARLSTem Arabic Stemmer
|
||||
The details about the implementation of this algorithm are described in:
|
||||
K. Abainia, S. Ouamour and H. Sayoud, A Novel Robust Arabic Light Stemmer ,
|
||||
Journal of Experimental & Theoretical Artificial Intelligence (JETAI'17),
|
||||
Vol. 29, No. 3, 2017, pp. 557-573.
|
||||
The ARLSTem is a light Arabic stemmer that is based on removing the affixes
|
||||
from the word (i.e. prefixes, suffixes and infixes). It was evaluated and
|
||||
compared to several other stemmers using Paice's parameters (under-stemming
|
||||
index, over-stemming index and stemming weight), and the results showed that
|
||||
ARLSTem is promising and producing high performances. This stemmer is not
|
||||
based on any dictionary and can be used on-line effectively.
|
||||
"""
|
||||
import re
|
||||
|
||||
from nltk.stem.api import StemmerI
|
||||
|
||||
|
||||
class ARLSTem(StemmerI):
|
||||
"""
|
||||
ARLSTem stemmer : a light Arabic Stemming algorithm without any dictionary.
|
||||
Department of Telecommunication & Information Processing. USTHB University,
|
||||
Algiers, Algeria.
|
||||
ARLSTem.stem(token) returns the Arabic stem for the input token.
|
||||
The ARLSTem Stemmer requires that all tokens are encoded using Unicode
|
||||
encoding.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
# different Alif with hamza
|
||||
self.re_hamzated_alif = re.compile(r"[\u0622\u0623\u0625]")
|
||||
self.re_alifMaqsura = re.compile(r"[\u0649]")
|
||||
self.re_diacritics = re.compile(r"[\u064B-\u065F]")
|
||||
|
||||
# Alif Laam, Laam Laam, Fa Laam, Fa Ba
|
||||
self.pr2 = ["\u0627\u0644", "\u0644\u0644", "\u0641\u0644", "\u0641\u0628"]
|
||||
# Ba Alif Laam, Kaaf Alif Laam, Waaw Alif Laam
|
||||
self.pr3 = ["\u0628\u0627\u0644", "\u0643\u0627\u0644", "\u0648\u0627\u0644"]
|
||||
# Fa Laam Laam, Waaw Laam Laam
|
||||
self.pr32 = ["\u0641\u0644\u0644", "\u0648\u0644\u0644"]
|
||||
# Fa Ba Alif Laam, Waaw Ba Alif Laam, Fa Kaaf Alif Laam
|
||||
self.pr4 = [
|
||||
"\u0641\u0628\u0627\u0644",
|
||||
"\u0648\u0628\u0627\u0644",
|
||||
"\u0641\u0643\u0627\u0644",
|
||||
]
|
||||
|
||||
# Kaf Yaa, Kaf Miim
|
||||
self.su2 = ["\u0643\u064A", "\u0643\u0645"]
|
||||
# Ha Alif, Ha Miim
|
||||
self.su22 = ["\u0647\u0627", "\u0647\u0645"]
|
||||
# Kaf Miim Alif, Kaf Noon Shadda
|
||||
self.su3 = ["\u0643\u0645\u0627", "\u0643\u0646\u0651"]
|
||||
# Ha Miim Alif, Ha Noon Shadda
|
||||
self.su32 = ["\u0647\u0645\u0627", "\u0647\u0646\u0651"]
|
||||
|
||||
# Alif Noon, Ya Noon, Waaw Noon
|
||||
self.pl_si2 = ["\u0627\u0646", "\u064A\u0646", "\u0648\u0646"]
|
||||
# Taa Alif Noon, Taa Ya Noon
|
||||
self.pl_si3 = ["\u062A\u0627\u0646", "\u062A\u064A\u0646"]
|
||||
|
||||
# Alif Noon, Waaw Noon
|
||||
self.verb_su2 = ["\u0627\u0646", "\u0648\u0646"]
|
||||
# Siin Taa, Siin Yaa
|
||||
self.verb_pr2 = ["\u0633\u062A", "\u0633\u064A"]
|
||||
# Siin Alif, Siin Noon
|
||||
self.verb_pr22 = ["\u0633\u0627", "\u0633\u0646"]
|
||||
# Lam Noon, Lam Taa, Lam Yaa, Lam Hamza
|
||||
self.verb_pr33 = [
|
||||
"\u0644\u0646",
|
||||
"\u0644\u062A",
|
||||
"\u0644\u064A",
|
||||
"\u0644\u0623",
|
||||
]
|
||||
# Taa Miim Alif, Taa Noon Shadda
|
||||
self.verb_suf3 = ["\u062A\u0645\u0627", "\u062A\u0646\u0651"]
|
||||
# Noon Alif, Taa Miim, Taa Alif, Waaw Alif
|
||||
self.verb_suf2 = [
|
||||
"\u0646\u0627",
|
||||
"\u062A\u0645",
|
||||
"\u062A\u0627",
|
||||
"\u0648\u0627",
|
||||
]
|
||||
# Taa, Alif, Noon
|
||||
self.verb_suf1 = ["\u062A", "\u0627", "\u0646"]
|
||||
|
||||
def stem(self, token):
|
||||
"""
|
||||
call this function to get the word's stem based on ARLSTem .
|
||||
"""
|
||||
try:
|
||||
if token is None:
|
||||
raise ValueError(
|
||||
"The word could not be stemmed, because \
|
||||
it is empty !"
|
||||
)
|
||||
# remove Arabic diacritics and replace some letters with others
|
||||
token = self.norm(token)
|
||||
# strip common prefixes of the nouns
|
||||
pre = self.pref(token)
|
||||
if pre is not None:
|
||||
token = pre
|
||||
# strip the suffixes which are common to nouns and verbs
|
||||
token = self.suff(token)
|
||||
# transform a plural noun to a singular noun
|
||||
ps = self.plur2sing(token)
|
||||
if ps is None:
|
||||
# transform from the feminine form to the masculine form
|
||||
fm = self.fem2masc(token)
|
||||
if fm is not None:
|
||||
return fm
|
||||
else:
|
||||
if pre is None: # if the prefixes are not stripped
|
||||
# strip the verb prefixes and suffixes
|
||||
return self.verb(token)
|
||||
else:
|
||||
return ps
|
||||
return token
|
||||
except ValueError as e:
|
||||
print(e)
|
||||
|
||||
def norm(self, token):
|
||||
"""
|
||||
normalize the word by removing diacritics, replacing hamzated Alif
|
||||
with Alif replacing AlifMaqsura with Yaa and removing Waaw at the
|
||||
beginning.
|
||||
"""
|
||||
# strip Arabic diacritics
|
||||
token = self.re_diacritics.sub("", token)
|
||||
# replace Hamzated Alif with Alif bare
|
||||
token = self.re_hamzated_alif.sub("\u0627", token)
|
||||
# replace alifMaqsura with Yaa
|
||||
token = self.re_alifMaqsura.sub("\u064A", token)
|
||||
# strip the Waaw from the word beginning if the remaining is 3 letters
|
||||
# at least
|
||||
if token.startswith("\u0648") and len(token) > 3:
|
||||
token = token[1:]
|
||||
return token
|
||||
|
||||
def pref(self, token):
|
||||
"""
|
||||
remove prefixes from the words' beginning.
|
||||
"""
|
||||
if len(token) > 5:
|
||||
for p3 in self.pr3:
|
||||
if token.startswith(p3):
|
||||
return token[3:]
|
||||
if len(token) > 6:
|
||||
for p4 in self.pr4:
|
||||
if token.startswith(p4):
|
||||
return token[4:]
|
||||
if len(token) > 5:
|
||||
for p3 in self.pr32:
|
||||
if token.startswith(p3):
|
||||
return token[3:]
|
||||
if len(token) > 4:
|
||||
for p2 in self.pr2:
|
||||
if token.startswith(p2):
|
||||
return token[2:]
|
||||
|
||||
def suff(self, token):
|
||||
"""
|
||||
remove suffixes from the word's end.
|
||||
"""
|
||||
if token.endswith("\u0643") and len(token) > 3:
|
||||
return token[:-1]
|
||||
if len(token) > 4:
|
||||
for s2 in self.su2:
|
||||
if token.endswith(s2):
|
||||
return token[:-2]
|
||||
if len(token) > 5:
|
||||
for s3 in self.su3:
|
||||
if token.endswith(s3):
|
||||
return token[:-3]
|
||||
if token.endswith("\u0647") and len(token) > 3:
|
||||
token = token[:-1]
|
||||
return token
|
||||
if len(token) > 4:
|
||||
for s2 in self.su22:
|
||||
if token.endswith(s2):
|
||||
return token[:-2]
|
||||
if len(token) > 5:
|
||||
for s3 in self.su32:
|
||||
if token.endswith(s3):
|
||||
return token[:-3]
|
||||
if token.endswith("\u0646\u0627") and len(token) > 4:
|
||||
return token[:-2]
|
||||
return token
|
||||
|
||||
def fem2masc(self, token):
|
||||
"""
|
||||
transform the word from the feminine form to the masculine form.
|
||||
"""
|
||||
if token.endswith("\u0629") and len(token) > 3:
|
||||
return token[:-1]
|
||||
|
||||
def plur2sing(self, token):
|
||||
"""
|
||||
transform the word from the plural form to the singular form.
|
||||
"""
|
||||
if len(token) > 4:
|
||||
for ps2 in self.pl_si2:
|
||||
if token.endswith(ps2):
|
||||
return token[:-2]
|
||||
if len(token) > 5:
|
||||
for ps3 in self.pl_si3:
|
||||
if token.endswith(ps3):
|
||||
return token[:-3]
|
||||
if len(token) > 3 and token.endswith("\u0627\u062A"):
|
||||
return token[:-2]
|
||||
if len(token) > 3 and token.startswith("\u0627") and token[2] == "\u0627":
|
||||
return token[:2] + token[3:]
|
||||
if len(token) > 4 and token.startswith("\u0627") and token[-2] == "\u0627":
|
||||
return token[1:-2] + token[-1]
|
||||
|
||||
def verb(self, token):
|
||||
"""
|
||||
stem the verb prefixes and suffixes or both
|
||||
"""
|
||||
vb = self.verb_t1(token)
|
||||
if vb is not None:
|
||||
return vb
|
||||
vb = self.verb_t2(token)
|
||||
if vb is not None:
|
||||
return vb
|
||||
vb = self.verb_t3(token)
|
||||
if vb is not None:
|
||||
return vb
|
||||
vb = self.verb_t4(token)
|
||||
if vb is not None:
|
||||
return vb
|
||||
vb = self.verb_t5(token)
|
||||
if vb is not None:
|
||||
return vb
|
||||
return self.verb_t6(token)
|
||||
|
||||
def verb_t1(self, token):
|
||||
"""
|
||||
stem the present prefixes and suffixes
|
||||
"""
|
||||
if len(token) > 5 and token.startswith("\u062A"): # Taa
|
||||
for s2 in self.pl_si2:
|
||||
if token.endswith(s2):
|
||||
return token[1:-2]
|
||||
if len(token) > 5 and token.startswith("\u064A"): # Yaa
|
||||
for s2 in self.verb_su2:
|
||||
if token.endswith(s2):
|
||||
return token[1:-2]
|
||||
if len(token) > 4 and token.startswith("\u0627"): # Alif
|
||||
# Waaw Alif
|
||||
if len(token) > 5 and token.endswith("\u0648\u0627"):
|
||||
return token[1:-2]
|
||||
# Yaa
|
||||
if token.endswith("\u064A"):
|
||||
return token[1:-1]
|
||||
# Alif
|
||||
if token.endswith("\u0627"):
|
||||
return token[1:-1]
|
||||
# Noon
|
||||
if token.endswith("\u0646"):
|
||||
return token[1:-1]
|
||||
# ^Yaa, Noon$
|
||||
if len(token) > 4 and token.startswith("\u064A") and token.endswith("\u0646"):
|
||||
return token[1:-1]
|
||||
# ^Taa, Noon$
|
||||
if len(token) > 4 and token.startswith("\u062A") and token.endswith("\u0646"):
|
||||
return token[1:-1]
|
||||
|
||||
def verb_t2(self, token):
|
||||
"""
|
||||
stem the future prefixes and suffixes
|
||||
"""
|
||||
if len(token) > 6:
|
||||
for s2 in self.pl_si2:
|
||||
# ^Siin Taa
|
||||
if token.startswith(self.verb_pr2[0]) and token.endswith(s2):
|
||||
return token[2:-2]
|
||||
# ^Siin Yaa, Alif Noon$
|
||||
if token.startswith(self.verb_pr2[1]) and token.endswith(self.pl_si2[0]):
|
||||
return token[2:-2]
|
||||
# ^Siin Yaa, Waaw Noon$
|
||||
if token.startswith(self.verb_pr2[1]) and token.endswith(self.pl_si2[2]):
|
||||
return token[2:-2]
|
||||
# ^Siin Taa, Noon$
|
||||
if (
|
||||
len(token) > 5
|
||||
and token.startswith(self.verb_pr2[0])
|
||||
and token.endswith("\u0646")
|
||||
):
|
||||
return token[2:-1]
|
||||
# ^Siin Yaa, Noon$
|
||||
if (
|
||||
len(token) > 5
|
||||
and token.startswith(self.verb_pr2[1])
|
||||
and token.endswith("\u0646")
|
||||
):
|
||||
return token[2:-1]
|
||||
|
||||
def verb_t3(self, token):
|
||||
"""
|
||||
stem the present suffixes
|
||||
"""
|
||||
if len(token) > 5:
|
||||
for su3 in self.verb_suf3:
|
||||
if token.endswith(su3):
|
||||
return token[:-3]
|
||||
if len(token) > 4:
|
||||
for su2 in self.verb_suf2:
|
||||
if token.endswith(su2):
|
||||
return token[:-2]
|
||||
if len(token) > 3:
|
||||
for su1 in self.verb_suf1:
|
||||
if token.endswith(su1):
|
||||
return token[:-1]
|
||||
|
||||
def verb_t4(self, token):
|
||||
"""
|
||||
stem the present prefixes
|
||||
"""
|
||||
if len(token) > 3:
|
||||
for pr1 in self.verb_suf1:
|
||||
if token.startswith(pr1):
|
||||
return token[1:]
|
||||
if token.startswith("\u064A"):
|
||||
return token[1:]
|
||||
|
||||
def verb_t5(self, token):
|
||||
"""
|
||||
stem the future prefixes
|
||||
"""
|
||||
if len(token) > 4:
|
||||
for pr2 in self.verb_pr22:
|
||||
if token.startswith(pr2):
|
||||
return token[2:]
|
||||
for pr2 in self.verb_pr2:
|
||||
if token.startswith(pr2):
|
||||
return token[2:]
|
||||
return token
|
||||
|
||||
def verb_t6(self, token):
|
||||
"""
|
||||
stem the order prefixes
|
||||
"""
|
||||
if len(token) > 4:
|
||||
for pr3 in self.verb_pr33:
|
||||
if token.startswith(pr3):
|
||||
return token[2:]
|
||||
return token
|
||||
457
backend/venv/Lib/site-packages/nltk/stem/arlstem2.py
Normal file
457
backend/venv/Lib/site-packages/nltk/stem/arlstem2.py
Normal file
@@ -0,0 +1,457 @@
|
||||
#
|
||||
# Natural Language Toolkit: ARLSTem Stemmer v2
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
#
|
||||
# Author: Kheireddine Abainia (x-programer) <k.abainia@gmail.com>
|
||||
# Algorithms: Kheireddine Abainia <k.abainia@gmail.com>
|
||||
# Hamza Rebbani <hamrebbani@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
|
||||
"""
|
||||
ARLSTem2 Arabic Light Stemmer
|
||||
The details about the implementation of this algorithm are described in:
|
||||
K. Abainia and H. Rebbani, Comparing the Effectiveness of the Improved ARLSTem
|
||||
Algorithm with Existing Arabic Light Stemmers, International Conference on
|
||||
Theoretical and Applicative Aspects of Computer Science (ICTAACS'19), Skikda,
|
||||
Algeria, December 15-16, 2019.
|
||||
ARLSTem2 is an Arabic light stemmer based on removing the affixes from
|
||||
the words (i.e. prefixes, suffixes and infixes). It is an improvement
|
||||
of the previous Arabic light stemmer (ARLSTem). The new version was compared to
|
||||
the original algorithm and several existing Arabic light stemmers, where the
|
||||
results showed that the new version considerably improves the under-stemming
|
||||
errors that are common to light stemmers. Both ARLSTem and ARLSTem2 can be run
|
||||
online and do not use any dictionary.
|
||||
"""
|
||||
import re
|
||||
|
||||
from nltk.stem.api import StemmerI
|
||||
|
||||
|
||||
class ARLSTem2(StemmerI):
|
||||
"""
|
||||
Return a stemmed Arabic word after removing affixes. This an improved
|
||||
version of the previous algorithm, which reduces under-stemming errors.
|
||||
Typically used in Arabic search engine, information retrieval and NLP.
|
||||
|
||||
>>> from nltk.stem import arlstem2
|
||||
>>> stemmer = ARLSTem2()
|
||||
>>> word = stemmer.stem('يعمل')
|
||||
>>> print(word)
|
||||
عمل
|
||||
|
||||
:param token: The input Arabic word (unicode) to be stemmed
|
||||
:type token: unicode
|
||||
:return: A unicode Arabic word
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
# different Alif with hamza
|
||||
self.re_hamzated_alif = re.compile(r"[\u0622\u0623\u0625]")
|
||||
self.re_alifMaqsura = re.compile(r"[\u0649]")
|
||||
self.re_diacritics = re.compile(r"[\u064B-\u065F]")
|
||||
|
||||
# Alif Laam, Laam Laam, Fa Laam, Fa Ba
|
||||
self.pr2 = ["\u0627\u0644", "\u0644\u0644", "\u0641\u0644", "\u0641\u0628"]
|
||||
# Ba Alif Laam, Kaaf Alif Laam, Waaw Alif Laam
|
||||
self.pr3 = ["\u0628\u0627\u0644", "\u0643\u0627\u0644", "\u0648\u0627\u0644"]
|
||||
# Fa Laam Laam, Waaw Laam Laam
|
||||
self.pr32 = ["\u0641\u0644\u0644", "\u0648\u0644\u0644"]
|
||||
# Fa Ba Alif Laam, Waaw Ba Alif Laam, Fa Kaaf Alif Laam
|
||||
self.pr4 = [
|
||||
"\u0641\u0628\u0627\u0644",
|
||||
"\u0648\u0628\u0627\u0644",
|
||||
"\u0641\u0643\u0627\u0644",
|
||||
]
|
||||
|
||||
# Kaf Yaa, Kaf Miim
|
||||
self.su2 = ["\u0643\u064A", "\u0643\u0645"]
|
||||
# Ha Alif, Ha Miim
|
||||
self.su22 = ["\u0647\u0627", "\u0647\u0645"]
|
||||
# Kaf Miim Alif, Kaf Noon Shadda
|
||||
self.su3 = ["\u0643\u0645\u0627", "\u0643\u0646\u0651"]
|
||||
# Ha Miim Alif, Ha Noon Shadda
|
||||
self.su32 = ["\u0647\u0645\u0627", "\u0647\u0646\u0651"]
|
||||
|
||||
# Alif Noon, Ya Noon, Waaw Noon
|
||||
self.pl_si2 = ["\u0627\u0646", "\u064A\u0646", "\u0648\u0646"]
|
||||
# Taa Alif Noon, Taa Ya Noon
|
||||
self.pl_si3 = ["\u062A\u0627\u0646", "\u062A\u064A\u0646"]
|
||||
|
||||
# Alif Noon, Waaw Noon
|
||||
self.verb_su2 = ["\u0627\u0646", "\u0648\u0646"]
|
||||
# Siin Taa, Siin Yaa
|
||||
self.verb_pr2 = ["\u0633\u062A", "\u0633\u064A"]
|
||||
# Siin Alif, Siin Noon
|
||||
self.verb_pr22 = ["\u0633\u0627", "\u0633\u0646"]
|
||||
# Lam Noon, Lam Taa, Lam Yaa, Lam Hamza
|
||||
self.verb_pr33 = [
|
||||
"\u0644\u0646",
|
||||
"\u0644\u062A",
|
||||
"\u0644\u064A",
|
||||
"\u0644\u0623",
|
||||
]
|
||||
# Taa Miim Alif, Taa Noon Shadda
|
||||
self.verb_suf3 = ["\u062A\u0645\u0627", "\u062A\u0646\u0651"]
|
||||
# Noon Alif, Taa Miim, Taa Alif, Waaw Alif
|
||||
self.verb_suf2 = [
|
||||
"\u0646\u0627",
|
||||
"\u062A\u0645",
|
||||
"\u062A\u0627",
|
||||
"\u0648\u0627",
|
||||
]
|
||||
# Taa, Alif, Noon
|
||||
self.verb_suf1 = ["\u062A", "\u0627", "\u0646"]
|
||||
|
||||
def stem1(self, token):
|
||||
"""
|
||||
call this function to get the first stem
|
||||
"""
|
||||
try:
|
||||
if token is None:
|
||||
raise ValueError(
|
||||
"The word could not be stemmed, because \
|
||||
it is empty !"
|
||||
)
|
||||
self.is_verb = False
|
||||
# remove Arabic diacritics and replace some letters with others
|
||||
token = self.norm(token)
|
||||
# strip the common noun prefixes
|
||||
pre = self.pref(token)
|
||||
if pre is not None:
|
||||
token = pre
|
||||
# transform the feminine form to masculine form
|
||||
fm = self.fem2masc(token)
|
||||
if fm is not None:
|
||||
return fm
|
||||
# strip the adjective affixes
|
||||
adj = self.adjective(token)
|
||||
if adj is not None:
|
||||
return adj
|
||||
# strip the suffixes that are common to nouns and verbs
|
||||
token = self.suff(token)
|
||||
# transform a plural noun to a singular noun
|
||||
ps = self.plur2sing(token)
|
||||
if ps is None:
|
||||
if pre is None: # if the noun prefixes are not stripped
|
||||
# strip the verb prefixes and suffixes
|
||||
verb = self.verb(token)
|
||||
if verb is not None:
|
||||
self.is_verb = True
|
||||
return verb
|
||||
else:
|
||||
return ps
|
||||
return token
|
||||
except ValueError as e:
|
||||
print(e)
|
||||
|
||||
def stem(self, token):
|
||||
# stem the input word
|
||||
try:
|
||||
if token is None:
|
||||
raise ValueError(
|
||||
"The word could not be stemmed, because \
|
||||
it is empty !"
|
||||
)
|
||||
# run the first round of stemming
|
||||
token = self.stem1(token)
|
||||
# check if there is some additional noun affixes
|
||||
if len(token) > 4:
|
||||
# ^Taa, $Yaa + char
|
||||
if token.startswith("\u062A") and token[-2] == "\u064A":
|
||||
token = token[1:-2] + token[-1]
|
||||
return token
|
||||
# ^Miim, $Waaw + char
|
||||
if token.startswith("\u0645") and token[-2] == "\u0648":
|
||||
token = token[1:-2] + token[-1]
|
||||
return token
|
||||
if len(token) > 3:
|
||||
# !^Alif, $Yaa
|
||||
if not token.startswith("\u0627") and token.endswith("\u064A"):
|
||||
token = token[:-1]
|
||||
return token
|
||||
# $Laam
|
||||
if token.startswith("\u0644"):
|
||||
return token[1:]
|
||||
return token
|
||||
except ValueError as e:
|
||||
print(e)
|
||||
|
||||
def norm(self, token):
|
||||
"""
|
||||
normalize the word by removing diacritics, replace hamzated Alif
|
||||
with Alif bare, replace AlifMaqsura with Yaa and remove Waaw at the
|
||||
beginning.
|
||||
"""
|
||||
# strip Arabic diacritics
|
||||
token = self.re_diacritics.sub("", token)
|
||||
# replace Hamzated Alif with Alif bare
|
||||
token = self.re_hamzated_alif.sub("\u0627", token)
|
||||
# replace alifMaqsura with Yaa
|
||||
token = self.re_alifMaqsura.sub("\u064A", token)
|
||||
# strip the Waaw from the word beginning if the remaining is
|
||||
# tri-literal at least
|
||||
if token.startswith("\u0648") and len(token) > 3:
|
||||
token = token[1:]
|
||||
return token
|
||||
|
||||
def pref(self, token):
|
||||
"""
|
||||
remove prefixes from the words' beginning.
|
||||
"""
|
||||
if len(token) > 5:
|
||||
for p3 in self.pr3:
|
||||
if token.startswith(p3):
|
||||
return token[3:]
|
||||
if len(token) > 6:
|
||||
for p4 in self.pr4:
|
||||
if token.startswith(p4):
|
||||
return token[4:]
|
||||
if len(token) > 5:
|
||||
for p3 in self.pr32:
|
||||
if token.startswith(p3):
|
||||
return token[3:]
|
||||
if len(token) > 4:
|
||||
for p2 in self.pr2:
|
||||
if token.startswith(p2):
|
||||
return token[2:]
|
||||
|
||||
def adjective(self, token):
|
||||
"""
|
||||
remove the infixes from adjectives
|
||||
"""
|
||||
# ^Alif, Alif, $Yaa
|
||||
if len(token) > 5:
|
||||
if (
|
||||
token.startswith("\u0627")
|
||||
and token[-3] == "\u0627"
|
||||
and token.endswith("\u064A")
|
||||
):
|
||||
return token[:-3] + token[-2]
|
||||
|
||||
def suff(self, token):
|
||||
"""
|
||||
remove the suffixes from the word's ending.
|
||||
"""
|
||||
if token.endswith("\u0643") and len(token) > 3:
|
||||
return token[:-1]
|
||||
if len(token) > 4:
|
||||
for s2 in self.su2:
|
||||
if token.endswith(s2):
|
||||
return token[:-2]
|
||||
if len(token) > 5:
|
||||
for s3 in self.su3:
|
||||
if token.endswith(s3):
|
||||
return token[:-3]
|
||||
if token.endswith("\u0647") and len(token) > 3:
|
||||
token = token[:-1]
|
||||
return token
|
||||
if len(token) > 4:
|
||||
for s2 in self.su22:
|
||||
if token.endswith(s2):
|
||||
return token[:-2]
|
||||
if len(token) > 5:
|
||||
for s3 in self.su32:
|
||||
if token.endswith(s3):
|
||||
return token[:-3]
|
||||
# $Noon and Alif
|
||||
if token.endswith("\u0646\u0627") and len(token) > 4:
|
||||
return token[:-2]
|
||||
return token
|
||||
|
||||
def fem2masc(self, token):
|
||||
"""
|
||||
transform the word from the feminine form to the masculine form.
|
||||
"""
|
||||
if len(token) > 6:
|
||||
# ^Taa, Yaa, $Yaa and Taa Marbuta
|
||||
if (
|
||||
token.startswith("\u062A")
|
||||
and token[-4] == "\u064A"
|
||||
and token.endswith("\u064A\u0629")
|
||||
):
|
||||
return token[1:-4] + token[-3]
|
||||
# ^Alif, Yaa, $Yaa and Taa Marbuta
|
||||
if (
|
||||
token.startswith("\u0627")
|
||||
and token[-4] == "\u0627"
|
||||
and token.endswith("\u064A\u0629")
|
||||
):
|
||||
return token[:-4] + token[-3]
|
||||
# $Alif, Yaa and Taa Marbuta
|
||||
if token.endswith("\u0627\u064A\u0629") and len(token) > 5:
|
||||
return token[:-2]
|
||||
if len(token) > 4:
|
||||
# Alif, $Taa Marbuta
|
||||
if token[1] == "\u0627" and token.endswith("\u0629"):
|
||||
return token[0] + token[2:-1]
|
||||
# $Yaa and Taa Marbuta
|
||||
if token.endswith("\u064A\u0629"):
|
||||
return token[:-2]
|
||||
# $Taa Marbuta
|
||||
if token.endswith("\u0629") and len(token) > 3:
|
||||
return token[:-1]
|
||||
|
||||
def plur2sing(self, token):
|
||||
"""
|
||||
transform the word from the plural form to the singular form.
|
||||
"""
|
||||
# ^Haa, $Noon, Waaw
|
||||
if len(token) > 5:
|
||||
if token.startswith("\u0645") and token.endswith("\u0648\u0646"):
|
||||
return token[1:-2]
|
||||
if len(token) > 4:
|
||||
for ps2 in self.pl_si2:
|
||||
if token.endswith(ps2):
|
||||
return token[:-2]
|
||||
if len(token) > 5:
|
||||
for ps3 in self.pl_si3:
|
||||
if token.endswith(ps3):
|
||||
return token[:-3]
|
||||
if len(token) > 4:
|
||||
# $Alif, Taa
|
||||
if token.endswith("\u0627\u062A"):
|
||||
return token[:-2]
|
||||
# ^Alif Alif
|
||||
if token.startswith("\u0627") and token[2] == "\u0627":
|
||||
return token[:2] + token[3:]
|
||||
# ^Alif Alif
|
||||
if token.startswith("\u0627") and token[-2] == "\u0627":
|
||||
return token[1:-2] + token[-1]
|
||||
|
||||
def verb(self, token):
|
||||
"""
|
||||
stem the verb prefixes and suffixes or both
|
||||
"""
|
||||
vb = self.verb_t1(token)
|
||||
if vb is not None:
|
||||
return vb
|
||||
vb = self.verb_t2(token)
|
||||
if vb is not None:
|
||||
return vb
|
||||
vb = self.verb_t3(token)
|
||||
if vb is not None:
|
||||
return vb
|
||||
vb = self.verb_t4(token)
|
||||
if vb is not None:
|
||||
return vb
|
||||
vb = self.verb_t5(token)
|
||||
if vb is not None:
|
||||
return vb
|
||||
vb = self.verb_t6(token)
|
||||
return vb
|
||||
|
||||
def verb_t1(self, token):
|
||||
"""
|
||||
stem the present tense co-occurred prefixes and suffixes
|
||||
"""
|
||||
if len(token) > 5 and token.startswith("\u062A"): # Taa
|
||||
for s2 in self.pl_si2:
|
||||
if token.endswith(s2):
|
||||
return token[1:-2]
|
||||
if len(token) > 5 and token.startswith("\u064A"): # Yaa
|
||||
for s2 in self.verb_su2:
|
||||
if token.endswith(s2):
|
||||
return token[1:-2]
|
||||
if len(token) > 4 and token.startswith("\u0627"): # Alif
|
||||
# Waaw Alif
|
||||
if len(token) > 5 and token.endswith("\u0648\u0627"):
|
||||
return token[1:-2]
|
||||
# Yaa
|
||||
if token.endswith("\u064A"):
|
||||
return token[1:-1]
|
||||
# Alif
|
||||
if token.endswith("\u0627"):
|
||||
return token[1:-1]
|
||||
# Noon
|
||||
if token.endswith("\u0646"):
|
||||
return token[1:-1]
|
||||
# ^Yaa, Noon$
|
||||
if len(token) > 4 and token.startswith("\u064A") and token.endswith("\u0646"):
|
||||
return token[1:-1]
|
||||
# ^Taa, Noon$
|
||||
if len(token) > 4 and token.startswith("\u062A") and token.endswith("\u0646"):
|
||||
return token[1:-1]
|
||||
|
||||
def verb_t2(self, token):
|
||||
"""
|
||||
stem the future tense co-occurred prefixes and suffixes
|
||||
"""
|
||||
if len(token) > 6:
|
||||
for s2 in self.pl_si2:
|
||||
# ^Siin Taa
|
||||
if token.startswith(self.verb_pr2[0]) and token.endswith(s2):
|
||||
return token[2:-2]
|
||||
# ^Siin Yaa, Alif Noon$
|
||||
if token.startswith(self.verb_pr2[1]) and token.endswith(self.pl_si2[0]):
|
||||
return token[2:-2]
|
||||
# ^Siin Yaa, Waaw Noon$
|
||||
if token.startswith(self.verb_pr2[1]) and token.endswith(self.pl_si2[2]):
|
||||
return token[2:-2]
|
||||
# ^Siin Taa, Noon$
|
||||
if (
|
||||
len(token) > 5
|
||||
and token.startswith(self.verb_pr2[0])
|
||||
and token.endswith("\u0646")
|
||||
):
|
||||
return token[2:-1]
|
||||
# ^Siin Yaa, Noon$
|
||||
if (
|
||||
len(token) > 5
|
||||
and token.startswith(self.verb_pr2[1])
|
||||
and token.endswith("\u0646")
|
||||
):
|
||||
return token[2:-1]
|
||||
|
||||
def verb_t3(self, token):
|
||||
"""
|
||||
stem the present tense suffixes
|
||||
"""
|
||||
if len(token) > 5:
|
||||
for su3 in self.verb_suf3:
|
||||
if token.endswith(su3):
|
||||
return token[:-3]
|
||||
if len(token) > 4:
|
||||
for su2 in self.verb_suf2:
|
||||
if token.endswith(su2):
|
||||
return token[:-2]
|
||||
if len(token) > 3:
|
||||
for su1 in self.verb_suf1:
|
||||
if token.endswith(su1):
|
||||
return token[:-1]
|
||||
|
||||
def verb_t4(self, token):
|
||||
"""
|
||||
stem the present tense prefixes
|
||||
"""
|
||||
if len(token) > 3:
|
||||
for pr1 in self.verb_suf1:
|
||||
if token.startswith(pr1):
|
||||
return token[1:]
|
||||
if token.startswith("\u064A"):
|
||||
return token[1:]
|
||||
|
||||
def verb_t5(self, token):
|
||||
"""
|
||||
stem the future tense prefixes
|
||||
"""
|
||||
if len(token) > 4:
|
||||
for pr2 in self.verb_pr22:
|
||||
if token.startswith(pr2):
|
||||
return token[2:]
|
||||
for pr2 in self.verb_pr2:
|
||||
if token.startswith(pr2):
|
||||
return token[2:]
|
||||
|
||||
def verb_t6(self, token):
|
||||
"""
|
||||
stem the imperative tense prefixes
|
||||
"""
|
||||
if len(token) > 4:
|
||||
for pr3 in self.verb_pr33:
|
||||
if token.startswith(pr3):
|
||||
return token[2:]
|
||||
|
||||
return token
|
||||
209
backend/venv/Lib/site-packages/nltk/stem/cistem.py
Normal file
209
backend/venv/Lib/site-packages/nltk/stem/cistem.py
Normal file
@@ -0,0 +1,209 @@
|
||||
# Natural Language Toolkit: CISTEM Stemmer for German
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Leonie Weissweiler <l.weissweiler@outlook.de>
|
||||
# Tom Aarsen <> (modifications)
|
||||
# Algorithm: Leonie Weissweiler <l.weissweiler@outlook.de>
|
||||
# Alexander Fraser <fraser@cis.lmu.de>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import re
|
||||
from typing import Tuple
|
||||
|
||||
from nltk.stem.api import StemmerI
|
||||
|
||||
|
||||
class Cistem(StemmerI):
|
||||
"""
|
||||
CISTEM Stemmer for German
|
||||
|
||||
This is the official Python implementation of the CISTEM stemmer.
|
||||
It is based on the paper
|
||||
Leonie Weissweiler, Alexander Fraser (2017). Developing a Stemmer for German
|
||||
Based on a Comparative Analysis of Publicly Available Stemmers.
|
||||
In Proceedings of the German Society for Computational Linguistics and Language
|
||||
Technology (GSCL)
|
||||
which can be read here:
|
||||
https://www.cis.lmu.de/~weissweiler/cistem/
|
||||
|
||||
In the paper, we conducted an analysis of publicly available stemmers,
|
||||
developed two gold standards for German stemming and evaluated the stemmers
|
||||
based on the two gold standards. We then proposed the stemmer implemented here
|
||||
and show that it achieves slightly better f-measure than the other stemmers and
|
||||
is thrice as fast as the Snowball stemmer for German while being about as fast
|
||||
as most other stemmers.
|
||||
|
||||
case_insensitive is a a boolean specifying if case-insensitive stemming
|
||||
should be used. Case insensitivity improves performance only if words in the
|
||||
text may be incorrectly upper case. For all-lowercase and correctly cased
|
||||
text, best performance is achieved by setting case_insensitive for false.
|
||||
|
||||
:param case_insensitive: if True, the stemming is case insensitive. False by default.
|
||||
:type case_insensitive: bool
|
||||
"""
|
||||
|
||||
strip_ge = re.compile(r"^ge(.{4,})")
|
||||
repl_xx = re.compile(r"(.)\1")
|
||||
strip_emr = re.compile(r"e[mr]$")
|
||||
strip_nd = re.compile(r"nd$")
|
||||
strip_t = re.compile(r"t$")
|
||||
strip_esn = re.compile(r"[esn]$")
|
||||
repl_xx_back = re.compile(r"(.)\*")
|
||||
|
||||
def __init__(self, case_insensitive: bool = False):
|
||||
self._case_insensitive = case_insensitive
|
||||
|
||||
@staticmethod
|
||||
def replace_to(word: str) -> str:
|
||||
word = word.replace("sch", "$")
|
||||
word = word.replace("ei", "%")
|
||||
word = word.replace("ie", "&")
|
||||
word = Cistem.repl_xx.sub(r"\1*", word)
|
||||
|
||||
return word
|
||||
|
||||
@staticmethod
|
||||
def replace_back(word: str) -> str:
|
||||
word = Cistem.repl_xx_back.sub(r"\1\1", word)
|
||||
word = word.replace("%", "ei")
|
||||
word = word.replace("&", "ie")
|
||||
word = word.replace("$", "sch")
|
||||
|
||||
return word
|
||||
|
||||
def stem(self, word: str) -> str:
|
||||
"""Stems the input word.
|
||||
|
||||
:param word: The word that is to be stemmed.
|
||||
:type word: str
|
||||
:return: The stemmed word.
|
||||
:rtype: str
|
||||
|
||||
>>> from nltk.stem.cistem import Cistem
|
||||
>>> stemmer = Cistem()
|
||||
>>> s1 = "Speicherbehältern"
|
||||
>>> stemmer.stem(s1)
|
||||
'speicherbehalt'
|
||||
>>> s2 = "Grenzpostens"
|
||||
>>> stemmer.stem(s2)
|
||||
'grenzpost'
|
||||
>>> s3 = "Ausgefeiltere"
|
||||
>>> stemmer.stem(s3)
|
||||
'ausgefeilt'
|
||||
>>> stemmer = Cistem(True)
|
||||
>>> stemmer.stem(s1)
|
||||
'speicherbehal'
|
||||
>>> stemmer.stem(s2)
|
||||
'grenzpo'
|
||||
>>> stemmer.stem(s3)
|
||||
'ausgefeil'
|
||||
"""
|
||||
if len(word) == 0:
|
||||
return word
|
||||
|
||||
upper = word[0].isupper()
|
||||
word = word.lower()
|
||||
|
||||
word = word.replace("ü", "u")
|
||||
word = word.replace("ö", "o")
|
||||
word = word.replace("ä", "a")
|
||||
word = word.replace("ß", "ss")
|
||||
|
||||
word = Cistem.strip_ge.sub(r"\1", word)
|
||||
|
||||
return self._segment_inner(word, upper)[0]
|
||||
|
||||
def segment(self, word: str) -> Tuple[str, str]:
|
||||
"""
|
||||
This method works very similarly to stem (:func:'cistem.stem'). The difference is that in
|
||||
addition to returning the stem, it also returns the rest that was removed at
|
||||
the end. To be able to return the stem unchanged so the stem and the rest
|
||||
can be concatenated to form the original word, all subsitutions that altered
|
||||
the stem in any other way than by removing letters at the end were left out.
|
||||
|
||||
:param word: The word that is to be stemmed.
|
||||
:type word: str
|
||||
:return: A tuple of the stemmed word and the removed suffix.
|
||||
:rtype: Tuple[str, str]
|
||||
|
||||
>>> from nltk.stem.cistem import Cistem
|
||||
>>> stemmer = Cistem()
|
||||
>>> s1 = "Speicherbehältern"
|
||||
>>> stemmer.segment(s1)
|
||||
('speicherbehält', 'ern')
|
||||
>>> s2 = "Grenzpostens"
|
||||
>>> stemmer.segment(s2)
|
||||
('grenzpost', 'ens')
|
||||
>>> s3 = "Ausgefeiltere"
|
||||
>>> stemmer.segment(s3)
|
||||
('ausgefeilt', 'ere')
|
||||
>>> stemmer = Cistem(True)
|
||||
>>> stemmer.segment(s1)
|
||||
('speicherbehäl', 'tern')
|
||||
>>> stemmer.segment(s2)
|
||||
('grenzpo', 'stens')
|
||||
>>> stemmer.segment(s3)
|
||||
('ausgefeil', 'tere')
|
||||
"""
|
||||
if len(word) == 0:
|
||||
return ("", "")
|
||||
|
||||
upper = word[0].isupper()
|
||||
word = word.lower()
|
||||
|
||||
return self._segment_inner(word, upper)
|
||||
|
||||
def _segment_inner(self, word: str, upper: bool):
|
||||
"""Inner method for iteratively applying the code stemming regexes.
|
||||
This method receives a pre-processed variant of the word to be stemmed,
|
||||
or the word to be segmented, and returns a tuple of the word and the
|
||||
removed suffix.
|
||||
|
||||
:param word: A pre-processed variant of the word that is to be stemmed.
|
||||
:type word: str
|
||||
:param upper: Whether the original word started with a capital letter.
|
||||
:type upper: bool
|
||||
:return: A tuple of the stemmed word and the removed suffix.
|
||||
:rtype: Tuple[str, str]
|
||||
"""
|
||||
|
||||
rest_length = 0
|
||||
word_copy = word[:]
|
||||
|
||||
# Pre-processing before applying the substitution patterns
|
||||
word = Cistem.replace_to(word)
|
||||
rest = ""
|
||||
|
||||
# Apply the substitution patterns
|
||||
while len(word) > 3:
|
||||
if len(word) > 5:
|
||||
word, n = Cistem.strip_emr.subn("", word)
|
||||
if n != 0:
|
||||
rest_length += 2
|
||||
continue
|
||||
|
||||
word, n = Cistem.strip_nd.subn("", word)
|
||||
if n != 0:
|
||||
rest_length += 2
|
||||
continue
|
||||
|
||||
if not upper or self._case_insensitive:
|
||||
word, n = Cistem.strip_t.subn("", word)
|
||||
if n != 0:
|
||||
rest_length += 1
|
||||
continue
|
||||
|
||||
word, n = Cistem.strip_esn.subn("", word)
|
||||
if n != 0:
|
||||
rest_length += 1
|
||||
continue
|
||||
else:
|
||||
break
|
||||
|
||||
# Post-processing after applying the substitution patterns
|
||||
word = Cistem.replace_back(word)
|
||||
|
||||
if rest_length:
|
||||
rest = word_copy[-rest_length:]
|
||||
|
||||
return (word, rest)
|
||||
395
backend/venv/Lib/site-packages/nltk/stem/isri.py
Normal file
395
backend/venv/Lib/site-packages/nltk/stem/isri.py
Normal file
@@ -0,0 +1,395 @@
|
||||
#
|
||||
# Natural Language Toolkit: The ISRI Arabic Stemmer
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Algorithm: Kazem Taghva, Rania Elkhoury, and Jeffrey Coombs (2005)
|
||||
# Author: Hosam Algasaier <hosam_hme@yahoo.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
ISRI Arabic Stemmer
|
||||
|
||||
The algorithm for this stemmer is described in:
|
||||
|
||||
Taghva, K., Elkoury, R., and Coombs, J. 2005. Arabic Stemming without a root dictionary.
|
||||
Information Science Research Institute. University of Nevada, Las Vegas, USA.
|
||||
|
||||
The Information Science Research Institute’s (ISRI) Arabic stemmer shares many features
|
||||
with the Khoja stemmer. However, the main difference is that ISRI stemmer does not use root
|
||||
dictionary. Also, if a root is not found, ISRI stemmer returned normalized form, rather than
|
||||
returning the original unmodified word.
|
||||
|
||||
Additional adjustments were made to improve the algorithm:
|
||||
|
||||
1- Adding 60 stop words.
|
||||
2- Adding the pattern (تفاعيل) to ISRI pattern set.
|
||||
3- The step 2 in the original algorithm was normalizing all hamza. This step is discarded because it
|
||||
increases the word ambiguities and changes the original root.
|
||||
|
||||
"""
|
||||
import re
|
||||
|
||||
from nltk.stem.api import StemmerI
|
||||
|
||||
|
||||
class ISRIStemmer(StemmerI):
|
||||
"""
|
||||
ISRI Arabic stemmer based on algorithm: Arabic Stemming without a root dictionary.
|
||||
Information Science Research Institute. University of Nevada, Las Vegas, USA.
|
||||
|
||||
A few minor modifications have been made to ISRI basic algorithm.
|
||||
See the source code of this module for more information.
|
||||
|
||||
isri.stem(token) returns Arabic root for the given token.
|
||||
|
||||
The ISRI Stemmer requires that all tokens have Unicode string types.
|
||||
If you use Python IDLE on Arabic Windows you have to decode text first
|
||||
using Arabic '1256' coding.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
# length three prefixes
|
||||
self.p3 = [
|
||||
"\u0643\u0627\u0644",
|
||||
"\u0628\u0627\u0644",
|
||||
"\u0648\u0644\u0644",
|
||||
"\u0648\u0627\u0644",
|
||||
]
|
||||
|
||||
# length two prefixes
|
||||
self.p2 = ["\u0627\u0644", "\u0644\u0644"]
|
||||
|
||||
# length one prefixes
|
||||
self.p1 = [
|
||||
"\u0644",
|
||||
"\u0628",
|
||||
"\u0641",
|
||||
"\u0633",
|
||||
"\u0648",
|
||||
"\u064a",
|
||||
"\u062a",
|
||||
"\u0646",
|
||||
"\u0627",
|
||||
]
|
||||
|
||||
# length three suffixes
|
||||
self.s3 = [
|
||||
"\u062a\u0645\u0644",
|
||||
"\u0647\u0645\u0644",
|
||||
"\u062a\u0627\u0646",
|
||||
"\u062a\u064a\u0646",
|
||||
"\u0643\u0645\u0644",
|
||||
]
|
||||
|
||||
# length two suffixes
|
||||
self.s2 = [
|
||||
"\u0648\u0646",
|
||||
"\u0627\u062a",
|
||||
"\u0627\u0646",
|
||||
"\u064a\u0646",
|
||||
"\u062a\u0646",
|
||||
"\u0643\u0645",
|
||||
"\u0647\u0646",
|
||||
"\u0646\u0627",
|
||||
"\u064a\u0627",
|
||||
"\u0647\u0627",
|
||||
"\u062a\u0645",
|
||||
"\u0643\u0646",
|
||||
"\u0646\u064a",
|
||||
"\u0648\u0627",
|
||||
"\u0645\u0627",
|
||||
"\u0647\u0645",
|
||||
]
|
||||
|
||||
# length one suffixes
|
||||
self.s1 = ["\u0629", "\u0647", "\u064a", "\u0643", "\u062a", "\u0627", "\u0646"]
|
||||
|
||||
# groups of length four patterns
|
||||
self.pr4 = {
|
||||
0: ["\u0645"],
|
||||
1: ["\u0627"],
|
||||
2: ["\u0627", "\u0648", "\u064A"],
|
||||
3: ["\u0629"],
|
||||
}
|
||||
|
||||
# Groups of length five patterns and length three roots
|
||||
self.pr53 = {
|
||||
0: ["\u0627", "\u062a"],
|
||||
1: ["\u0627", "\u064a", "\u0648"],
|
||||
2: ["\u0627", "\u062a", "\u0645"],
|
||||
3: ["\u0645", "\u064a", "\u062a"],
|
||||
4: ["\u0645", "\u062a"],
|
||||
5: ["\u0627", "\u0648"],
|
||||
6: ["\u0627", "\u0645"],
|
||||
}
|
||||
|
||||
self.re_short_vowels = re.compile(r"[\u064B-\u0652]")
|
||||
self.re_hamza = re.compile(r"[\u0621\u0624\u0626]")
|
||||
self.re_initial_hamza = re.compile(r"^[\u0622\u0623\u0625]")
|
||||
|
||||
self.stop_words = [
|
||||
"\u064a\u0643\u0648\u0646",
|
||||
"\u0648\u0644\u064a\u0633",
|
||||
"\u0648\u0643\u0627\u0646",
|
||||
"\u0643\u0630\u0644\u0643",
|
||||
"\u0627\u0644\u062a\u064a",
|
||||
"\u0648\u0628\u064a\u0646",
|
||||
"\u0639\u0644\u064a\u0647\u0627",
|
||||
"\u0645\u0633\u0627\u0621",
|
||||
"\u0627\u0644\u0630\u064a",
|
||||
"\u0648\u0643\u0627\u0646\u062a",
|
||||
"\u0648\u0644\u0643\u0646",
|
||||
"\u0648\u0627\u0644\u062a\u064a",
|
||||
"\u062a\u0643\u0648\u0646",
|
||||
"\u0627\u0644\u064a\u0648\u0645",
|
||||
"\u0627\u0644\u0644\u0630\u064a\u0646",
|
||||
"\u0639\u0644\u064a\u0647",
|
||||
"\u0643\u0627\u0646\u062a",
|
||||
"\u0644\u0630\u0644\u0643",
|
||||
"\u0623\u0645\u0627\u0645",
|
||||
"\u0647\u0646\u0627\u0643",
|
||||
"\u0645\u0646\u0647\u0627",
|
||||
"\u0645\u0627\u0632\u0627\u0644",
|
||||
"\u0644\u0627\u0632\u0627\u0644",
|
||||
"\u0644\u0627\u064a\u0632\u0627\u0644",
|
||||
"\u0645\u0627\u064a\u0632\u0627\u0644",
|
||||
"\u0627\u0635\u0628\u062d",
|
||||
"\u0623\u0635\u0628\u062d",
|
||||
"\u0623\u0645\u0633\u0649",
|
||||
"\u0627\u0645\u0633\u0649",
|
||||
"\u0623\u0636\u062d\u0649",
|
||||
"\u0627\u0636\u062d\u0649",
|
||||
"\u0645\u0627\u0628\u0631\u062d",
|
||||
"\u0645\u0627\u0641\u062a\u0626",
|
||||
"\u0645\u0627\u0627\u0646\u0641\u0643",
|
||||
"\u0644\u0627\u0633\u064a\u0645\u0627",
|
||||
"\u0648\u0644\u0627\u064a\u0632\u0627\u0644",
|
||||
"\u0627\u0644\u062d\u0627\u0644\u064a",
|
||||
"\u0627\u0644\u064a\u0647\u0627",
|
||||
"\u0627\u0644\u0630\u064a\u0646",
|
||||
"\u0641\u0627\u0646\u0647",
|
||||
"\u0648\u0627\u0644\u0630\u064a",
|
||||
"\u0648\u0647\u0630\u0627",
|
||||
"\u0644\u0647\u0630\u0627",
|
||||
"\u0641\u0643\u0627\u0646",
|
||||
"\u0633\u062a\u0643\u0648\u0646",
|
||||
"\u0627\u0644\u064a\u0647",
|
||||
"\u064a\u0645\u0643\u0646",
|
||||
"\u0628\u0647\u0630\u0627",
|
||||
"\u0627\u0644\u0630\u0649",
|
||||
]
|
||||
|
||||
def stem(self, token):
|
||||
"""
|
||||
Stemming a word token using the ISRI stemmer.
|
||||
"""
|
||||
token = self.norm(
|
||||
token, 1
|
||||
) # remove diacritics which representing Arabic short vowels
|
||||
if token in self.stop_words:
|
||||
return token # exclude stop words from being processed
|
||||
token = self.pre32(
|
||||
token
|
||||
) # remove length three and length two prefixes in this order
|
||||
token = self.suf32(
|
||||
token
|
||||
) # remove length three and length two suffixes in this order
|
||||
token = self.waw(
|
||||
token
|
||||
) # remove connective ‘و’ if it precedes a word beginning with ‘و’
|
||||
token = self.norm(token, 2) # normalize initial hamza to bare alif
|
||||
# if 4 <= word length <= 7, then stem; otherwise, no stemming
|
||||
if len(token) == 4: # length 4 word
|
||||
token = self.pro_w4(token)
|
||||
elif len(token) == 5: # length 5 word
|
||||
token = self.pro_w53(token)
|
||||
token = self.end_w5(token)
|
||||
elif len(token) == 6: # length 6 word
|
||||
token = self.pro_w6(token)
|
||||
token = self.end_w6(token)
|
||||
elif len(token) == 7: # length 7 word
|
||||
token = self.suf1(token)
|
||||
if len(token) == 7:
|
||||
token = self.pre1(token)
|
||||
if len(token) == 6:
|
||||
token = self.pro_w6(token)
|
||||
token = self.end_w6(token)
|
||||
return token
|
||||
|
||||
def norm(self, word, num=3):
|
||||
"""
|
||||
normalization:
|
||||
num=1 normalize diacritics
|
||||
num=2 normalize initial hamza
|
||||
num=3 both 1&2
|
||||
"""
|
||||
if num == 1:
|
||||
word = self.re_short_vowels.sub("", word)
|
||||
elif num == 2:
|
||||
word = self.re_initial_hamza.sub("\u0627", word)
|
||||
elif num == 3:
|
||||
word = self.re_short_vowels.sub("", word)
|
||||
word = self.re_initial_hamza.sub("\u0627", word)
|
||||
return word
|
||||
|
||||
def pre32(self, word):
|
||||
"""remove length three and length two prefixes in this order"""
|
||||
if len(word) >= 6:
|
||||
for pre3 in self.p3:
|
||||
if word.startswith(pre3):
|
||||
return word[3:]
|
||||
if len(word) >= 5:
|
||||
for pre2 in self.p2:
|
||||
if word.startswith(pre2):
|
||||
return word[2:]
|
||||
return word
|
||||
|
||||
def suf32(self, word):
|
||||
"""remove length three and length two suffixes in this order"""
|
||||
if len(word) >= 6:
|
||||
for suf3 in self.s3:
|
||||
if word.endswith(suf3):
|
||||
return word[:-3]
|
||||
if len(word) >= 5:
|
||||
for suf2 in self.s2:
|
||||
if word.endswith(suf2):
|
||||
return word[:-2]
|
||||
return word
|
||||
|
||||
def waw(self, word):
|
||||
"""remove connective ‘و’ if it precedes a word beginning with ‘و’"""
|
||||
if len(word) >= 4 and word[:2] == "\u0648\u0648":
|
||||
word = word[1:]
|
||||
return word
|
||||
|
||||
def pro_w4(self, word):
|
||||
"""process length four patterns and extract length three roots"""
|
||||
if word[0] in self.pr4[0]: # مفعل
|
||||
word = word[1:]
|
||||
elif word[1] in self.pr4[1]: # فاعل
|
||||
word = word[:1] + word[2:]
|
||||
elif word[2] in self.pr4[2]: # فعال - فعول - فعيل
|
||||
word = word[:2] + word[3]
|
||||
elif word[3] in self.pr4[3]: # فعلة
|
||||
word = word[:-1]
|
||||
else:
|
||||
word = self.suf1(word) # do - normalize short sufix
|
||||
if len(word) == 4:
|
||||
word = self.pre1(word) # do - normalize short prefix
|
||||
return word
|
||||
|
||||
def pro_w53(self, word):
|
||||
"""process length five patterns and extract length three roots"""
|
||||
if word[2] in self.pr53[0] and word[0] == "\u0627": # افتعل - افاعل
|
||||
word = word[1] + word[3:]
|
||||
elif word[3] in self.pr53[1] and word[0] == "\u0645": # مفعول - مفعال - مفعيل
|
||||
word = word[1:3] + word[4]
|
||||
elif word[0] in self.pr53[2] and word[4] == "\u0629": # مفعلة - تفعلة - افعلة
|
||||
word = word[1:4]
|
||||
elif word[0] in self.pr53[3] and word[2] == "\u062a": # مفتعل - يفتعل - تفتعل
|
||||
word = word[1] + word[3:]
|
||||
elif word[0] in self.pr53[4] and word[2] == "\u0627": # مفاعل - تفاعل
|
||||
word = word[1] + word[3:]
|
||||
elif word[2] in self.pr53[5] and word[4] == "\u0629": # فعولة - فعالة
|
||||
word = word[:2] + word[3]
|
||||
elif word[0] in self.pr53[6] and word[1] == "\u0646": # انفعل - منفعل
|
||||
word = word[2:]
|
||||
elif word[3] == "\u0627" and word[0] == "\u0627": # افعال
|
||||
word = word[1:3] + word[4]
|
||||
elif word[4] == "\u0646" and word[3] == "\u0627": # فعلان
|
||||
word = word[:3]
|
||||
elif word[3] == "\u064a" and word[0] == "\u062a": # تفعيل
|
||||
word = word[1:3] + word[4]
|
||||
elif word[3] == "\u0648" and word[1] == "\u0627": # فاعول
|
||||
word = word[0] + word[2] + word[4]
|
||||
elif word[2] == "\u0627" and word[1] == "\u0648": # فواعل
|
||||
word = word[0] + word[3:]
|
||||
elif word[3] == "\u0626" and word[2] == "\u0627": # فعائل
|
||||
word = word[:2] + word[4]
|
||||
elif word[4] == "\u0629" and word[1] == "\u0627": # فاعلة
|
||||
word = word[0] + word[2:4]
|
||||
elif word[4] == "\u064a" and word[2] == "\u0627": # فعالي
|
||||
word = word[:2] + word[3]
|
||||
else:
|
||||
word = self.suf1(word) # do - normalize short sufix
|
||||
if len(word) == 5:
|
||||
word = self.pre1(word) # do - normalize short prefix
|
||||
return word
|
||||
|
||||
def pro_w54(self, word):
|
||||
"""process length five patterns and extract length four roots"""
|
||||
if word[0] in self.pr53[2]: # تفعلل - افعلل - مفعلل
|
||||
word = word[1:]
|
||||
elif word[4] == "\u0629": # فعللة
|
||||
word = word[:4]
|
||||
elif word[2] == "\u0627": # فعالل
|
||||
word = word[:2] + word[3:]
|
||||
return word
|
||||
|
||||
def end_w5(self, word):
|
||||
"""ending step (word of length five)"""
|
||||
if len(word) == 4:
|
||||
word = self.pro_w4(word)
|
||||
elif len(word) == 5:
|
||||
word = self.pro_w54(word)
|
||||
return word
|
||||
|
||||
def pro_w6(self, word):
|
||||
"""process length six patterns and extract length three roots"""
|
||||
if word.startswith("\u0627\u0633\u062a") or word.startswith(
|
||||
"\u0645\u0633\u062a"
|
||||
): # مستفعل - استفعل
|
||||
word = word[3:]
|
||||
elif (
|
||||
word[0] == "\u0645" and word[3] == "\u0627" and word[5] == "\u0629"
|
||||
): # مفعالة
|
||||
word = word[1:3] + word[4]
|
||||
elif (
|
||||
word[0] == "\u0627" and word[2] == "\u062a" and word[4] == "\u0627"
|
||||
): # افتعال
|
||||
word = word[1] + word[3] + word[5]
|
||||
elif (
|
||||
word[0] == "\u0627" and word[3] == "\u0648" and word[2] == word[4]
|
||||
): # افعوعل
|
||||
word = word[1] + word[4:]
|
||||
elif (
|
||||
word[0] == "\u062a" and word[2] == "\u0627" and word[4] == "\u064a"
|
||||
): # تفاعيل new pattern
|
||||
word = word[1] + word[3] + word[5]
|
||||
else:
|
||||
word = self.suf1(word) # do - normalize short sufix
|
||||
if len(word) == 6:
|
||||
word = self.pre1(word) # do - normalize short prefix
|
||||
return word
|
||||
|
||||
def pro_w64(self, word):
|
||||
"""process length six patterns and extract length four roots"""
|
||||
if word[0] == "\u0627" and word[4] == "\u0627": # افعلال
|
||||
word = word[1:4] + word[5]
|
||||
elif word.startswith("\u0645\u062a"): # متفعلل
|
||||
word = word[2:]
|
||||
return word
|
||||
|
||||
def end_w6(self, word):
|
||||
"""ending step (word of length six)"""
|
||||
if len(word) == 5:
|
||||
word = self.pro_w53(word)
|
||||
word = self.end_w5(word)
|
||||
elif len(word) == 6:
|
||||
word = self.pro_w64(word)
|
||||
return word
|
||||
|
||||
def suf1(self, word):
|
||||
"""normalize short sufix"""
|
||||
for sf1 in self.s1:
|
||||
if word.endswith(sf1):
|
||||
return word[:-1]
|
||||
return word
|
||||
|
||||
def pre1(self, word):
|
||||
"""normalize short prefix"""
|
||||
for sp1 in self.p1:
|
||||
if word.startswith(sp1):
|
||||
return word[1:]
|
||||
return word
|
||||
342
backend/venv/Lib/site-packages/nltk/stem/lancaster.py
Normal file
342
backend/venv/Lib/site-packages/nltk/stem/lancaster.py
Normal file
@@ -0,0 +1,342 @@
|
||||
# Natural Language Toolkit: Stemmers
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Tomcavage <stomcava@law.upenn.edu>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
A word stemmer based on the Lancaster (Paice/Husk) stemming algorithm.
|
||||
Paice, Chris D. "Another Stemmer." ACM SIGIR Forum 24.3 (1990): 56-61.
|
||||
"""
|
||||
import re
|
||||
|
||||
from nltk.stem.api import StemmerI
|
||||
|
||||
|
||||
class LancasterStemmer(StemmerI):
|
||||
"""
|
||||
Lancaster Stemmer
|
||||
|
||||
>>> from nltk.stem.lancaster import LancasterStemmer
|
||||
>>> st = LancasterStemmer()
|
||||
>>> st.stem('maximum') # Remove "-um" when word is intact
|
||||
'maxim'
|
||||
>>> st.stem('presumably') # Don't remove "-um" when word is not intact
|
||||
'presum'
|
||||
>>> st.stem('multiply') # No action taken if word ends with "-ply"
|
||||
'multiply'
|
||||
>>> st.stem('provision') # Replace "-sion" with "-j" to trigger "j" set of rules
|
||||
'provid'
|
||||
>>> st.stem('owed') # Word starting with vowel must contain at least 2 letters
|
||||
'ow'
|
||||
>>> st.stem('ear') # ditto
|
||||
'ear'
|
||||
>>> st.stem('saying') # Words starting with consonant must contain at least 3
|
||||
'say'
|
||||
>>> st.stem('crying') # letters and one of those letters must be a vowel
|
||||
'cry'
|
||||
>>> st.stem('string') # ditto
|
||||
'string'
|
||||
>>> st.stem('meant') # ditto
|
||||
'meant'
|
||||
>>> st.stem('cement') # ditto
|
||||
'cem'
|
||||
>>> st_pre = LancasterStemmer(strip_prefix_flag=True)
|
||||
>>> st_pre.stem('kilometer') # Test Prefix
|
||||
'met'
|
||||
>>> st_custom = LancasterStemmer(rule_tuple=("ssen4>", "s1t."))
|
||||
>>> st_custom.stem("ness") # Change s to t
|
||||
'nest'
|
||||
"""
|
||||
|
||||
# The rule list is static since it doesn't change between instances
|
||||
default_rule_tuple = (
|
||||
"ai*2.", # -ia > - if intact
|
||||
"a*1.", # -a > - if intact
|
||||
"bb1.", # -bb > -b
|
||||
"city3s.", # -ytic > -ys
|
||||
"ci2>", # -ic > -
|
||||
"cn1t>", # -nc > -nt
|
||||
"dd1.", # -dd > -d
|
||||
"dei3y>", # -ied > -y
|
||||
"deec2ss.", # -ceed >", -cess
|
||||
"dee1.", # -eed > -ee
|
||||
"de2>", # -ed > -
|
||||
"dooh4>", # -hood > -
|
||||
"e1>", # -e > -
|
||||
"feil1v.", # -lief > -liev
|
||||
"fi2>", # -if > -
|
||||
"gni3>", # -ing > -
|
||||
"gai3y.", # -iag > -y
|
||||
"ga2>", # -ag > -
|
||||
"gg1.", # -gg > -g
|
||||
"ht*2.", # -th > - if intact
|
||||
"hsiug5ct.", # -guish > -ct
|
||||
"hsi3>", # -ish > -
|
||||
"i*1.", # -i > - if intact
|
||||
"i1y>", # -i > -y
|
||||
"ji1d.", # -ij > -id -- see nois4j> & vis3j>
|
||||
"juf1s.", # -fuj > -fus
|
||||
"ju1d.", # -uj > -ud
|
||||
"jo1d.", # -oj > -od
|
||||
"jeh1r.", # -hej > -her
|
||||
"jrev1t.", # -verj > -vert
|
||||
"jsim2t.", # -misj > -mit
|
||||
"jn1d.", # -nj > -nd
|
||||
"j1s.", # -j > -s
|
||||
"lbaifi6.", # -ifiabl > -
|
||||
"lbai4y.", # -iabl > -y
|
||||
"lba3>", # -abl > -
|
||||
"lbi3.", # -ibl > -
|
||||
"lib2l>", # -bil > -bl
|
||||
"lc1.", # -cl > c
|
||||
"lufi4y.", # -iful > -y
|
||||
"luf3>", # -ful > -
|
||||
"lu2.", # -ul > -
|
||||
"lai3>", # -ial > -
|
||||
"lau3>", # -ual > -
|
||||
"la2>", # -al > -
|
||||
"ll1.", # -ll > -l
|
||||
"mui3.", # -ium > -
|
||||
"mu*2.", # -um > - if intact
|
||||
"msi3>", # -ism > -
|
||||
"mm1.", # -mm > -m
|
||||
"nois4j>", # -sion > -j
|
||||
"noix4ct.", # -xion > -ct
|
||||
"noi3>", # -ion > -
|
||||
"nai3>", # -ian > -
|
||||
"na2>", # -an > -
|
||||
"nee0.", # protect -een
|
||||
"ne2>", # -en > -
|
||||
"nn1.", # -nn > -n
|
||||
"pihs4>", # -ship > -
|
||||
"pp1.", # -pp > -p
|
||||
"re2>", # -er > -
|
||||
"rae0.", # protect -ear
|
||||
"ra2.", # -ar > -
|
||||
"ro2>", # -or > -
|
||||
"ru2>", # -ur > -
|
||||
"rr1.", # -rr > -r
|
||||
"rt1>", # -tr > -t
|
||||
"rei3y>", # -ier > -y
|
||||
"sei3y>", # -ies > -y
|
||||
"sis2.", # -sis > -s
|
||||
"si2>", # -is > -
|
||||
"ssen4>", # -ness > -
|
||||
"ss0.", # protect -ss
|
||||
"suo3>", # -ous > -
|
||||
"su*2.", # -us > - if intact
|
||||
"s*1>", # -s > - if intact
|
||||
"s0.", # -s > -s
|
||||
"tacilp4y.", # -plicat > -ply
|
||||
"ta2>", # -at > -
|
||||
"tnem4>", # -ment > -
|
||||
"tne3>", # -ent > -
|
||||
"tna3>", # -ant > -
|
||||
"tpir2b.", # -ript > -rib
|
||||
"tpro2b.", # -orpt > -orb
|
||||
"tcud1.", # -duct > -duc
|
||||
"tpmus2.", # -sumpt > -sum
|
||||
"tpec2iv.", # -cept > -ceiv
|
||||
"tulo2v.", # -olut > -olv
|
||||
"tsis0.", # protect -sist
|
||||
"tsi3>", # -ist > -
|
||||
"tt1.", # -tt > -t
|
||||
"uqi3.", # -iqu > -
|
||||
"ugo1.", # -ogu > -og
|
||||
"vis3j>", # -siv > -j
|
||||
"vie0.", # protect -eiv
|
||||
"vi2>", # -iv > -
|
||||
"ylb1>", # -bly > -bl
|
||||
"yli3y>", # -ily > -y
|
||||
"ylp0.", # protect -ply
|
||||
"yl2>", # -ly > -
|
||||
"ygo1.", # -ogy > -og
|
||||
"yhp1.", # -phy > -ph
|
||||
"ymo1.", # -omy > -om
|
||||
"ypo1.", # -opy > -op
|
||||
"yti3>", # -ity > -
|
||||
"yte3>", # -ety > -
|
||||
"ytl2.", # -lty > -l
|
||||
"yrtsi5.", # -istry > -
|
||||
"yra3>", # -ary > -
|
||||
"yro3>", # -ory > -
|
||||
"yfi3.", # -ify > -
|
||||
"ycn2t>", # -ncy > -nt
|
||||
"yca3>", # -acy > -
|
||||
"zi2>", # -iz > -
|
||||
"zy1s.", # -yz > -ys
|
||||
)
|
||||
|
||||
def __init__(self, rule_tuple=None, strip_prefix_flag=False):
|
||||
"""Create an instance of the Lancaster stemmer."""
|
||||
# Setup an empty rule dictionary - this will be filled in later
|
||||
self.rule_dictionary = {}
|
||||
# Check if a user wants to strip prefix
|
||||
self._strip_prefix = strip_prefix_flag
|
||||
# Check if a user wants to use his/her own rule tuples.
|
||||
self._rule_tuple = rule_tuple if rule_tuple else self.default_rule_tuple
|
||||
|
||||
def parseRules(self, rule_tuple=None):
|
||||
"""Validate the set of rules used in this stemmer.
|
||||
|
||||
If this function is called as an individual method, without using stem
|
||||
method, rule_tuple argument will be compiled into self.rule_dictionary.
|
||||
If this function is called within stem, self._rule_tuple will be used.
|
||||
|
||||
"""
|
||||
# If there is no argument for the function, use class' own rule tuple.
|
||||
rule_tuple = rule_tuple if rule_tuple else self._rule_tuple
|
||||
valid_rule = re.compile(r"^[a-z]+\*?\d[a-z]*[>\.]?$")
|
||||
# Empty any old rules from the rule set before adding new ones
|
||||
self.rule_dictionary = {}
|
||||
|
||||
for rule in rule_tuple:
|
||||
if not valid_rule.match(rule):
|
||||
raise ValueError(f"The rule {rule} is invalid")
|
||||
first_letter = rule[0:1]
|
||||
if first_letter in self.rule_dictionary:
|
||||
self.rule_dictionary[first_letter].append(rule)
|
||||
else:
|
||||
self.rule_dictionary[first_letter] = [rule]
|
||||
|
||||
def stem(self, word):
|
||||
"""Stem a word using the Lancaster stemmer."""
|
||||
# Lower-case the word, since all the rules are lower-cased
|
||||
word = word.lower()
|
||||
word = self.__stripPrefix(word) if self._strip_prefix else word
|
||||
|
||||
# Save a copy of the original word
|
||||
intact_word = word
|
||||
|
||||
# If rule dictionary is empty, parse rule tuple.
|
||||
if not self.rule_dictionary:
|
||||
self.parseRules()
|
||||
|
||||
return self.__doStemming(word, intact_word)
|
||||
|
||||
def __doStemming(self, word, intact_word):
|
||||
"""Perform the actual word stemming"""
|
||||
|
||||
valid_rule = re.compile(r"^([a-z]+)(\*?)(\d)([a-z]*)([>\.]?)$")
|
||||
|
||||
proceed = True
|
||||
|
||||
while proceed:
|
||||
# Find the position of the last letter of the word to be stemmed
|
||||
last_letter_position = self.__getLastLetter(word)
|
||||
|
||||
# Only stem the word if it has a last letter and a rule matching that last letter
|
||||
if (
|
||||
last_letter_position < 0
|
||||
or word[last_letter_position] not in self.rule_dictionary
|
||||
):
|
||||
proceed = False
|
||||
|
||||
else:
|
||||
rule_was_applied = False
|
||||
|
||||
# Go through each rule that matches the word's final letter
|
||||
for rule in self.rule_dictionary[word[last_letter_position]]:
|
||||
rule_match = valid_rule.match(rule)
|
||||
if rule_match:
|
||||
(
|
||||
ending_string,
|
||||
intact_flag,
|
||||
remove_total,
|
||||
append_string,
|
||||
cont_flag,
|
||||
) = rule_match.groups()
|
||||
|
||||
# Convert the number of chars to remove when stemming
|
||||
# from a string to an integer
|
||||
remove_total = int(remove_total)
|
||||
|
||||
# Proceed if word's ending matches rule's word ending
|
||||
if word.endswith(ending_string[::-1]):
|
||||
if intact_flag:
|
||||
if word == intact_word and self.__isAcceptable(
|
||||
word, remove_total
|
||||
):
|
||||
word = self.__applyRule(
|
||||
word, remove_total, append_string
|
||||
)
|
||||
rule_was_applied = True
|
||||
if cont_flag == ".":
|
||||
proceed = False
|
||||
break
|
||||
elif self.__isAcceptable(word, remove_total):
|
||||
word = self.__applyRule(
|
||||
word, remove_total, append_string
|
||||
)
|
||||
rule_was_applied = True
|
||||
if cont_flag == ".":
|
||||
proceed = False
|
||||
break
|
||||
# If no rules apply, the word doesn't need any more stemming
|
||||
if rule_was_applied == False:
|
||||
proceed = False
|
||||
return word
|
||||
|
||||
def __getLastLetter(self, word):
|
||||
"""Get the zero-based index of the last alphabetic character in this string"""
|
||||
last_letter = -1
|
||||
for position in range(len(word)):
|
||||
if word[position].isalpha():
|
||||
last_letter = position
|
||||
else:
|
||||
break
|
||||
return last_letter
|
||||
|
||||
def __isAcceptable(self, word, remove_total):
|
||||
"""Determine if the word is acceptable for stemming."""
|
||||
word_is_acceptable = False
|
||||
# If the word starts with a vowel, it must be at least 2
|
||||
# characters long to be stemmed
|
||||
if word[0] in "aeiouy":
|
||||
if len(word) - remove_total >= 2:
|
||||
word_is_acceptable = True
|
||||
# If the word starts with a consonant, it must be at least 3
|
||||
# characters long (including one vowel) to be stemmed
|
||||
elif len(word) - remove_total >= 3:
|
||||
if word[1] in "aeiouy":
|
||||
word_is_acceptable = True
|
||||
elif word[2] in "aeiouy":
|
||||
word_is_acceptable = True
|
||||
return word_is_acceptable
|
||||
|
||||
def __applyRule(self, word, remove_total, append_string):
|
||||
"""Apply the stemming rule to the word"""
|
||||
# Remove letters from the end of the word
|
||||
new_word_length = len(word) - remove_total
|
||||
word = word[0:new_word_length]
|
||||
|
||||
# And add new letters to the end of the truncated word
|
||||
if append_string:
|
||||
word += append_string
|
||||
return word
|
||||
|
||||
def __stripPrefix(self, word):
|
||||
"""Remove prefix from a word.
|
||||
|
||||
This function originally taken from Whoosh.
|
||||
|
||||
"""
|
||||
for prefix in (
|
||||
"kilo",
|
||||
"micro",
|
||||
"milli",
|
||||
"intra",
|
||||
"ultra",
|
||||
"mega",
|
||||
"nano",
|
||||
"pico",
|
||||
"pseudo",
|
||||
):
|
||||
if word.startswith(prefix):
|
||||
return word[len(prefix) :]
|
||||
return word
|
||||
|
||||
def __repr__(self):
|
||||
return "<LancasterStemmer>"
|
||||
717
backend/venv/Lib/site-packages/nltk/stem/porter.py
Normal file
717
backend/venv/Lib/site-packages/nltk/stem/porter.py
Normal file
@@ -0,0 +1,717 @@
|
||||
"""
|
||||
Porter Stemmer
|
||||
|
||||
This is the Porter stemming algorithm. It follows the algorithm
|
||||
presented in
|
||||
|
||||
Porter, M. "An algorithm for suffix stripping." Program 14.3 (1980): 130-137.
|
||||
|
||||
with some optional deviations that can be turned on or off with the
|
||||
`mode` argument to the constructor.
|
||||
|
||||
Martin Porter, the algorithm's inventor, maintains a web page about the
|
||||
algorithm at
|
||||
|
||||
https://www.tartarus.org/~martin/PorterStemmer/
|
||||
|
||||
which includes another Python implementation and other implementations
|
||||
in many languages.
|
||||
"""
|
||||
|
||||
__docformat__ = "plaintext"
|
||||
|
||||
import re
|
||||
|
||||
from nltk.stem.api import StemmerI
|
||||
|
||||
|
||||
class PorterStemmer(StemmerI):
|
||||
"""
|
||||
A word stemmer based on the Porter stemming algorithm.
|
||||
|
||||
Porter, M. "An algorithm for suffix stripping."
|
||||
Program 14.3 (1980): 130-137.
|
||||
|
||||
See https://www.tartarus.org/~martin/PorterStemmer/ for the homepage
|
||||
of the algorithm.
|
||||
|
||||
Martin Porter has endorsed several modifications to the Porter
|
||||
algorithm since writing his original paper, and those extensions are
|
||||
included in the implementations on his website. Additionally, others
|
||||
have proposed further improvements to the algorithm, including NLTK
|
||||
contributors. There are thus three modes that can be selected by
|
||||
passing the appropriate constant to the class constructor's `mode`
|
||||
attribute:
|
||||
|
||||
- PorterStemmer.ORIGINAL_ALGORITHM
|
||||
|
||||
An implementation that is faithful to the original paper.
|
||||
|
||||
Note that Martin Porter has deprecated this version of the
|
||||
algorithm. Martin distributes implementations of the Porter
|
||||
Stemmer in many languages, hosted at:
|
||||
|
||||
https://www.tartarus.org/~martin/PorterStemmer/
|
||||
|
||||
and all of these implementations include his extensions. He
|
||||
strongly recommends against using the original, published
|
||||
version of the algorithm; only use this mode if you clearly
|
||||
understand why you are choosing to do so.
|
||||
|
||||
- PorterStemmer.MARTIN_EXTENSIONS
|
||||
|
||||
An implementation that only uses the modifications to the
|
||||
algorithm that are included in the implementations on Martin
|
||||
Porter's website. He has declared Porter frozen, so the
|
||||
behaviour of those implementations should never change.
|
||||
|
||||
- PorterStemmer.NLTK_EXTENSIONS (default)
|
||||
|
||||
An implementation that includes further improvements devised by
|
||||
NLTK contributors or taken from other modified implementations
|
||||
found on the web.
|
||||
|
||||
For the best stemming, you should use the default NLTK_EXTENSIONS
|
||||
version. However, if you need to get the same results as either the
|
||||
original algorithm or one of Martin Porter's hosted versions for
|
||||
compatibility with an existing implementation or dataset, you can use
|
||||
one of the other modes instead.
|
||||
"""
|
||||
|
||||
# Modes the Stemmer can be instantiated in
|
||||
NLTK_EXTENSIONS = "NLTK_EXTENSIONS"
|
||||
MARTIN_EXTENSIONS = "MARTIN_EXTENSIONS"
|
||||
ORIGINAL_ALGORITHM = "ORIGINAL_ALGORITHM"
|
||||
|
||||
def __init__(self, mode=NLTK_EXTENSIONS):
|
||||
if mode not in (
|
||||
self.NLTK_EXTENSIONS,
|
||||
self.MARTIN_EXTENSIONS,
|
||||
self.ORIGINAL_ALGORITHM,
|
||||
):
|
||||
raise ValueError(
|
||||
"Mode must be one of PorterStemmer.NLTK_EXTENSIONS, "
|
||||
"PorterStemmer.MARTIN_EXTENSIONS, or "
|
||||
"PorterStemmer.ORIGINAL_ALGORITHM"
|
||||
)
|
||||
|
||||
self.mode = mode
|
||||
|
||||
if self.mode == self.NLTK_EXTENSIONS:
|
||||
# This is a table of irregular forms. It is quite short,
|
||||
# but still reflects the errors actually drawn to Martin
|
||||
# Porter's attention over a 20 year period!
|
||||
irregular_forms = {
|
||||
"sky": ["sky", "skies"],
|
||||
"die": ["dying"],
|
||||
"lie": ["lying"],
|
||||
"tie": ["tying"],
|
||||
"news": ["news"],
|
||||
"inning": ["innings", "inning"],
|
||||
"outing": ["outings", "outing"],
|
||||
"canning": ["cannings", "canning"],
|
||||
"howe": ["howe"],
|
||||
"proceed": ["proceed"],
|
||||
"exceed": ["exceed"],
|
||||
"succeed": ["succeed"],
|
||||
}
|
||||
|
||||
self.pool = {}
|
||||
for key in irregular_forms:
|
||||
for val in irregular_forms[key]:
|
||||
self.pool[val] = key
|
||||
|
||||
self.vowels = frozenset(["a", "e", "i", "o", "u"])
|
||||
|
||||
def _is_consonant(self, word, i):
|
||||
"""Returns True if word[i] is a consonant, False otherwise
|
||||
|
||||
A consonant is defined in the paper as follows:
|
||||
|
||||
A consonant in a word is a letter other than A, E, I, O or
|
||||
U, and other than Y preceded by a consonant. (The fact that
|
||||
the term `consonant' is defined to some extent in terms of
|
||||
itself does not make it ambiguous.) So in TOY the consonants
|
||||
are T and Y, and in SYZYGY they are S, Z and G. If a letter
|
||||
is not a consonant it is a vowel.
|
||||
"""
|
||||
if word[i] in self.vowels:
|
||||
return False
|
||||
if word[i] == "y":
|
||||
if i == 0:
|
||||
return True
|
||||
else:
|
||||
return not self._is_consonant(word, i - 1)
|
||||
return True
|
||||
|
||||
def _measure(self, stem):
|
||||
r"""Returns the 'measure' of stem, per definition in the paper
|
||||
|
||||
From the paper:
|
||||
|
||||
A consonant will be denoted by c, a vowel by v. A list
|
||||
ccc... of length greater than 0 will be denoted by C, and a
|
||||
list vvv... of length greater than 0 will be denoted by V.
|
||||
Any word, or part of a word, therefore has one of the four
|
||||
forms:
|
||||
|
||||
CVCV ... C
|
||||
CVCV ... V
|
||||
VCVC ... C
|
||||
VCVC ... V
|
||||
|
||||
These may all be represented by the single form
|
||||
|
||||
[C]VCVC ... [V]
|
||||
|
||||
where the square brackets denote arbitrary presence of their
|
||||
contents. Using (VC){m} to denote VC repeated m times, this
|
||||
may again be written as
|
||||
|
||||
[C](VC){m}[V].
|
||||
|
||||
m will be called the \measure\ of any word or word part when
|
||||
represented in this form. The case m = 0 covers the null
|
||||
word. Here are some examples:
|
||||
|
||||
m=0 TR, EE, TREE, Y, BY.
|
||||
m=1 TROUBLE, OATS, TREES, IVY.
|
||||
m=2 TROUBLES, PRIVATE, OATEN, ORRERY.
|
||||
"""
|
||||
cv_sequence = ""
|
||||
|
||||
# Construct a string of 'c's and 'v's representing whether each
|
||||
# character in `stem` is a consonant or a vowel.
|
||||
# e.g. 'falafel' becomes 'cvcvcvc',
|
||||
# 'architecture' becomes 'vcccvcvccvcv'
|
||||
for i in range(len(stem)):
|
||||
if self._is_consonant(stem, i):
|
||||
cv_sequence += "c"
|
||||
else:
|
||||
cv_sequence += "v"
|
||||
|
||||
# Count the number of 'vc' occurrences, which is equivalent to
|
||||
# the number of 'VC' occurrences in Porter's reduced form in the
|
||||
# docstring above, which is in turn equivalent to `m`
|
||||
return cv_sequence.count("vc")
|
||||
|
||||
def _has_positive_measure(self, stem):
|
||||
return self._measure(stem) > 0
|
||||
|
||||
def _contains_vowel(self, stem):
|
||||
"""Returns True if stem contains a vowel, else False"""
|
||||
for i in range(len(stem)):
|
||||
if not self._is_consonant(stem, i):
|
||||
return True
|
||||
return False
|
||||
|
||||
def _ends_double_consonant(self, word):
|
||||
"""Implements condition *d from the paper
|
||||
|
||||
Returns True if word ends with a double consonant
|
||||
"""
|
||||
return (
|
||||
len(word) >= 2
|
||||
and word[-1] == word[-2]
|
||||
and self._is_consonant(word, len(word) - 1)
|
||||
)
|
||||
|
||||
def _ends_cvc(self, word):
|
||||
"""Implements condition *o from the paper
|
||||
|
||||
From the paper:
|
||||
|
||||
*o - the stem ends cvc, where the second c is not W, X or Y
|
||||
(e.g. -WIL, -HOP).
|
||||
"""
|
||||
return (
|
||||
len(word) >= 3
|
||||
and self._is_consonant(word, len(word) - 3)
|
||||
and not self._is_consonant(word, len(word) - 2)
|
||||
and self._is_consonant(word, len(word) - 1)
|
||||
and word[-1] not in ("w", "x", "y")
|
||||
) or (
|
||||
self.mode == self.NLTK_EXTENSIONS
|
||||
and len(word) == 2
|
||||
and not self._is_consonant(word, 0)
|
||||
and self._is_consonant(word, 1)
|
||||
)
|
||||
|
||||
def _replace_suffix(self, word, suffix, replacement):
|
||||
"""Replaces `suffix` of `word` with `replacement"""
|
||||
assert word.endswith(suffix), "Given word doesn't end with given suffix"
|
||||
if suffix == "":
|
||||
return word + replacement
|
||||
else:
|
||||
return word[: -len(suffix)] + replacement
|
||||
|
||||
def _apply_rule_list(self, word, rules):
|
||||
"""Applies the first applicable suffix-removal rule to the word
|
||||
|
||||
Takes a word and a list of suffix-removal rules represented as
|
||||
3-tuples, with the first element being the suffix to remove,
|
||||
the second element being the string to replace it with, and the
|
||||
final element being the condition for the rule to be applicable,
|
||||
or None if the rule is unconditional.
|
||||
"""
|
||||
for rule in rules:
|
||||
suffix, replacement, condition = rule
|
||||
if suffix == "*d" and self._ends_double_consonant(word):
|
||||
stem = word[:-2]
|
||||
if condition is None or condition(stem):
|
||||
return stem + replacement
|
||||
else:
|
||||
# Don't try any further rules
|
||||
return word
|
||||
if word.endswith(suffix):
|
||||
stem = self._replace_suffix(word, suffix, "")
|
||||
if condition is None or condition(stem):
|
||||
return stem + replacement
|
||||
else:
|
||||
# Don't try any further rules
|
||||
return word
|
||||
|
||||
return word
|
||||
|
||||
def _step1a(self, word):
|
||||
"""Implements Step 1a from "An algorithm for suffix stripping"
|
||||
|
||||
From the paper:
|
||||
|
||||
SSES -> SS caresses -> caress
|
||||
IES -> I ponies -> poni
|
||||
ties -> ti
|
||||
SS -> SS caress -> caress
|
||||
S -> cats -> cat
|
||||
"""
|
||||
# this NLTK-only rule extends the original algorithm, so
|
||||
# that 'flies'->'fli' but 'dies'->'die' etc
|
||||
if self.mode == self.NLTK_EXTENSIONS:
|
||||
if word.endswith("ies") and len(word) == 4:
|
||||
return self._replace_suffix(word, "ies", "ie")
|
||||
|
||||
return self._apply_rule_list(
|
||||
word,
|
||||
[
|
||||
("sses", "ss", None), # SSES -> SS
|
||||
("ies", "i", None), # IES -> I
|
||||
("ss", "ss", None), # SS -> SS
|
||||
("s", "", None), # S ->
|
||||
],
|
||||
)
|
||||
|
||||
def _step1b(self, word):
|
||||
"""Implements Step 1b from "An algorithm for suffix stripping"
|
||||
|
||||
From the paper:
|
||||
|
||||
(m>0) EED -> EE feed -> feed
|
||||
agreed -> agree
|
||||
(*v*) ED -> plastered -> plaster
|
||||
bled -> bled
|
||||
(*v*) ING -> motoring -> motor
|
||||
sing -> sing
|
||||
|
||||
If the second or third of the rules in Step 1b is successful,
|
||||
the following is done:
|
||||
|
||||
AT -> ATE conflat(ed) -> conflate
|
||||
BL -> BLE troubl(ed) -> trouble
|
||||
IZ -> IZE siz(ed) -> size
|
||||
(*d and not (*L or *S or *Z))
|
||||
-> single letter
|
||||
hopp(ing) -> hop
|
||||
tann(ed) -> tan
|
||||
fall(ing) -> fall
|
||||
hiss(ing) -> hiss
|
||||
fizz(ed) -> fizz
|
||||
(m=1 and *o) -> E fail(ing) -> fail
|
||||
fil(ing) -> file
|
||||
|
||||
The rule to map to a single letter causes the removal of one of
|
||||
the double letter pair. The -E is put back on -AT, -BL and -IZ,
|
||||
so that the suffixes -ATE, -BLE and -IZE can be recognised
|
||||
later. This E may be removed in step 4.
|
||||
"""
|
||||
# this NLTK-only block extends the original algorithm, so that
|
||||
# 'spied'->'spi' but 'died'->'die' etc
|
||||
if self.mode == self.NLTK_EXTENSIONS:
|
||||
if word.endswith("ied"):
|
||||
if len(word) == 4:
|
||||
return self._replace_suffix(word, "ied", "ie")
|
||||
else:
|
||||
return self._replace_suffix(word, "ied", "i")
|
||||
|
||||
# (m>0) EED -> EE
|
||||
if word.endswith("eed"):
|
||||
stem = self._replace_suffix(word, "eed", "")
|
||||
if self._measure(stem) > 0:
|
||||
return stem + "ee"
|
||||
else:
|
||||
return word
|
||||
|
||||
rule_2_or_3_succeeded = False
|
||||
|
||||
for suffix in ["ed", "ing"]:
|
||||
if word.endswith(suffix):
|
||||
intermediate_stem = self._replace_suffix(word, suffix, "")
|
||||
if self._contains_vowel(intermediate_stem):
|
||||
rule_2_or_3_succeeded = True
|
||||
break
|
||||
|
||||
if not rule_2_or_3_succeeded:
|
||||
return word
|
||||
|
||||
return self._apply_rule_list(
|
||||
intermediate_stem,
|
||||
[
|
||||
("at", "ate", None), # AT -> ATE
|
||||
("bl", "ble", None), # BL -> BLE
|
||||
("iz", "ize", None), # IZ -> IZE
|
||||
# (*d and not (*L or *S or *Z))
|
||||
# -> single letter
|
||||
(
|
||||
"*d",
|
||||
intermediate_stem[-1],
|
||||
lambda stem: intermediate_stem[-1] not in ("l", "s", "z"),
|
||||
),
|
||||
# (m=1 and *o) -> E
|
||||
(
|
||||
"",
|
||||
"e",
|
||||
lambda stem: (self._measure(stem) == 1 and self._ends_cvc(stem)),
|
||||
),
|
||||
],
|
||||
)
|
||||
|
||||
def _step1c(self, word):
|
||||
"""Implements Step 1c from "An algorithm for suffix stripping"
|
||||
|
||||
From the paper:
|
||||
|
||||
Step 1c
|
||||
|
||||
(*v*) Y -> I happy -> happi
|
||||
sky -> sky
|
||||
"""
|
||||
|
||||
def nltk_condition(stem):
|
||||
"""
|
||||
This has been modified from the original Porter algorithm so
|
||||
that y->i is only done when y is preceded by a consonant,
|
||||
but not if the stem is only a single consonant, i.e.
|
||||
|
||||
(*c and not c) Y -> I
|
||||
|
||||
So 'happy' -> 'happi', but
|
||||
'enjoy' -> 'enjoy' etc
|
||||
|
||||
This is a much better rule. Formerly 'enjoy'->'enjoi' and
|
||||
'enjoyment'->'enjoy'. Step 1c is perhaps done too soon; but
|
||||
with this modification that no longer really matters.
|
||||
|
||||
Also, the removal of the contains_vowel(z) condition means
|
||||
that 'spy', 'fly', 'try' ... stem to 'spi', 'fli', 'tri' and
|
||||
conflate with 'spied', 'tried', 'flies' ...
|
||||
"""
|
||||
return len(stem) > 1 and self._is_consonant(stem, len(stem) - 1)
|
||||
|
||||
def original_condition(stem):
|
||||
return self._contains_vowel(stem)
|
||||
|
||||
return self._apply_rule_list(
|
||||
word,
|
||||
[
|
||||
(
|
||||
"y",
|
||||
"i",
|
||||
(
|
||||
nltk_condition
|
||||
if self.mode == self.NLTK_EXTENSIONS
|
||||
else original_condition
|
||||
),
|
||||
)
|
||||
],
|
||||
)
|
||||
|
||||
def _step2(self, word):
|
||||
"""Implements Step 2 from "An algorithm for suffix stripping"
|
||||
|
||||
From the paper:
|
||||
|
||||
Step 2
|
||||
|
||||
(m>0) ATIONAL -> ATE relational -> relate
|
||||
(m>0) TIONAL -> TION conditional -> condition
|
||||
rational -> rational
|
||||
(m>0) ENCI -> ENCE valenci -> valence
|
||||
(m>0) ANCI -> ANCE hesitanci -> hesitance
|
||||
(m>0) IZER -> IZE digitizer -> digitize
|
||||
(m>0) ABLI -> ABLE conformabli -> conformable
|
||||
(m>0) ALLI -> AL radicalli -> radical
|
||||
(m>0) ENTLI -> ENT differentli -> different
|
||||
(m>0) ELI -> E vileli - > vile
|
||||
(m>0) OUSLI -> OUS analogousli -> analogous
|
||||
(m>0) IZATION -> IZE vietnamization -> vietnamize
|
||||
(m>0) ATION -> ATE predication -> predicate
|
||||
(m>0) ATOR -> ATE operator -> operate
|
||||
(m>0) ALISM -> AL feudalism -> feudal
|
||||
(m>0) IVENESS -> IVE decisiveness -> decisive
|
||||
(m>0) FULNESS -> FUL hopefulness -> hopeful
|
||||
(m>0) OUSNESS -> OUS callousness -> callous
|
||||
(m>0) ALITI -> AL formaliti -> formal
|
||||
(m>0) IVITI -> IVE sensitiviti -> sensitive
|
||||
(m>0) BILITI -> BLE sensibiliti -> sensible
|
||||
"""
|
||||
|
||||
if self.mode == self.NLTK_EXTENSIONS:
|
||||
# Instead of applying the ALLI -> AL rule after '(a)bli' per
|
||||
# the published algorithm, instead we apply it first, and,
|
||||
# if it succeeds, run the result through step2 again.
|
||||
if word.endswith("alli") and self._has_positive_measure(
|
||||
self._replace_suffix(word, "alli", "")
|
||||
):
|
||||
return self._step2(self._replace_suffix(word, "alli", "al"))
|
||||
|
||||
bli_rule = ("bli", "ble", self._has_positive_measure)
|
||||
abli_rule = ("abli", "able", self._has_positive_measure)
|
||||
|
||||
rules = [
|
||||
("ational", "ate", self._has_positive_measure),
|
||||
("tional", "tion", self._has_positive_measure),
|
||||
("enci", "ence", self._has_positive_measure),
|
||||
("anci", "ance", self._has_positive_measure),
|
||||
("izer", "ize", self._has_positive_measure),
|
||||
abli_rule if self.mode == self.ORIGINAL_ALGORITHM else bli_rule,
|
||||
("alli", "al", self._has_positive_measure),
|
||||
("entli", "ent", self._has_positive_measure),
|
||||
("eli", "e", self._has_positive_measure),
|
||||
("ousli", "ous", self._has_positive_measure),
|
||||
("ization", "ize", self._has_positive_measure),
|
||||
("ation", "ate", self._has_positive_measure),
|
||||
("ator", "ate", self._has_positive_measure),
|
||||
("alism", "al", self._has_positive_measure),
|
||||
("iveness", "ive", self._has_positive_measure),
|
||||
("fulness", "ful", self._has_positive_measure),
|
||||
("ousness", "ous", self._has_positive_measure),
|
||||
("aliti", "al", self._has_positive_measure),
|
||||
("iviti", "ive", self._has_positive_measure),
|
||||
("biliti", "ble", self._has_positive_measure),
|
||||
]
|
||||
|
||||
if self.mode == self.NLTK_EXTENSIONS:
|
||||
rules.append(("fulli", "ful", self._has_positive_measure))
|
||||
|
||||
# The 'l' of the 'logi' -> 'log' rule is put with the stem,
|
||||
# so that short stems like 'geo' 'theo' etc work like
|
||||
# 'archaeo' 'philo' etc.
|
||||
rules.append(
|
||||
("logi", "log", lambda stem: self._has_positive_measure(word[:-3]))
|
||||
)
|
||||
|
||||
if self.mode == self.MARTIN_EXTENSIONS:
|
||||
rules.append(("logi", "log", self._has_positive_measure))
|
||||
|
||||
return self._apply_rule_list(word, rules)
|
||||
|
||||
def _step3(self, word):
|
||||
"""Implements Step 3 from "An algorithm for suffix stripping"
|
||||
|
||||
From the paper:
|
||||
|
||||
Step 3
|
||||
|
||||
(m>0) ICATE -> IC triplicate -> triplic
|
||||
(m>0) ATIVE -> formative -> form
|
||||
(m>0) ALIZE -> AL formalize -> formal
|
||||
(m>0) ICITI -> IC electriciti -> electric
|
||||
(m>0) ICAL -> IC electrical -> electric
|
||||
(m>0) FUL -> hopeful -> hope
|
||||
(m>0) NESS -> goodness -> good
|
||||
"""
|
||||
return self._apply_rule_list(
|
||||
word,
|
||||
[
|
||||
("icate", "ic", self._has_positive_measure),
|
||||
("ative", "", self._has_positive_measure),
|
||||
("alize", "al", self._has_positive_measure),
|
||||
("iciti", "ic", self._has_positive_measure),
|
||||
("ical", "ic", self._has_positive_measure),
|
||||
("ful", "", self._has_positive_measure),
|
||||
("ness", "", self._has_positive_measure),
|
||||
],
|
||||
)
|
||||
|
||||
def _step4(self, word):
|
||||
"""Implements Step 4 from "An algorithm for suffix stripping"
|
||||
|
||||
Step 4
|
||||
|
||||
(m>1) AL -> revival -> reviv
|
||||
(m>1) ANCE -> allowance -> allow
|
||||
(m>1) ENCE -> inference -> infer
|
||||
(m>1) ER -> airliner -> airlin
|
||||
(m>1) IC -> gyroscopic -> gyroscop
|
||||
(m>1) ABLE -> adjustable -> adjust
|
||||
(m>1) IBLE -> defensible -> defens
|
||||
(m>1) ANT -> irritant -> irrit
|
||||
(m>1) EMENT -> replacement -> replac
|
||||
(m>1) MENT -> adjustment -> adjust
|
||||
(m>1) ENT -> dependent -> depend
|
||||
(m>1 and (*S or *T)) ION -> adoption -> adopt
|
||||
(m>1) OU -> homologou -> homolog
|
||||
(m>1) ISM -> communism -> commun
|
||||
(m>1) ATE -> activate -> activ
|
||||
(m>1) ITI -> angulariti -> angular
|
||||
(m>1) OUS -> homologous -> homolog
|
||||
(m>1) IVE -> effective -> effect
|
||||
(m>1) IZE -> bowdlerize -> bowdler
|
||||
|
||||
The suffixes are now removed. All that remains is a little
|
||||
tidying up.
|
||||
"""
|
||||
measure_gt_1 = lambda stem: self._measure(stem) > 1
|
||||
|
||||
return self._apply_rule_list(
|
||||
word,
|
||||
[
|
||||
("al", "", measure_gt_1),
|
||||
("ance", "", measure_gt_1),
|
||||
("ence", "", measure_gt_1),
|
||||
("er", "", measure_gt_1),
|
||||
("ic", "", measure_gt_1),
|
||||
("able", "", measure_gt_1),
|
||||
("ible", "", measure_gt_1),
|
||||
("ant", "", measure_gt_1),
|
||||
("ement", "", measure_gt_1),
|
||||
("ment", "", measure_gt_1),
|
||||
("ent", "", measure_gt_1),
|
||||
# (m>1 and (*S or *T)) ION ->
|
||||
(
|
||||
"ion",
|
||||
"",
|
||||
lambda stem: self._measure(stem) > 1 and stem[-1] in ("s", "t"),
|
||||
),
|
||||
("ou", "", measure_gt_1),
|
||||
("ism", "", measure_gt_1),
|
||||
("ate", "", measure_gt_1),
|
||||
("iti", "", measure_gt_1),
|
||||
("ous", "", measure_gt_1),
|
||||
("ive", "", measure_gt_1),
|
||||
("ize", "", measure_gt_1),
|
||||
],
|
||||
)
|
||||
|
||||
def _step5a(self, word):
|
||||
"""Implements Step 5a from "An algorithm for suffix stripping"
|
||||
|
||||
From the paper:
|
||||
|
||||
Step 5a
|
||||
|
||||
(m>1) E -> probate -> probat
|
||||
rate -> rate
|
||||
(m=1 and not *o) E -> cease -> ceas
|
||||
"""
|
||||
# Note that Martin's test vocabulary and reference
|
||||
# implementations are inconsistent in how they handle the case
|
||||
# where two rules both refer to a suffix that matches the word
|
||||
# to be stemmed, but only the condition of the second one is
|
||||
# true.
|
||||
# Earlier in step2b we had the rules:
|
||||
# (m>0) EED -> EE
|
||||
# (*v*) ED ->
|
||||
# but the examples in the paper included "feed"->"feed", even
|
||||
# though (*v*) is true for "fe" and therefore the second rule
|
||||
# alone would map "feed"->"fe".
|
||||
# However, in THIS case, we need to handle the consecutive rules
|
||||
# differently and try both conditions (obviously; the second
|
||||
# rule here would be redundant otherwise). Martin's paper makes
|
||||
# no explicit mention of the inconsistency; you have to infer it
|
||||
# from the examples.
|
||||
# For this reason, we can't use _apply_rule_list here.
|
||||
if word.endswith("e"):
|
||||
stem = self._replace_suffix(word, "e", "")
|
||||
if self._measure(stem) > 1:
|
||||
return stem
|
||||
if self._measure(stem) == 1 and not self._ends_cvc(stem):
|
||||
return stem
|
||||
return word
|
||||
|
||||
def _step5b(self, word):
|
||||
"""Implements Step 5a from "An algorithm for suffix stripping"
|
||||
|
||||
From the paper:
|
||||
|
||||
Step 5b
|
||||
|
||||
(m > 1 and *d and *L) -> single letter
|
||||
controll -> control
|
||||
roll -> roll
|
||||
"""
|
||||
return self._apply_rule_list(
|
||||
word, [("ll", "l", lambda stem: self._measure(word[:-1]) > 1)]
|
||||
)
|
||||
|
||||
def stem(self, word, to_lowercase=True):
|
||||
"""
|
||||
:param to_lowercase: if `to_lowercase=True` the word always lowercase
|
||||
"""
|
||||
stem = word.lower() if to_lowercase else word
|
||||
|
||||
if self.mode == self.NLTK_EXTENSIONS and word in self.pool:
|
||||
return self.pool[stem]
|
||||
|
||||
if self.mode != self.ORIGINAL_ALGORITHM and len(word) <= 2:
|
||||
# With this line, strings of length 1 or 2 don't go through
|
||||
# the stemming process, although no mention is made of this
|
||||
# in the published algorithm.
|
||||
return stem
|
||||
|
||||
stem = self._step1a(stem)
|
||||
stem = self._step1b(stem)
|
||||
stem = self._step1c(stem)
|
||||
stem = self._step2(stem)
|
||||
stem = self._step3(stem)
|
||||
stem = self._step4(stem)
|
||||
stem = self._step5a(stem)
|
||||
stem = self._step5b(stem)
|
||||
|
||||
return stem
|
||||
|
||||
def __repr__(self):
|
||||
return "<PorterStemmer>"
|
||||
|
||||
|
||||
def demo():
|
||||
"""
|
||||
A demonstration of the porter stemmer on a sample from
|
||||
the Penn Treebank corpus.
|
||||
"""
|
||||
|
||||
from nltk import stem
|
||||
from nltk.corpus import treebank
|
||||
|
||||
stemmer = stem.PorterStemmer()
|
||||
|
||||
orig = []
|
||||
stemmed = []
|
||||
for item in treebank.fileids()[:3]:
|
||||
for word, tag in treebank.tagged_words(item):
|
||||
orig.append(word)
|
||||
stemmed.append(stemmer.stem(word))
|
||||
|
||||
# Convert the results to a string, and word-wrap them.
|
||||
results = " ".join(stemmed)
|
||||
results = re.sub(r"(.{,70})\s", r"\1\n", results + " ").rstrip()
|
||||
|
||||
# Convert the original to a string, and word wrap it.
|
||||
original = " ".join(orig)
|
||||
original = re.sub(r"(.{,70})\s", r"\1\n", original + " ").rstrip()
|
||||
|
||||
# Print the results.
|
||||
print("-Original-".center(70).replace(" ", "*").replace("-", " "))
|
||||
print(original)
|
||||
print("-Results-".center(70).replace(" ", "*").replace("-", " "))
|
||||
print(results)
|
||||
print("*" * 70)
|
||||
55
backend/venv/Lib/site-packages/nltk/stem/regexp.py
Normal file
55
backend/venv/Lib/site-packages/nltk/stem/regexp.py
Normal file
@@ -0,0 +1,55 @@
|
||||
# Natural Language Toolkit: Stemmers
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
import re
|
||||
|
||||
from nltk.stem.api import StemmerI
|
||||
|
||||
|
||||
class RegexpStemmer(StemmerI):
|
||||
"""
|
||||
A stemmer that uses regular expressions to identify morphological
|
||||
affixes. Any substrings that match the regular expressions will
|
||||
be removed.
|
||||
|
||||
>>> from nltk.stem import RegexpStemmer
|
||||
>>> st = RegexpStemmer('ing$|s$|e$|able$', min=4)
|
||||
>>> st.stem('cars')
|
||||
'car'
|
||||
>>> st.stem('mass')
|
||||
'mas'
|
||||
>>> st.stem('was')
|
||||
'was'
|
||||
>>> st.stem('bee')
|
||||
'bee'
|
||||
>>> st.stem('compute')
|
||||
'comput'
|
||||
>>> st.stem('advisable')
|
||||
'advis'
|
||||
|
||||
:type regexp: str or regexp
|
||||
:param regexp: The regular expression that should be used to
|
||||
identify morphological affixes.
|
||||
:type min: int
|
||||
:param min: The minimum length of string to stem
|
||||
"""
|
||||
|
||||
def __init__(self, regexp, min=0):
|
||||
if not hasattr(regexp, "pattern"):
|
||||
regexp = re.compile(regexp)
|
||||
self._regexp = regexp
|
||||
self._min = min
|
||||
|
||||
def stem(self, word):
|
||||
if len(word) < self._min:
|
||||
return word
|
||||
else:
|
||||
return self._regexp.sub("", word)
|
||||
|
||||
def __repr__(self):
|
||||
return f"<RegexpStemmer: {self._regexp.pattern!r}>"
|
||||
137
backend/venv/Lib/site-packages/nltk/stem/rslp.py
Normal file
137
backend/venv/Lib/site-packages/nltk/stem/rslp.py
Normal file
@@ -0,0 +1,137 @@
|
||||
# Natural Language Toolkit: RSLP Stemmer
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Tiago Tresoldi <tresoldi@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
# This code is based on the algorithm presented in the paper "A Stemming
|
||||
# Algorithm for the Portuguese Language" by Viviane Moreira Orengo and
|
||||
# Christian Huyck, which unfortunately I had no access to. The code is a
|
||||
# Python version, with some minor modifications of mine, to the description
|
||||
# presented at https://www.webcitation.org/5NnvdIzOb and to the C source code
|
||||
# available at http://www.inf.ufrgs.br/~arcoelho/rslp/integrando_rslp.html.
|
||||
# Please note that this stemmer is intended for demonstration and educational
|
||||
# purposes only. Feel free to write me for any comments, including the
|
||||
# development of a different and/or better stemmer for Portuguese. I also
|
||||
# suggest using NLTK's mailing list for Portuguese for any discussion.
|
||||
|
||||
# Este código é baseado no algoritmo apresentado no artigo "A Stemming
|
||||
# Algorithm for the Portuguese Language" de Viviane Moreira Orengo e
|
||||
# Christian Huyck, o qual infelizmente não tive a oportunidade de ler. O
|
||||
# código é uma conversão para Python, com algumas pequenas modificações
|
||||
# minhas, daquele apresentado em https://www.webcitation.org/5NnvdIzOb e do
|
||||
# código para linguagem C disponível em
|
||||
# http://www.inf.ufrgs.br/~arcoelho/rslp/integrando_rslp.html. Por favor,
|
||||
# lembre-se de que este stemmer foi desenvolvido com finalidades unicamente
|
||||
# de demonstração e didáticas. Sinta-se livre para me escrever para qualquer
|
||||
# comentário, inclusive sobre o desenvolvimento de um stemmer diferente
|
||||
# e/ou melhor para o português. Também sugiro utilizar-se a lista de discussão
|
||||
# do NLTK para o português para qualquer debate.
|
||||
|
||||
from nltk.data import load
|
||||
from nltk.stem.api import StemmerI
|
||||
|
||||
|
||||
class RSLPStemmer(StemmerI):
|
||||
"""
|
||||
A stemmer for Portuguese.
|
||||
|
||||
>>> from nltk.stem import RSLPStemmer
|
||||
>>> st = RSLPStemmer()
|
||||
>>> # opening lines of Erico Verissimo's "Música ao Longe"
|
||||
>>> text = '''
|
||||
... Clarissa risca com giz no quadro-negro a paisagem que os alunos
|
||||
... devem copiar . Uma casinha de porta e janela , em cima duma
|
||||
... coxilha .'''
|
||||
>>> for token in text.split(): # doctest: +NORMALIZE_WHITESPACE
|
||||
... print(st.stem(token))
|
||||
clariss risc com giz no quadro-negr a pais que os alun dev copi .
|
||||
uma cas de port e janel , em cim dum coxilh .
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._model = []
|
||||
|
||||
self._model.append(self.read_rule("step0.pt"))
|
||||
self._model.append(self.read_rule("step1.pt"))
|
||||
self._model.append(self.read_rule("step2.pt"))
|
||||
self._model.append(self.read_rule("step3.pt"))
|
||||
self._model.append(self.read_rule("step4.pt"))
|
||||
self._model.append(self.read_rule("step5.pt"))
|
||||
self._model.append(self.read_rule("step6.pt"))
|
||||
|
||||
def read_rule(self, filename):
|
||||
rules = load("nltk:stemmers/rslp/" + filename, format="raw").decode("utf8")
|
||||
lines = rules.split("\n")
|
||||
|
||||
lines = [line for line in lines if line != ""] # remove blank lines
|
||||
lines = [line for line in lines if line[0] != "#"] # remove comments
|
||||
|
||||
# NOTE: a simple but ugly hack to make this parser happy with double '\t's
|
||||
lines = [line.replace("\t\t", "\t") for line in lines]
|
||||
|
||||
# parse rules
|
||||
rules = []
|
||||
for line in lines:
|
||||
rule = []
|
||||
tokens = line.split("\t")
|
||||
|
||||
# text to be searched for at the end of the string
|
||||
rule.append(tokens[0][1:-1]) # remove quotes
|
||||
|
||||
# minimum stem size to perform the replacement
|
||||
rule.append(int(tokens[1]))
|
||||
|
||||
# text to be replaced into
|
||||
rule.append(tokens[2][1:-1]) # remove quotes
|
||||
|
||||
# exceptions to this rule
|
||||
rule.append([token[1:-1] for token in tokens[3].split(",")])
|
||||
|
||||
# append to the results
|
||||
rules.append(rule)
|
||||
|
||||
return rules
|
||||
|
||||
def stem(self, word):
|
||||
word = word.lower()
|
||||
|
||||
# the word ends in 's'? apply rule for plural reduction
|
||||
if word[-1] == "s":
|
||||
word = self.apply_rule(word, 0)
|
||||
|
||||
# the word ends in 'a'? apply rule for feminine reduction
|
||||
if word[-1] == "a":
|
||||
word = self.apply_rule(word, 1)
|
||||
|
||||
# augmentative reduction
|
||||
word = self.apply_rule(word, 3)
|
||||
|
||||
# adverb reduction
|
||||
word = self.apply_rule(word, 2)
|
||||
|
||||
# noun reduction
|
||||
prev_word = word
|
||||
word = self.apply_rule(word, 4)
|
||||
if word == prev_word:
|
||||
# verb reduction
|
||||
prev_word = word
|
||||
word = self.apply_rule(word, 5)
|
||||
if word == prev_word:
|
||||
# vowel removal
|
||||
word = self.apply_rule(word, 6)
|
||||
|
||||
return word
|
||||
|
||||
def apply_rule(self, word, rule_index):
|
||||
rules = self._model[rule_index]
|
||||
for rule in rules:
|
||||
suffix_length = len(rule[0])
|
||||
if word[-suffix_length:] == rule[0]: # if suffix matches
|
||||
if len(word) >= suffix_length + rule[1]: # if we have minimum size
|
||||
if word not in rule[3]: # if not an exception
|
||||
word = word[:-suffix_length] + rule[2]
|
||||
break
|
||||
|
||||
return word
|
||||
5921
backend/venv/Lib/site-packages/nltk/stem/snowball.py
Normal file
5921
backend/venv/Lib/site-packages/nltk/stem/snowball.py
Normal file
File diff suppressed because it is too large
Load Diff
25
backend/venv/Lib/site-packages/nltk/stem/util.py
Normal file
25
backend/venv/Lib/site-packages/nltk/stem/util.py
Normal file
@@ -0,0 +1,25 @@
|
||||
# Natural Language Toolkit: Stemmer Utilities
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Helder <he7d3r@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
|
||||
def suffix_replace(original, old, new):
|
||||
"""
|
||||
Replaces the old suffix of the original string by a new suffix
|
||||
"""
|
||||
return original[: -len(old)] + new
|
||||
|
||||
|
||||
def prefix_replace(original, old, new):
|
||||
"""
|
||||
Replaces the old prefix of the original string by a new suffix
|
||||
|
||||
:param original: string
|
||||
:param old: string
|
||||
:param new: string
|
||||
:return: string
|
||||
"""
|
||||
return new + original[len(old) :]
|
||||
89
backend/venv/Lib/site-packages/nltk/stem/wordnet.py
Normal file
89
backend/venv/Lib/site-packages/nltk/stem/wordnet.py
Normal file
@@ -0,0 +1,89 @@
|
||||
# Natural Language Toolkit: WordNet stemmer interface
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# Eric Kafe <kafe.eric@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
|
||||
class WordNetLemmatizer:
|
||||
"""
|
||||
WordNet Lemmatizer
|
||||
|
||||
Provides 3 lemmatizer modes: _morphy(), morphy() and lemmatize().
|
||||
|
||||
lemmatize() is a permissive wrapper around _morphy().
|
||||
It returns the shortest lemma found in WordNet,
|
||||
or the input string unchanged if nothing is found.
|
||||
|
||||
>>> from nltk.stem import WordNetLemmatizer as wnl
|
||||
>>> print(wnl().lemmatize('us', 'n'))
|
||||
u
|
||||
|
||||
>>> print(wnl().lemmatize('Anythinggoeszxcv'))
|
||||
Anythinggoeszxcv
|
||||
|
||||
"""
|
||||
|
||||
def _morphy(self, form, pos, check_exceptions=True):
|
||||
"""
|
||||
_morphy() is WordNet's _morphy lemmatizer.
|
||||
It returns a list of all lemmas found in WordNet.
|
||||
|
||||
>>> from nltk.stem import WordNetLemmatizer as wnl
|
||||
>>> print(wnl()._morphy('us', 'n'))
|
||||
['us', 'u']
|
||||
"""
|
||||
from nltk.corpus import wordnet as wn
|
||||
|
||||
return wn._morphy(form, pos, check_exceptions)
|
||||
|
||||
def morphy(self, form, pos=None, check_exceptions=True):
|
||||
"""
|
||||
morphy() is a restrictive wrapper around _morphy().
|
||||
It returns the first lemma found in WordNet,
|
||||
or None if no lemma is found.
|
||||
|
||||
>>> from nltk.stem import WordNetLemmatizer as wnl
|
||||
>>> print(wnl().morphy('us', 'n'))
|
||||
us
|
||||
|
||||
>>> print(wnl().morphy('catss'))
|
||||
None
|
||||
"""
|
||||
from nltk.corpus import wordnet as wn
|
||||
|
||||
return wn.morphy(form, pos, check_exceptions)
|
||||
|
||||
def lemmatize(self, word: str, pos: str = "n") -> str:
|
||||
"""Lemmatize `word` by picking the shortest of the possible lemmas,
|
||||
using the wordnet corpus reader's built-in _morphy function.
|
||||
Returns the input word unchanged if it cannot be found in WordNet.
|
||||
|
||||
>>> from nltk.stem import WordNetLemmatizer as wnl
|
||||
>>> print(wnl().lemmatize('dogs'))
|
||||
dog
|
||||
>>> print(wnl().lemmatize('churches'))
|
||||
church
|
||||
>>> print(wnl().lemmatize('aardwolves'))
|
||||
aardwolf
|
||||
>>> print(wnl().lemmatize('abaci'))
|
||||
abacus
|
||||
>>> print(wnl().lemmatize('hardrock'))
|
||||
hardrock
|
||||
|
||||
:param word: The input word to lemmatize.
|
||||
:type word: str
|
||||
:param pos: The Part Of Speech tag. Valid options are `"n"` for nouns,
|
||||
`"v"` for verbs, `"a"` for adjectives, `"r"` for adverbs and `"s"`
|
||||
for satellite adjectives.
|
||||
:type pos: str
|
||||
:return: The shortest lemma of `word`, for the given `pos`.
|
||||
"""
|
||||
lemmas = self._morphy(word, pos)
|
||||
return min(lemmas, key=len) if lemmas else word
|
||||
|
||||
def __repr__(self):
|
||||
return "<WordNetLemmatizer>"
|
||||
Reference in New Issue
Block a user