Initial commit
This commit is contained in:
139
backend/venv/Lib/site-packages/textblob/en/__init__.py
Normal file
139
backend/venv/Lib/site-packages/textblob/en/__init__.py
Normal file
@@ -0,0 +1,139 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
'''This file is based on pattern.en. See the bundled NOTICE file for
|
||||
license information.
|
||||
'''
|
||||
from __future__ import absolute_import
|
||||
import os
|
||||
|
||||
from textblob._text import (Parser as _Parser, Sentiment as _Sentiment, Lexicon,
|
||||
WORD, POS, CHUNK, PNP, PENN, UNIVERSAL, Spelling)
|
||||
|
||||
from textblob.compat import text_type, unicode
|
||||
|
||||
try:
|
||||
MODULE = os.path.dirname(os.path.abspath(__file__))
|
||||
except:
|
||||
MODULE = ""
|
||||
|
||||
spelling = Spelling(
|
||||
path = os.path.join(MODULE, "en-spelling.txt")
|
||||
)
|
||||
|
||||
#--- ENGLISH PARSER --------------------------------------------------------------------------------
|
||||
|
||||
def find_lemmata(tokens):
|
||||
""" Annotates the tokens with lemmata for plural nouns and conjugated verbs,
|
||||
where each token is a [word, part-of-speech] list.
|
||||
"""
|
||||
for token in tokens:
|
||||
word, pos, lemma = token[0], token[1], token[0]
|
||||
# cats => cat
|
||||
if pos == "NNS":
|
||||
lemma = singularize(word)
|
||||
# sat => sit
|
||||
if pos.startswith(("VB", "MD")):
|
||||
lemma = conjugate(word, INFINITIVE) or word
|
||||
token.append(lemma.lower())
|
||||
return tokens
|
||||
|
||||
class Parser(_Parser):
|
||||
|
||||
def find_lemmata(self, tokens, **kwargs):
|
||||
return find_lemmata(tokens)
|
||||
|
||||
def find_tags(self, tokens, **kwargs):
|
||||
if kwargs.get("tagset") in (PENN, None):
|
||||
kwargs.setdefault("map", lambda token, tag: (token, tag))
|
||||
if kwargs.get("tagset") == UNIVERSAL:
|
||||
kwargs.setdefault("map", lambda token, tag: penntreebank2universal(token, tag))
|
||||
return _Parser.find_tags(self, tokens, **kwargs)
|
||||
|
||||
class Sentiment(_Sentiment):
|
||||
|
||||
def load(self, path=None):
|
||||
_Sentiment.load(self, path)
|
||||
# Map "terrible" to adverb "terribly" (+1% accuracy)
|
||||
if not path:
|
||||
for w, pos in list(dict.items(self)):
|
||||
if "JJ" in pos:
|
||||
if w.endswith("y"):
|
||||
w = w[:-1] + "i"
|
||||
if w.endswith("le"):
|
||||
w = w[:-2]
|
||||
p, s, i = pos["JJ"]
|
||||
self.annotate(w + "ly", "RB", p, s, i)
|
||||
|
||||
|
||||
lexicon = Lexicon(
|
||||
path = os.path.join(MODULE, "en-lexicon.txt"),
|
||||
morphology = os.path.join(MODULE, "en-morphology.txt"),
|
||||
context = os.path.join(MODULE, "en-context.txt"),
|
||||
entities = os.path.join(MODULE, "en-entities.txt"),
|
||||
language = "en"
|
||||
)
|
||||
parser = Parser(
|
||||
lexicon = lexicon,
|
||||
default = ("NN", "NNP", "CD"),
|
||||
language = "en"
|
||||
)
|
||||
|
||||
sentiment = Sentiment(
|
||||
path = os.path.join(MODULE, "en-sentiment.xml"),
|
||||
synset = "wordnet_id",
|
||||
negations = ("no", "not", "n't", "never"),
|
||||
modifiers = ("RB",),
|
||||
modifier = lambda w: w.endswith("ly"),
|
||||
tokenizer = parser.find_tokens,
|
||||
language = "en"
|
||||
)
|
||||
|
||||
|
||||
def tokenize(s, *args, **kwargs):
|
||||
""" Returns a list of sentences, where punctuation marks have been split from words.
|
||||
"""
|
||||
return parser.find_tokens(text_type(s), *args, **kwargs)
|
||||
|
||||
def parse(s, *args, **kwargs):
|
||||
""" Returns a tagged Unicode string.
|
||||
"""
|
||||
return parser.parse(unicode(s), *args, **kwargs)
|
||||
|
||||
def parsetree(s, *args, **kwargs):
|
||||
""" Returns a parsed Text from the given string.
|
||||
"""
|
||||
return Text(parse(unicode(s), *args, **kwargs))
|
||||
|
||||
def split(s, token=[WORD, POS, CHUNK, PNP]):
|
||||
""" Returns a parsed Text from the given parsed string.
|
||||
"""
|
||||
return Text(text_type(s), token)
|
||||
|
||||
def tag(s, tokenize=True, encoding="utf-8"):
|
||||
""" Returns a list of (token, tag)-tuples from the given string.
|
||||
"""
|
||||
tags = []
|
||||
for sentence in parse(s, tokenize, True, False, False, False, encoding).split():
|
||||
for token in sentence:
|
||||
tags.append((token[0], token[1]))
|
||||
return tags
|
||||
|
||||
def suggest(w):
|
||||
""" Returns a list of (word, confidence)-tuples of spelling corrections.
|
||||
"""
|
||||
return spelling.suggest(w)
|
||||
|
||||
def polarity(s, **kwargs):
|
||||
""" Returns the sentence polarity (positive/negative) between -1.0 and 1.0.
|
||||
"""
|
||||
return sentiment(unicode(s), **kwargs)[0]
|
||||
|
||||
def subjectivity(s, **kwargs):
|
||||
""" Returns the sentence subjectivity (objective/subjective) between 0.0 and 1.0.
|
||||
"""
|
||||
return sentiment(unicode(s), **kwargs)[1]
|
||||
|
||||
def positive(s, threshold=0.1, **kwargs):
|
||||
""" Returns True if the given sentence has a positive sentiment (polarity >= threshold).
|
||||
"""
|
||||
return polarity(unicode(s), **kwargs) >= threshold
|
||||
|
||||
294
backend/venv/Lib/site-packages/textblob/en/en-context.txt
Normal file
294
backend/venv/Lib/site-packages/textblob/en/en-context.txt
Normal file
@@ -0,0 +1,294 @@
|
||||
;;;
|
||||
;;; The contextual rules are based on Brill's rule based tagger v1.14,
|
||||
;;; trained on Brown corpus and Penn Treebank.
|
||||
;;;
|
||||
IN VB PREVTAG PRP
|
||||
NN VB PREVTAG TO
|
||||
VBP VB PREV1OR2OR3TAG MD
|
||||
NN VB PREV1OR2TAG MD
|
||||
VB NN PREV1OR2TAG DT
|
||||
VBD VBN PREV1OR2OR3TAG VBZ
|
||||
VBN VBD PREVTAG PRP
|
||||
VBN VBD PREVTAG NNP
|
||||
VBD VBN PREVTAG VBD
|
||||
VBP VB PREVTAG TO
|
||||
POS VBZ PREVTAG PRP
|
||||
VB VBP PREVTAG NNS
|
||||
IN RB WDAND2AFT as as
|
||||
VBD VBN PREV1OR2WD have
|
||||
IN WDT NEXT1OR2TAG VB
|
||||
VB VBP PREVTAG PRP
|
||||
VBP VB PREV1OR2WD n't
|
||||
IN WDT NEXTTAG VBZ
|
||||
JJ NNP NEXTTAG NNP
|
||||
IN WDT NEXTTAG VBD
|
||||
JJ NN NEXTWD of
|
||||
VBD VBN PREV1OR2WD be
|
||||
JJR RBR NEXTTAG JJ
|
||||
IN WDT NEXTTAG VBP
|
||||
JJS RBS WDNEXTTAG most JJ
|
||||
VBN VBD SURROUNDTAG NN DT
|
||||
NNS VBZ PREVTAG PRP
|
||||
POS VBZ NEXT1OR2TAG DT
|
||||
NNP NN SURROUNDTAG STAART NNS
|
||||
VBD VBN NEXTWD by
|
||||
VB NN PREV1OR2TAG IN
|
||||
VB VBP PREVTAG WDT
|
||||
VBG NN PREVTAG JJ
|
||||
NNS VBZ NEXTTAG DT
|
||||
VBN VBD PREVTAG WP
|
||||
NN VBP PREVTAG NNS
|
||||
VB NN PREVTAG NN
|
||||
NN VB PREVWD n't
|
||||
NN VBG NEXTTAG DT
|
||||
RB JJ NEXTTAG NN
|
||||
NN VBP PREVTAG PRP
|
||||
VBN VBD SURROUNDTAG NNS DT
|
||||
VB NN PREV1OR2TAG POS
|
||||
JJ NN NEXTTAG VBD
|
||||
RB RP WDNEXTTAG up DT
|
||||
JJ VB PREVTAG TO
|
||||
VBN VBD SURROUNDTAG , DT
|
||||
VBN VBD PREVWD that
|
||||
VB VBP PREVBIGRAM NNS RB
|
||||
NNP JJ SURROUNDTAG STAART NN
|
||||
VB VBN PREVTAG VBZ
|
||||
NNP JJ WDNEXTTAG American NNS
|
||||
JJ RB NEXTTAG JJR
|
||||
NNS NN CURWD yen
|
||||
IN WDT NEXTTAG VBD
|
||||
DT IN WDAND2TAGAFT that NNS
|
||||
POS VBZ PREVWD that
|
||||
JJ VB PREVTAG MD
|
||||
VB NN PREVTAG JJ
|
||||
JJR RBR NEXTTAG RB
|
||||
VBD VBN PREV1OR2WD are
|
||||
NN JJ WDNEXTTAG executive NN
|
||||
NNP JJ WDNEXTTAG American NN
|
||||
VBN VBD PREVTAG WDT
|
||||
VBD VBN PREVBIGRAM VBD RB
|
||||
JJ NN SURROUNDTAG DT .
|
||||
NNP JJ NEXTWD German
|
||||
VBN VB PREVTAG TO
|
||||
VBN VBD PREVBIGRAM NNP RB
|
||||
RB IN RBIGRAM up to
|
||||
VB VBP PREVTAG WP
|
||||
JJ NN SURROUNDTAG DT IN
|
||||
IN DT NEXTWD 's
|
||||
VBD VBN WDNEXTTAG ended NNP
|
||||
VBD VBN SURROUNDTAG DT NN
|
||||
NNS NNP NEXTTAG NNP
|
||||
NN NNP NEXTTAG NNP
|
||||
VBG NN SURROUNDTAG DT IN
|
||||
NNP JJ SURROUNDTAG STAART NNS
|
||||
RB RP WDPREVTAG VB up
|
||||
VBN VBD PREVBIGRAM PRP RB
|
||||
JJ RB NEXTTAG VBN
|
||||
NN VBP PREVTAG RB
|
||||
NNS VBZ PREVTAG RB
|
||||
POS VBZ PREVTAG WP
|
||||
VB VBN PREVWD have
|
||||
NN PDT WDNEXTTAG half DT
|
||||
IN WDT NEXTTAG MD
|
||||
POS VBZ PREVTAG DT
|
||||
NN NNP CURWD Integrated
|
||||
POS '' NEXT1OR2TAG ''
|
||||
VBD VBN PREVTAG IN
|
||||
JJR RBR NEXT1OR2TAG VBN
|
||||
JJS RBS WDNEXTTAG most RB
|
||||
JJ NN SURROUNDTAG JJ IN
|
||||
VBZ NNS PREVTAG JJ
|
||||
NNS VBZ WDPREVTAG JJ is
|
||||
JJ NN NEXTTAG VBZ
|
||||
VBP NN PREVTAG DT
|
||||
JJ NN SURROUNDTAG JJ .
|
||||
NNPS NNP NEXTTAG NNP
|
||||
WDT DT PREVTAG CC
|
||||
RB IN WDNEXTTAG so PRP
|
||||
VBP NN PREVWD earnings
|
||||
NN VBG PREVWD is
|
||||
NNS VBZ PREV1OR2WD Mr.
|
||||
VBZ NNS PREVWD the
|
||||
RB RP WDPREVTAG VBN up
|
||||
NNPS NNS PREVTAG STAART
|
||||
VBN VBD SURROUNDTAG NN JJ
|
||||
VBP VB PREV2TAG VB
|
||||
RBR JJR NEXTTAG NNS
|
||||
JJ NN SURROUNDTAG DT ,
|
||||
JJ NN SURROUNDTAG IN .
|
||||
NN VB PREVTAG TO
|
||||
VB NN PREVTAG VB
|
||||
NN VBP PREVWD who
|
||||
RB RP WDPREVTAG VBG up
|
||||
NN RB WDNEXTTAG right RB
|
||||
VBZ POS WDPREVTAG NNP 's
|
||||
JJ RP WDNEXTTAG up NN
|
||||
VBN VBD SURROUNDTAG NN NN
|
||||
VBN VBD SURROUNDTAG CC DT
|
||||
JJ NN NEXTBIGRAM MD VB
|
||||
JJ RB WDNEXTTAG early IN
|
||||
JJ VBN SURROUNDTAG STAART IN
|
||||
IN RB RBIGRAM though ,
|
||||
VBD VBN PREV1OR2WD been
|
||||
DT PDT WDNEXTTAG all DT
|
||||
VBN VBD PREVBIGRAM NN RB
|
||||
NN VB PREVWD help
|
||||
VBP VB PREV1OR2WD not
|
||||
VBP NN PREVTAG JJ
|
||||
DT WDT PREVTAG NNS
|
||||
NN VBP PREVTAG WDT
|
||||
VB RB RBIGRAM close to
|
||||
NNS VBZ PREVBIGRAM , WDT
|
||||
IN RP WDNEXTTAG out DT
|
||||
DT RB NEXTWD longer
|
||||
IN JJ SURROUNDTAG DT NN
|
||||
DT WDT SURROUNDTAG NN VBZ
|
||||
IN VB NEXT2TAG VB
|
||||
IN NN PREVTAG DT
|
||||
VBN VBD SURROUNDTAG NNS NNS
|
||||
IN RB RBIGRAM about $
|
||||
EX RB NEXT1OR2TAG IN
|
||||
NN VBG NEXTTAG PRP$
|
||||
NN VBG CURWD living
|
||||
VBZ NNS PREVTAG PRP$
|
||||
RBR JJR NEXTTAG NN
|
||||
RBR JJR CURWD higher
|
||||
VB VBP PREVBIGRAM PRP RB
|
||||
NN VB PREVTAG MD
|
||||
VB NN PREV1OR2TAG PRP$
|
||||
RP IN PREV1OR2TAG ,
|
||||
VB JJ PREVTAG DT
|
||||
DT IN PREVWD out
|
||||
POS VBZ PREVTAG EX
|
||||
JJ NN NEXTTAG POS
|
||||
NN JJ CURWD first
|
||||
VBD VBN PREVWD the
|
||||
NNS VBZ WDPREVTAG NNP plans
|
||||
NNP NNS SURROUNDTAG STAART IN
|
||||
RB JJ NEXTTAG NNS
|
||||
JJ RB CURWD just
|
||||
VBP NN PREVWD sales
|
||||
NNS NNPS PREVWD Orange
|
||||
VB VBN PREVTAG VBD
|
||||
WDT DT PREVTAG IN
|
||||
NN JJ WDNEXTTAG right NN
|
||||
NN VBG WDNEXTTAG operating IN
|
||||
JJ VBN CURWD insured
|
||||
JJ NNP LBIGRAM STAART U.S.
|
||||
IN DT NEXTTAG STAART
|
||||
POS '' PREV1OR2OR3TAG ``
|
||||
NN JJ WDNEXTTAG official NN
|
||||
NNP JJ CURWD Irish
|
||||
JJ RB NEXTTAG RBR
|
||||
VBG NN WDPREVTAG DT selling
|
||||
VBP VB PREV1OR2OR3TAG MD
|
||||
WDT IN NEXTTAG PRP
|
||||
EX RB NEXTTAG .
|
||||
VBN VBD SURROUNDTAG NNS PRP$
|
||||
VBN VBD CURWD said
|
||||
JJ RB PREVTAG MD
|
||||
NN VBG NEXTBIGRAM JJ NNS
|
||||
JJ RB WDNEXTTAG late IN
|
||||
VBG NN PREVTAG PRP$
|
||||
VBZ NNS NEXTTAG VBP
|
||||
NN NNP WDPREVTAG DT CD
|
||||
NN VBN PREVWD be
|
||||
JJS RBS NEXTTAG VBN
|
||||
VBN VBD SURROUNDTAG NN PRP$
|
||||
VBN VBD SURROUNDTAG NNS JJ
|
||||
VBN VBD SURROUNDTAG NNS NN
|
||||
VBD VBN WDNEXTTAG increased NN
|
||||
VBZ NNS NEXTWD of
|
||||
IN RP WDAND2TAGAFT out NNS
|
||||
JJ NNP NEXTTAG POS
|
||||
RB RP WDNEXTTAG down DT
|
||||
CD NNS CURWD 1970s
|
||||
VBG NNP CURWD Working
|
||||
VBN VB PREVTAG MD
|
||||
JJ NN NEXTBIGRAM CC NN
|
||||
NN JJ SURROUNDTAG STAART NNS
|
||||
VBN VBD PREVBIGRAM , CC
|
||||
IN RB NEXTBIGRAM . STAART
|
||||
NN VBG PREVWD was
|
||||
NNP NNPS CURWD Cowboys
|
||||
VBZ NNS PREVWD phone
|
||||
NNP NNS SURROUNDTAG STAART VBP
|
||||
RBR JJR WDNEXTTAG lower JJ
|
||||
PRP$ PRP NEXTTAG IN
|
||||
VBD VB PREVTAG TO
|
||||
JJ NN WDPREVTAG NN chief
|
||||
JJ NN SURROUNDTAG JJ ,
|
||||
NN JJ WDPREVTAG DT third
|
||||
VBN VBD SURROUNDTAG NNS NNP
|
||||
NNP NN SURROUNDTAG STAART NN
|
||||
NNP NN CURWD HDTV
|
||||
VBG NN SURROUNDTAG DT ,
|
||||
VBG NN SURROUNDTAG DT .
|
||||
NNS VBZ PREVTAG WP
|
||||
NN VB SURROUNDTAG CC DT
|
||||
NNPS NNP WDAND2TAGBFR IN Securities
|
||||
RP IN PREVTAG NNS
|
||||
VBP NN LBIGRAM funds rate
|
||||
VBP NN WDPREVTAG NNS market
|
||||
DT RB RBIGRAM either .
|
||||
VBN NN SURROUNDTAG DT IN
|
||||
VBD VB PREV1OR2OR3TAG MD
|
||||
NN JJ NEXTWD oil
|
||||
VBN VBD SURROUNDTAG , $
|
||||
VBD VBN PREVBIGRAM DT RB
|
||||
VBN JJ PREVWD by
|
||||
NNP JJ WDNEXTTAG American JJ
|
||||
NN VBG PREVTAG VBP
|
||||
JJ RB LBIGRAM very much
|
||||
NN VBG RBIGRAM operating officer
|
||||
RB IN RBIGRAM up for
|
||||
NNS VBZ NEXTBIGRAM JJ NNS
|
||||
NNS VBZ SURROUNDTAG , IN
|
||||
VB VBP PREVTAG NNPS
|
||||
IN RP WDAND2TAGAFT out IN
|
||||
NNPS NNP PREVBIGRAM CC NNP
|
||||
NN RB RBIGRAM close to
|
||||
RBR RB PREVWD no
|
||||
JJ VBD NEXTTAG DT
|
||||
RB NNP PREVTAG NNP
|
||||
MD NN PREVWD good
|
||||
JJ NN WDPREVTAG NN giant
|
||||
NN JJ WDNEXTTAG official NNS
|
||||
VBN VBD SURROUNDTAG , PRP$
|
||||
VBN VBD SURROUNDTAG , RB
|
||||
VBN VBD SURROUNDTAG NN PRP
|
||||
NNP JJ WDNEXTTAG South JJ
|
||||
NN VBG PREVTAG RB
|
||||
NNS VBZ SURROUNDTAG , TO
|
||||
VBZ NNS SURROUNDTAG NN .
|
||||
NN VB NEXTTAG PRP$
|
||||
VBP VB PREV1OR2WD do
|
||||
VB JJ NEXTWD countries
|
||||
IN WDT NEXTBIGRAM RB VBZ
|
||||
JJ VB NEXTTAG DT
|
||||
WDT DT NEXTBIGRAM VBZ ,
|
||||
NNP RB RBIGRAM First ,
|
||||
DT NNP WDNEXTTAG A VBZ
|
||||
JJ RBR RBIGRAM further ,
|
||||
CD PRP WDNEXTTAG one MD
|
||||
POS '' PREV1OR2OR3TAG .
|
||||
PRP NN PREVTAG -LRB-
|
||||
VBN VBD SURROUNDTAG , PRP
|
||||
VBN VBD SURROUNDTAG NN NNS
|
||||
VBN VBD SURROUNDTAG NN RP
|
||||
NNP NN LBIGRAM STAART Business
|
||||
VBD VBN PREVTAG VBG
|
||||
IN RB RBIGRAM before ,
|
||||
IN RB WDAND2AFT As as
|
||||
NNP JJ LBIGRAM New York-based
|
||||
NNP JJ CURWD Mexican
|
||||
NNP NNPS WDNEXTTAG Motors NNP
|
||||
NNP NNPS WDPREVTAG NNP Enterprises
|
||||
JJ RB WDNEXTTAG long IN
|
||||
VBG JJ SURROUNDTAG DT NN
|
||||
NN PRP PREVWD are mine
|
||||
* IN CURWD with
|
||||
* VB CURWD be
|
||||
* JJ RBIGRAM such as
|
||||
* IN LBIGRAM such as
|
||||
* IN CURWD from
|
||||
646
backend/venv/Lib/site-packages/textblob/en/en-entities.txt
Normal file
646
backend/venv/Lib/site-packages/textblob/en/en-entities.txt
Normal file
@@ -0,0 +1,646 @@
|
||||
50 Cent PERS
|
||||
AIDS
|
||||
AK-47
|
||||
AT&T ORG
|
||||
Abraham Lincoln PERS
|
||||
Acropolis LOC
|
||||
Adam Sandler PERS
|
||||
Adolf Hitler PERS
|
||||
Adriana Lima PERS
|
||||
Afghanistan LOC
|
||||
Africa LOC
|
||||
Al Capone PERS
|
||||
Al Pacino PERS
|
||||
Alaska LOC
|
||||
Albert Einstein PERS
|
||||
Albert Hofmann PERS
|
||||
Albert Schweitzer PERS
|
||||
Alexander the Great PERS
|
||||
Alfred Hitchcock PERS
|
||||
Alice Cooper PERS
|
||||
Alice in Wonderland
|
||||
Amazon.com ORG
|
||||
Amber Heard PERS
|
||||
Amelia Earhart PERS
|
||||
American Express
|
||||
American Idol
|
||||
Amsterdam LOC
|
||||
Amy Adams PERS
|
||||
Amy Winehouse PERS
|
||||
Ancient Egypt LOC
|
||||
Ancient Rome LOC
|
||||
Android
|
||||
Angelina Jolie PERS
|
||||
Angry Birds
|
||||
Anne Frank PERS
|
||||
Anne Hathaway PERS
|
||||
Antartica LOC
|
||||
Apple Inc. ORG
|
||||
Archimedes PERS
|
||||
Aretha Franklin PERS
|
||||
Argentina LOC
|
||||
Aristotle PERS
|
||||
Arnold Schwarzenegger PERS
|
||||
Audi ORG
|
||||
Audrey Hepburn PERS
|
||||
Aung San Suu Kyi PERS
|
||||
Australia LOC
|
||||
Austria LOC
|
||||
Avatar
|
||||
Avril Lavigne PERS
|
||||
Ayn Rand PERS
|
||||
Aztec
|
||||
BMW ORG
|
||||
Babe Ruth PERS
|
||||
Bacardi ORG
|
||||
Backstreet Boys
|
||||
Bangladesh LOC
|
||||
Barack Obama PERS
|
||||
Barbra Streisand PERS
|
||||
Barcelona LOC
|
||||
Batman PERS
|
||||
Beethoven PERS
|
||||
Belarus LOC
|
||||
Belgium LOC
|
||||
Ben Affleck PERS
|
||||
Ben Folds PERS
|
||||
Ben Stiller PERS
|
||||
Benazir Bhutto PERS
|
||||
Benjamin Franklin PERS
|
||||
Benjamin Millepied PERS
|
||||
Bernard Madoff PERS
|
||||
Beyoncé Knowles PERS
|
||||
Bill Clinton PERS
|
||||
Bill Gates PERS
|
||||
Billie Holiday PERS
|
||||
Billie Jean King PERS
|
||||
Bing Crosby PERS
|
||||
Black Sabbath
|
||||
Blake Edwards PERS
|
||||
Blake Lively PERS
|
||||
Bob Dylan PERS
|
||||
Bob Geldof PERS
|
||||
Bob Marley PERS
|
||||
Brad Pitt PERS
|
||||
Bradley Manning PERS
|
||||
Brazil LOC
|
||||
Brett Favre PERS
|
||||
Britney Spears PERS
|
||||
Bruce Lee PERS
|
||||
Bruce Willis PERS
|
||||
Bruno Mars PERS
|
||||
Buddhism
|
||||
Bulgaria LOC
|
||||
Burger King
|
||||
Burma LOC
|
||||
C.S. Lewis PERS
|
||||
Cadillac ORG
|
||||
California LOC
|
||||
Cameron Diaz PERS
|
||||
Cameron Newton PERS
|
||||
Canada LOC
|
||||
Captain Beefheart PERS
|
||||
Carl Lewis PERS
|
||||
Charles Darwin PERS
|
||||
Charles Dickens PERS
|
||||
Charles Kindbergh PERS
|
||||
Charles de Gaulle PERS
|
||||
Charlie Sheen PERS
|
||||
Che Guevara PERS
|
||||
Cheryl Cole PERS
|
||||
Chicago LOC
|
||||
China LOC
|
||||
Chopin PERS
|
||||
Chris Colfer PERS
|
||||
Christian Bale PERS
|
||||
Christiano Ronaldo PERS
|
||||
Christina Aguilera PERS
|
||||
Christmas
|
||||
Christopher Nolan PERS
|
||||
Chuck Norris PERS
|
||||
Clint Eastwood PERS
|
||||
Coca Cola ORG
|
||||
Coco Chanel ORG
|
||||
Coldplay
|
||||
Colombia LOC
|
||||
Conan PERS
|
||||
Cristiano Ronaldo PERS
|
||||
Crystal Harris PERS
|
||||
Cthulhu PERS
|
||||
Cuba LOC
|
||||
DNA
|
||||
Daft Punk
|
||||
Dalai Lama PERS
|
||||
Daniel Radcliffe PERS
|
||||
Darren Aronofsky PERS
|
||||
Darren Criss PERS
|
||||
Darth Vader PERS
|
||||
David Beckham PERS
|
||||
David Bowie PERS
|
||||
David Cook PERS
|
||||
Demi Lovato PERS
|
||||
Demi Moore PERS
|
||||
Denmark LOC
|
||||
Desmond Tutu PERS
|
||||
Dexter PERS
|
||||
Diana PERS
|
||||
Diego Maradona PERS
|
||||
Disney ORG
|
||||
Dmitry Medvedev PERS
|
||||
Doctor Who PERS
|
||||
Dr. Dre PERS
|
||||
Dr. Seuss PERS
|
||||
Dragon Ball
|
||||
Dubai LOC
|
||||
Dwayne Johnson PERS
|
||||
Earth LOC
|
||||
Ebenezer Scrooge PERS
|
||||
Eddie Murphy PERS
|
||||
Eduardo Saverin PERS
|
||||
Egypt LOC
|
||||
El Salvador LOC
|
||||
Elizabeth Edwards PERS
|
||||
Elizabeth Hurley PERS
|
||||
Ellen Page PERS
|
||||
Elton John PERS
|
||||
Elvis Presley PERS
|
||||
Emile Zatopek PERS
|
||||
Eminem PERS
|
||||
Emma Roberts PERS
|
||||
Emma Stone PERS
|
||||
Emma Watson PERS
|
||||
Emmeline Pankhurst PERS
|
||||
England LOC
|
||||
Enrique Iglesias PERS
|
||||
Ernest Hemingway PERS
|
||||
Ernest Hemingway PERS
|
||||
Europe LOC
|
||||
Eva Peron PERS
|
||||
Exxon Mobil PERS
|
||||
FC Barcelona ORG
|
||||
FIFA ORG
|
||||
Facebook ORG
|
||||
Fahrenheit
|
||||
Family Guy
|
||||
Faye Resnick PERS
|
||||
FedEx ORG
|
||||
Fidel Castro PERS
|
||||
Finland LOC
|
||||
Firefox ORG
|
||||
Florence Nightingale PERS
|
||||
Florida LOC
|
||||
Fort Wayne LOC
|
||||
France LOC
|
||||
Frank Sinatra PERS
|
||||
Franklin D. Roosevelt PERS
|
||||
Freddie Mercury PERS
|
||||
Frédéric Chopin PERS
|
||||
Futurama
|
||||
Garrett Hedlund PERS
|
||||
Gene Simmons PERS
|
||||
General Electric
|
||||
Genghis Khan PERS
|
||||
George Bush PERS
|
||||
George Clooney PERS
|
||||
George Harrison PERS
|
||||
George Orwell PERS
|
||||
George W. Bush PERS
|
||||
George Washington PERS
|
||||
Georges St-Pierre PERS
|
||||
Germany LOC
|
||||
Google ORG
|
||||
Google Chrome
|
||||
Gorillaz
|
||||
Grand Theft Auto
|
||||
Greece LOC
|
||||
Gucci ORG
|
||||
Gulf War
|
||||
Gulliver's Travels
|
||||
Guns N' Roses
|
||||
Gwyneth Paltrow PERS
|
||||
HIV
|
||||
HSBC
|
||||
Haile Selassie PERS
|
||||
Haiti LOC
|
||||
Halliburton ORG
|
||||
Halloween
|
||||
Hank Baskett PERS
|
||||
Hannah Montana PERS
|
||||
Hanukkah
|
||||
Harrison Ford PERS
|
||||
Harry Potter PERS
|
||||
Hawaii LOC
|
||||
He-Man PERS
|
||||
Heath Ledger PERS
|
||||
Helen Keller PERS
|
||||
Helena Bonham Carter PERS
|
||||
Henry Ford PERS
|
||||
Henry IV PERS
|
||||
Henry V PERS
|
||||
Henry VIII PERS
|
||||
Hilary Duff PERS
|
||||
Hillary Clinton PERS
|
||||
Honda ORG
|
||||
Hong Kong LOC
|
||||
Hotmail
|
||||
Hugh Hefner PERS
|
||||
Humphrey Bogart PERS
|
||||
Hungary LOC
|
||||
IBM ORG
|
||||
IKEA ORG
|
||||
Iceland LOC
|
||||
India LOC
|
||||
Indiana Jones PERS
|
||||
Indira Gandhi PERS
|
||||
Indonesia LOC
|
||||
Internet Explorer
|
||||
Iran LOC
|
||||
Ireland LOC
|
||||
Iron Man PERS
|
||||
Isaac Newton PERS
|
||||
Isabelle Caro PERS
|
||||
Islam
|
||||
Israel LOC
|
||||
Italy LOC
|
||||
Ivy League ORG
|
||||
J. Robert Oppenheimer PERS
|
||||
J.K. Rowling PERS
|
||||
J.R.R. Tolkien PERS
|
||||
JFK PERS
|
||||
Jack the Ripper PERS
|
||||
Jackie Chan PERS
|
||||
Jacqueline Kennedy Onassis PERS
|
||||
Jaden Smith PERS
|
||||
Jake Gyllenhaal PERS
|
||||
James Bond PERS
|
||||
James Franco PERS
|
||||
Jane Austen PERS
|
||||
Janet Jackson PERS
|
||||
Japan LOC
|
||||
Jared Leto PERS
|
||||
Jason Statham PERS
|
||||
Jawaharlal Nehru PERS
|
||||
Jay-Z PERS
|
||||
Jeff Bridges PERS
|
||||
Jeff Buckley PERS
|
||||
Jenna Jameson PERS
|
||||
Jennifer Aniston PERS
|
||||
Jesse Owens PERS
|
||||
Jessica Alba PERS
|
||||
Jesus PERS
|
||||
Jim Carrey PERS
|
||||
Jim Morrisson PERS
|
||||
Jimi Hendrix PERS
|
||||
Jimmy Wales PERS
|
||||
Joaquin Phoenix PERS
|
||||
John Cena PERS
|
||||
John Edwards PERS
|
||||
John F. Kennedy PERS
|
||||
John Lennon PERS
|
||||
John M. Keynes PERS
|
||||
John McCain PERS
|
||||
John Wayne PERS
|
||||
Johnnie Walker PERS
|
||||
Johnny Cash PERS
|
||||
Johnny Depp PERS
|
||||
Joseph Stalin PERS
|
||||
Judy Garland PERS
|
||||
Julia Roberts PERS
|
||||
Julian Assange PERS
|
||||
Julie Andrews PERS
|
||||
Julius Caesar PERS
|
||||
Justin Bieber PERS
|
||||
Justin Timberlake PERS
|
||||
KFC ORG
|
||||
KLM ORG
|
||||
Kama Sutra
|
||||
Kanye West PERS
|
||||
Kate Middleton PERS
|
||||
Katherine Hepburn PERS
|
||||
Katrina Kaif PERS
|
||||
Katy Perry PERS
|
||||
Keira Knightley PERS
|
||||
Ken Livingstone PERS
|
||||
Keri Hilson PERS
|
||||
Kesha PERS
|
||||
Kevin Bacon PERS
|
||||
Kid Cudi PERS
|
||||
Kim Kardashian PERS
|
||||
Kinect
|
||||
King Arthur PERS
|
||||
Kobe Bryant PERS
|
||||
Kosovo LOC
|
||||
Kristallnacht
|
||||
Kristen Stewart PERS
|
||||
Kurt Cobain PERS
|
||||
L'Oreal ORG
|
||||
L. Ron Hubbard PERS
|
||||
Lady Gaga PERS
|
||||
Lea Michele PERS
|
||||
Lebanon LOC
|
||||
Lech Walesa PERS
|
||||
Led Zeppelin
|
||||
Lego
|
||||
Lenin PERS
|
||||
Leo Tolstoy PERS
|
||||
Leon Trotsky PERS
|
||||
Leonardo DiCaprio PERS
|
||||
Leonardo da Vinci PERS
|
||||
Leslie Nielsen PERS
|
||||
Lexus ORG
|
||||
Liam Neeson PERS
|
||||
Lil Wayne PERS
|
||||
Lindsay Lohan PERS
|
||||
Linkin Park PERS
|
||||
Lionel Messi PERS
|
||||
Loch Ness LOC
|
||||
London LOC
|
||||
Lord Baden Powell PERS
|
||||
Los Angeles LOC
|
||||
Louis Pasteur PERS
|
||||
Louis Vuitton PERS
|
||||
Louvre LOC
|
||||
Ludwig van Beethoven PERS
|
||||
Lyndon Johnson PERS
|
||||
MDMA
|
||||
Mac OS X
|
||||
Macaulay Culkin PERS
|
||||
Madagascar LOC
|
||||
Madonna PERS
|
||||
Mahatma Gandhi PERS
|
||||
Malaysia LOC
|
||||
Malcolm X PERS
|
||||
Manchester LOC
|
||||
Manchester United ORG
|
||||
Margaret Thatcher PERS
|
||||
Mariah Carey PERS
|
||||
Marilyn Monroe PERS
|
||||
Mario Gómez PERS
|
||||
Mario Kart
|
||||
Mark David Chapman PERS
|
||||
Mark Wahlberg PERS
|
||||
Mark Zuckerberg PERS
|
||||
Martin Luther King PERS
|
||||
Massachussetts LOC
|
||||
Mata Hari PERS
|
||||
Matt Damon PERS
|
||||
Mattel ORG
|
||||
Maya Angelou PERS
|
||||
McDonald's ORG
|
||||
McGill University ORG
|
||||
Megan Fox PERS
|
||||
Mercedes-Benz ORG
|
||||
Merlin PERS
|
||||
Metallica
|
||||
Mexico LOC
|
||||
Miami LOC
|
||||
Miami Vice
|
||||
Michael C. Hall PERS
|
||||
Michael Jackson PERS
|
||||
Michael Jordan PERS
|
||||
Michael Vick PERS
|
||||
Michelin ORG
|
||||
Michigan LOC
|
||||
Micky Ward PERS
|
||||
Microsoft ORG
|
||||
Microsoft Windows
|
||||
Middle Ages
|
||||
Mike Tyson PERS
|
||||
Mila Kunis PERS
|
||||
Miley Cyrus PERS
|
||||
Minecraft
|
||||
Mohammed Ali PERS
|
||||
Mona Lisa PERS
|
||||
Montreal LOC
|
||||
Morocco LOC
|
||||
Mother Teresa PERS
|
||||
Mother's Day
|
||||
Mozart PERS
|
||||
Mozilla Firefox
|
||||
Muhammad PERS
|
||||
Muhammad Ali PERS
|
||||
Myanmar LOC
|
||||
Napoleon PERS
|
||||
Narnia LOC
|
||||
Natalie Portman PERS
|
||||
Nazi Germany
|
||||
Neil Armstrong PERS
|
||||
Neil Patrick Harris PERS
|
||||
Nelson Mandela PERS
|
||||
Nepal LOC
|
||||
Netherlands LOC
|
||||
New York LOC
|
||||
New York City LOC
|
||||
New Zealand LOC
|
||||
Nicki Minaj PERS
|
||||
Nicolas Cage PERS
|
||||
Nicole Scherzinger PERS
|
||||
Nigeria LOC
|
||||
Nike ORG
|
||||
Nivea ORG
|
||||
North America LOC
|
||||
North Korea LOC
|
||||
Norway LOC
|
||||
Olivia Wilde PERS
|
||||
Oprah Winfrey PERS
|
||||
Osama Bin Laden PERS
|
||||
Oscar Wilde PERS
|
||||
Owen Wilson PERS
|
||||
Ozzfest
|
||||
Pablo Picasso PERS
|
||||
Pakistan LOC
|
||||
Panasonic ORG
|
||||
Paris LOC
|
||||
Paul McCartney PERS
|
||||
Pele PERS
|
||||
Pepsi ORG
|
||||
Peter Sellers PERS
|
||||
Philadelphia LOC
|
||||
Philips ORG
|
||||
Phillipines LOC
|
||||
Pink Floyd PERS
|
||||
PlayStation 3
|
||||
Pocahontas PERS
|
||||
Pokemon
|
||||
Pokémon
|
||||
Poland LOC
|
||||
Pope John Paul II PERS
|
||||
Premier League ORG
|
||||
Prince Charles PERS
|
||||
Priory of Sion LOC
|
||||
Procter & Gamble
|
||||
Puerto Rico LOC
|
||||
Qatar LOC
|
||||
Queen Elizabeth II PERS
|
||||
Queen Victoria PERS
|
||||
Rachmaninoff PERS
|
||||
Raiders of the Lost Ark
|
||||
Raisa Gorbachev PERS
|
||||
Real Madrid ORG
|
||||
Red Hot Chili Peppers PERS
|
||||
Reese Witherspoon PERS
|
||||
Resident Evil
|
||||
Richard PERS
|
||||
Richard Branson PERS
|
||||
Richard Dawkins PERS
|
||||
Richard Holbrooke PERS
|
||||
Richard Nixon PERS
|
||||
Rihanna PERS
|
||||
Ringo Starr PERS
|
||||
Robert De Niro PERS
|
||||
Robert Pattinson PERS
|
||||
Robin Hood PERS
|
||||
Roger Federer PERS
|
||||
Roman Empire ORG
|
||||
Romania LOC
|
||||
Rome LOC
|
||||
Romeo and Juliet
|
||||
Ronald Reagan PERS
|
||||
Ronnie O'Sullivan PERS
|
||||
Rosa Parks PERS
|
||||
Russell Brand PERS
|
||||
Russia LOC
|
||||
Ryan Reynolds PERS
|
||||
Saddam Hussein PERS
|
||||
Sahara LOC
|
||||
Saint Nicholas PERS
|
||||
Salman Khan PERS
|
||||
Samsung ORG
|
||||
Sandra Bullock PERS
|
||||
Santa Claus PERS
|
||||
Sarah Palin PERS
|
||||
Sasha Grey PERS
|
||||
Saudi Arabia LOC
|
||||
Scarlett Johansson PERS
|
||||
Scientology ORG
|
||||
Scotland LOC
|
||||
Sean Combs PERS
|
||||
Sean Parker PERS
|
||||
Selena Gomez PERS
|
||||
Serbia LOC
|
||||
Sergei Rachmaninoff PERS
|
||||
Shakira
|
||||
Shaquille O'Neal PERS
|
||||
Shaun Ryder PERS
|
||||
Sherlock Holmes PERS
|
||||
Shia LaBeouf PERS
|
||||
Shirley Temple PERS
|
||||
Siemens ORG
|
||||
Sigmund Freud PERS
|
||||
Silvio Berlusconi PERS
|
||||
Singapore LOC
|
||||
Skype
|
||||
Smirnoff ORG
|
||||
Snoop Dogg PERS
|
||||
Snow White PERS
|
||||
Socrates PERS
|
||||
Somalia LOC
|
||||
Sony ORG
|
||||
South Africa LOC
|
||||
South America LOC
|
||||
South Korea LOC
|
||||
South Park
|
||||
Soviet Union
|
||||
Spain LOC
|
||||
Spider-Man PERS
|
||||
Spiderman PERS
|
||||
Sri Lanka LOC
|
||||
Star Trek
|
||||
Star Wars
|
||||
Starbucks ORG
|
||||
Stephen Hawking PERS
|
||||
Stephen King PERS
|
||||
Steve Jobs PERS
|
||||
Steve Nash PERS
|
||||
Steven Spielberg PERS
|
||||
Sudan LOC
|
||||
Super Bowl
|
||||
Superman PERS
|
||||
Sweden LOC
|
||||
Switzerland LOC
|
||||
Sylvester Stallone PERS
|
||||
Taiwan LOC
|
||||
Taj Mahal LOC
|
||||
Take That
|
||||
Taylor Lautner PERS
|
||||
Taylor Momsem PERS
|
||||
Taylor Swift PERS
|
||||
Teena Marie PERS
|
||||
Tennessee LOC
|
||||
Texas LOC
|
||||
Thailand LOC
|
||||
The Beatles
|
||||
The Chronicles of Narnia
|
||||
The Godfather
|
||||
The Green Hornet
|
||||
The Lord of the Rings
|
||||
The Rolling Stones
|
||||
The Simpsons
|
||||
The Sims
|
||||
Theodore Roosevelt PERS
|
||||
Thomas Jefferson PERS
|
||||
Thor PERS
|
||||
Tiger Woods PERS
|
||||
Titanic
|
||||
Tom Brady PERS
|
||||
Tom Cruise PERS
|
||||
Tom Hanks PERS
|
||||
Toy Story
|
||||
Toyota ORG
|
||||
Transformers
|
||||
Tron
|
||||
Tupac Shakur PERS
|
||||
Twin Peaks
|
||||
Twitter
|
||||
UEFA Champions League
|
||||
Ubuntu
|
||||
Ukraine LOC
|
||||
United Kingdom LOC
|
||||
United Nations
|
||||
United States LOC
|
||||
Usain Bolt PERS
|
||||
Vanessa Hudgens PERS
|
||||
Venus LOC
|
||||
Vietnam LOC
|
||||
Vin Diesel PERS
|
||||
Virginia Woolf PERS
|
||||
Vladimir Putin PERS
|
||||
Vodafone ORG
|
||||
Volkswagen ORG
|
||||
Walmart ORG
|
||||
Walt Disney PERS
|
||||
Warren Buffet PERS
|
||||
Washington LOC
|
||||
Washington D.C. LOC
|
||||
Wesley Snipes PERS
|
||||
Wii
|
||||
WikiLeaks ORG
|
||||
Wikipedia ORG
|
||||
Will Ferrell PERS
|
||||
Will Smith PERS
|
||||
William Shakespeare PERS
|
||||
Willow Smith PERS
|
||||
Windows 7
|
||||
Windows 95
|
||||
Windows Vista
|
||||
Windows XP
|
||||
Winona Ryder PERS
|
||||
Winston Churchill PERS
|
||||
Wiz Khalifa PERS
|
||||
Wolfgang Amadeus Mozart PERS
|
||||
Woodrow Wilson PERS
|
||||
World War I
|
||||
World War II
|
||||
World of Warcraft
|
||||
Wright Brothers PERS
|
||||
X-Men
|
||||
Xbox 360
|
||||
Yoko Onen PERS
|
||||
Yoko Ono PERS
|
||||
YouTube ORG
|
||||
amazon.com ORG
|
||||
eBay ORG
|
||||
iPad
|
||||
iPhone
|
||||
iPod
|
||||
iPod touch
|
||||
94137
backend/venv/Lib/site-packages/textblob/en/en-lexicon.txt
Normal file
94137
backend/venv/Lib/site-packages/textblob/en/en-lexicon.txt
Normal file
File diff suppressed because it is too large
Load Diff
152
backend/venv/Lib/site-packages/textblob/en/en-morphology.txt
Normal file
152
backend/venv/Lib/site-packages/textblob/en/en-morphology.txt
Normal file
@@ -0,0 +1,152 @@
|
||||
;;;
|
||||
;;; The morphological rules are based on Brill's rule based tagger v1.14,
|
||||
;;; trained on Brown corpus and Penn Treebank.
|
||||
;;;
|
||||
NN s fhassuf 1 NNS x
|
||||
NN . fchar CD x
|
||||
NN - fchar JJ x
|
||||
NN ed fhassuf 2 VBN x
|
||||
NN ing fhassuf 3 VBG x
|
||||
ly hassuf 2 RB x
|
||||
ly addsuf 2 JJ x
|
||||
NN $ fgoodright CD x
|
||||
NN al fhassuf 2 JJ x
|
||||
NN would fgoodright VB x
|
||||
NN 0 fchar CD x
|
||||
NN be fgoodright JJ x
|
||||
NNS us fhassuf 2 JJ x
|
||||
NNS it fgoodright VBZ x
|
||||
NN ble fhassuf 3 JJ x
|
||||
NN ic fhassuf 2 JJ x
|
||||
NN 1 fchar CD x
|
||||
NNS ss fhassuf 2 NN x
|
||||
un deletepref 2 JJ x
|
||||
NN ive fhassuf 3 JJ x
|
||||
NNP ed fhassuf 2 JJ x
|
||||
NN n't fgoodright VB x
|
||||
VB the fgoodright NN x
|
||||
NNS he fgoodright VBZ x
|
||||
VBN he fgoodright VBD x
|
||||
NN are fgoodright JJ x
|
||||
JJ was fgoodleft NN x
|
||||
NN est fhassuf 3 JJS x
|
||||
VBZ The fgoodright NNS x
|
||||
NNP ts fhassuf 2 NNS x
|
||||
NN 4 fchar CD x
|
||||
NN ize fhassuf 3 VB x
|
||||
.. hassuf 2 : x
|
||||
ful hassuf 3 JJ x
|
||||
NN ate fhassuf 3 VB x
|
||||
NNP ing fhassuf 3 VBG x
|
||||
VBG is fgoodleft NN x
|
||||
NN less fhassuf 4 JJ x
|
||||
NN ary fhassuf 3 JJ x
|
||||
Co. goodleft NNP x
|
||||
NN ant fhassuf 3 JJ x
|
||||
million goodleft CD x
|
||||
JJ their fgoodleft IN x
|
||||
NN he fgoodright VBD x
|
||||
Mr. goodright NNP x
|
||||
JJ of fgoodleft NN x
|
||||
NN so fgoodright JJ x
|
||||
NN y fdeletesuf 1 JJ x
|
||||
VBN which fgoodright VBD x
|
||||
VBD been fgoodright VBN x
|
||||
VB a fgoodright NN x
|
||||
NN economic fgoodleft JJ x
|
||||
9 char CD x
|
||||
CD t fchar JJ x
|
||||
NN can fgoodright VB x
|
||||
VB the fgoodright NN x
|
||||
JJ S-T-A-R-T fgoodright VBN x
|
||||
VBN - fchar JJ x
|
||||
NN lar fhassuf 3 JJ x
|
||||
NNP ans fhassuf 3 NNPS x
|
||||
NN men fhassuf 3 NNS x
|
||||
CD d fchar JJ x
|
||||
JJ n fdeletesuf 1 VBN x
|
||||
JJ 's fgoodleft NN x
|
||||
NNS is fhassuf 2 NN x
|
||||
ES hassuf 2 NNS x
|
||||
JJ er fdeletesuf 2 JJR x
|
||||
Inc. goodleft NNP x
|
||||
NN 2 fchar CD x
|
||||
VBD be fgoodleft MD x
|
||||
ons hassuf 3 NNS x
|
||||
RB - fchar JJ x
|
||||
NN very fgoodright JJ x
|
||||
ous hassuf 3 JJ x
|
||||
NN a fdeletepref 1 RB x
|
||||
NNP people fgoodleft JJ x
|
||||
VB have fgoodleft RB x
|
||||
NNS It fgoodright VBZ x
|
||||
NN id fhassuf 2 JJ x
|
||||
JJ may fgoodleft NN x
|
||||
VBN but fgoodright VBD x
|
||||
RS hassuf 2 NNS x
|
||||
JJ stry fhassuf 4 NN x
|
||||
NNS them fgoodleft VBZ x
|
||||
VBZ were fgoodleft NNS x
|
||||
NN ing faddsuf 3 VB x
|
||||
JJ s faddsuf 1 NN x
|
||||
NN 7 fchar CD x
|
||||
NN d faddsuf 1 VB x
|
||||
VB but fgoodleft NN x
|
||||
NN 3 fchar CD x
|
||||
NN est faddsuf 3 JJ x
|
||||
NN en fhassuf 2 VBN x
|
||||
NN costs fgoodright IN x
|
||||
NN 8 fchar CD x
|
||||
VB b fhaspref 1 NN x
|
||||
zes hassuf 3 VBZ x
|
||||
VBN s faddsuf 1 NN x
|
||||
some hassuf 4 JJ x
|
||||
NN ic fhassuf 2 JJ x
|
||||
ly addsuf 2 JJ x
|
||||
ness addsuf 4 JJ x
|
||||
JJS s faddsuf 1 NN x
|
||||
NN ier fhassuf 3 JJR x
|
||||
NN ky fhassuf 2 JJ x
|
||||
tyle hassuf 4 JJ x
|
||||
NNS ates fhassuf 4 VBZ x
|
||||
fy hassuf 2 VB x
|
||||
body addsuf 4 DT x
|
||||
NN ways fgoodleft JJ x
|
||||
NNP ies fhassuf 3 NNPS x
|
||||
VB negative fgoodright NN x
|
||||
ders hassuf 4 NNS x
|
||||
ds hassuf 2 NNS x
|
||||
-day addsuf 4 CD x
|
||||
nian hassuf 4 JJ x
|
||||
JJR s faddsuf 1 NN x
|
||||
ppy hassuf 3 JJ x
|
||||
NN ish fhassuf 3 JJ x
|
||||
tors hassuf 4 NNS x
|
||||
oses hassuf 4 VBZ x
|
||||
NNS oves fhassuf 4 VBZ x
|
||||
VBN un fhaspref 2 JJ x
|
||||
lent hassuf 4 JJ x
|
||||
NN ward fdeletesuf 4 RB x
|
||||
VB k fchar NN x
|
||||
VB r fhassuf 1 NN x
|
||||
VB e fdeletesuf 1 NN x
|
||||
NNS Engelken fgoodright VBZ x
|
||||
NN ient fhassuf 4 JJ x
|
||||
ED hassuf 2 VBD x
|
||||
VBG B fchar NNP x
|
||||
VB le fhassuf 2 NN x
|
||||
ment addsuf 4 VB x
|
||||
ING hassuf 3 NN x
|
||||
JJ ery fhassuf 3 NN x
|
||||
JJ tus fhassuf 3 NN x
|
||||
JJ car fhassuf 3 NN x
|
||||
NN 6 fchar CD x
|
||||
NNS 0 fchar CD x
|
||||
JJ ing fdeletesuf 3 VBG x
|
||||
here hassuf 4 RB x
|
||||
VBN scr fhaspref 3 VBD x
|
||||
uces hassuf 4 VBZ x
|
||||
fies hassuf 4 VBZ x
|
||||
self deletesuf 4 PRP x
|
||||
NNP $ fchar $ x
|
||||
VBN wa fhaspref 2 VBD x
|
||||
2932
backend/venv/Lib/site-packages/textblob/en/en-sentiment.xml
Normal file
2932
backend/venv/Lib/site-packages/textblob/en/en-sentiment.xml
Normal file
File diff suppressed because it is too large
Load Diff
29162
backend/venv/Lib/site-packages/textblob/en/en-spelling.txt
Normal file
29162
backend/venv/Lib/site-packages/textblob/en/en-spelling.txt
Normal file
File diff suppressed because it is too large
Load Diff
472
backend/venv/Lib/site-packages/textblob/en/inflect.py
Normal file
472
backend/venv/Lib/site-packages/textblob/en/inflect.py
Normal file
@@ -0,0 +1,472 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
'''The pluralize and singular methods from the pattern library.
|
||||
|
||||
Licenced under the BSD.
|
||||
See here https://github.com/clips/pattern/blob/master/LICENSE.txt for
|
||||
complete license information.
|
||||
'''
|
||||
import re
|
||||
|
||||
VERB, NOUN, ADJECTIVE, ADVERB = "VB", "NN", "JJ", "RB"
|
||||
|
||||
#### PLURALIZE #####################################################################################
|
||||
# Based on "An Algorithmic Approach to English Pluralization" by Damian Conway:
|
||||
# http://www.csse.monash.edu.au/~damian/papers/HTML/Plurals.html
|
||||
|
||||
# Prepositions are used to solve things like
|
||||
# "mother-in-law" or "man at arms"
|
||||
plural_prepositions = [
|
||||
"about", "above", "across", "after", "among", "around", "at", "athwart", "before", "behind",
|
||||
"below", "beneath", "beside", "besides", "between", "betwixt", "beyond", "but", "by", "during",
|
||||
"except", "for", "from", "in", "into", "near", "of", "off", "on", "onto", "out", "over",
|
||||
"since", "till", "to", "under", "until", "unto", "upon", "with"
|
||||
]
|
||||
|
||||
# Inflection rules that are either general,
|
||||
# or apply to a certain category of words,
|
||||
# or apply to a certain category of words only in classical mode,
|
||||
# or apply only in classical mode.
|
||||
# Each rule consists of:
|
||||
# suffix, inflection, category and classic flag.
|
||||
plural_rules = [
|
||||
# 0) Indefinite articles and demonstratives.
|
||||
[["^a$|^an$", "some", None, False],
|
||||
["^this$", "these", None, False],
|
||||
["^that$", "those", None, False],
|
||||
["^any$", "all", None, False]
|
||||
],
|
||||
# 1) Possessive adjectives.
|
||||
# Overlaps with 1/ for "his" and "its".
|
||||
# Overlaps with 2/ for "her".
|
||||
[["^my$", "our", None, False],
|
||||
["^your$|^thy$", "your", None, False],
|
||||
["^her$|^his$|^its$|^their$", "their", None, False]
|
||||
],
|
||||
# 2) Possessive pronouns.
|
||||
[["^mine$", "ours", None, False],
|
||||
["^yours$|^thine$", "yours", None, False],
|
||||
["^hers$|^his$|^its$|^theirs$", "theirs", None, False]
|
||||
],
|
||||
# 3) Personal pronouns.
|
||||
[["^I$", "we", None, False],
|
||||
["^me$", "us", None, False],
|
||||
["^myself$", "ourselves", None, False],
|
||||
["^you$", "you", None, False],
|
||||
["^thou$|^thee$", "ye", None, False],
|
||||
["^yourself$|^thyself$", "yourself", None, False],
|
||||
["^she$|^he$|^it$|^they$", "they", None, False],
|
||||
["^her$|^him$|^it$|^them$", "them", None, False],
|
||||
["^herself$|^himself$|^itself$|^themself$", "themselves", None, False],
|
||||
["^oneself$", "oneselves", None, False]
|
||||
],
|
||||
# 4) Words that do not inflect.
|
||||
[["$", "", "uninflected", False],
|
||||
["$", "", "uncountable", False],
|
||||
["fish$", "fish", None, False],
|
||||
["([- ])bass$", "\\1bass", None, False],
|
||||
["ois$", "ois", None, False],
|
||||
["sheep$", "sheep", None, False],
|
||||
["deer$", "deer", None, False],
|
||||
["pox$", "pox", None, False],
|
||||
["([A-Z].*)ese$", "\\1ese", None, False],
|
||||
["itis$", "itis", None, False],
|
||||
["(fruct|gluc|galact|lact|ket|malt|rib|sacchar|cellul)ose$", "\\1ose", None, False]
|
||||
],
|
||||
# 5) Irregular plurals (mongoose, oxen).
|
||||
[["atlas$", "atlantes", None, True],
|
||||
["atlas$", "atlases", None, False],
|
||||
["beef$", "beeves", None, True],
|
||||
["brother$", "brethren", None, True],
|
||||
["child$", "children", None, False],
|
||||
["corpus$", "corpora", None, True],
|
||||
["corpus$", "corpuses", None, False],
|
||||
["^cow$", "kine", None, True],
|
||||
["ephemeris$", "ephemerides", None, False],
|
||||
["ganglion$", "ganglia", None, True],
|
||||
["genie$", "genii", None, True],
|
||||
["genus$", "genera", None, False],
|
||||
["graffito$", "graffiti", None, False],
|
||||
["loaf$", "loaves", None, False],
|
||||
["money$", "monies", None, True],
|
||||
["mongoose$", "mongooses", None, False],
|
||||
["mythos$", "mythoi", None, False],
|
||||
["octopus$", "octopodes", None, True],
|
||||
["opus$", "opera", None, True],
|
||||
["opus$", "opuses", None, False],
|
||||
["^ox$", "oxen", None, False],
|
||||
["penis$", "penes", None, True],
|
||||
["penis$", "penises", None, False],
|
||||
["soliloquy$", "soliloquies", None, False],
|
||||
["testis$", "testes", None, False],
|
||||
["trilby$", "trilbys", None, False],
|
||||
["turf$", "turves", None, True],
|
||||
["numen$", "numena", None, False],
|
||||
["occiput$", "occipita", None, True]
|
||||
],
|
||||
# 6) Irregular inflections for common suffixes (synopses, mice, men).
|
||||
[["man$", "men", None, False],
|
||||
["person$", "people", None, False],
|
||||
["([lm])ouse$", "\\1ice", None, False],
|
||||
["tooth$", "teeth", None, False],
|
||||
["goose$", "geese", None, False],
|
||||
["foot$", "feet", None, False],
|
||||
["zoon$", "zoa", None, False],
|
||||
["([csx])is$", "\\1es", None, False]
|
||||
],
|
||||
# 7) Fully assimilated classical inflections (vertebrae, codices).
|
||||
[["ex$", "ices", "ex-ices", False],
|
||||
["ex$", "ices", "ex-ices-classical", True],
|
||||
["um$", "a", "um-a", False],
|
||||
["um$", "a", "um-a-classical", True],
|
||||
["on$", "a", "on-a", False],
|
||||
["a$", "ae", "a-ae", False],
|
||||
["a$", "ae", "a-ae-classical", True]
|
||||
],
|
||||
# 8) Classical variants of modern inflections (stigmata, soprani).
|
||||
[["trix$", "trices", None, True],
|
||||
["eau$", "eaux", None, True],
|
||||
["ieu$", "ieu", None, True],
|
||||
["([iay])nx$", "\\1nges", None, True],
|
||||
["en$", "ina", "en-ina-classical", True],
|
||||
["a$", "ata", "a-ata-classical", True],
|
||||
["is$", "ides", "is-ides-classical", True],
|
||||
["us$", "i", "us-i-classical", True],
|
||||
["us$", "us", "us-us-classical", True],
|
||||
["o$", "i", "o-i-classical", True],
|
||||
["$", "i", "-i-classical", True],
|
||||
["$", "im", "-im-classical", True]
|
||||
],
|
||||
# 9) -ch, -sh and -ss and the s-singular group take -es in the plural (churches, classes, lenses).
|
||||
[["([cs])h$", "\\1hes", None, False],
|
||||
["ss$", "sses", None, False],
|
||||
["x$", "xes", None, False],
|
||||
["s$", "ses", "s-singular", False]
|
||||
],
|
||||
# 10) Certain words ending in -f or -fe take -ves in the plural (lives, wolves).
|
||||
[["([aeo]l)f$", "\\1ves", None, False],
|
||||
["([^d]ea)f$", "\\1ves", None, False],
|
||||
["arf$", "arves", None, False],
|
||||
["([nlw]i)fe$", "\\1ves", None, False],
|
||||
],
|
||||
# 11) -y takes -ys if preceded by a vowel or when a proper noun,
|
||||
# but -ies if preceded by a consonant (storeys, Marys, stories).
|
||||
[["([aeiou])y$", "\\1ys", None, False],
|
||||
["([A-Z].*)y$", "\\1ys", None, False],
|
||||
["y$", "ies", None, False]
|
||||
],
|
||||
# 12) Some words ending in -o take -os, the rest take -oes.
|
||||
# Words in which the -o is preceded by a vowel always take -os (lassos, potatoes, bamboos).
|
||||
[["o$", "os", "o-os", False],
|
||||
["([aeiou])o$", "\\1os", None, False],
|
||||
["o$", "oes", None, False]
|
||||
],
|
||||
# 13) Miltary stuff (Major Generals).
|
||||
[["l$", "ls", "general-generals", False]
|
||||
],
|
||||
# 14) Otherwise, assume that the plural just adds -s (cats, programmes).
|
||||
[["$", "s", None, False]
|
||||
],
|
||||
]
|
||||
|
||||
# For performance, compile the regular expressions only once:
|
||||
for ruleset in plural_rules:
|
||||
for rule in ruleset:
|
||||
rule[0] = re.compile(rule[0])
|
||||
|
||||
# Suffix categories.
|
||||
plural_categories = {
|
||||
"uninflected": [
|
||||
"aircraft", "antelope", "bison", "bream", "breeches", "britches", "carp", "cattle", "chassis",
|
||||
"clippers", "cod", "contretemps", "corps", "debris", "diabetes", "djinn", "eland", "elk",
|
||||
"flounder", "gallows", "graffiti", "headquarters", "herpes", "high-jinks", "homework", "innings",
|
||||
"jackanapes", "mackerel", "measles", "mews", "moose", "mumps", "offspring", "news", "pincers",
|
||||
"pliers", "proceedings", "rabies", "salmon", "scissors", "series", "shears", "species", "swine",
|
||||
"trout", "tuna", "whiting", "wildebeest"],
|
||||
"uncountable": [
|
||||
"advice", "bread", "butter", "cannabis", "cheese", "electricity", "equipment", "fruit", "furniture",
|
||||
"garbage", "gravel", "happiness", "information", "ketchup", "knowledge", "love", "luggage",
|
||||
"mathematics", "mayonnaise", "meat", "mustard", "news", "progress", "research", "rice",
|
||||
"sand", "software", "understanding", "water"],
|
||||
"s-singular": [
|
||||
"acropolis", "aegis", "alias", "asbestos", "bathos", "bias", "bus", "caddis", "canvas",
|
||||
"chaos", "christmas", "cosmos", "dais", "digitalis", "epidermis", "ethos", "gas", "glottis",
|
||||
"ibis", "lens", "mantis", "marquis", "metropolis", "pathos", "pelvis", "polis", "rhinoceros",
|
||||
"sassafras", "trellis"],
|
||||
"ex-ices": ["codex", "murex", "silex"],
|
||||
"ex-ices-classical": [
|
||||
"apex", "cortex", "index", "latex", "pontifex", "simplex", "vertex", "vortex"],
|
||||
"um-a": [
|
||||
"agendum", "bacterium", "candelabrum", "datum", "desideratum", "erratum", "extremum",
|
||||
"ovum", "stratum"],
|
||||
"um-a-classical": [
|
||||
"aquarium", "compendium", "consortium", "cranium", "curriculum", "dictum", "emporium",
|
||||
"enconium", "gymnasium", "honorarium", "interregnum", "lustrum", "maximum", "medium",
|
||||
"memorandum", "millenium", "minimum", "momentum", "optimum", "phylum", "quantum", "rostrum",
|
||||
"spectrum", "speculum", "stadium", "trapezium", "ultimatum", "vacuum", "velum"],
|
||||
"on-a": [
|
||||
"aphelion", "asyndeton", "criterion", "hyperbaton", "noumenon", "organon", "perihelion",
|
||||
"phenomenon", "prolegomenon"],
|
||||
"a-ae": ["alga", "alumna", "vertebra"],
|
||||
"a-ae-classical": [
|
||||
"abscissa", "amoeba", "antenna", "aurora", "formula", "hydra", "hyperbola", "lacuna",
|
||||
"medusa", "nebula", "nova", "parabola"],
|
||||
"en-ina-classical": ["foramen", "lumen", "stamen"],
|
||||
"a-ata-classical": [
|
||||
"anathema", "bema", "carcinoma", "charisma", "diploma", "dogma", "drama", "edema", "enema",
|
||||
"enigma", "gumma", "lemma", "lymphoma", "magma", "melisma", "miasma", "oedema", "sarcoma",
|
||||
"schema", "soma", "stigma", "stoma", "trauma"],
|
||||
"is-ides-classical": ["clitoris", "iris"],
|
||||
"us-i-classical": [
|
||||
"focus", "fungus", "genius", "incubus", "nimbus", "nucleolus", "radius", "stylus", "succubus",
|
||||
"torus", "umbilicus", "uterus"],
|
||||
"us-us-classical": [
|
||||
"apparatus", "cantus", "coitus", "hiatus", "impetus", "nexus", "plexus", "prospectus",
|
||||
"sinus", "status"],
|
||||
"o-i-classical": ["alto", "basso", "canto", "contralto", "crescendo", "solo", "soprano", "tempo"],
|
||||
"-i-classical": ["afreet", "afrit", "efreet"],
|
||||
"-im-classical": ["cherub", "goy", "seraph"],
|
||||
"o-os": [
|
||||
"albino", "archipelago", "armadillo", "commando", "ditto", "dynamo", "embryo", "fiasco",
|
||||
"generalissimo", "ghetto", "guano", "inferno", "jumbo", "lingo", "lumbago", "magneto",
|
||||
"manifesto", "medico", "octavo", "photo", "pro", "quarto", "rhino", "stylo"],
|
||||
"general-generals": [
|
||||
"Adjutant", "Brigadier", "Lieutenant", "Major", "Quartermaster",
|
||||
"adjutant", "brigadier", "lieutenant", "major", "quartermaster"],
|
||||
}
|
||||
|
||||
def pluralize(word, pos=NOUN, custom={}, classical=True):
|
||||
""" Returns the plural of a given word.
|
||||
For example: child -> children.
|
||||
Handles nouns and adjectives, using classical inflection by default
|
||||
(e.g. where "matrix" pluralizes to "matrices" instead of "matrixes").
|
||||
The custom dictionary is for user-defined replacements.
|
||||
"""
|
||||
|
||||
if word in custom:
|
||||
return custom[word]
|
||||
|
||||
# Recursion of genitives.
|
||||
# Remove the apostrophe and any trailing -s,
|
||||
# form the plural of the resultant noun, and then append an apostrophe (dog's -> dogs').
|
||||
if word.endswith("'") or word.endswith("'s"):
|
||||
owner = word.rstrip("'s")
|
||||
owners = pluralize(owner, pos, custom, classical)
|
||||
if owners.endswith("s"):
|
||||
return owners + "'"
|
||||
else:
|
||||
return owners + "'s"
|
||||
|
||||
# Recursion of compound words
|
||||
# (Postmasters General, mothers-in-law, Roman deities).
|
||||
words = word.replace("-", " ").split(" ")
|
||||
if len(words) > 1:
|
||||
if words[1] == "general" or words[1] == "General" and \
|
||||
words[0] not in plural_categories["general-generals"]:
|
||||
return word.replace(words[0], pluralize(words[0], pos, custom, classical))
|
||||
elif words[1] in plural_prepositions:
|
||||
return word.replace(words[0], pluralize(words[0], pos, custom, classical))
|
||||
else:
|
||||
return word.replace(words[-1], pluralize(words[-1], pos, custom, classical))
|
||||
|
||||
# Only a very few number of adjectives inflect.
|
||||
n = list(range(len(plural_rules)))
|
||||
if pos.startswith(ADJECTIVE):
|
||||
n = [0, 1]
|
||||
|
||||
# Apply pluralization rules.
|
||||
for i in n:
|
||||
ruleset = plural_rules[i]
|
||||
for rule in ruleset:
|
||||
suffix, inflection, category, classic = rule
|
||||
# A general rule, or a classic rule in classical mode.
|
||||
if category == None:
|
||||
if not classic or (classic and classical):
|
||||
if suffix.search(word) is not None:
|
||||
return suffix.sub(inflection, word)
|
||||
# A rule relating to a specific category of words.
|
||||
if category != None:
|
||||
if word in plural_categories[category] and (not classic or (classic and classical)):
|
||||
if suffix.search(word) is not None:
|
||||
return suffix.sub(inflection, word)
|
||||
|
||||
#### SINGULARIZE ###################################################################################
|
||||
# Adapted from Bermi Ferrer's Inflector for Python:
|
||||
# http://www.bermi.org/inflector/
|
||||
|
||||
# Copyright (c) 2006 Bermi Ferrer Martinez
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software to deal in this software without restriction, including
|
||||
# without limitation the rights to use, copy, modify, merge, publish,
|
||||
# distribute, sublicense, and/or sell copies of this software, and to permit
|
||||
# persons to whom this software is furnished to do so, subject to the following
|
||||
# condition:
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THIS SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
# THIS SOFTWARE.
|
||||
|
||||
singular_rules = [
|
||||
['(?i)(.)ae$', '\\1a'],
|
||||
['(?i)(.)itis$', '\\1itis'],
|
||||
['(?i)(.)eaux$', '\\1eau'],
|
||||
['(?i)(quiz)zes$', '\\1'],
|
||||
['(?i)(matr)ices$', '\\1ix'],
|
||||
['(?i)(ap|vert|ind)ices$', '\\1ex'],
|
||||
['(?i)^(ox)en', '\\1'],
|
||||
['(?i)(alias|status)es$', '\\1'],
|
||||
['(?i)([octop|vir])i$', '\\1us'],
|
||||
['(?i)(cris|ax|test)es$', '\\1is'],
|
||||
['(?i)(shoe)s$', '\\1'],
|
||||
['(?i)(o)es$', '\\1'],
|
||||
['(?i)(bus)es$', '\\1'],
|
||||
['(?i)([m|l])ice$', '\\1ouse'],
|
||||
['(?i)(x|ch|ss|sh)es$', '\\1'],
|
||||
['(?i)(m)ovies$', '\\1ovie'],
|
||||
['(?i)(.)ombies$', '\\1ombie'],
|
||||
['(?i)(s)eries$', '\\1eries'],
|
||||
['(?i)([^aeiouy]|qu)ies$', '\\1y'],
|
||||
# Certain words ending in -f or -fe take -ves in the plural (lives, wolves).
|
||||
["([aeo]l)ves$", "\\1f"],
|
||||
["([^d]ea)ves$", "\\1f"],
|
||||
["arves$", "arf"],
|
||||
["erves$", "erve"],
|
||||
["([nlw]i)ves$", "\\1fe"],
|
||||
['(?i)([lr])ves$', '\\1f'],
|
||||
["([aeo])ves$", "\\1ve"],
|
||||
['(?i)(sive)s$', '\\1'],
|
||||
['(?i)(tive)s$', '\\1'],
|
||||
['(?i)(hive)s$', '\\1'],
|
||||
['(?i)([^f])ves$', '\\1fe'],
|
||||
# -es suffix.
|
||||
['(?i)(^analy)ses$', '\\1sis'],
|
||||
['(?i)((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$', '\\1\\2sis'],
|
||||
['(?i)(.)opses$', '\\1opsis'],
|
||||
['(?i)(.)yses$', '\\1ysis'],
|
||||
['(?i)(h|d|r|o|n|b|cl|p)oses$', '\\1ose'],
|
||||
['(?i)(fruct|gluc|galact|lact|ket|malt|rib|sacchar|cellul)ose$', '\\1ose'],
|
||||
['(?i)(.)oses$', '\\1osis'],
|
||||
# -a
|
||||
['(?i)([ti])a$', '\\1um'],
|
||||
['(?i)(n)ews$', '\\1ews'],
|
||||
['(?i)s$', ''],
|
||||
]
|
||||
|
||||
# For performance, compile the regular expressions only once:
|
||||
for rule in singular_rules:
|
||||
rule[0] = re.compile(rule[0])
|
||||
|
||||
singular_uninflected = [
|
||||
"aircraft", "antelope", "bison", "bream", "breeches", "britches", "carp", "cattle", "chassis",
|
||||
"clippers", "cod", "contretemps", "corps", "debris", "diabetes", "djinn", "eland",
|
||||
"elk", "flounder", "gallows", "georgia", "graffiti", "headquarters", "herpes", "high-jinks",
|
||||
"homework", "innings", "jackanapes", "mackerel", "measles", "mews", "moose", "mumps", "news",
|
||||
"offspring", "pincers", "pliers", "proceedings", "rabies", "salmon", "scissors", "series",
|
||||
"shears", "species", "swine", "swiss", "trout", "tuna", "whiting", "wildebeest"
|
||||
]
|
||||
singular_uncountable = [
|
||||
"advice", "bread", "butter", "cannabis", "cheese", "electricity", "equipment", "fruit", "furniture",
|
||||
"garbage", "gravel", "happiness", "information", "ketchup", "knowledge", "love", "luggage",
|
||||
"mathematics", "mayonnaise", "meat", "mustard", "news", "progress", "research", "rice", "sand",
|
||||
"software", "understanding", "water"
|
||||
]
|
||||
singular_ie = [
|
||||
"algerie", "auntie", "beanie", "birdie", "bogie", "bombie", "bookie", "collie", "cookie", "cutie",
|
||||
"doggie", "eyrie", "freebie", "goonie", "groupie", "hankie", "hippie", "hoagie", "hottie",
|
||||
"indie", "junkie", "laddie", "laramie", "lingerie", "meanie", "nightie", "oldie", "^pie",
|
||||
"pixie", "quickie", "reverie", "rookie", "softie", "sortie", "stoolie", "sweetie", "techie",
|
||||
"^tie", "toughie", "valkyrie", "veggie", "weenie", "yuppie", "zombie"
|
||||
]
|
||||
singular_s = plural_categories['s-singular']
|
||||
|
||||
# key plural, value singular
|
||||
singular_irregular = {
|
||||
"men": "man",
|
||||
"people": "person",
|
||||
"children": "child",
|
||||
"sexes": "sex",
|
||||
"axes": "axe",
|
||||
"moves": "move",
|
||||
"teeth": "tooth",
|
||||
"geese": "goose",
|
||||
"feet": "foot",
|
||||
"zoa": "zoon",
|
||||
"atlantes": "atlas",
|
||||
"atlases": "atlas",
|
||||
"beeves": "beef",
|
||||
"brethren": "brother",
|
||||
"children": "child",
|
||||
"corpora": "corpus",
|
||||
"corpuses": "corpus",
|
||||
"kine": "cow",
|
||||
"ephemerides": "ephemeris",
|
||||
"ganglia": "ganglion",
|
||||
"genii": "genie",
|
||||
"genera": "genus",
|
||||
"graffiti": "graffito",
|
||||
"helves": "helve",
|
||||
"leaves": "leaf",
|
||||
"loaves": "loaf",
|
||||
"monies": "money",
|
||||
"mongooses": "mongoose",
|
||||
"mythoi": "mythos",
|
||||
"octopodes": "octopus",
|
||||
"opera": "opus",
|
||||
"opuses": "opus",
|
||||
"oxen": "ox",
|
||||
"penes": "penis",
|
||||
"penises": "penis",
|
||||
"soliloquies": "soliloquy",
|
||||
"testes": "testis",
|
||||
"trilbys": "trilby",
|
||||
"turves": "turf",
|
||||
"numena": "numen",
|
||||
"occipita": "occiput",
|
||||
"our": "my",
|
||||
}
|
||||
|
||||
def singularize(word, pos=NOUN, custom={}):
|
||||
|
||||
if word in list(custom.keys()):
|
||||
return custom[word]
|
||||
|
||||
# Recursion of compound words (e.g. mothers-in-law).
|
||||
if "-" in word:
|
||||
words = word.split("-")
|
||||
if len(words) > 1 and words[1] in plural_prepositions:
|
||||
return singularize(words[0], pos, custom)+"-"+"-".join(words[1:])
|
||||
# dogs' => dog's
|
||||
if word.endswith("'"):
|
||||
return singularize(word[:-1]) + "'s"
|
||||
|
||||
lower = word.lower()
|
||||
for w in singular_uninflected:
|
||||
if w.endswith(lower):
|
||||
return word
|
||||
for w in singular_uncountable:
|
||||
if w.endswith(lower):
|
||||
return word
|
||||
for w in singular_ie:
|
||||
if lower.endswith(w+"s"):
|
||||
return w
|
||||
for w in singular_s:
|
||||
if lower.endswith(w + 'es'):
|
||||
return w
|
||||
for w in list(singular_irregular.keys()):
|
||||
if lower.endswith(w):
|
||||
return re.sub('(?i)'+w+'$', singular_irregular[w], word)
|
||||
|
||||
for rule in singular_rules:
|
||||
suffix, inflection = rule
|
||||
match = suffix.search(word)
|
||||
if match:
|
||||
groups = match.groups()
|
||||
for k in range(0, len(groups)):
|
||||
if groups[k] == None:
|
||||
inflection = inflection.replace('\\'+str(k+1), '')
|
||||
return suffix.sub(inflection, word)
|
||||
|
||||
return word
|
||||
204
backend/venv/Lib/site-packages/textblob/en/np_extractors.py
Normal file
204
backend/venv/Lib/site-packages/textblob/en/np_extractors.py
Normal file
@@ -0,0 +1,204 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
'''Various noun phrase extractors.'''
|
||||
from __future__ import unicode_literals, absolute_import
|
||||
|
||||
import nltk
|
||||
|
||||
from textblob.taggers import PatternTagger
|
||||
from textblob.decorators import requires_nltk_corpus
|
||||
from textblob.utils import tree2str, filter_insignificant
|
||||
from textblob.base import BaseNPExtractor
|
||||
|
||||
|
||||
class ChunkParser(nltk.ChunkParserI):
|
||||
|
||||
def __init__(self):
|
||||
self._trained = False
|
||||
|
||||
@requires_nltk_corpus
|
||||
def train(self):
|
||||
'''Train the Chunker on the ConLL-2000 corpus.'''
|
||||
train_data = [[(t, c) for _, t, c in nltk.chunk.tree2conlltags(sent)]
|
||||
for sent in
|
||||
nltk.corpus.conll2000.chunked_sents('train.txt',
|
||||
chunk_types=['NP'])]
|
||||
unigram_tagger = nltk.UnigramTagger(train_data)
|
||||
self.tagger = nltk.BigramTagger(train_data, backoff=unigram_tagger)
|
||||
self._trained = True
|
||||
|
||||
def parse(self, sentence):
|
||||
'''Return the parse tree for the sentence.'''
|
||||
if not self._trained:
|
||||
self.train()
|
||||
pos_tags = [pos for (word, pos) in sentence]
|
||||
tagged_pos_tags = self.tagger.tag(pos_tags)
|
||||
chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
|
||||
conlltags = [(word, pos, chunktag) for ((word, pos), chunktag) in
|
||||
zip(sentence, chunktags)]
|
||||
return nltk.chunk.util.conlltags2tree(conlltags)
|
||||
|
||||
|
||||
class ConllExtractor(BaseNPExtractor):
|
||||
|
||||
'''A noun phrase extractor that uses chunk parsing trained with the
|
||||
ConLL-2000 training corpus.
|
||||
'''
|
||||
|
||||
POS_TAGGER = PatternTagger()
|
||||
|
||||
# The context-free grammar with which to filter the noun phrases
|
||||
CFG = {
|
||||
('NNP', 'NNP'): 'NNP',
|
||||
('NN', 'NN'): 'NNI',
|
||||
('NNI', 'NN'): 'NNI',
|
||||
('JJ', 'JJ'): 'JJ',
|
||||
('JJ', 'NN'): 'NNI',
|
||||
}
|
||||
|
||||
# POS suffixes that will be ignored
|
||||
INSIGNIFICANT_SUFFIXES = ['DT', 'CC', 'PRP$', 'PRP']
|
||||
|
||||
def __init__(self, parser=None):
|
||||
self.parser = ChunkParser() if not parser else parser
|
||||
|
||||
def extract(self, text):
|
||||
'''Return a list of noun phrases (strings) for body of text.'''
|
||||
sentences = nltk.tokenize.sent_tokenize(text)
|
||||
noun_phrases = []
|
||||
for sentence in sentences:
|
||||
parsed = self._parse_sentence(sentence)
|
||||
# Get the string representation of each subtree that is a
|
||||
# noun phrase tree
|
||||
phrases = [_normalize_tags(filter_insignificant(each,
|
||||
self.INSIGNIFICANT_SUFFIXES)) for each in parsed
|
||||
if isinstance(each, nltk.tree.Tree) and each.label()
|
||||
== 'NP' and len(filter_insignificant(each)) >= 1
|
||||
and _is_match(each, cfg=self.CFG)]
|
||||
nps = [tree2str(phrase) for phrase in phrases]
|
||||
noun_phrases.extend(nps)
|
||||
return noun_phrases
|
||||
|
||||
def _parse_sentence(self, sentence):
|
||||
'''Tag and parse a sentence (a plain, untagged string).'''
|
||||
tagged = self.POS_TAGGER.tag(sentence)
|
||||
return self.parser.parse(tagged)
|
||||
|
||||
|
||||
class FastNPExtractor(BaseNPExtractor):
|
||||
|
||||
'''A fast and simple noun phrase extractor.
|
||||
|
||||
Credit to Shlomi Babluk. Link to original blog post:
|
||||
|
||||
http://thetokenizer.com/2013/05/09/efficient-way-to-extract-the-main-topics-of-a-sentence/
|
||||
'''
|
||||
|
||||
CFG = {
|
||||
('NNP', 'NNP'): 'NNP',
|
||||
('NN', 'NN'): 'NNI',
|
||||
('NNI', 'NN'): 'NNI',
|
||||
('JJ', 'JJ'): 'JJ',
|
||||
('JJ', 'NN'): 'NNI',
|
||||
}
|
||||
|
||||
def __init__(self):
|
||||
self._trained = False
|
||||
|
||||
@requires_nltk_corpus
|
||||
def train(self):
|
||||
train_data = nltk.corpus.brown.tagged_sents(categories='news')
|
||||
regexp_tagger = nltk.RegexpTagger([
|
||||
(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),
|
||||
(r'(-|:|;)$', ':'),
|
||||
(r'\'*$', 'MD'),
|
||||
(r'(The|the|A|a|An|an)$', 'AT'),
|
||||
(r'.*able$', 'JJ'),
|
||||
(r'^[A-Z].*$', 'NNP'),
|
||||
(r'.*ness$', 'NN'),
|
||||
(r'.*ly$', 'RB'),
|
||||
(r'.*s$', 'NNS'),
|
||||
(r'.*ing$', 'VBG'),
|
||||
(r'.*ed$', 'VBD'),
|
||||
(r'.*', 'NN'),
|
||||
])
|
||||
unigram_tagger = nltk.UnigramTagger(train_data, backoff=regexp_tagger)
|
||||
self.tagger = nltk.BigramTagger(train_data, backoff=unigram_tagger)
|
||||
self._trained = True
|
||||
return None
|
||||
|
||||
|
||||
def _tokenize_sentence(self, sentence):
|
||||
'''Split the sentence into single words/tokens'''
|
||||
tokens = nltk.word_tokenize(sentence)
|
||||
return tokens
|
||||
|
||||
def extract(self, sentence):
|
||||
'''Return a list of noun phrases (strings) for body of text.'''
|
||||
if not self._trained:
|
||||
self.train()
|
||||
tokens = self._tokenize_sentence(sentence)
|
||||
tagged = self.tagger.tag(tokens)
|
||||
tags = _normalize_tags(tagged)
|
||||
merge = True
|
||||
while merge:
|
||||
merge = False
|
||||
for x in range(0, len(tags) - 1):
|
||||
t1 = tags[x]
|
||||
t2 = tags[x + 1]
|
||||
key = t1[1], t2[1]
|
||||
value = self.CFG.get(key, '')
|
||||
if value:
|
||||
merge = True
|
||||
tags.pop(x)
|
||||
tags.pop(x)
|
||||
match = '%s %s' % (t1[0], t2[0])
|
||||
pos = value
|
||||
tags.insert(x, (match, pos))
|
||||
break
|
||||
|
||||
matches = [t[0] for t in tags if t[1] in ['NNP', 'NNI']]
|
||||
return matches
|
||||
|
||||
|
||||
### Utility methods ###
|
||||
|
||||
def _normalize_tags(chunk):
|
||||
'''Normalize the corpus tags.
|
||||
("NN", "NN-PL", "NNS") -> "NN"
|
||||
'''
|
||||
ret = []
|
||||
for word, tag in chunk:
|
||||
if tag == 'NP-TL' or tag == 'NP':
|
||||
ret.append((word, 'NNP'))
|
||||
continue
|
||||
if tag.endswith('-TL'):
|
||||
ret.append((word, tag[:-3]))
|
||||
continue
|
||||
if tag.endswith('S'):
|
||||
ret.append((word, tag[:-1]))
|
||||
continue
|
||||
ret.append((word, tag))
|
||||
return ret
|
||||
|
||||
|
||||
def _is_match(tagged_phrase, cfg):
|
||||
'''Return whether or not a tagged phrases matches a context-free grammar.
|
||||
'''
|
||||
copy = list(tagged_phrase) # A copy of the list
|
||||
merge = True
|
||||
while merge:
|
||||
merge = False
|
||||
for i in range(len(copy) - 1):
|
||||
first, second = copy[i], copy[i + 1]
|
||||
key = first[1], second[1] # Tuple of tags e.g. ('NN', 'JJ')
|
||||
value = cfg.get(key, None)
|
||||
if value:
|
||||
merge = True
|
||||
copy.pop(i)
|
||||
copy.pop(i)
|
||||
match = '{0} {1}'.format(first[0], second[0])
|
||||
pos = value
|
||||
copy.insert(i, (match, pos))
|
||||
break
|
||||
match = any([t[1] in ('NNP', 'NNI') for t in copy])
|
||||
return match
|
||||
18
backend/venv/Lib/site-packages/textblob/en/parsers.py
Normal file
18
backend/venv/Lib/site-packages/textblob/en/parsers.py
Normal file
@@ -0,0 +1,18 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Various parser implementations.
|
||||
|
||||
.. versionadded:: 0.6.0
|
||||
"""
|
||||
from __future__ import absolute_import
|
||||
from textblob.en import parse as pattern_parse
|
||||
from textblob.base import BaseParser
|
||||
|
||||
|
||||
class PatternParser(BaseParser):
|
||||
"""Parser that uses the implementation in Tom de Smedt's pattern library.
|
||||
http://www.clips.ua.ac.be/pages/pattern-en#parser
|
||||
"""
|
||||
|
||||
def parse(self, text):
|
||||
"""Parses the text."""
|
||||
return pattern_parse(text)
|
||||
97
backend/venv/Lib/site-packages/textblob/en/sentiments.py
Normal file
97
backend/venv/Lib/site-packages/textblob/en/sentiments.py
Normal file
@@ -0,0 +1,97 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Sentiment analysis implementations.
|
||||
|
||||
.. versionadded:: 0.5.0
|
||||
"""
|
||||
from __future__ import absolute_import
|
||||
from collections import namedtuple
|
||||
|
||||
import nltk
|
||||
|
||||
from textblob.en import sentiment as pattern_sentiment
|
||||
from textblob.tokenizers import word_tokenize
|
||||
from textblob.decorators import requires_nltk_corpus
|
||||
from textblob.base import BaseSentimentAnalyzer, DISCRETE, CONTINUOUS
|
||||
|
||||
|
||||
class PatternAnalyzer(BaseSentimentAnalyzer):
|
||||
"""Sentiment analyzer that uses the same implementation as the
|
||||
pattern library. Returns results as a named tuple of the form:
|
||||
|
||||
``Sentiment(polarity, subjectivity, [assessments])``
|
||||
|
||||
where [assessments] is a list of the assessed tokens and their
|
||||
polarity and subjectivity scores
|
||||
"""
|
||||
kind = CONTINUOUS
|
||||
# This is only here for backwards-compatibility.
|
||||
# The return type is actually determined upon calling analyze()
|
||||
RETURN_TYPE = namedtuple('Sentiment', ['polarity', 'subjectivity'])
|
||||
|
||||
def analyze(self, text, keep_assessments=False):
|
||||
"""Return the sentiment as a named tuple of the form:
|
||||
``Sentiment(polarity, subjectivity, [assessments])``.
|
||||
"""
|
||||
#: Return type declaration
|
||||
if keep_assessments:
|
||||
Sentiment = namedtuple('Sentiment', ['polarity', 'subjectivity', 'assessments'])
|
||||
assessments = pattern_sentiment(text).assessments
|
||||
polarity, subjectivity = pattern_sentiment(text)
|
||||
return Sentiment(polarity, subjectivity, assessments)
|
||||
|
||||
else:
|
||||
Sentiment = namedtuple('Sentiment', ['polarity', 'subjectivity'])
|
||||
return Sentiment(*pattern_sentiment(text))
|
||||
|
||||
|
||||
def _default_feature_extractor(words):
|
||||
"""Default feature extractor for the NaiveBayesAnalyzer."""
|
||||
return dict(((word, True) for word in words))
|
||||
|
||||
|
||||
class NaiveBayesAnalyzer(BaseSentimentAnalyzer):
|
||||
"""Naive Bayes analyzer that is trained on a dataset of movie reviews.
|
||||
Returns results as a named tuple of the form:
|
||||
``Sentiment(classification, p_pos, p_neg)``
|
||||
|
||||
:param callable feature_extractor: Function that returns a dictionary of
|
||||
features, given a list of words.
|
||||
"""
|
||||
|
||||
kind = DISCRETE
|
||||
#: Return type declaration
|
||||
RETURN_TYPE = namedtuple('Sentiment', ['classification', 'p_pos', 'p_neg'])
|
||||
|
||||
def __init__(self, feature_extractor=_default_feature_extractor):
|
||||
super(NaiveBayesAnalyzer, self).__init__()
|
||||
self._classifier = None
|
||||
self.feature_extractor = feature_extractor
|
||||
|
||||
@requires_nltk_corpus
|
||||
def train(self):
|
||||
"""Train the Naive Bayes classifier on the movie review corpus."""
|
||||
super(NaiveBayesAnalyzer, self).train()
|
||||
neg_ids = nltk.corpus.movie_reviews.fileids('neg')
|
||||
pos_ids = nltk.corpus.movie_reviews.fileids('pos')
|
||||
neg_feats = [(self.feature_extractor(
|
||||
nltk.corpus.movie_reviews.words(fileids=[f])), 'neg') for f in neg_ids]
|
||||
pos_feats = [(self.feature_extractor(
|
||||
nltk.corpus.movie_reviews.words(fileids=[f])), 'pos') for f in pos_ids]
|
||||
train_data = neg_feats + pos_feats
|
||||
self._classifier = nltk.classify.NaiveBayesClassifier.train(train_data)
|
||||
|
||||
def analyze(self, text):
|
||||
"""Return the sentiment as a named tuple of the form:
|
||||
``Sentiment(classification, p_pos, p_neg)``
|
||||
"""
|
||||
# Lazily train the classifier
|
||||
super(NaiveBayesAnalyzer, self).analyze(text)
|
||||
tokens = word_tokenize(text, include_punc=False)
|
||||
filtered = (t.lower() for t in tokens if len(t) >= 3)
|
||||
feats = self.feature_extractor(filtered)
|
||||
prob_dist = self._classifier.prob_classify(feats)
|
||||
return self.RETURN_TYPE(
|
||||
classification=prob_dist.max(),
|
||||
p_pos=prob_dist.prob('pos'),
|
||||
p_neg=prob_dist.prob("neg")
|
||||
)
|
||||
38
backend/venv/Lib/site-packages/textblob/en/taggers.py
Normal file
38
backend/venv/Lib/site-packages/textblob/en/taggers.py
Normal file
@@ -0,0 +1,38 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Parts-of-speech tagger implementations."""
|
||||
from __future__ import absolute_import
|
||||
|
||||
import nltk
|
||||
import textblob.compat
|
||||
|
||||
import textblob as tb
|
||||
from textblob.en import tag as pattern_tag
|
||||
from textblob.decorators import requires_nltk_corpus
|
||||
from textblob.base import BaseTagger
|
||||
|
||||
|
||||
class PatternTagger(BaseTagger):
|
||||
"""Tagger that uses the implementation in
|
||||
Tom de Smedt's pattern library
|
||||
(http://www.clips.ua.ac.be/pattern).
|
||||
"""
|
||||
|
||||
def tag(self, text, tokenize=True):
|
||||
"""Tag a string or BaseBlob."""
|
||||
if not isinstance(text, textblob.compat.text_type):
|
||||
text = text.raw
|
||||
return pattern_tag(text, tokenize)
|
||||
|
||||
|
||||
class NLTKTagger(BaseTagger):
|
||||
"""Tagger that uses NLTK's standard TreeBank tagger.
|
||||
NOTE: Requires numpy. Not yet supported with PyPy.
|
||||
"""
|
||||
|
||||
@requires_nltk_corpus
|
||||
def tag(self, text):
|
||||
"""Tag a string or BaseBlob."""
|
||||
if isinstance(text, textblob.compat.text_type):
|
||||
text = tb.TextBlob(text)
|
||||
|
||||
return nltk.tag.pos_tag(text.tokens)
|
||||
Reference in New Issue
Block a user