Initial commit

This commit is contained in:
2026-02-01 09:31:38 +01:00
commit e02db93960
4396 changed files with 1511612 additions and 0 deletions

View File

@@ -0,0 +1,139 @@
# -*- coding: utf-8 -*-
'''This file is based on pattern.en. See the bundled NOTICE file for
license information.
'''
from __future__ import absolute_import
import os
from textblob._text import (Parser as _Parser, Sentiment as _Sentiment, Lexicon,
WORD, POS, CHUNK, PNP, PENN, UNIVERSAL, Spelling)
from textblob.compat import text_type, unicode
try:
MODULE = os.path.dirname(os.path.abspath(__file__))
except:
MODULE = ""
spelling = Spelling(
path = os.path.join(MODULE, "en-spelling.txt")
)
#--- ENGLISH PARSER --------------------------------------------------------------------------------
def find_lemmata(tokens):
""" Annotates the tokens with lemmata for plural nouns and conjugated verbs,
where each token is a [word, part-of-speech] list.
"""
for token in tokens:
word, pos, lemma = token[0], token[1], token[0]
# cats => cat
if pos == "NNS":
lemma = singularize(word)
# sat => sit
if pos.startswith(("VB", "MD")):
lemma = conjugate(word, INFINITIVE) or word
token.append(lemma.lower())
return tokens
class Parser(_Parser):
def find_lemmata(self, tokens, **kwargs):
return find_lemmata(tokens)
def find_tags(self, tokens, **kwargs):
if kwargs.get("tagset") in (PENN, None):
kwargs.setdefault("map", lambda token, tag: (token, tag))
if kwargs.get("tagset") == UNIVERSAL:
kwargs.setdefault("map", lambda token, tag: penntreebank2universal(token, tag))
return _Parser.find_tags(self, tokens, **kwargs)
class Sentiment(_Sentiment):
def load(self, path=None):
_Sentiment.load(self, path)
# Map "terrible" to adverb "terribly" (+1% accuracy)
if not path:
for w, pos in list(dict.items(self)):
if "JJ" in pos:
if w.endswith("y"):
w = w[:-1] + "i"
if w.endswith("le"):
w = w[:-2]
p, s, i = pos["JJ"]
self.annotate(w + "ly", "RB", p, s, i)
lexicon = Lexicon(
path = os.path.join(MODULE, "en-lexicon.txt"),
morphology = os.path.join(MODULE, "en-morphology.txt"),
context = os.path.join(MODULE, "en-context.txt"),
entities = os.path.join(MODULE, "en-entities.txt"),
language = "en"
)
parser = Parser(
lexicon = lexicon,
default = ("NN", "NNP", "CD"),
language = "en"
)
sentiment = Sentiment(
path = os.path.join(MODULE, "en-sentiment.xml"),
synset = "wordnet_id",
negations = ("no", "not", "n't", "never"),
modifiers = ("RB",),
modifier = lambda w: w.endswith("ly"),
tokenizer = parser.find_tokens,
language = "en"
)
def tokenize(s, *args, **kwargs):
""" Returns a list of sentences, where punctuation marks have been split from words.
"""
return parser.find_tokens(text_type(s), *args, **kwargs)
def parse(s, *args, **kwargs):
""" Returns a tagged Unicode string.
"""
return parser.parse(unicode(s), *args, **kwargs)
def parsetree(s, *args, **kwargs):
""" Returns a parsed Text from the given string.
"""
return Text(parse(unicode(s), *args, **kwargs))
def split(s, token=[WORD, POS, CHUNK, PNP]):
""" Returns a parsed Text from the given parsed string.
"""
return Text(text_type(s), token)
def tag(s, tokenize=True, encoding="utf-8"):
""" Returns a list of (token, tag)-tuples from the given string.
"""
tags = []
for sentence in parse(s, tokenize, True, False, False, False, encoding).split():
for token in sentence:
tags.append((token[0], token[1]))
return tags
def suggest(w):
""" Returns a list of (word, confidence)-tuples of spelling corrections.
"""
return spelling.suggest(w)
def polarity(s, **kwargs):
""" Returns the sentence polarity (positive/negative) between -1.0 and 1.0.
"""
return sentiment(unicode(s), **kwargs)[0]
def subjectivity(s, **kwargs):
""" Returns the sentence subjectivity (objective/subjective) between 0.0 and 1.0.
"""
return sentiment(unicode(s), **kwargs)[1]
def positive(s, threshold=0.1, **kwargs):
""" Returns True if the given sentence has a positive sentiment (polarity >= threshold).
"""
return polarity(unicode(s), **kwargs) >= threshold

View File

@@ -0,0 +1,294 @@
;;;
;;; The contextual rules are based on Brill's rule based tagger v1.14,
;;; trained on Brown corpus and Penn Treebank.
;;;
IN VB PREVTAG PRP
NN VB PREVTAG TO
VBP VB PREV1OR2OR3TAG MD
NN VB PREV1OR2TAG MD
VB NN PREV1OR2TAG DT
VBD VBN PREV1OR2OR3TAG VBZ
VBN VBD PREVTAG PRP
VBN VBD PREVTAG NNP
VBD VBN PREVTAG VBD
VBP VB PREVTAG TO
POS VBZ PREVTAG PRP
VB VBP PREVTAG NNS
IN RB WDAND2AFT as as
VBD VBN PREV1OR2WD have
IN WDT NEXT1OR2TAG VB
VB VBP PREVTAG PRP
VBP VB PREV1OR2WD n't
IN WDT NEXTTAG VBZ
JJ NNP NEXTTAG NNP
IN WDT NEXTTAG VBD
JJ NN NEXTWD of
VBD VBN PREV1OR2WD be
JJR RBR NEXTTAG JJ
IN WDT NEXTTAG VBP
JJS RBS WDNEXTTAG most JJ
VBN VBD SURROUNDTAG NN DT
NNS VBZ PREVTAG PRP
POS VBZ NEXT1OR2TAG DT
NNP NN SURROUNDTAG STAART NNS
VBD VBN NEXTWD by
VB NN PREV1OR2TAG IN
VB VBP PREVTAG WDT
VBG NN PREVTAG JJ
NNS VBZ NEXTTAG DT
VBN VBD PREVTAG WP
NN VBP PREVTAG NNS
VB NN PREVTAG NN
NN VB PREVWD n't
NN VBG NEXTTAG DT
RB JJ NEXTTAG NN
NN VBP PREVTAG PRP
VBN VBD SURROUNDTAG NNS DT
VB NN PREV1OR2TAG POS
JJ NN NEXTTAG VBD
RB RP WDNEXTTAG up DT
JJ VB PREVTAG TO
VBN VBD SURROUNDTAG , DT
VBN VBD PREVWD that
VB VBP PREVBIGRAM NNS RB
NNP JJ SURROUNDTAG STAART NN
VB VBN PREVTAG VBZ
NNP JJ WDNEXTTAG American NNS
JJ RB NEXTTAG JJR
NNS NN CURWD yen
IN WDT NEXTTAG VBD
DT IN WDAND2TAGAFT that NNS
POS VBZ PREVWD that
JJ VB PREVTAG MD
VB NN PREVTAG JJ
JJR RBR NEXTTAG RB
VBD VBN PREV1OR2WD are
NN JJ WDNEXTTAG executive NN
NNP JJ WDNEXTTAG American NN
VBN VBD PREVTAG WDT
VBD VBN PREVBIGRAM VBD RB
JJ NN SURROUNDTAG DT .
NNP JJ NEXTWD German
VBN VB PREVTAG TO
VBN VBD PREVBIGRAM NNP RB
RB IN RBIGRAM up to
VB VBP PREVTAG WP
JJ NN SURROUNDTAG DT IN
IN DT NEXTWD 's
VBD VBN WDNEXTTAG ended NNP
VBD VBN SURROUNDTAG DT NN
NNS NNP NEXTTAG NNP
NN NNP NEXTTAG NNP
VBG NN SURROUNDTAG DT IN
NNP JJ SURROUNDTAG STAART NNS
RB RP WDPREVTAG VB up
VBN VBD PREVBIGRAM PRP RB
JJ RB NEXTTAG VBN
NN VBP PREVTAG RB
NNS VBZ PREVTAG RB
POS VBZ PREVTAG WP
VB VBN PREVWD have
NN PDT WDNEXTTAG half DT
IN WDT NEXTTAG MD
POS VBZ PREVTAG DT
NN NNP CURWD Integrated
POS '' NEXT1OR2TAG ''
VBD VBN PREVTAG IN
JJR RBR NEXT1OR2TAG VBN
JJS RBS WDNEXTTAG most RB
JJ NN SURROUNDTAG JJ IN
VBZ NNS PREVTAG JJ
NNS VBZ WDPREVTAG JJ is
JJ NN NEXTTAG VBZ
VBP NN PREVTAG DT
JJ NN SURROUNDTAG JJ .
NNPS NNP NEXTTAG NNP
WDT DT PREVTAG CC
RB IN WDNEXTTAG so PRP
VBP NN PREVWD earnings
NN VBG PREVWD is
NNS VBZ PREV1OR2WD Mr.
VBZ NNS PREVWD the
RB RP WDPREVTAG VBN up
NNPS NNS PREVTAG STAART
VBN VBD SURROUNDTAG NN JJ
VBP VB PREV2TAG VB
RBR JJR NEXTTAG NNS
JJ NN SURROUNDTAG DT ,
JJ NN SURROUNDTAG IN .
NN VB PREVTAG TO
VB NN PREVTAG VB
NN VBP PREVWD who
RB RP WDPREVTAG VBG up
NN RB WDNEXTTAG right RB
VBZ POS WDPREVTAG NNP 's
JJ RP WDNEXTTAG up NN
VBN VBD SURROUNDTAG NN NN
VBN VBD SURROUNDTAG CC DT
JJ NN NEXTBIGRAM MD VB
JJ RB WDNEXTTAG early IN
JJ VBN SURROUNDTAG STAART IN
IN RB RBIGRAM though ,
VBD VBN PREV1OR2WD been
DT PDT WDNEXTTAG all DT
VBN VBD PREVBIGRAM NN RB
NN VB PREVWD help
VBP VB PREV1OR2WD not
VBP NN PREVTAG JJ
DT WDT PREVTAG NNS
NN VBP PREVTAG WDT
VB RB RBIGRAM close to
NNS VBZ PREVBIGRAM , WDT
IN RP WDNEXTTAG out DT
DT RB NEXTWD longer
IN JJ SURROUNDTAG DT NN
DT WDT SURROUNDTAG NN VBZ
IN VB NEXT2TAG VB
IN NN PREVTAG DT
VBN VBD SURROUNDTAG NNS NNS
IN RB RBIGRAM about $
EX RB NEXT1OR2TAG IN
NN VBG NEXTTAG PRP$
NN VBG CURWD living
VBZ NNS PREVTAG PRP$
RBR JJR NEXTTAG NN
RBR JJR CURWD higher
VB VBP PREVBIGRAM PRP RB
NN VB PREVTAG MD
VB NN PREV1OR2TAG PRP$
RP IN PREV1OR2TAG ,
VB JJ PREVTAG DT
DT IN PREVWD out
POS VBZ PREVTAG EX
JJ NN NEXTTAG POS
NN JJ CURWD first
VBD VBN PREVWD the
NNS VBZ WDPREVTAG NNP plans
NNP NNS SURROUNDTAG STAART IN
RB JJ NEXTTAG NNS
JJ RB CURWD just
VBP NN PREVWD sales
NNS NNPS PREVWD Orange
VB VBN PREVTAG VBD
WDT DT PREVTAG IN
NN JJ WDNEXTTAG right NN
NN VBG WDNEXTTAG operating IN
JJ VBN CURWD insured
JJ NNP LBIGRAM STAART U.S.
IN DT NEXTTAG STAART
POS '' PREV1OR2OR3TAG ``
NN JJ WDNEXTTAG official NN
NNP JJ CURWD Irish
JJ RB NEXTTAG RBR
VBG NN WDPREVTAG DT selling
VBP VB PREV1OR2OR3TAG MD
WDT IN NEXTTAG PRP
EX RB NEXTTAG .
VBN VBD SURROUNDTAG NNS PRP$
VBN VBD CURWD said
JJ RB PREVTAG MD
NN VBG NEXTBIGRAM JJ NNS
JJ RB WDNEXTTAG late IN
VBG NN PREVTAG PRP$
VBZ NNS NEXTTAG VBP
NN NNP WDPREVTAG DT CD
NN VBN PREVWD be
JJS RBS NEXTTAG VBN
VBN VBD SURROUNDTAG NN PRP$
VBN VBD SURROUNDTAG NNS JJ
VBN VBD SURROUNDTAG NNS NN
VBD VBN WDNEXTTAG increased NN
VBZ NNS NEXTWD of
IN RP WDAND2TAGAFT out NNS
JJ NNP NEXTTAG POS
RB RP WDNEXTTAG down DT
CD NNS CURWD 1970s
VBG NNP CURWD Working
VBN VB PREVTAG MD
JJ NN NEXTBIGRAM CC NN
NN JJ SURROUNDTAG STAART NNS
VBN VBD PREVBIGRAM , CC
IN RB NEXTBIGRAM . STAART
NN VBG PREVWD was
NNP NNPS CURWD Cowboys
VBZ NNS PREVWD phone
NNP NNS SURROUNDTAG STAART VBP
RBR JJR WDNEXTTAG lower JJ
PRP$ PRP NEXTTAG IN
VBD VB PREVTAG TO
JJ NN WDPREVTAG NN chief
JJ NN SURROUNDTAG JJ ,
NN JJ WDPREVTAG DT third
VBN VBD SURROUNDTAG NNS NNP
NNP NN SURROUNDTAG STAART NN
NNP NN CURWD HDTV
VBG NN SURROUNDTAG DT ,
VBG NN SURROUNDTAG DT .
NNS VBZ PREVTAG WP
NN VB SURROUNDTAG CC DT
NNPS NNP WDAND2TAGBFR IN Securities
RP IN PREVTAG NNS
VBP NN LBIGRAM funds rate
VBP NN WDPREVTAG NNS market
DT RB RBIGRAM either .
VBN NN SURROUNDTAG DT IN
VBD VB PREV1OR2OR3TAG MD
NN JJ NEXTWD oil
VBN VBD SURROUNDTAG , $
VBD VBN PREVBIGRAM DT RB
VBN JJ PREVWD by
NNP JJ WDNEXTTAG American JJ
NN VBG PREVTAG VBP
JJ RB LBIGRAM very much
NN VBG RBIGRAM operating officer
RB IN RBIGRAM up for
NNS VBZ NEXTBIGRAM JJ NNS
NNS VBZ SURROUNDTAG , IN
VB VBP PREVTAG NNPS
IN RP WDAND2TAGAFT out IN
NNPS NNP PREVBIGRAM CC NNP
NN RB RBIGRAM close to
RBR RB PREVWD no
JJ VBD NEXTTAG DT
RB NNP PREVTAG NNP
MD NN PREVWD good
JJ NN WDPREVTAG NN giant
NN JJ WDNEXTTAG official NNS
VBN VBD SURROUNDTAG , PRP$
VBN VBD SURROUNDTAG , RB
VBN VBD SURROUNDTAG NN PRP
NNP JJ WDNEXTTAG South JJ
NN VBG PREVTAG RB
NNS VBZ SURROUNDTAG , TO
VBZ NNS SURROUNDTAG NN .
NN VB NEXTTAG PRP$
VBP VB PREV1OR2WD do
VB JJ NEXTWD countries
IN WDT NEXTBIGRAM RB VBZ
JJ VB NEXTTAG DT
WDT DT NEXTBIGRAM VBZ ,
NNP RB RBIGRAM First ,
DT NNP WDNEXTTAG A VBZ
JJ RBR RBIGRAM further ,
CD PRP WDNEXTTAG one MD
POS '' PREV1OR2OR3TAG .
PRP NN PREVTAG -LRB-
VBN VBD SURROUNDTAG , PRP
VBN VBD SURROUNDTAG NN NNS
VBN VBD SURROUNDTAG NN RP
NNP NN LBIGRAM STAART Business
VBD VBN PREVTAG VBG
IN RB RBIGRAM before ,
IN RB WDAND2AFT As as
NNP JJ LBIGRAM New York-based
NNP JJ CURWD Mexican
NNP NNPS WDNEXTTAG Motors NNP
NNP NNPS WDPREVTAG NNP Enterprises
JJ RB WDNEXTTAG long IN
VBG JJ SURROUNDTAG DT NN
NN PRP PREVWD are mine
* IN CURWD with
* VB CURWD be
* JJ RBIGRAM such as
* IN LBIGRAM such as
* IN CURWD from

View File

@@ -0,0 +1,646 @@
50 Cent PERS
AIDS
AK-47
AT&T ORG
Abraham Lincoln PERS
Acropolis LOC
Adam Sandler PERS
Adolf Hitler PERS
Adriana Lima PERS
Afghanistan LOC
Africa LOC
Al Capone PERS
Al Pacino PERS
Alaska LOC
Albert Einstein PERS
Albert Hofmann PERS
Albert Schweitzer PERS
Alexander the Great PERS
Alfred Hitchcock PERS
Alice Cooper PERS
Alice in Wonderland
Amazon.com ORG
Amber Heard PERS
Amelia Earhart PERS
American Express
American Idol
Amsterdam LOC
Amy Adams PERS
Amy Winehouse PERS
Ancient Egypt LOC
Ancient Rome LOC
Android
Angelina Jolie PERS
Angry Birds
Anne Frank PERS
Anne Hathaway PERS
Antartica LOC
Apple Inc. ORG
Archimedes PERS
Aretha Franklin PERS
Argentina LOC
Aristotle PERS
Arnold Schwarzenegger PERS
Audi ORG
Audrey Hepburn PERS
Aung San Suu Kyi PERS
Australia LOC
Austria LOC
Avatar
Avril Lavigne PERS
Ayn Rand PERS
Aztec
BMW ORG
Babe Ruth PERS
Bacardi ORG
Backstreet Boys
Bangladesh LOC
Barack Obama PERS
Barbra Streisand PERS
Barcelona LOC
Batman PERS
Beethoven PERS
Belarus LOC
Belgium LOC
Ben Affleck PERS
Ben Folds PERS
Ben Stiller PERS
Benazir Bhutto PERS
Benjamin Franklin PERS
Benjamin Millepied PERS
Bernard Madoff PERS
Beyoncé Knowles PERS
Bill Clinton PERS
Bill Gates PERS
Billie Holiday PERS
Billie Jean King PERS
Bing Crosby PERS
Black Sabbath
Blake Edwards PERS
Blake Lively PERS
Bob Dylan PERS
Bob Geldof PERS
Bob Marley PERS
Brad Pitt PERS
Bradley Manning PERS
Brazil LOC
Brett Favre PERS
Britney Spears PERS
Bruce Lee PERS
Bruce Willis PERS
Bruno Mars PERS
Buddhism
Bulgaria LOC
Burger King
Burma LOC
C.S. Lewis PERS
Cadillac ORG
California LOC
Cameron Diaz PERS
Cameron Newton PERS
Canada LOC
Captain Beefheart PERS
Carl Lewis PERS
Charles Darwin PERS
Charles Dickens PERS
Charles Kindbergh PERS
Charles de Gaulle PERS
Charlie Sheen PERS
Che Guevara PERS
Cheryl Cole PERS
Chicago LOC
China LOC
Chopin PERS
Chris Colfer PERS
Christian Bale PERS
Christiano Ronaldo PERS
Christina Aguilera PERS
Christmas
Christopher Nolan PERS
Chuck Norris PERS
Clint Eastwood PERS
Coca Cola ORG
Coco Chanel ORG
Coldplay
Colombia LOC
Conan PERS
Cristiano Ronaldo PERS
Crystal Harris PERS
Cthulhu PERS
Cuba LOC
DNA
Daft Punk
Dalai Lama PERS
Daniel Radcliffe PERS
Darren Aronofsky PERS
Darren Criss PERS
Darth Vader PERS
David Beckham PERS
David Bowie PERS
David Cook PERS
Demi Lovato PERS
Demi Moore PERS
Denmark LOC
Desmond Tutu PERS
Dexter PERS
Diana PERS
Diego Maradona PERS
Disney ORG
Dmitry Medvedev PERS
Doctor Who PERS
Dr. Dre PERS
Dr. Seuss PERS
Dragon Ball
Dubai LOC
Dwayne Johnson PERS
Earth LOC
Ebenezer Scrooge PERS
Eddie Murphy PERS
Eduardo Saverin PERS
Egypt LOC
El Salvador LOC
Elizabeth Edwards PERS
Elizabeth Hurley PERS
Ellen Page PERS
Elton John PERS
Elvis Presley PERS
Emile Zatopek PERS
Eminem PERS
Emma Roberts PERS
Emma Stone PERS
Emma Watson PERS
Emmeline Pankhurst PERS
England LOC
Enrique Iglesias PERS
Ernest Hemingway PERS
Ernest Hemingway PERS
Europe LOC
Eva Peron PERS
Exxon Mobil PERS
FC Barcelona ORG
FIFA ORG
Facebook ORG
Fahrenheit
Family Guy
Faye Resnick PERS
FedEx ORG
Fidel Castro PERS
Finland LOC
Firefox ORG
Florence Nightingale PERS
Florida LOC
Fort Wayne LOC
France LOC
Frank Sinatra PERS
Franklin D. Roosevelt PERS
Freddie Mercury PERS
Frédéric Chopin PERS
Futurama
Garrett Hedlund PERS
Gene Simmons PERS
General Electric
Genghis Khan PERS
George Bush PERS
George Clooney PERS
George Harrison PERS
George Orwell PERS
George W. Bush PERS
George Washington PERS
Georges St-Pierre PERS
Germany LOC
Google ORG
Google Chrome
Gorillaz
Grand Theft Auto
Greece LOC
Gucci ORG
Gulf War
Gulliver's Travels
Guns N' Roses
Gwyneth Paltrow PERS
HIV
HSBC
Haile Selassie PERS
Haiti LOC
Halliburton ORG
Halloween
Hank Baskett PERS
Hannah Montana PERS
Hanukkah
Harrison Ford PERS
Harry Potter PERS
Hawaii LOC
He-Man PERS
Heath Ledger PERS
Helen Keller PERS
Helena Bonham Carter PERS
Henry Ford PERS
Henry IV PERS
Henry V PERS
Henry VIII PERS
Hilary Duff PERS
Hillary Clinton PERS
Honda ORG
Hong Kong LOC
Hotmail
Hugh Hefner PERS
Humphrey Bogart PERS
Hungary LOC
IBM ORG
IKEA ORG
Iceland LOC
India LOC
Indiana Jones PERS
Indira Gandhi PERS
Indonesia LOC
Internet Explorer
Iran LOC
Ireland LOC
Iron Man PERS
Isaac Newton PERS
Isabelle Caro PERS
Islam
Israel LOC
Italy LOC
Ivy League ORG
J. Robert Oppenheimer PERS
J.K. Rowling PERS
J.R.R. Tolkien PERS
JFK PERS
Jack the Ripper PERS
Jackie Chan PERS
Jacqueline Kennedy Onassis PERS
Jaden Smith PERS
Jake Gyllenhaal PERS
James Bond PERS
James Franco PERS
Jane Austen PERS
Janet Jackson PERS
Japan LOC
Jared Leto PERS
Jason Statham PERS
Jawaharlal Nehru PERS
Jay-Z PERS
Jeff Bridges PERS
Jeff Buckley PERS
Jenna Jameson PERS
Jennifer Aniston PERS
Jesse Owens PERS
Jessica Alba PERS
Jesus PERS
Jim Carrey PERS
Jim Morrisson PERS
Jimi Hendrix PERS
Jimmy Wales PERS
Joaquin Phoenix PERS
John Cena PERS
John Edwards PERS
John F. Kennedy PERS
John Lennon PERS
John M. Keynes PERS
John McCain PERS
John Wayne PERS
Johnnie Walker PERS
Johnny Cash PERS
Johnny Depp PERS
Joseph Stalin PERS
Judy Garland PERS
Julia Roberts PERS
Julian Assange PERS
Julie Andrews PERS
Julius Caesar PERS
Justin Bieber PERS
Justin Timberlake PERS
KFC ORG
KLM ORG
Kama Sutra
Kanye West PERS
Kate Middleton PERS
Katherine Hepburn PERS
Katrina Kaif PERS
Katy Perry PERS
Keira Knightley PERS
Ken Livingstone PERS
Keri Hilson PERS
Kesha PERS
Kevin Bacon PERS
Kid Cudi PERS
Kim Kardashian PERS
Kinect
King Arthur PERS
Kobe Bryant PERS
Kosovo LOC
Kristallnacht
Kristen Stewart PERS
Kurt Cobain PERS
L'Oreal ORG
L. Ron Hubbard PERS
Lady Gaga PERS
Lea Michele PERS
Lebanon LOC
Lech Walesa PERS
Led Zeppelin
Lego
Lenin PERS
Leo Tolstoy PERS
Leon Trotsky PERS
Leonardo DiCaprio PERS
Leonardo da Vinci PERS
Leslie Nielsen PERS
Lexus ORG
Liam Neeson PERS
Lil Wayne PERS
Lindsay Lohan PERS
Linkin Park PERS
Lionel Messi PERS
Loch Ness LOC
London LOC
Lord Baden Powell PERS
Los Angeles LOC
Louis Pasteur PERS
Louis Vuitton PERS
Louvre LOC
Ludwig van Beethoven PERS
Lyndon Johnson PERS
MDMA
Mac OS X
Macaulay Culkin PERS
Madagascar LOC
Madonna PERS
Mahatma Gandhi PERS
Malaysia LOC
Malcolm X PERS
Manchester LOC
Manchester United ORG
Margaret Thatcher PERS
Mariah Carey PERS
Marilyn Monroe PERS
Mario Gómez PERS
Mario Kart
Mark David Chapman PERS
Mark Wahlberg PERS
Mark Zuckerberg PERS
Martin Luther King PERS
Massachussetts LOC
Mata Hari PERS
Matt Damon PERS
Mattel ORG
Maya Angelou PERS
McDonald's ORG
McGill University ORG
Megan Fox PERS
Mercedes-Benz ORG
Merlin PERS
Metallica
Mexico LOC
Miami LOC
Miami Vice
Michael C. Hall PERS
Michael Jackson PERS
Michael Jordan PERS
Michael Vick PERS
Michelin ORG
Michigan LOC
Micky Ward PERS
Microsoft ORG
Microsoft Windows
Middle Ages
Mike Tyson PERS
Mila Kunis PERS
Miley Cyrus PERS
Minecraft
Mohammed Ali PERS
Mona Lisa PERS
Montreal LOC
Morocco LOC
Mother Teresa PERS
Mother's Day
Mozart PERS
Mozilla Firefox
Muhammad PERS
Muhammad Ali PERS
Myanmar LOC
Napoleon PERS
Narnia LOC
Natalie Portman PERS
Nazi Germany
Neil Armstrong PERS
Neil Patrick Harris PERS
Nelson Mandela PERS
Nepal LOC
Netherlands LOC
New York LOC
New York City LOC
New Zealand LOC
Nicki Minaj PERS
Nicolas Cage PERS
Nicole Scherzinger PERS
Nigeria LOC
Nike ORG
Nivea ORG
North America LOC
North Korea LOC
Norway LOC
Olivia Wilde PERS
Oprah Winfrey PERS
Osama Bin Laden PERS
Oscar Wilde PERS
Owen Wilson PERS
Ozzfest
Pablo Picasso PERS
Pakistan LOC
Panasonic ORG
Paris LOC
Paul McCartney PERS
Pele PERS
Pepsi ORG
Peter Sellers PERS
Philadelphia LOC
Philips ORG
Phillipines LOC
Pink Floyd PERS
PlayStation 3
Pocahontas PERS
Pokemon
Pokémon
Poland LOC
Pope John Paul II PERS
Premier League ORG
Prince Charles PERS
Priory of Sion LOC
Procter & Gamble
Puerto Rico LOC
Qatar LOC
Queen Elizabeth II PERS
Queen Victoria PERS
Rachmaninoff PERS
Raiders of the Lost Ark
Raisa Gorbachev PERS
Real Madrid ORG
Red Hot Chili Peppers PERS
Reese Witherspoon PERS
Resident Evil
Richard PERS
Richard Branson PERS
Richard Dawkins PERS
Richard Holbrooke PERS
Richard Nixon PERS
Rihanna PERS
Ringo Starr PERS
Robert De Niro PERS
Robert Pattinson PERS
Robin Hood PERS
Roger Federer PERS
Roman Empire ORG
Romania LOC
Rome LOC
Romeo and Juliet
Ronald Reagan PERS
Ronnie O'Sullivan PERS
Rosa Parks PERS
Russell Brand PERS
Russia LOC
Ryan Reynolds PERS
Saddam Hussein PERS
Sahara LOC
Saint Nicholas PERS
Salman Khan PERS
Samsung ORG
Sandra Bullock PERS
Santa Claus PERS
Sarah Palin PERS
Sasha Grey PERS
Saudi Arabia LOC
Scarlett Johansson PERS
Scientology ORG
Scotland LOC
Sean Combs PERS
Sean Parker PERS
Selena Gomez PERS
Serbia LOC
Sergei Rachmaninoff PERS
Shakira
Shaquille O'Neal PERS
Shaun Ryder PERS
Sherlock Holmes PERS
Shia LaBeouf PERS
Shirley Temple PERS
Siemens ORG
Sigmund Freud PERS
Silvio Berlusconi PERS
Singapore LOC
Skype
Smirnoff ORG
Snoop Dogg PERS
Snow White PERS
Socrates PERS
Somalia LOC
Sony ORG
South Africa LOC
South America LOC
South Korea LOC
South Park
Soviet Union
Spain LOC
Spider-Man PERS
Spiderman PERS
Sri Lanka LOC
Star Trek
Star Wars
Starbucks ORG
Stephen Hawking PERS
Stephen King PERS
Steve Jobs PERS
Steve Nash PERS
Steven Spielberg PERS
Sudan LOC
Super Bowl
Superman PERS
Sweden LOC
Switzerland LOC
Sylvester Stallone PERS
Taiwan LOC
Taj Mahal LOC
Take That
Taylor Lautner PERS
Taylor Momsem PERS
Taylor Swift PERS
Teena Marie PERS
Tennessee LOC
Texas LOC
Thailand LOC
The Beatles
The Chronicles of Narnia
The Godfather
The Green Hornet
The Lord of the Rings
The Rolling Stones
The Simpsons
The Sims
Theodore Roosevelt PERS
Thomas Jefferson PERS
Thor PERS
Tiger Woods PERS
Titanic
Tom Brady PERS
Tom Cruise PERS
Tom Hanks PERS
Toy Story
Toyota ORG
Transformers
Tron
Tupac Shakur PERS
Twin Peaks
Twitter
UEFA Champions League
Ubuntu
Ukraine LOC
United Kingdom LOC
United Nations
United States LOC
Usain Bolt PERS
Vanessa Hudgens PERS
Venus LOC
Vietnam LOC
Vin Diesel PERS
Virginia Woolf PERS
Vladimir Putin PERS
Vodafone ORG
Volkswagen ORG
Walmart ORG
Walt Disney PERS
Warren Buffet PERS
Washington LOC
Washington D.C. LOC
Wesley Snipes PERS
Wii
WikiLeaks ORG
Wikipedia ORG
Will Ferrell PERS
Will Smith PERS
William Shakespeare PERS
Willow Smith PERS
Windows 7
Windows 95
Windows Vista
Windows XP
Winona Ryder PERS
Winston Churchill PERS
Wiz Khalifa PERS
Wolfgang Amadeus Mozart PERS
Woodrow Wilson PERS
World War I
World War II
World of Warcraft
Wright Brothers PERS
X-Men
Xbox 360
Yoko Onen PERS
Yoko Ono PERS
YouTube ORG
amazon.com ORG
eBay ORG
iPad
iPhone
iPod
iPod touch

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,152 @@
;;;
;;; The morphological rules are based on Brill's rule based tagger v1.14,
;;; trained on Brown corpus and Penn Treebank.
;;;
NN s fhassuf 1 NNS x
NN . fchar CD x
NN - fchar JJ x
NN ed fhassuf 2 VBN x
NN ing fhassuf 3 VBG x
ly hassuf 2 RB x
ly addsuf 2 JJ x
NN $ fgoodright CD x
NN al fhassuf 2 JJ x
NN would fgoodright VB x
NN 0 fchar CD x
NN be fgoodright JJ x
NNS us fhassuf 2 JJ x
NNS it fgoodright VBZ x
NN ble fhassuf 3 JJ x
NN ic fhassuf 2 JJ x
NN 1 fchar CD x
NNS ss fhassuf 2 NN x
un deletepref 2 JJ x
NN ive fhassuf 3 JJ x
NNP ed fhassuf 2 JJ x
NN n't fgoodright VB x
VB the fgoodright NN x
NNS he fgoodright VBZ x
VBN he fgoodright VBD x
NN are fgoodright JJ x
JJ was fgoodleft NN x
NN est fhassuf 3 JJS x
VBZ The fgoodright NNS x
NNP ts fhassuf 2 NNS x
NN 4 fchar CD x
NN ize fhassuf 3 VB x
.. hassuf 2 : x
ful hassuf 3 JJ x
NN ate fhassuf 3 VB x
NNP ing fhassuf 3 VBG x
VBG is fgoodleft NN x
NN less fhassuf 4 JJ x
NN ary fhassuf 3 JJ x
Co. goodleft NNP x
NN ant fhassuf 3 JJ x
million goodleft CD x
JJ their fgoodleft IN x
NN he fgoodright VBD x
Mr. goodright NNP x
JJ of fgoodleft NN x
NN so fgoodright JJ x
NN y fdeletesuf 1 JJ x
VBN which fgoodright VBD x
VBD been fgoodright VBN x
VB a fgoodright NN x
NN economic fgoodleft JJ x
9 char CD x
CD t fchar JJ x
NN can fgoodright VB x
VB the fgoodright NN x
JJ S-T-A-R-T fgoodright VBN x
VBN - fchar JJ x
NN lar fhassuf 3 JJ x
NNP ans fhassuf 3 NNPS x
NN men fhassuf 3 NNS x
CD d fchar JJ x
JJ n fdeletesuf 1 VBN x
JJ 's fgoodleft NN x
NNS is fhassuf 2 NN x
ES hassuf 2 NNS x
JJ er fdeletesuf 2 JJR x
Inc. goodleft NNP x
NN 2 fchar CD x
VBD be fgoodleft MD x
ons hassuf 3 NNS x
RB - fchar JJ x
NN very fgoodright JJ x
ous hassuf 3 JJ x
NN a fdeletepref 1 RB x
NNP people fgoodleft JJ x
VB have fgoodleft RB x
NNS It fgoodright VBZ x
NN id fhassuf 2 JJ x
JJ may fgoodleft NN x
VBN but fgoodright VBD x
RS hassuf 2 NNS x
JJ stry fhassuf 4 NN x
NNS them fgoodleft VBZ x
VBZ were fgoodleft NNS x
NN ing faddsuf 3 VB x
JJ s faddsuf 1 NN x
NN 7 fchar CD x
NN d faddsuf 1 VB x
VB but fgoodleft NN x
NN 3 fchar CD x
NN est faddsuf 3 JJ x
NN en fhassuf 2 VBN x
NN costs fgoodright IN x
NN 8 fchar CD x
VB b fhaspref 1 NN x
zes hassuf 3 VBZ x
VBN s faddsuf 1 NN x
some hassuf 4 JJ x
NN ic fhassuf 2 JJ x
ly addsuf 2 JJ x
ness addsuf 4 JJ x
JJS s faddsuf 1 NN x
NN ier fhassuf 3 JJR x
NN ky fhassuf 2 JJ x
tyle hassuf 4 JJ x
NNS ates fhassuf 4 VBZ x
fy hassuf 2 VB x
body addsuf 4 DT x
NN ways fgoodleft JJ x
NNP ies fhassuf 3 NNPS x
VB negative fgoodright NN x
ders hassuf 4 NNS x
ds hassuf 2 NNS x
-day addsuf 4 CD x
nian hassuf 4 JJ x
JJR s faddsuf 1 NN x
ppy hassuf 3 JJ x
NN ish fhassuf 3 JJ x
tors hassuf 4 NNS x
oses hassuf 4 VBZ x
NNS oves fhassuf 4 VBZ x
VBN un fhaspref 2 JJ x
lent hassuf 4 JJ x
NN ward fdeletesuf 4 RB x
VB k fchar NN x
VB r fhassuf 1 NN x
VB e fdeletesuf 1 NN x
NNS Engelken fgoodright VBZ x
NN ient fhassuf 4 JJ x
ED hassuf 2 VBD x
VBG B fchar NNP x
VB le fhassuf 2 NN x
ment addsuf 4 VB x
ING hassuf 3 NN x
JJ ery fhassuf 3 NN x
JJ tus fhassuf 3 NN x
JJ car fhassuf 3 NN x
NN 6 fchar CD x
NNS 0 fchar CD x
JJ ing fdeletesuf 3 VBG x
here hassuf 4 RB x
VBN scr fhaspref 3 VBD x
uces hassuf 4 VBZ x
fies hassuf 4 VBZ x
self deletesuf 4 PRP x
NNP $ fchar $ x
VBN wa fhaspref 2 VBD x

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,472 @@
# -*- coding: utf-8 -*-
'''The pluralize and singular methods from the pattern library.
Licenced under the BSD.
See here https://github.com/clips/pattern/blob/master/LICENSE.txt for
complete license information.
'''
import re
VERB, NOUN, ADJECTIVE, ADVERB = "VB", "NN", "JJ", "RB"
#### PLURALIZE #####################################################################################
# Based on "An Algorithmic Approach to English Pluralization" by Damian Conway:
# http://www.csse.monash.edu.au/~damian/papers/HTML/Plurals.html
# Prepositions are used to solve things like
# "mother-in-law" or "man at arms"
plural_prepositions = [
"about", "above", "across", "after", "among", "around", "at", "athwart", "before", "behind",
"below", "beneath", "beside", "besides", "between", "betwixt", "beyond", "but", "by", "during",
"except", "for", "from", "in", "into", "near", "of", "off", "on", "onto", "out", "over",
"since", "till", "to", "under", "until", "unto", "upon", "with"
]
# Inflection rules that are either general,
# or apply to a certain category of words,
# or apply to a certain category of words only in classical mode,
# or apply only in classical mode.
# Each rule consists of:
# suffix, inflection, category and classic flag.
plural_rules = [
# 0) Indefinite articles and demonstratives.
[["^a$|^an$", "some", None, False],
["^this$", "these", None, False],
["^that$", "those", None, False],
["^any$", "all", None, False]
],
# 1) Possessive adjectives.
# Overlaps with 1/ for "his" and "its".
# Overlaps with 2/ for "her".
[["^my$", "our", None, False],
["^your$|^thy$", "your", None, False],
["^her$|^his$|^its$|^their$", "their", None, False]
],
# 2) Possessive pronouns.
[["^mine$", "ours", None, False],
["^yours$|^thine$", "yours", None, False],
["^hers$|^his$|^its$|^theirs$", "theirs", None, False]
],
# 3) Personal pronouns.
[["^I$", "we", None, False],
["^me$", "us", None, False],
["^myself$", "ourselves", None, False],
["^you$", "you", None, False],
["^thou$|^thee$", "ye", None, False],
["^yourself$|^thyself$", "yourself", None, False],
["^she$|^he$|^it$|^they$", "they", None, False],
["^her$|^him$|^it$|^them$", "them", None, False],
["^herself$|^himself$|^itself$|^themself$", "themselves", None, False],
["^oneself$", "oneselves", None, False]
],
# 4) Words that do not inflect.
[["$", "", "uninflected", False],
["$", "", "uncountable", False],
["fish$", "fish", None, False],
["([- ])bass$", "\\1bass", None, False],
["ois$", "ois", None, False],
["sheep$", "sheep", None, False],
["deer$", "deer", None, False],
["pox$", "pox", None, False],
["([A-Z].*)ese$", "\\1ese", None, False],
["itis$", "itis", None, False],
["(fruct|gluc|galact|lact|ket|malt|rib|sacchar|cellul)ose$", "\\1ose", None, False]
],
# 5) Irregular plurals (mongoose, oxen).
[["atlas$", "atlantes", None, True],
["atlas$", "atlases", None, False],
["beef$", "beeves", None, True],
["brother$", "brethren", None, True],
["child$", "children", None, False],
["corpus$", "corpora", None, True],
["corpus$", "corpuses", None, False],
["^cow$", "kine", None, True],
["ephemeris$", "ephemerides", None, False],
["ganglion$", "ganglia", None, True],
["genie$", "genii", None, True],
["genus$", "genera", None, False],
["graffito$", "graffiti", None, False],
["loaf$", "loaves", None, False],
["money$", "monies", None, True],
["mongoose$", "mongooses", None, False],
["mythos$", "mythoi", None, False],
["octopus$", "octopodes", None, True],
["opus$", "opera", None, True],
["opus$", "opuses", None, False],
["^ox$", "oxen", None, False],
["penis$", "penes", None, True],
["penis$", "penises", None, False],
["soliloquy$", "soliloquies", None, False],
["testis$", "testes", None, False],
["trilby$", "trilbys", None, False],
["turf$", "turves", None, True],
["numen$", "numena", None, False],
["occiput$", "occipita", None, True]
],
# 6) Irregular inflections for common suffixes (synopses, mice, men).
[["man$", "men", None, False],
["person$", "people", None, False],
["([lm])ouse$", "\\1ice", None, False],
["tooth$", "teeth", None, False],
["goose$", "geese", None, False],
["foot$", "feet", None, False],
["zoon$", "zoa", None, False],
["([csx])is$", "\\1es", None, False]
],
# 7) Fully assimilated classical inflections (vertebrae, codices).
[["ex$", "ices", "ex-ices", False],
["ex$", "ices", "ex-ices-classical", True],
["um$", "a", "um-a", False],
["um$", "a", "um-a-classical", True],
["on$", "a", "on-a", False],
["a$", "ae", "a-ae", False],
["a$", "ae", "a-ae-classical", True]
],
# 8) Classical variants of modern inflections (stigmata, soprani).
[["trix$", "trices", None, True],
["eau$", "eaux", None, True],
["ieu$", "ieu", None, True],
["([iay])nx$", "\\1nges", None, True],
["en$", "ina", "en-ina-classical", True],
["a$", "ata", "a-ata-classical", True],
["is$", "ides", "is-ides-classical", True],
["us$", "i", "us-i-classical", True],
["us$", "us", "us-us-classical", True],
["o$", "i", "o-i-classical", True],
["$", "i", "-i-classical", True],
["$", "im", "-im-classical", True]
],
# 9) -ch, -sh and -ss and the s-singular group take -es in the plural (churches, classes, lenses).
[["([cs])h$", "\\1hes", None, False],
["ss$", "sses", None, False],
["x$", "xes", None, False],
["s$", "ses", "s-singular", False]
],
# 10) Certain words ending in -f or -fe take -ves in the plural (lives, wolves).
[["([aeo]l)f$", "\\1ves", None, False],
["([^d]ea)f$", "\\1ves", None, False],
["arf$", "arves", None, False],
["([nlw]i)fe$", "\\1ves", None, False],
],
# 11) -y takes -ys if preceded by a vowel or when a proper noun,
# but -ies if preceded by a consonant (storeys, Marys, stories).
[["([aeiou])y$", "\\1ys", None, False],
["([A-Z].*)y$", "\\1ys", None, False],
["y$", "ies", None, False]
],
# 12) Some words ending in -o take -os, the rest take -oes.
# Words in which the -o is preceded by a vowel always take -os (lassos, potatoes, bamboos).
[["o$", "os", "o-os", False],
["([aeiou])o$", "\\1os", None, False],
["o$", "oes", None, False]
],
# 13) Miltary stuff (Major Generals).
[["l$", "ls", "general-generals", False]
],
# 14) Otherwise, assume that the plural just adds -s (cats, programmes).
[["$", "s", None, False]
],
]
# For performance, compile the regular expressions only once:
for ruleset in plural_rules:
for rule in ruleset:
rule[0] = re.compile(rule[0])
# Suffix categories.
plural_categories = {
"uninflected": [
"aircraft", "antelope", "bison", "bream", "breeches", "britches", "carp", "cattle", "chassis",
"clippers", "cod", "contretemps", "corps", "debris", "diabetes", "djinn", "eland", "elk",
"flounder", "gallows", "graffiti", "headquarters", "herpes", "high-jinks", "homework", "innings",
"jackanapes", "mackerel", "measles", "mews", "moose", "mumps", "offspring", "news", "pincers",
"pliers", "proceedings", "rabies", "salmon", "scissors", "series", "shears", "species", "swine",
"trout", "tuna", "whiting", "wildebeest"],
"uncountable": [
"advice", "bread", "butter", "cannabis", "cheese", "electricity", "equipment", "fruit", "furniture",
"garbage", "gravel", "happiness", "information", "ketchup", "knowledge", "love", "luggage",
"mathematics", "mayonnaise", "meat", "mustard", "news", "progress", "research", "rice",
"sand", "software", "understanding", "water"],
"s-singular": [
"acropolis", "aegis", "alias", "asbestos", "bathos", "bias", "bus", "caddis", "canvas",
"chaos", "christmas", "cosmos", "dais", "digitalis", "epidermis", "ethos", "gas", "glottis",
"ibis", "lens", "mantis", "marquis", "metropolis", "pathos", "pelvis", "polis", "rhinoceros",
"sassafras", "trellis"],
"ex-ices": ["codex", "murex", "silex"],
"ex-ices-classical": [
"apex", "cortex", "index", "latex", "pontifex", "simplex", "vertex", "vortex"],
"um-a": [
"agendum", "bacterium", "candelabrum", "datum", "desideratum", "erratum", "extremum",
"ovum", "stratum"],
"um-a-classical": [
"aquarium", "compendium", "consortium", "cranium", "curriculum", "dictum", "emporium",
"enconium", "gymnasium", "honorarium", "interregnum", "lustrum", "maximum", "medium",
"memorandum", "millenium", "minimum", "momentum", "optimum", "phylum", "quantum", "rostrum",
"spectrum", "speculum", "stadium", "trapezium", "ultimatum", "vacuum", "velum"],
"on-a": [
"aphelion", "asyndeton", "criterion", "hyperbaton", "noumenon", "organon", "perihelion",
"phenomenon", "prolegomenon"],
"a-ae": ["alga", "alumna", "vertebra"],
"a-ae-classical": [
"abscissa", "amoeba", "antenna", "aurora", "formula", "hydra", "hyperbola", "lacuna",
"medusa", "nebula", "nova", "parabola"],
"en-ina-classical": ["foramen", "lumen", "stamen"],
"a-ata-classical": [
"anathema", "bema", "carcinoma", "charisma", "diploma", "dogma", "drama", "edema", "enema",
"enigma", "gumma", "lemma", "lymphoma", "magma", "melisma", "miasma", "oedema", "sarcoma",
"schema", "soma", "stigma", "stoma", "trauma"],
"is-ides-classical": ["clitoris", "iris"],
"us-i-classical": [
"focus", "fungus", "genius", "incubus", "nimbus", "nucleolus", "radius", "stylus", "succubus",
"torus", "umbilicus", "uterus"],
"us-us-classical": [
"apparatus", "cantus", "coitus", "hiatus", "impetus", "nexus", "plexus", "prospectus",
"sinus", "status"],
"o-i-classical": ["alto", "basso", "canto", "contralto", "crescendo", "solo", "soprano", "tempo"],
"-i-classical": ["afreet", "afrit", "efreet"],
"-im-classical": ["cherub", "goy", "seraph"],
"o-os": [
"albino", "archipelago", "armadillo", "commando", "ditto", "dynamo", "embryo", "fiasco",
"generalissimo", "ghetto", "guano", "inferno", "jumbo", "lingo", "lumbago", "magneto",
"manifesto", "medico", "octavo", "photo", "pro", "quarto", "rhino", "stylo"],
"general-generals": [
"Adjutant", "Brigadier", "Lieutenant", "Major", "Quartermaster",
"adjutant", "brigadier", "lieutenant", "major", "quartermaster"],
}
def pluralize(word, pos=NOUN, custom={}, classical=True):
""" Returns the plural of a given word.
For example: child -> children.
Handles nouns and adjectives, using classical inflection by default
(e.g. where "matrix" pluralizes to "matrices" instead of "matrixes").
The custom dictionary is for user-defined replacements.
"""
if word in custom:
return custom[word]
# Recursion of genitives.
# Remove the apostrophe and any trailing -s,
# form the plural of the resultant noun, and then append an apostrophe (dog's -> dogs').
if word.endswith("'") or word.endswith("'s"):
owner = word.rstrip("'s")
owners = pluralize(owner, pos, custom, classical)
if owners.endswith("s"):
return owners + "'"
else:
return owners + "'s"
# Recursion of compound words
# (Postmasters General, mothers-in-law, Roman deities).
words = word.replace("-", " ").split(" ")
if len(words) > 1:
if words[1] == "general" or words[1] == "General" and \
words[0] not in plural_categories["general-generals"]:
return word.replace(words[0], pluralize(words[0], pos, custom, classical))
elif words[1] in plural_prepositions:
return word.replace(words[0], pluralize(words[0], pos, custom, classical))
else:
return word.replace(words[-1], pluralize(words[-1], pos, custom, classical))
# Only a very few number of adjectives inflect.
n = list(range(len(plural_rules)))
if pos.startswith(ADJECTIVE):
n = [0, 1]
# Apply pluralization rules.
for i in n:
ruleset = plural_rules[i]
for rule in ruleset:
suffix, inflection, category, classic = rule
# A general rule, or a classic rule in classical mode.
if category == None:
if not classic or (classic and classical):
if suffix.search(word) is not None:
return suffix.sub(inflection, word)
# A rule relating to a specific category of words.
if category != None:
if word in plural_categories[category] and (not classic or (classic and classical)):
if suffix.search(word) is not None:
return suffix.sub(inflection, word)
#### SINGULARIZE ###################################################################################
# Adapted from Bermi Ferrer's Inflector for Python:
# http://www.bermi.org/inflector/
# Copyright (c) 2006 Bermi Ferrer Martinez
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software to deal in this software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of this software, and to permit
# persons to whom this software is furnished to do so, subject to the following
# condition:
#
# THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THIS SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THIS SOFTWARE.
singular_rules = [
['(?i)(.)ae$', '\\1a'],
['(?i)(.)itis$', '\\1itis'],
['(?i)(.)eaux$', '\\1eau'],
['(?i)(quiz)zes$', '\\1'],
['(?i)(matr)ices$', '\\1ix'],
['(?i)(ap|vert|ind)ices$', '\\1ex'],
['(?i)^(ox)en', '\\1'],
['(?i)(alias|status)es$', '\\1'],
['(?i)([octop|vir])i$', '\\1us'],
['(?i)(cris|ax|test)es$', '\\1is'],
['(?i)(shoe)s$', '\\1'],
['(?i)(o)es$', '\\1'],
['(?i)(bus)es$', '\\1'],
['(?i)([m|l])ice$', '\\1ouse'],
['(?i)(x|ch|ss|sh)es$', '\\1'],
['(?i)(m)ovies$', '\\1ovie'],
['(?i)(.)ombies$', '\\1ombie'],
['(?i)(s)eries$', '\\1eries'],
['(?i)([^aeiouy]|qu)ies$', '\\1y'],
# Certain words ending in -f or -fe take -ves in the plural (lives, wolves).
["([aeo]l)ves$", "\\1f"],
["([^d]ea)ves$", "\\1f"],
["arves$", "arf"],
["erves$", "erve"],
["([nlw]i)ves$", "\\1fe"],
['(?i)([lr])ves$', '\\1f'],
["([aeo])ves$", "\\1ve"],
['(?i)(sive)s$', '\\1'],
['(?i)(tive)s$', '\\1'],
['(?i)(hive)s$', '\\1'],
['(?i)([^f])ves$', '\\1fe'],
# -es suffix.
['(?i)(^analy)ses$', '\\1sis'],
['(?i)((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$', '\\1\\2sis'],
['(?i)(.)opses$', '\\1opsis'],
['(?i)(.)yses$', '\\1ysis'],
['(?i)(h|d|r|o|n|b|cl|p)oses$', '\\1ose'],
['(?i)(fruct|gluc|galact|lact|ket|malt|rib|sacchar|cellul)ose$', '\\1ose'],
['(?i)(.)oses$', '\\1osis'],
# -a
['(?i)([ti])a$', '\\1um'],
['(?i)(n)ews$', '\\1ews'],
['(?i)s$', ''],
]
# For performance, compile the regular expressions only once:
for rule in singular_rules:
rule[0] = re.compile(rule[0])
singular_uninflected = [
"aircraft", "antelope", "bison", "bream", "breeches", "britches", "carp", "cattle", "chassis",
"clippers", "cod", "contretemps", "corps", "debris", "diabetes", "djinn", "eland",
"elk", "flounder", "gallows", "georgia", "graffiti", "headquarters", "herpes", "high-jinks",
"homework", "innings", "jackanapes", "mackerel", "measles", "mews", "moose", "mumps", "news",
"offspring", "pincers", "pliers", "proceedings", "rabies", "salmon", "scissors", "series",
"shears", "species", "swine", "swiss", "trout", "tuna", "whiting", "wildebeest"
]
singular_uncountable = [
"advice", "bread", "butter", "cannabis", "cheese", "electricity", "equipment", "fruit", "furniture",
"garbage", "gravel", "happiness", "information", "ketchup", "knowledge", "love", "luggage",
"mathematics", "mayonnaise", "meat", "mustard", "news", "progress", "research", "rice", "sand",
"software", "understanding", "water"
]
singular_ie = [
"algerie", "auntie", "beanie", "birdie", "bogie", "bombie", "bookie", "collie", "cookie", "cutie",
"doggie", "eyrie", "freebie", "goonie", "groupie", "hankie", "hippie", "hoagie", "hottie",
"indie", "junkie", "laddie", "laramie", "lingerie", "meanie", "nightie", "oldie", "^pie",
"pixie", "quickie", "reverie", "rookie", "softie", "sortie", "stoolie", "sweetie", "techie",
"^tie", "toughie", "valkyrie", "veggie", "weenie", "yuppie", "zombie"
]
singular_s = plural_categories['s-singular']
# key plural, value singular
singular_irregular = {
"men": "man",
"people": "person",
"children": "child",
"sexes": "sex",
"axes": "axe",
"moves": "move",
"teeth": "tooth",
"geese": "goose",
"feet": "foot",
"zoa": "zoon",
"atlantes": "atlas",
"atlases": "atlas",
"beeves": "beef",
"brethren": "brother",
"children": "child",
"corpora": "corpus",
"corpuses": "corpus",
"kine": "cow",
"ephemerides": "ephemeris",
"ganglia": "ganglion",
"genii": "genie",
"genera": "genus",
"graffiti": "graffito",
"helves": "helve",
"leaves": "leaf",
"loaves": "loaf",
"monies": "money",
"mongooses": "mongoose",
"mythoi": "mythos",
"octopodes": "octopus",
"opera": "opus",
"opuses": "opus",
"oxen": "ox",
"penes": "penis",
"penises": "penis",
"soliloquies": "soliloquy",
"testes": "testis",
"trilbys": "trilby",
"turves": "turf",
"numena": "numen",
"occipita": "occiput",
"our": "my",
}
def singularize(word, pos=NOUN, custom={}):
if word in list(custom.keys()):
return custom[word]
# Recursion of compound words (e.g. mothers-in-law).
if "-" in word:
words = word.split("-")
if len(words) > 1 and words[1] in plural_prepositions:
return singularize(words[0], pos, custom)+"-"+"-".join(words[1:])
# dogs' => dog's
if word.endswith("'"):
return singularize(word[:-1]) + "'s"
lower = word.lower()
for w in singular_uninflected:
if w.endswith(lower):
return word
for w in singular_uncountable:
if w.endswith(lower):
return word
for w in singular_ie:
if lower.endswith(w+"s"):
return w
for w in singular_s:
if lower.endswith(w + 'es'):
return w
for w in list(singular_irregular.keys()):
if lower.endswith(w):
return re.sub('(?i)'+w+'$', singular_irregular[w], word)
for rule in singular_rules:
suffix, inflection = rule
match = suffix.search(word)
if match:
groups = match.groups()
for k in range(0, len(groups)):
if groups[k] == None:
inflection = inflection.replace('\\'+str(k+1), '')
return suffix.sub(inflection, word)
return word

View File

@@ -0,0 +1,204 @@
# -*- coding: utf-8 -*-
'''Various noun phrase extractors.'''
from __future__ import unicode_literals, absolute_import
import nltk
from textblob.taggers import PatternTagger
from textblob.decorators import requires_nltk_corpus
from textblob.utils import tree2str, filter_insignificant
from textblob.base import BaseNPExtractor
class ChunkParser(nltk.ChunkParserI):
def __init__(self):
self._trained = False
@requires_nltk_corpus
def train(self):
'''Train the Chunker on the ConLL-2000 corpus.'''
train_data = [[(t, c) for _, t, c in nltk.chunk.tree2conlltags(sent)]
for sent in
nltk.corpus.conll2000.chunked_sents('train.txt',
chunk_types=['NP'])]
unigram_tagger = nltk.UnigramTagger(train_data)
self.tagger = nltk.BigramTagger(train_data, backoff=unigram_tagger)
self._trained = True
def parse(self, sentence):
'''Return the parse tree for the sentence.'''
if not self._trained:
self.train()
pos_tags = [pos for (word, pos) in sentence]
tagged_pos_tags = self.tagger.tag(pos_tags)
chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
conlltags = [(word, pos, chunktag) for ((word, pos), chunktag) in
zip(sentence, chunktags)]
return nltk.chunk.util.conlltags2tree(conlltags)
class ConllExtractor(BaseNPExtractor):
'''A noun phrase extractor that uses chunk parsing trained with the
ConLL-2000 training corpus.
'''
POS_TAGGER = PatternTagger()
# The context-free grammar with which to filter the noun phrases
CFG = {
('NNP', 'NNP'): 'NNP',
('NN', 'NN'): 'NNI',
('NNI', 'NN'): 'NNI',
('JJ', 'JJ'): 'JJ',
('JJ', 'NN'): 'NNI',
}
# POS suffixes that will be ignored
INSIGNIFICANT_SUFFIXES = ['DT', 'CC', 'PRP$', 'PRP']
def __init__(self, parser=None):
self.parser = ChunkParser() if not parser else parser
def extract(self, text):
'''Return a list of noun phrases (strings) for body of text.'''
sentences = nltk.tokenize.sent_tokenize(text)
noun_phrases = []
for sentence in sentences:
parsed = self._parse_sentence(sentence)
# Get the string representation of each subtree that is a
# noun phrase tree
phrases = [_normalize_tags(filter_insignificant(each,
self.INSIGNIFICANT_SUFFIXES)) for each in parsed
if isinstance(each, nltk.tree.Tree) and each.label()
== 'NP' and len(filter_insignificant(each)) >= 1
and _is_match(each, cfg=self.CFG)]
nps = [tree2str(phrase) for phrase in phrases]
noun_phrases.extend(nps)
return noun_phrases
def _parse_sentence(self, sentence):
'''Tag and parse a sentence (a plain, untagged string).'''
tagged = self.POS_TAGGER.tag(sentence)
return self.parser.parse(tagged)
class FastNPExtractor(BaseNPExtractor):
'''A fast and simple noun phrase extractor.
Credit to Shlomi Babluk. Link to original blog post:
http://thetokenizer.com/2013/05/09/efficient-way-to-extract-the-main-topics-of-a-sentence/
'''
CFG = {
('NNP', 'NNP'): 'NNP',
('NN', 'NN'): 'NNI',
('NNI', 'NN'): 'NNI',
('JJ', 'JJ'): 'JJ',
('JJ', 'NN'): 'NNI',
}
def __init__(self):
self._trained = False
@requires_nltk_corpus
def train(self):
train_data = nltk.corpus.brown.tagged_sents(categories='news')
regexp_tagger = nltk.RegexpTagger([
(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),
(r'(-|:|;)$', ':'),
(r'\'*$', 'MD'),
(r'(The|the|A|a|An|an)$', 'AT'),
(r'.*able$', 'JJ'),
(r'^[A-Z].*$', 'NNP'),
(r'.*ness$', 'NN'),
(r'.*ly$', 'RB'),
(r'.*s$', 'NNS'),
(r'.*ing$', 'VBG'),
(r'.*ed$', 'VBD'),
(r'.*', 'NN'),
])
unigram_tagger = nltk.UnigramTagger(train_data, backoff=regexp_tagger)
self.tagger = nltk.BigramTagger(train_data, backoff=unigram_tagger)
self._trained = True
return None
def _tokenize_sentence(self, sentence):
'''Split the sentence into single words/tokens'''
tokens = nltk.word_tokenize(sentence)
return tokens
def extract(self, sentence):
'''Return a list of noun phrases (strings) for body of text.'''
if not self._trained:
self.train()
tokens = self._tokenize_sentence(sentence)
tagged = self.tagger.tag(tokens)
tags = _normalize_tags(tagged)
merge = True
while merge:
merge = False
for x in range(0, len(tags) - 1):
t1 = tags[x]
t2 = tags[x + 1]
key = t1[1], t2[1]
value = self.CFG.get(key, '')
if value:
merge = True
tags.pop(x)
tags.pop(x)
match = '%s %s' % (t1[0], t2[0])
pos = value
tags.insert(x, (match, pos))
break
matches = [t[0] for t in tags if t[1] in ['NNP', 'NNI']]
return matches
### Utility methods ###
def _normalize_tags(chunk):
'''Normalize the corpus tags.
("NN", "NN-PL", "NNS") -> "NN"
'''
ret = []
for word, tag in chunk:
if tag == 'NP-TL' or tag == 'NP':
ret.append((word, 'NNP'))
continue
if tag.endswith('-TL'):
ret.append((word, tag[:-3]))
continue
if tag.endswith('S'):
ret.append((word, tag[:-1]))
continue
ret.append((word, tag))
return ret
def _is_match(tagged_phrase, cfg):
'''Return whether or not a tagged phrases matches a context-free grammar.
'''
copy = list(tagged_phrase) # A copy of the list
merge = True
while merge:
merge = False
for i in range(len(copy) - 1):
first, second = copy[i], copy[i + 1]
key = first[1], second[1] # Tuple of tags e.g. ('NN', 'JJ')
value = cfg.get(key, None)
if value:
merge = True
copy.pop(i)
copy.pop(i)
match = '{0} {1}'.format(first[0], second[0])
pos = value
copy.insert(i, (match, pos))
break
match = any([t[1] in ('NNP', 'NNI') for t in copy])
return match

View File

@@ -0,0 +1,18 @@
# -*- coding: utf-8 -*-
"""Various parser implementations.
.. versionadded:: 0.6.0
"""
from __future__ import absolute_import
from textblob.en import parse as pattern_parse
from textblob.base import BaseParser
class PatternParser(BaseParser):
"""Parser that uses the implementation in Tom de Smedt's pattern library.
http://www.clips.ua.ac.be/pages/pattern-en#parser
"""
def parse(self, text):
"""Parses the text."""
return pattern_parse(text)

View File

@@ -0,0 +1,97 @@
# -*- coding: utf-8 -*-
"""Sentiment analysis implementations.
.. versionadded:: 0.5.0
"""
from __future__ import absolute_import
from collections import namedtuple
import nltk
from textblob.en import sentiment as pattern_sentiment
from textblob.tokenizers import word_tokenize
from textblob.decorators import requires_nltk_corpus
from textblob.base import BaseSentimentAnalyzer, DISCRETE, CONTINUOUS
class PatternAnalyzer(BaseSentimentAnalyzer):
"""Sentiment analyzer that uses the same implementation as the
pattern library. Returns results as a named tuple of the form:
``Sentiment(polarity, subjectivity, [assessments])``
where [assessments] is a list of the assessed tokens and their
polarity and subjectivity scores
"""
kind = CONTINUOUS
# This is only here for backwards-compatibility.
# The return type is actually determined upon calling analyze()
RETURN_TYPE = namedtuple('Sentiment', ['polarity', 'subjectivity'])
def analyze(self, text, keep_assessments=False):
"""Return the sentiment as a named tuple of the form:
``Sentiment(polarity, subjectivity, [assessments])``.
"""
#: Return type declaration
if keep_assessments:
Sentiment = namedtuple('Sentiment', ['polarity', 'subjectivity', 'assessments'])
assessments = pattern_sentiment(text).assessments
polarity, subjectivity = pattern_sentiment(text)
return Sentiment(polarity, subjectivity, assessments)
else:
Sentiment = namedtuple('Sentiment', ['polarity', 'subjectivity'])
return Sentiment(*pattern_sentiment(text))
def _default_feature_extractor(words):
"""Default feature extractor for the NaiveBayesAnalyzer."""
return dict(((word, True) for word in words))
class NaiveBayesAnalyzer(BaseSentimentAnalyzer):
"""Naive Bayes analyzer that is trained on a dataset of movie reviews.
Returns results as a named tuple of the form:
``Sentiment(classification, p_pos, p_neg)``
:param callable feature_extractor: Function that returns a dictionary of
features, given a list of words.
"""
kind = DISCRETE
#: Return type declaration
RETURN_TYPE = namedtuple('Sentiment', ['classification', 'p_pos', 'p_neg'])
def __init__(self, feature_extractor=_default_feature_extractor):
super(NaiveBayesAnalyzer, self).__init__()
self._classifier = None
self.feature_extractor = feature_extractor
@requires_nltk_corpus
def train(self):
"""Train the Naive Bayes classifier on the movie review corpus."""
super(NaiveBayesAnalyzer, self).train()
neg_ids = nltk.corpus.movie_reviews.fileids('neg')
pos_ids = nltk.corpus.movie_reviews.fileids('pos')
neg_feats = [(self.feature_extractor(
nltk.corpus.movie_reviews.words(fileids=[f])), 'neg') for f in neg_ids]
pos_feats = [(self.feature_extractor(
nltk.corpus.movie_reviews.words(fileids=[f])), 'pos') for f in pos_ids]
train_data = neg_feats + pos_feats
self._classifier = nltk.classify.NaiveBayesClassifier.train(train_data)
def analyze(self, text):
"""Return the sentiment as a named tuple of the form:
``Sentiment(classification, p_pos, p_neg)``
"""
# Lazily train the classifier
super(NaiveBayesAnalyzer, self).analyze(text)
tokens = word_tokenize(text, include_punc=False)
filtered = (t.lower() for t in tokens if len(t) >= 3)
feats = self.feature_extractor(filtered)
prob_dist = self._classifier.prob_classify(feats)
return self.RETURN_TYPE(
classification=prob_dist.max(),
p_pos=prob_dist.prob('pos'),
p_neg=prob_dist.prob("neg")
)

View File

@@ -0,0 +1,38 @@
# -*- coding: utf-8 -*-
"""Parts-of-speech tagger implementations."""
from __future__ import absolute_import
import nltk
import textblob.compat
import textblob as tb
from textblob.en import tag as pattern_tag
from textblob.decorators import requires_nltk_corpus
from textblob.base import BaseTagger
class PatternTagger(BaseTagger):
"""Tagger that uses the implementation in
Tom de Smedt's pattern library
(http://www.clips.ua.ac.be/pattern).
"""
def tag(self, text, tokenize=True):
"""Tag a string or BaseBlob."""
if not isinstance(text, textblob.compat.text_type):
text = text.raw
return pattern_tag(text, tokenize)
class NLTKTagger(BaseTagger):
"""Tagger that uses NLTK's standard TreeBank tagger.
NOTE: Requires numpy. Not yet supported with PyPy.
"""
@requires_nltk_corpus
def tag(self, text):
"""Tag a string or BaseBlob."""
if isinstance(text, textblob.compat.text_type):
text = tb.TextBlob(text)
return nltk.tag.pos_tag(text.tokens)