Initial commit

This commit is contained in:
2026-02-01 09:31:38 +01:00
commit e02db93960
4396 changed files with 1511612 additions and 0 deletions

View File

@@ -0,0 +1,16 @@
import os
from .blob import TextBlob, Word, Sentence, Blobber, WordList
__version__ = '0.17.1'
__license__ = 'MIT'
__author__ = 'Steven Loria'
PACKAGE_DIR = os.path.dirname(os.path.abspath(__file__))
__all__ = [
'TextBlob',
'Word',
'Sentence',
'Blobber',
'WordList',
]

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,107 @@
# -*- coding: utf-8 -*-
"""Abstract base classes for models (taggers, noun phrase extractors, etc.)
which define the interface for descendant classes.
.. versionchanged:: 0.7.0
All base classes are defined in the same module, ``textblob.base``.
"""
from __future__ import absolute_import
from abc import ABCMeta, abstractmethod
import nltk
from textblob.compat import with_metaclass
##### POS TAGGERS #####
class BaseTagger(with_metaclass(ABCMeta)):
"""Abstract tagger class from which all taggers
inherit from. All descendants must implement a
``tag()`` method.
"""
@abstractmethod
def tag(self, text, tokenize=True):
"""Return a list of tuples of the form (word, tag)
for a given set of text or BaseBlob instance.
"""
return
##### NOUN PHRASE EXTRACTORS #####
class BaseNPExtractor(with_metaclass(ABCMeta)):
"""Abstract base class from which all NPExtractor classes inherit.
Descendant classes must implement an ``extract(text)`` method
that returns a list of noun phrases as strings.
"""
@abstractmethod
def extract(self, text):
"""Return a list of noun phrases (strings) for a body of text."""
return
##### TOKENIZERS #####
class BaseTokenizer(with_metaclass(ABCMeta), nltk.tokenize.api.TokenizerI):
"""Abstract base class from which all Tokenizer classes inherit.
Descendant classes must implement a ``tokenize(text)`` method
that returns a list of noun phrases as strings.
"""
@abstractmethod
def tokenize(self, text):
"""Return a list of tokens (strings) for a body of text.
:rtype: list
"""
return
def itokenize(self, text, *args, **kwargs):
"""Return a generator that generates tokens "on-demand".
.. versionadded:: 0.6.0
:rtype: generator
"""
return (t for t in self.tokenize(text, *args, **kwargs))
##### SENTIMENT ANALYZERS ####
DISCRETE = 'ds'
CONTINUOUS = 'co'
class BaseSentimentAnalyzer(with_metaclass(ABCMeta)):
"""Abstract base class from which all sentiment analyzers inherit.
Should implement an ``analyze(text)`` method which returns either the
results of analysis.
"""
kind = DISCRETE
def __init__(self):
self._trained = False
def train(self):
# Train me
self._trained = True
@abstractmethod
def analyze(self, text):
"""Return the result of of analysis. Typically returns either a
tuple, float, or dictionary.
"""
# Lazily train the classifier
if not self._trained:
self.train()
# Analyze text
return None
##### PARSERS #####
class BaseParser(with_metaclass(ABCMeta)):
"""Abstract parser class from which all parsers inherit from. All
descendants must implement a ``parse()`` method.
"""
@abstractmethod
def parse(self, text):
"""Parses the text."""
return

View File

@@ -0,0 +1,824 @@
# -*- coding: utf-8 -*-
"""Wrappers for various units of text, including the main
:class:`TextBlob <textblob.blob.TextBlob>`, :class:`Word <textblob.blob.Word>`,
and :class:`WordList <textblob.blob.WordList>` classes.
Example usage: ::
>>> from textblob import TextBlob
>>> b = TextBlob("Simple is better than complex.")
>>> b.tags
[(u'Simple', u'NN'), (u'is', u'VBZ'), (u'better', u'JJR'), (u'than', u'IN'), (u'complex', u'NN')]
>>> b.noun_phrases
WordList([u'simple'])
>>> b.words
WordList([u'Simple', u'is', u'better', u'than', u'complex'])
>>> b.sentiment
(0.06666666666666667, 0.41904761904761906)
>>> b.words[0].synsets()[0]
Synset('simple.n.01')
.. versionchanged:: 0.8.0
These classes are now imported from ``textblob`` rather than ``text.blob``.
"""
from __future__ import unicode_literals, absolute_import
import sys
import json
import warnings
from collections import defaultdict
import nltk
from textblob.decorators import cached_property, requires_nltk_corpus
from textblob.utils import lowerstrip, PUNCTUATION_REGEX
from textblob.inflect import singularize as _singularize, pluralize as _pluralize
from textblob.mixins import BlobComparableMixin, StringlikeMixin
from textblob.compat import unicode, basestring
from textblob.base import (BaseNPExtractor, BaseTagger, BaseTokenizer,
BaseSentimentAnalyzer, BaseParser)
from textblob.np_extractors import FastNPExtractor
from textblob.taggers import NLTKTagger
from textblob.tokenizers import WordTokenizer, sent_tokenize, word_tokenize
from textblob.sentiments import PatternAnalyzer
from textblob.parsers import PatternParser
from textblob.translate import Translator
from textblob.en import suggest
# Wordnet interface
# NOTE: textblob.wordnet is not imported so that the wordnet corpus can be lazy-loaded
_wordnet = nltk.corpus.wordnet
def _penn_to_wordnet(tag):
"""Converts a Penn corpus tag into a Wordnet tag."""
if tag in ("NN", "NNS", "NNP", "NNPS"):
return _wordnet.NOUN
if tag in ("JJ", "JJR", "JJS"):
return _wordnet.ADJ
if tag in ("VB", "VBD", "VBG", "VBN", "VBP", "VBZ"):
return _wordnet.VERB
if tag in ("RB", "RBR", "RBS"):
return _wordnet.ADV
return None
class Word(unicode):
"""A simple word representation. Includes methods for inflection,
translation, and WordNet integration.
"""
translator = Translator()
def __new__(cls, string, pos_tag=None):
"""Return a new instance of the class. It is necessary to override
this method in order to handle the extra pos_tag argument in the
constructor.
"""
return super(Word, cls).__new__(cls, string)
def __init__(self, string, pos_tag=None):
self.string = string
self.pos_tag = pos_tag
def __repr__(self):
return repr(self.string)
def __str__(self):
return self.string
def singularize(self):
"""Return the singular version of the word as a string."""
return Word(_singularize(self.string))
def pluralize(self):
'''Return the plural version of the word as a string.'''
return Word(_pluralize(self.string))
def translate(self, from_lang='auto', to="en"):
'''Translate the word to another language using Google's
Translate API.
.. deprecated:: 0.16.0
Use the official Google Translate API instead.
.. versionadded:: 0.5.0
'''
warnings.warn(
'Word.translate is deprecated and will be removed in a future release. '
'Use the official Google Translate API instead.',
DeprecationWarning
)
return self.translator.translate(self.string,
from_lang=from_lang, to_lang=to)
def detect_language(self):
'''Detect the word's language using Google's Translate API.
.. deprecated:: 0.16.0
Use the official Google Translate API istead.
.. versionadded:: 0.5.0
'''
warnings.warn(
'Word.detect_language is deprecated and will be removed in a future release. '
'Use the official Google Translate API instead.',
DeprecationWarning
)
return self.translator.detect(self.string)
def spellcheck(self):
'''Return a list of (word, confidence) tuples of spelling corrections.
Based on: Peter Norvig, "How to Write a Spelling Corrector"
(http://norvig.com/spell-correct.html) as implemented in the pattern
library.
.. versionadded:: 0.6.0
'''
return suggest(self.string)
def correct(self):
'''Correct the spelling of the word. Returns the word with the highest
confidence using the spelling corrector.
.. versionadded:: 0.6.0
'''
return Word(self.spellcheck()[0][0])
@cached_property
@requires_nltk_corpus
def lemma(self):
"""Return the lemma of this word using Wordnet's morphy function.
"""
return self.lemmatize(pos=self.pos_tag)
@requires_nltk_corpus
def lemmatize(self, pos=None):
"""Return the lemma for a word using WordNet's morphy function.
:param pos: Part of speech to filter upon. If `None`, defaults to
``_wordnet.NOUN``.
.. versionadded:: 0.8.1
"""
if pos is None:
tag = _wordnet.NOUN
elif pos in _wordnet._FILEMAP.keys():
tag = pos
else:
tag = _penn_to_wordnet(pos)
lemmatizer = nltk.stem.WordNetLemmatizer()
return lemmatizer.lemmatize(self.string, tag)
PorterStemmer = nltk.stem.porter.PorterStemmer()
LancasterStemmer = nltk.stem.lancaster.LancasterStemmer()
SnowballStemmer = nltk.stem.snowball.SnowballStemmer("english")
#added 'stemmer' on lines of lemmatizer
#based on nltk
def stem(self, stemmer=PorterStemmer):
"""Stem a word using various NLTK stemmers. (Default: Porter Stemmer)
.. versionadded:: 0.12.0
"""
return stemmer.stem(self.string)
@cached_property
def synsets(self):
"""The list of Synset objects for this Word.
:rtype: list of Synsets
.. versionadded:: 0.7.0
"""
return self.get_synsets(pos=None)
@cached_property
def definitions(self):
"""The list of definitions for this word. Each definition corresponds
to a synset.
.. versionadded:: 0.7.0
"""
return self.define(pos=None)
def get_synsets(self, pos=None):
"""Return a list of Synset objects for this word.
:param pos: A part-of-speech tag to filter upon. If ``None``, all
synsets for all parts of speech will be loaded.
:rtype: list of Synsets
.. versionadded:: 0.7.0
"""
return _wordnet.synsets(self.string, pos)
def define(self, pos=None):
"""Return a list of definitions for this word. Each definition
corresponds to a synset for this word.
:param pos: A part-of-speech tag to filter upon. If ``None``, definitions
for all parts of speech will be loaded.
:rtype: List of strings
.. versionadded:: 0.7.0
"""
return [syn.definition() for syn in self.get_synsets(pos=pos)]
class WordList(list):
"""A list-like collection of words."""
def __init__(self, collection):
"""Initialize a WordList. Takes a collection of strings as
its only argument.
"""
super(WordList, self).__init__([Word(w) for w in collection])
def __str__(self):
"""Returns a string representation for printing."""
return super(WordList, self).__repr__()
def __repr__(self):
"""Returns a string representation for debugging."""
class_name = self.__class__.__name__
return '{cls}({lst})'.format(cls=class_name, lst=super(WordList, self).__repr__())
def __getitem__(self, key):
"""Returns a string at the given index."""
item = super(WordList, self).__getitem__(key)
if isinstance(key, slice):
return self.__class__(item)
else:
return item
def __getslice__(self, i, j):
# This is included for Python 2.* compatibility
return self.__class__(super(WordList, self).__getslice__(i, j))
def __setitem__(self, index, obj):
"""Places object at given index, replacing existing item. If the object
is a string, inserts a :class:`Word <Word>` object.
"""
if isinstance(obj, basestring):
super(WordList, self).__setitem__(index, Word(obj))
else:
super(WordList, self).__setitem__(index, obj)
def count(self, strg, case_sensitive=False, *args, **kwargs):
"""Get the count of a word or phrase `s` within this WordList.
:param strg: The string to count.
:param case_sensitive: A boolean, whether or not the search is case-sensitive.
"""
if not case_sensitive:
return [word.lower() for word in self].count(strg.lower(), *args,
**kwargs)
return super(WordList, self).count(strg, *args, **kwargs)
def append(self, obj):
"""Append an object to end. If the object is a string, appends a
:class:`Word <Word>` object.
"""
if isinstance(obj, basestring):
super(WordList, self).append(Word(obj))
else:
super(WordList, self).append(obj)
def extend(self, iterable):
"""Extend WordList by appending elements from ``iterable``. If an element
is a string, appends a :class:`Word <Word>` object.
"""
for e in iterable:
self.append(e)
def upper(self):
"""Return a new WordList with each word upper-cased."""
return self.__class__([word.upper() for word in self])
def lower(self):
"""Return a new WordList with each word lower-cased."""
return self.__class__([word.lower() for word in self])
def singularize(self):
"""Return the single version of each word in this WordList."""
return self.__class__([word.singularize() for word in self])
def pluralize(self):
"""Return the plural version of each word in this WordList."""
return self.__class__([word.pluralize() for word in self])
def lemmatize(self):
"""Return the lemma of each word in this WordList."""
return self.__class__([word.lemmatize() for word in self])
def stem(self, *args, **kwargs):
"""Return the stem for each word in this WordList."""
return self.__class__([word.stem(*args, **kwargs) for word in self])
def _validated_param(obj, name, base_class, default, base_class_name=None):
"""Validates a parameter passed to __init__. Makes sure that obj is
the correct class. Return obj if it's not None or falls back to default
:param obj: The object passed in.
:param name: The name of the parameter.
:param base_class: The class that obj must inherit from.
:param default: The default object to fall back upon if obj is None.
"""
base_class_name = base_class_name if base_class_name else base_class.__name__
if obj is not None and not isinstance(obj, base_class):
raise ValueError('{name} must be an instance of {cls}'
.format(name=name, cls=base_class_name))
return obj or default
def _initialize_models(obj, tokenizer, pos_tagger,
np_extractor, analyzer, parser, classifier):
"""Common initialization between BaseBlob and Blobber classes."""
# tokenizer may be a textblob or an NLTK tokenizer
obj.tokenizer = _validated_param(tokenizer, "tokenizer",
base_class=(BaseTokenizer, nltk.tokenize.api.TokenizerI),
default=BaseBlob.tokenizer,
base_class_name="BaseTokenizer")
obj.np_extractor = _validated_param(np_extractor, "np_extractor",
base_class=BaseNPExtractor,
default=BaseBlob.np_extractor)
obj.pos_tagger = _validated_param(pos_tagger, "pos_tagger",
BaseTagger, BaseBlob.pos_tagger)
obj.analyzer = _validated_param(analyzer, "analyzer",
BaseSentimentAnalyzer, BaseBlob.analyzer)
obj.parser = _validated_param(parser, "parser", BaseParser, BaseBlob.parser)
obj.classifier = classifier
class BaseBlob(StringlikeMixin, BlobComparableMixin):
"""An abstract base class that all textblob classes will inherit from.
Includes words, POS tag, NP, and word count properties. Also includes
basic dunder and string methods for making objects like Python strings.
:param text: A string.
:param tokenizer: (optional) A tokenizer instance. If ``None``,
defaults to :class:`WordTokenizer() <textblob.tokenizers.WordTokenizer>`.
:param np_extractor: (optional) An NPExtractor instance. If ``None``,
defaults to :class:`FastNPExtractor() <textblob.en.np_extractors.FastNPExtractor>`.
:param pos_tagger: (optional) A Tagger instance. If ``None``,
defaults to :class:`NLTKTagger <textblob.en.taggers.NLTKTagger>`.
:param analyzer: (optional) A sentiment analyzer. If ``None``,
defaults to :class:`PatternAnalyzer <textblob.en.sentiments.PatternAnalyzer>`.
:param parser: A parser. If ``None``, defaults to
:class:`PatternParser <textblob.en.parsers.PatternParser>`.
:param classifier: A classifier.
.. versionchanged:: 0.6.0
``clean_html`` parameter deprecated, as it was in NLTK.
"""
np_extractor = FastNPExtractor()
pos_tagger = NLTKTagger()
tokenizer = WordTokenizer()
translator = Translator()
analyzer = PatternAnalyzer()
parser = PatternParser()
def __init__(self, text, tokenizer=None,
pos_tagger=None, np_extractor=None, analyzer=None,
parser=None, classifier=None, clean_html=False):
if not isinstance(text, basestring):
raise TypeError('The `text` argument passed to `__init__(text)` '
'must be a string, not {0}'.format(type(text)))
if clean_html:
raise NotImplementedError("clean_html has been deprecated. "
"To remove HTML markup, use BeautifulSoup's "
"get_text() function")
self.raw = self.string = text
self.stripped = lowerstrip(self.raw, all=True)
_initialize_models(self, tokenizer, pos_tagger, np_extractor, analyzer,
parser, classifier)
@cached_property
def words(self):
"""Return a list of word tokens. This excludes punctuation characters.
If you want to include punctuation characters, access the ``tokens``
property.
:returns: A :class:`WordList <WordList>` of word tokens.
"""
return WordList(word_tokenize(self.raw, include_punc=False))
@cached_property
def tokens(self):
"""Return a list of tokens, using this blob's tokenizer object
(defaults to :class:`WordTokenizer <textblob.tokenizers.WordTokenizer>`).
"""
return WordList(self.tokenizer.tokenize(self.raw))
def tokenize(self, tokenizer=None):
"""Return a list of tokens, using ``tokenizer``.
:param tokenizer: (optional) A tokenizer object. If None, defaults to
this blob's default tokenizer.
"""
t = tokenizer if tokenizer is not None else self.tokenizer
return WordList(t.tokenize(self.raw))
def parse(self, parser=None):
"""Parse the text.
:param parser: (optional) A parser instance. If ``None``, defaults to
this blob's default parser.
.. versionadded:: 0.6.0
"""
p = parser if parser is not None else self.parser
return p.parse(self.raw)
def classify(self):
"""Classify the blob using the blob's ``classifier``."""
if self.classifier is None:
raise NameError("This blob has no classifier. Train one first!")
return self.classifier.classify(self.raw)
@cached_property
def sentiment(self):
"""Return a tuple of form (polarity, subjectivity ) where polarity
is a float within the range [-1.0, 1.0] and subjectivity is a float
within the range [0.0, 1.0] where 0.0 is very objective and 1.0 is
very subjective.
:rtype: namedtuple of the form ``Sentiment(polarity, subjectivity)``
"""
return self.analyzer.analyze(self.raw)
@cached_property
def sentiment_assessments(self):
"""Return a tuple of form (polarity, subjectivity, assessments ) where
polarity is a float within the range [-1.0, 1.0], subjectivity is a
float within the range [0.0, 1.0] where 0.0 is very objective and 1.0
is very subjective, and assessments is a list of polarity and
subjectivity scores for the assessed tokens.
:rtype: namedtuple of the form ``Sentiment(polarity, subjectivity,
assessments)``
"""
return self.analyzer.analyze(self.raw, keep_assessments=True)
@cached_property
def polarity(self):
"""Return the polarity score as a float within the range [-1.0, 1.0]
:rtype: float
"""
return PatternAnalyzer().analyze(self.raw)[0]
@cached_property
def subjectivity(self):
"""Return the subjectivity score as a float within the range [0.0, 1.0]
where 0.0 is very objective and 1.0 is very subjective.
:rtype: float
"""
return PatternAnalyzer().analyze(self.raw)[1]
@cached_property
def noun_phrases(self):
"""Returns a list of noun phrases for this blob."""
return WordList([phrase.strip().lower()
for phrase in self.np_extractor.extract(self.raw)
if len(phrase) > 1])
@cached_property
def pos_tags(self):
"""Returns an list of tuples of the form (word, POS tag).
Example:
::
[('At', 'IN'), ('eight', 'CD'), ("o'clock", 'JJ'), ('on', 'IN'),
('Thursday', 'NNP'), ('morning', 'NN')]
:rtype: list of tuples
"""
if isinstance(self, TextBlob):
return [val for sublist in [s.pos_tags for s in self.sentences] for val in sublist]
else:
return [(Word(unicode(word), pos_tag=t), unicode(t))
for word, t in self.pos_tagger.tag(self)
if not PUNCTUATION_REGEX.match(unicode(t))]
tags = pos_tags
@cached_property
def word_counts(self):
"""Dictionary of word frequencies in this text.
"""
counts = defaultdict(int)
stripped_words = [lowerstrip(word) for word in self.words]
for word in stripped_words:
counts[word] += 1
return counts
@cached_property
def np_counts(self):
"""Dictionary of noun phrase frequencies in this text.
"""
counts = defaultdict(int)
for phrase in self.noun_phrases:
counts[phrase] += 1
return counts
def ngrams(self, n=3):
"""Return a list of n-grams (tuples of n successive words) for this
blob.
:rtype: List of :class:`WordLists <WordList>`
"""
if n <= 0:
return []
grams = [WordList(self.words[i:i + n])
for i in range(len(self.words) - n + 1)]
return grams
def translate(self, from_lang="auto", to="en"):
"""Translate the blob to another language.
Uses the Google Translate API. Returns a new TextBlob.
Requires an internet connection.
Usage:
::
>>> b = TextBlob("Simple is better than complex")
>>> b.translate(to="es")
TextBlob('Lo simple es mejor que complejo')
Language code reference:
https://developers.google.com/translate/v2/using_rest#language-params
.. deprecated:: 0.16.0
Use the official Google Translate API instead.
.. versionadded:: 0.5.0.
:param str from_lang: Language to translate from. If ``None``, will attempt
to detect the language.
:param str to: Language to translate to.
:rtype: :class:`BaseBlob <BaseBlob>`
"""
warnings.warn(
'TextBlob.translate is deprecated and will be removed in a future release. '
'Use the official Google Translate API instead.',
DeprecationWarning
)
return self.__class__(self.translator.translate(self.raw,
from_lang=from_lang, to_lang=to))
def detect_language(self):
"""Detect the blob's language using the Google Translate API.
Requires an internet connection.
Usage:
::
>>> b = TextBlob("bonjour")
>>> b.detect_language()
u'fr'
Language code reference:
https://developers.google.com/translate/v2/using_rest#language-params
.. deprecated:: 0.16.0
Use the official Google Translate API instead.
.. versionadded:: 0.5.0
:rtype: str
"""
warnings.warn(
'TextBlob.detext_translate is deprecated and will be removed in a future release. '
'Use the official Google Translate API instead.',
DeprecationWarning
)
return self.translator.detect(self.raw)
def correct(self):
"""Attempt to correct the spelling of a blob.
.. versionadded:: 0.6.0
:rtype: :class:`BaseBlob <BaseBlob>`
"""
# regex matches: word or punctuation or whitespace
tokens = nltk.tokenize.regexp_tokenize(self.raw, r"\w+|[^\w\s]|\s")
corrected = (Word(w).correct() for w in tokens)
ret = ''.join(corrected)
return self.__class__(ret)
def _cmpkey(self):
"""Key used by ComparableMixin to implement all rich comparison
operators.
"""
return self.raw
def _strkey(self):
"""Key used by StringlikeMixin to implement string methods."""
return self.raw
def __hash__(self):
return hash(self._cmpkey())
def __add__(self, other):
'''Concatenates two text objects the same way Python strings are
concatenated.
Arguments:
- `other`: a string or a text object
'''
if isinstance(other, basestring):
return self.__class__(self.raw + other)
elif isinstance(other, BaseBlob):
return self.__class__(self.raw + other.raw)
else:
raise TypeError('Operands must be either strings or {0} objects'
.format(self.__class__.__name__))
def split(self, sep=None, maxsplit=sys.maxsize):
"""Behaves like the built-in str.split() except returns a
WordList.
:rtype: :class:`WordList <WordList>`
"""
return WordList(self._strkey().split(sep, maxsplit))
class TextBlob(BaseBlob):
"""A general text block, meant for larger bodies of text (esp. those
containing sentences). Inherits from :class:`BaseBlob <BaseBlob>`.
:param str text: A string.
:param tokenizer: (optional) A tokenizer instance. If ``None``, defaults to
:class:`WordTokenizer() <textblob.tokenizers.WordTokenizer>`.
:param np_extractor: (optional) An NPExtractor instance. If ``None``,
defaults to :class:`FastNPExtractor() <textblob.en.np_extractors.FastNPExtractor>`.
:param pos_tagger: (optional) A Tagger instance. If ``None``, defaults to
:class:`NLTKTagger <textblob.en.taggers.NLTKTagger>`.
:param analyzer: (optional) A sentiment analyzer. If ``None``, defaults to
:class:`PatternAnalyzer <textblob.en.sentiments.PatternAnalyzer>`.
:param classifier: (optional) A classifier.
"""
@cached_property
def sentences(self):
"""Return list of :class:`Sentence <Sentence>` objects."""
return self._create_sentence_objects()
@cached_property
def words(self):
"""Return a list of word tokens. This excludes punctuation characters.
If you want to include punctuation characters, access the ``tokens``
property.
:returns: A :class:`WordList <WordList>` of word tokens.
"""
return WordList(word_tokenize(self.raw, include_punc=False))
@property
def raw_sentences(self):
"""List of strings, the raw sentences in the blob."""
return [sentence.raw for sentence in self.sentences]
@property
def serialized(self):
"""Returns a list of each sentence's dict representation."""
return [sentence.dict for sentence in self.sentences]
def to_json(self, *args, **kwargs):
'''Return a json representation (str) of this blob.
Takes the same arguments as json.dumps.
.. versionadded:: 0.5.1
'''
return json.dumps(self.serialized, *args, **kwargs)
@property
def json(self):
'''The json representation of this blob.
.. versionchanged:: 0.5.1
Made ``json`` a property instead of a method to restore backwards
compatibility that was broken after version 0.4.0.
'''
return self.to_json()
def _create_sentence_objects(self):
'''Returns a list of Sentence objects from the raw text.
'''
sentence_objects = []
sentences = sent_tokenize(self.raw)
char_index = 0 # Keeps track of character index within the blob
for sent in sentences:
# Compute the start and end indices of the sentence
# within the blob
start_index = self.raw.index(sent, char_index)
char_index += len(sent)
end_index = start_index + len(sent)
# Sentences share the same models as their parent blob
s = Sentence(sent, start_index=start_index, end_index=end_index,
tokenizer=self.tokenizer, np_extractor=self.np_extractor,
pos_tagger=self.pos_tagger, analyzer=self.analyzer,
parser=self.parser, classifier=self.classifier)
sentence_objects.append(s)
return sentence_objects
class Sentence(BaseBlob):
"""A sentence within a TextBlob. Inherits from :class:`BaseBlob <BaseBlob>`.
:param sentence: A string, the raw sentence.
:param start_index: An int, the index where this sentence begins
in a TextBlob. If not given, defaults to 0.
:param end_index: An int, the index where this sentence ends in
a TextBlob. If not given, defaults to the
length of the sentence - 1.
"""
def __init__(self, sentence, start_index=0, end_index=None, *args, **kwargs):
super(Sentence, self).__init__(sentence, *args, **kwargs)
#: The start index within a TextBlob
self.start = self.start_index = start_index
#: The end index within a textBlob
self.end = self.end_index = end_index or len(sentence) - 1
@property
def dict(self):
'''The dict representation of this sentence.'''
return {
'raw': self.raw,
'start_index': self.start_index,
'end_index': self.end_index,
'stripped': self.stripped,
'noun_phrases': self.noun_phrases,
'polarity': self.polarity,
'subjectivity': self.subjectivity,
}
class Blobber(object):
"""A factory for TextBlobs that all share the same tagger,
tokenizer, parser, classifier, and np_extractor.
Usage:
>>> from textblob import Blobber
>>> from textblob.taggers import NLTKTagger
>>> from textblob.tokenizers import SentenceTokenizer
>>> tb = Blobber(pos_tagger=NLTKTagger(), tokenizer=SentenceTokenizer())
>>> blob1 = tb("This is one blob.")
>>> blob2 = tb("This blob has the same tagger and tokenizer.")
>>> blob1.pos_tagger is blob2.pos_tagger
True
:param tokenizer: (optional) A tokenizer instance. If ``None``,
defaults to :class:`WordTokenizer() <textblob.tokenizers.WordTokenizer>`.
:param np_extractor: (optional) An NPExtractor instance. If ``None``,
defaults to :class:`FastNPExtractor() <textblob.en.np_extractors.FastNPExtractor>`.
:param pos_tagger: (optional) A Tagger instance. If ``None``,
defaults to :class:`NLTKTagger <textblob.en.taggers.NLTKTagger>`.
:param analyzer: (optional) A sentiment analyzer. If ``None``,
defaults to :class:`PatternAnalyzer <textblob.en.sentiments.PatternAnalyzer>`.
:param parser: A parser. If ``None``, defaults to
:class:`PatternParser <textblob.en.parsers.PatternParser>`.
:param classifier: A classifier.
.. versionadded:: 0.4.0
"""
np_extractor = FastNPExtractor()
pos_tagger = NLTKTagger()
tokenizer = WordTokenizer()
analyzer = PatternAnalyzer()
parser = PatternParser()
def __init__(self, tokenizer=None, pos_tagger=None, np_extractor=None,
analyzer=None, parser=None, classifier=None):
_initialize_models(self, tokenizer, pos_tagger, np_extractor, analyzer,
parser, classifier)
def __call__(self, text):
"""Return a new TextBlob object with this Blobber's ``np_extractor``,
``pos_tagger``, ``tokenizer``, ``analyzer``, and ``classifier``.
:returns: A new :class:`TextBlob <TextBlob>`.
"""
return TextBlob(text, tokenizer=self.tokenizer, pos_tagger=self.pos_tagger,
np_extractor=self.np_extractor, analyzer=self.analyzer,
parser=self.parser,
classifier=self.classifier)
def __repr__(self):
classifier_name = self.classifier.__class__.__name__ + "()" if self.classifier else "None"
return ("Blobber(tokenizer={0}(), pos_tagger={1}(), "
"np_extractor={2}(), analyzer={3}(), parser={4}(), classifier={5})")\
.format(self.tokenizer.__class__.__name__,
self.pos_tagger.__class__.__name__,
self.np_extractor.__class__.__name__,
self.analyzer.__class__.__name__,
self.parser.__class__.__name__,
classifier_name)
__str__ = __repr__

View File

@@ -0,0 +1,503 @@
# -*- coding: utf-8 -*-
"""Various classifier implementations. Also includes basic feature extractor
methods.
Example Usage:
::
>>> from textblob import TextBlob
>>> from textblob.classifiers import NaiveBayesClassifier
>>> train = [
... ('I love this sandwich.', 'pos'),
... ('This is an amazing place!', 'pos'),
... ('I feel very good about these beers.', 'pos'),
... ('I do not like this restaurant', 'neg'),
... ('I am tired of this stuff.', 'neg'),
... ("I can't deal with this", 'neg'),
... ("My boss is horrible.", "neg")
... ]
>>> cl = NaiveBayesClassifier(train)
>>> cl.classify("I feel amazing!")
'pos'
>>> blob = TextBlob("The beer is good. But the hangover is horrible.", classifier=cl)
>>> for s in blob.sentences:
... print(s)
... print(s.classify())
...
The beer is good.
pos
But the hangover is horrible.
neg
.. versionadded:: 0.6.0
"""
from __future__ import absolute_import
from itertools import chain
import nltk
from textblob.compat import basestring
from textblob.decorators import cached_property
from textblob.exceptions import FormatError
from textblob.tokenizers import word_tokenize
from textblob.utils import strip_punc, is_filelike
import textblob.formats as formats
### Basic feature extractors ###
def _get_words_from_dataset(dataset):
"""Return a set of all words in a dataset.
:param dataset: A list of tuples of the form ``(words, label)`` where
``words`` is either a string of a list of tokens.
"""
# Words may be either a string or a list of tokens. Return an iterator
# of tokens accordingly
def tokenize(words):
if isinstance(words, basestring):
return word_tokenize(words, include_punc=False)
else:
return words
all_words = chain.from_iterable(tokenize(words) for words, _ in dataset)
return set(all_words)
def _get_document_tokens(document):
if isinstance(document, basestring):
tokens = set((strip_punc(w, all=False)
for w in word_tokenize(document, include_punc=False)))
else:
tokens = set(strip_punc(w, all=False) for w in document)
return tokens
def basic_extractor(document, train_set):
"""A basic document feature extractor that returns a dict indicating
what words in ``train_set`` are contained in ``document``.
:param document: The text to extract features from. Can be a string or an iterable.
:param list train_set: Training data set, a list of tuples of the form
``(words, label)`` OR an iterable of strings.
"""
try:
el_zero = next(iter(train_set)) # Infer input from first element.
except StopIteration:
return {}
if isinstance(el_zero, basestring):
word_features = [w for w in chain([el_zero], train_set)]
else:
try:
assert(isinstance(el_zero[0], basestring))
word_features = _get_words_from_dataset(chain([el_zero], train_set))
except Exception:
raise ValueError('train_set is probably malformed.')
tokens = _get_document_tokens(document)
features = dict(((u'contains({0})'.format(word), (word in tokens))
for word in word_features))
return features
def contains_extractor(document):
"""A basic document feature extractor that returns a dict of words that
the document contains.
"""
tokens = _get_document_tokens(document)
features = dict((u'contains({0})'.format(w), True) for w in tokens)
return features
##### CLASSIFIERS #####
class BaseClassifier(object):
"""Abstract classifier class from which all classifers inherit. At a
minimum, descendant classes must implement a ``classify`` method and have
a ``classifier`` property.
:param train_set: The training set, either a list of tuples of the form
``(text, classification)`` or a file-like object. ``text`` may be either
a string or an iterable.
:param callable feature_extractor: A feature extractor function that takes one or
two arguments: ``document`` and ``train_set``.
:param str format: If ``train_set`` is a filename, the file format, e.g.
``"csv"`` or ``"json"``. If ``None``, will attempt to detect the
file format.
:param kwargs: Additional keyword arguments are passed to the constructor
of the :class:`Format <textblob.formats.BaseFormat>` class used to
read the data. Only applies when a file-like object is passed as
``train_set``.
.. versionadded:: 0.6.0
"""
def __init__(self, train_set, feature_extractor=basic_extractor, format=None, **kwargs):
self.format_kwargs = kwargs
self.feature_extractor = feature_extractor
if is_filelike(train_set):
self.train_set = self._read_data(train_set, format)
else: # train_set is a list of tuples
self.train_set = train_set
self._word_set = _get_words_from_dataset(self.train_set) # Keep a hidden set of unique words.
self.train_features = None
def _read_data(self, dataset, format=None):
"""Reads a data file and returns an iterable that can be used
as testing or training data.
"""
# Attempt to detect file format if "format" isn't specified
if not format:
format_class = formats.detect(dataset)
if not format_class:
raise FormatError('Could not automatically detect format for the given '
'data source.')
else:
registry = formats.get_registry()
if format not in registry.keys():
raise ValueError("'{0}' format not supported.".format(format))
format_class = registry[format]
return format_class(dataset, **self.format_kwargs).to_iterable()
@cached_property
def classifier(self):
"""The classifier object."""
raise NotImplementedError('Must implement the "classifier" property.')
def classify(self, text):
"""Classifies a string of text."""
raise NotImplementedError('Must implement a "classify" method.')
def train(self, labeled_featureset):
"""Trains the classifier."""
raise NotImplementedError('Must implement a "train" method.')
def labels(self):
"""Returns an iterable containing the possible labels."""
raise NotImplementedError('Must implement a "labels" method.')
def extract_features(self, text):
'''Extracts features from a body of text.
:rtype: dictionary of features
'''
# Feature extractor may take one or two arguments
try:
return self.feature_extractor(text, self._word_set)
except (TypeError, AttributeError):
return self.feature_extractor(text)
class NLTKClassifier(BaseClassifier):
"""An abstract class that wraps around the nltk.classify module.
Expects that descendant classes include a class variable ``nltk_class``
which is the class in the nltk.classify module to be wrapped.
Example: ::
class MyClassifier(NLTKClassifier):
nltk_class = nltk.classify.svm.SvmClassifier
"""
#: The NLTK class to be wrapped. Must be a class within nltk.classify
nltk_class = None
def __init__(self, train_set,
feature_extractor=basic_extractor, format=None, **kwargs):
super(NLTKClassifier, self).__init__(train_set, feature_extractor, format, **kwargs)
self.train_features = [(self.extract_features(d), c) for d, c in self.train_set]
def __repr__(self):
class_name = self.__class__.__name__
return "<{cls} trained on {n} instances>".format(cls=class_name,
n=len(self.train_set))
@cached_property
def classifier(self):
"""The classifier."""
try:
return self.train()
except AttributeError: # nltk_class has not been defined
raise ValueError("NLTKClassifier must have a nltk_class"
" variable that is not None.")
def train(self, *args, **kwargs):
"""Train the classifier with a labeled feature set and return
the classifier. Takes the same arguments as the wrapped NLTK class.
This method is implicitly called when calling ``classify`` or
``accuracy`` methods and is included only to allow passing in arguments
to the ``train`` method of the wrapped NLTK class.
.. versionadded:: 0.6.2
:rtype: A classifier
"""
try:
self.classifier = self.nltk_class.train(self.train_features,
*args, **kwargs)
return self.classifier
except AttributeError:
raise ValueError("NLTKClassifier must have a nltk_class"
" variable that is not None.")
def labels(self):
"""Return an iterable of possible labels."""
return self.classifier.labels()
def classify(self, text):
"""Classifies the text.
:param str text: A string of text.
"""
text_features = self.extract_features(text)
return self.classifier.classify(text_features)
def accuracy(self, test_set, format=None):
"""Compute the accuracy on a test set.
:param test_set: A list of tuples of the form ``(text, label)``, or a
file pointer.
:param format: If ``test_set`` is a filename, the file format, e.g.
``"csv"`` or ``"json"``. If ``None``, will attempt to detect the
file format.
"""
if is_filelike(test_set):
test_data = self._read_data(test_set, format)
else: # test_set is a list of tuples
test_data = test_set
test_features = [(self.extract_features(d), c) for d, c in test_data]
return nltk.classify.accuracy(self.classifier, test_features)
def update(self, new_data, *args, **kwargs):
"""Update the classifier with new training data and re-trains the
classifier.
:param new_data: New data as a list of tuples of the form
``(text, label)``.
"""
self.train_set += new_data
self._word_set.update(_get_words_from_dataset(new_data))
self.train_features = [(self.extract_features(d), c)
for d, c in self.train_set]
try:
self.classifier = self.nltk_class.train(self.train_features,
*args, **kwargs)
except AttributeError: # Descendant has not defined nltk_class
raise ValueError("NLTKClassifier must have a nltk_class"
" variable that is not None.")
return True
class NaiveBayesClassifier(NLTKClassifier):
"""A classifier based on the Naive Bayes algorithm, as implemented in
NLTK.
:param train_set: The training set, either a list of tuples of the form
``(text, classification)`` or a filename. ``text`` may be either
a string or an iterable.
:param feature_extractor: A feature extractor function that takes one or
two arguments: ``document`` and ``train_set``.
:param format: If ``train_set`` is a filename, the file format, e.g.
``"csv"`` or ``"json"``. If ``None``, will attempt to detect the
file format.
.. versionadded:: 0.6.0
"""
nltk_class = nltk.classify.NaiveBayesClassifier
def prob_classify(self, text):
"""Return the label probability distribution for classifying a string
of text.
Example:
::
>>> classifier = NaiveBayesClassifier(train_data)
>>> prob_dist = classifier.prob_classify("I feel happy this morning.")
>>> prob_dist.max()
'positive'
>>> prob_dist.prob("positive")
0.7
:rtype: nltk.probability.DictionaryProbDist
"""
text_features = self.extract_features(text)
return self.classifier.prob_classify(text_features)
def informative_features(self, *args, **kwargs):
"""Return the most informative features as a list of tuples of the
form ``(feature_name, feature_value)``.
:rtype: list
"""
return self.classifier.most_informative_features(*args, **kwargs)
def show_informative_features(self, *args, **kwargs):
"""Displays a listing of the most informative features for this
classifier.
:rtype: None
"""
return self.classifier.show_most_informative_features(*args, **kwargs)
class DecisionTreeClassifier(NLTKClassifier):
"""A classifier based on the decision tree algorithm, as implemented in
NLTK.
:param train_set: The training set, either a list of tuples of the form
``(text, classification)`` or a filename. ``text`` may be either
a string or an iterable.
:param feature_extractor: A feature extractor function that takes one or
two arguments: ``document`` and ``train_set``.
:param format: If ``train_set`` is a filename, the file format, e.g.
``"csv"`` or ``"json"``. If ``None``, will attempt to detect the
file format.
.. versionadded:: 0.6.2
"""
nltk_class = nltk.classify.decisiontree.DecisionTreeClassifier
def pretty_format(self, *args, **kwargs):
"""Return a string containing a pretty-printed version of this decision
tree. Each line in the string corresponds to a single decision tree node
or leaf, and indentation is used to display the structure of the tree.
:rtype: str
"""
return self.classifier.pretty_format(*args, **kwargs)
# Backwards-compat
pprint = pretty_format
def pseudocode(self, *args, **kwargs):
"""Return a string representation of this decision tree that expresses
the decisions it makes as a nested set of pseudocode if statements.
:rtype: str
"""
return self.classifier.pseudocode(*args, **kwargs)
class PositiveNaiveBayesClassifier(NLTKClassifier):
"""A variant of the Naive Bayes Classifier that performs binary
classification with partially-labeled training sets, i.e. when only
one class is labeled and the other is not. Assuming a prior distribution
on the two labels, uses the unlabeled set to estimate the frequencies of
the features.
Example usage:
::
>>> from text.classifiers import PositiveNaiveBayesClassifier
>>> sports_sentences = ['The team dominated the game',
... 'They lost the ball',
... 'The game was intense',
... 'The goalkeeper catched the ball',
... 'The other team controlled the ball']
>>> various_sentences = ['The President did not comment',
... 'I lost the keys',
... 'The team won the game',
... 'Sara has two kids',
... 'The ball went off the court',
... 'They had the ball for the whole game',
... 'The show is over']
>>> classifier = PositiveNaiveBayesClassifier(positive_set=sports_sentences,
... unlabeled_set=various_sentences)
>>> classifier.classify("My team lost the game")
True
>>> classifier.classify("And now for something completely different.")
False
:param positive_set: A collection of strings that have the positive label.
:param unlabeled_set: A collection of unlabeled strings.
:param feature_extractor: A feature extractor function.
:param positive_prob_prior: A prior estimate of the probability of the
label ``True``.
.. versionadded:: 0.7.0
"""
nltk_class = nltk.classify.PositiveNaiveBayesClassifier
def __init__(self, positive_set, unlabeled_set,
feature_extractor=contains_extractor,
positive_prob_prior=0.5, **kwargs):
self.feature_extractor = feature_extractor
self.positive_set = positive_set
self.unlabeled_set = unlabeled_set
self.positive_features = [self.extract_features(d)
for d in self.positive_set]
self.unlabeled_features = [self.extract_features(d)
for d in self.unlabeled_set]
self.positive_prob_prior = positive_prob_prior
def __repr__(self):
class_name = self.__class__.__name__
return "<{cls} trained on {n_pos} labeled and {n_unlabeled} unlabeled instances>"\
.format(cls=class_name, n_pos=len(self.positive_set),
n_unlabeled=len(self.unlabeled_set))
# Override
def train(self, *args, **kwargs):
"""Train the classifier with a labeled and unlabeled feature sets and return
the classifier. Takes the same arguments as the wrapped NLTK class.
This method is implicitly called when calling ``classify`` or
``accuracy`` methods and is included only to allow passing in arguments
to the ``train`` method of the wrapped NLTK class.
:rtype: A classifier
"""
self.classifier = self.nltk_class.train(self.positive_features,
self.unlabeled_features,
self.positive_prob_prior)
return self.classifier
def update(self, new_positive_data=None,
new_unlabeled_data=None, positive_prob_prior=0.5,
*args, **kwargs):
"""Update the classifier with new data and re-trains the
classifier.
:param new_positive_data: List of new, labeled strings.
:param new_unlabeled_data: List of new, unlabeled strings.
"""
self.positive_prob_prior = positive_prob_prior
if new_positive_data:
self.positive_set += new_positive_data
self.positive_features += [self.extract_features(d)
for d in new_positive_data]
if new_unlabeled_data:
self.unlabeled_set += new_unlabeled_data
self.unlabeled_features += [self.extract_features(d)
for d in new_unlabeled_data]
self.classifier = self.nltk_class.train(self.positive_features,
self.unlabeled_features,
self.positive_prob_prior,
*args, **kwargs)
return True
class MaxEntClassifier(NLTKClassifier):
__doc__ = nltk.classify.maxent.MaxentClassifier.__doc__
nltk_class = nltk.classify.maxent.MaxentClassifier
def prob_classify(self, text):
"""Return the label probability distribution for classifying a string
of text.
Example:
::
>>> classifier = MaxEntClassifier(train_data)
>>> prob_dist = classifier.prob_classify("I feel happy this morning.")
>>> prob_dist.max()
'positive'
>>> prob_dist.prob("positive")
0.7
:rtype: nltk.probability.DictionaryProbDist
"""
feats = self.extract_features(text)
return self.classifier.prob_classify(feats)

View File

@@ -0,0 +1,53 @@
# -*- coding: utf-8 -*-
import sys
PY2 = int(sys.version[0]) == 2
if PY2:
from itertools import imap, izip
import urllib2 as request
from urllib import quote as urlquote
from urllib import urlencode
text_type = unicode
binary_type = str
string_types = (str, unicode)
unicode = unicode
basestring = basestring
imap = imap
izip = izip
import unicodecsv as csv
def implements_to_string(cls):
"""Class decorator that renames __str__ to __unicode__ and
modifies __str__ that returns utf-8.
"""
cls.__unicode__ = cls.__str__
cls.__str__ = lambda x: x.__unicode__().encode('utf-8')
return cls
else: # PY3
from urllib import request
from urllib.parse import quote as urlquote
from urllib.parse import urlencode
text_type = str
binary_type = bytes
string_types = (str,)
unicode = str
basestring = (str, bytes)
imap = map
izip = zip
import csv
implements_to_string = lambda x: x
# From six
def with_metaclass(meta, *bases):
"""Create a base class with a metaclass."""
# This requires a bit of explanation: the basic idea is to make a dummy
# metaclass for one level of class instantiation that replaces itself with
# the actual metaclass.
class metaclass(meta): # noqa
def __new__(cls, name, this_bases, d):
return meta(name, bases, d)
return type.__new__(metaclass, 'temporary_class', (), {})

View File

@@ -0,0 +1,39 @@
# -*- coding: utf-8 -*-
"""Custom decorators."""
from __future__ import absolute_import
from functools import wraps
from textblob.exceptions import MissingCorpusError
class cached_property(object):
"""A property that is only computed once per instance and then replaces
itself with an ordinary attribute. Deleting the attribute resets the
property.
Credit to Marcel Hellkamp, author of bottle.py.
"""
def __init__(self, func):
self.__doc__ = getattr(func, '__doc__')
self.func = func
def __get__(self, obj, cls):
if obj is None:
return self
value = obj.__dict__[self.func.__name__] = self.func(obj)
return value
def requires_nltk_corpus(func):
"""Wraps a function that requires an NLTK corpus. If the corpus isn't found,
raise a :exc:`MissingCorpusError`.
"""
@wraps(func)
def decorated(*args, **kwargs):
try:
return func(*args, **kwargs)
except LookupError as err:
print(err)
raise MissingCorpusError()
return decorated

View File

@@ -0,0 +1,51 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Downloads the necessary NLTK corpora for TextBlob.
Usage: ::
$ python -m textblob.download_corpora
If you only intend to use TextBlob's default models, you can use the "lite"
option: ::
$ python -m textblob.download_corpora lite
"""
import sys
import nltk
MIN_CORPORA = [
'brown', # Required for FastNPExtractor
'punkt', # Required for WordTokenizer
'wordnet', # Required for lemmatization
'averaged_perceptron_tagger', # Required for NLTKTagger
]
ADDITIONAL_CORPORA = [
'conll2000', # Required for ConllExtractor
'movie_reviews', # Required for NaiveBayesAnalyzer
]
ALL_CORPORA = MIN_CORPORA + ADDITIONAL_CORPORA
def download_lite():
for each in MIN_CORPORA:
nltk.download(each)
def download_all():
for each in ALL_CORPORA:
nltk.download(each)
def main():
if 'lite' in sys.argv:
download_lite()
else:
download_all()
print("Finished.")
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,139 @@
# -*- coding: utf-8 -*-
'''This file is based on pattern.en. See the bundled NOTICE file for
license information.
'''
from __future__ import absolute_import
import os
from textblob._text import (Parser as _Parser, Sentiment as _Sentiment, Lexicon,
WORD, POS, CHUNK, PNP, PENN, UNIVERSAL, Spelling)
from textblob.compat import text_type, unicode
try:
MODULE = os.path.dirname(os.path.abspath(__file__))
except:
MODULE = ""
spelling = Spelling(
path = os.path.join(MODULE, "en-spelling.txt")
)
#--- ENGLISH PARSER --------------------------------------------------------------------------------
def find_lemmata(tokens):
""" Annotates the tokens with lemmata for plural nouns and conjugated verbs,
where each token is a [word, part-of-speech] list.
"""
for token in tokens:
word, pos, lemma = token[0], token[1], token[0]
# cats => cat
if pos == "NNS":
lemma = singularize(word)
# sat => sit
if pos.startswith(("VB", "MD")):
lemma = conjugate(word, INFINITIVE) or word
token.append(lemma.lower())
return tokens
class Parser(_Parser):
def find_lemmata(self, tokens, **kwargs):
return find_lemmata(tokens)
def find_tags(self, tokens, **kwargs):
if kwargs.get("tagset") in (PENN, None):
kwargs.setdefault("map", lambda token, tag: (token, tag))
if kwargs.get("tagset") == UNIVERSAL:
kwargs.setdefault("map", lambda token, tag: penntreebank2universal(token, tag))
return _Parser.find_tags(self, tokens, **kwargs)
class Sentiment(_Sentiment):
def load(self, path=None):
_Sentiment.load(self, path)
# Map "terrible" to adverb "terribly" (+1% accuracy)
if not path:
for w, pos in list(dict.items(self)):
if "JJ" in pos:
if w.endswith("y"):
w = w[:-1] + "i"
if w.endswith("le"):
w = w[:-2]
p, s, i = pos["JJ"]
self.annotate(w + "ly", "RB", p, s, i)
lexicon = Lexicon(
path = os.path.join(MODULE, "en-lexicon.txt"),
morphology = os.path.join(MODULE, "en-morphology.txt"),
context = os.path.join(MODULE, "en-context.txt"),
entities = os.path.join(MODULE, "en-entities.txt"),
language = "en"
)
parser = Parser(
lexicon = lexicon,
default = ("NN", "NNP", "CD"),
language = "en"
)
sentiment = Sentiment(
path = os.path.join(MODULE, "en-sentiment.xml"),
synset = "wordnet_id",
negations = ("no", "not", "n't", "never"),
modifiers = ("RB",),
modifier = lambda w: w.endswith("ly"),
tokenizer = parser.find_tokens,
language = "en"
)
def tokenize(s, *args, **kwargs):
""" Returns a list of sentences, where punctuation marks have been split from words.
"""
return parser.find_tokens(text_type(s), *args, **kwargs)
def parse(s, *args, **kwargs):
""" Returns a tagged Unicode string.
"""
return parser.parse(unicode(s), *args, **kwargs)
def parsetree(s, *args, **kwargs):
""" Returns a parsed Text from the given string.
"""
return Text(parse(unicode(s), *args, **kwargs))
def split(s, token=[WORD, POS, CHUNK, PNP]):
""" Returns a parsed Text from the given parsed string.
"""
return Text(text_type(s), token)
def tag(s, tokenize=True, encoding="utf-8"):
""" Returns a list of (token, tag)-tuples from the given string.
"""
tags = []
for sentence in parse(s, tokenize, True, False, False, False, encoding).split():
for token in sentence:
tags.append((token[0], token[1]))
return tags
def suggest(w):
""" Returns a list of (word, confidence)-tuples of spelling corrections.
"""
return spelling.suggest(w)
def polarity(s, **kwargs):
""" Returns the sentence polarity (positive/negative) between -1.0 and 1.0.
"""
return sentiment(unicode(s), **kwargs)[0]
def subjectivity(s, **kwargs):
""" Returns the sentence subjectivity (objective/subjective) between 0.0 and 1.0.
"""
return sentiment(unicode(s), **kwargs)[1]
def positive(s, threshold=0.1, **kwargs):
""" Returns True if the given sentence has a positive sentiment (polarity >= threshold).
"""
return polarity(unicode(s), **kwargs) >= threshold

View File

@@ -0,0 +1,294 @@
;;;
;;; The contextual rules are based on Brill's rule based tagger v1.14,
;;; trained on Brown corpus and Penn Treebank.
;;;
IN VB PREVTAG PRP
NN VB PREVTAG TO
VBP VB PREV1OR2OR3TAG MD
NN VB PREV1OR2TAG MD
VB NN PREV1OR2TAG DT
VBD VBN PREV1OR2OR3TAG VBZ
VBN VBD PREVTAG PRP
VBN VBD PREVTAG NNP
VBD VBN PREVTAG VBD
VBP VB PREVTAG TO
POS VBZ PREVTAG PRP
VB VBP PREVTAG NNS
IN RB WDAND2AFT as as
VBD VBN PREV1OR2WD have
IN WDT NEXT1OR2TAG VB
VB VBP PREVTAG PRP
VBP VB PREV1OR2WD n't
IN WDT NEXTTAG VBZ
JJ NNP NEXTTAG NNP
IN WDT NEXTTAG VBD
JJ NN NEXTWD of
VBD VBN PREV1OR2WD be
JJR RBR NEXTTAG JJ
IN WDT NEXTTAG VBP
JJS RBS WDNEXTTAG most JJ
VBN VBD SURROUNDTAG NN DT
NNS VBZ PREVTAG PRP
POS VBZ NEXT1OR2TAG DT
NNP NN SURROUNDTAG STAART NNS
VBD VBN NEXTWD by
VB NN PREV1OR2TAG IN
VB VBP PREVTAG WDT
VBG NN PREVTAG JJ
NNS VBZ NEXTTAG DT
VBN VBD PREVTAG WP
NN VBP PREVTAG NNS
VB NN PREVTAG NN
NN VB PREVWD n't
NN VBG NEXTTAG DT
RB JJ NEXTTAG NN
NN VBP PREVTAG PRP
VBN VBD SURROUNDTAG NNS DT
VB NN PREV1OR2TAG POS
JJ NN NEXTTAG VBD
RB RP WDNEXTTAG up DT
JJ VB PREVTAG TO
VBN VBD SURROUNDTAG , DT
VBN VBD PREVWD that
VB VBP PREVBIGRAM NNS RB
NNP JJ SURROUNDTAG STAART NN
VB VBN PREVTAG VBZ
NNP JJ WDNEXTTAG American NNS
JJ RB NEXTTAG JJR
NNS NN CURWD yen
IN WDT NEXTTAG VBD
DT IN WDAND2TAGAFT that NNS
POS VBZ PREVWD that
JJ VB PREVTAG MD
VB NN PREVTAG JJ
JJR RBR NEXTTAG RB
VBD VBN PREV1OR2WD are
NN JJ WDNEXTTAG executive NN
NNP JJ WDNEXTTAG American NN
VBN VBD PREVTAG WDT
VBD VBN PREVBIGRAM VBD RB
JJ NN SURROUNDTAG DT .
NNP JJ NEXTWD German
VBN VB PREVTAG TO
VBN VBD PREVBIGRAM NNP RB
RB IN RBIGRAM up to
VB VBP PREVTAG WP
JJ NN SURROUNDTAG DT IN
IN DT NEXTWD 's
VBD VBN WDNEXTTAG ended NNP
VBD VBN SURROUNDTAG DT NN
NNS NNP NEXTTAG NNP
NN NNP NEXTTAG NNP
VBG NN SURROUNDTAG DT IN
NNP JJ SURROUNDTAG STAART NNS
RB RP WDPREVTAG VB up
VBN VBD PREVBIGRAM PRP RB
JJ RB NEXTTAG VBN
NN VBP PREVTAG RB
NNS VBZ PREVTAG RB
POS VBZ PREVTAG WP
VB VBN PREVWD have
NN PDT WDNEXTTAG half DT
IN WDT NEXTTAG MD
POS VBZ PREVTAG DT
NN NNP CURWD Integrated
POS '' NEXT1OR2TAG ''
VBD VBN PREVTAG IN
JJR RBR NEXT1OR2TAG VBN
JJS RBS WDNEXTTAG most RB
JJ NN SURROUNDTAG JJ IN
VBZ NNS PREVTAG JJ
NNS VBZ WDPREVTAG JJ is
JJ NN NEXTTAG VBZ
VBP NN PREVTAG DT
JJ NN SURROUNDTAG JJ .
NNPS NNP NEXTTAG NNP
WDT DT PREVTAG CC
RB IN WDNEXTTAG so PRP
VBP NN PREVWD earnings
NN VBG PREVWD is
NNS VBZ PREV1OR2WD Mr.
VBZ NNS PREVWD the
RB RP WDPREVTAG VBN up
NNPS NNS PREVTAG STAART
VBN VBD SURROUNDTAG NN JJ
VBP VB PREV2TAG VB
RBR JJR NEXTTAG NNS
JJ NN SURROUNDTAG DT ,
JJ NN SURROUNDTAG IN .
NN VB PREVTAG TO
VB NN PREVTAG VB
NN VBP PREVWD who
RB RP WDPREVTAG VBG up
NN RB WDNEXTTAG right RB
VBZ POS WDPREVTAG NNP 's
JJ RP WDNEXTTAG up NN
VBN VBD SURROUNDTAG NN NN
VBN VBD SURROUNDTAG CC DT
JJ NN NEXTBIGRAM MD VB
JJ RB WDNEXTTAG early IN
JJ VBN SURROUNDTAG STAART IN
IN RB RBIGRAM though ,
VBD VBN PREV1OR2WD been
DT PDT WDNEXTTAG all DT
VBN VBD PREVBIGRAM NN RB
NN VB PREVWD help
VBP VB PREV1OR2WD not
VBP NN PREVTAG JJ
DT WDT PREVTAG NNS
NN VBP PREVTAG WDT
VB RB RBIGRAM close to
NNS VBZ PREVBIGRAM , WDT
IN RP WDNEXTTAG out DT
DT RB NEXTWD longer
IN JJ SURROUNDTAG DT NN
DT WDT SURROUNDTAG NN VBZ
IN VB NEXT2TAG VB
IN NN PREVTAG DT
VBN VBD SURROUNDTAG NNS NNS
IN RB RBIGRAM about $
EX RB NEXT1OR2TAG IN
NN VBG NEXTTAG PRP$
NN VBG CURWD living
VBZ NNS PREVTAG PRP$
RBR JJR NEXTTAG NN
RBR JJR CURWD higher
VB VBP PREVBIGRAM PRP RB
NN VB PREVTAG MD
VB NN PREV1OR2TAG PRP$
RP IN PREV1OR2TAG ,
VB JJ PREVTAG DT
DT IN PREVWD out
POS VBZ PREVTAG EX
JJ NN NEXTTAG POS
NN JJ CURWD first
VBD VBN PREVWD the
NNS VBZ WDPREVTAG NNP plans
NNP NNS SURROUNDTAG STAART IN
RB JJ NEXTTAG NNS
JJ RB CURWD just
VBP NN PREVWD sales
NNS NNPS PREVWD Orange
VB VBN PREVTAG VBD
WDT DT PREVTAG IN
NN JJ WDNEXTTAG right NN
NN VBG WDNEXTTAG operating IN
JJ VBN CURWD insured
JJ NNP LBIGRAM STAART U.S.
IN DT NEXTTAG STAART
POS '' PREV1OR2OR3TAG ``
NN JJ WDNEXTTAG official NN
NNP JJ CURWD Irish
JJ RB NEXTTAG RBR
VBG NN WDPREVTAG DT selling
VBP VB PREV1OR2OR3TAG MD
WDT IN NEXTTAG PRP
EX RB NEXTTAG .
VBN VBD SURROUNDTAG NNS PRP$
VBN VBD CURWD said
JJ RB PREVTAG MD
NN VBG NEXTBIGRAM JJ NNS
JJ RB WDNEXTTAG late IN
VBG NN PREVTAG PRP$
VBZ NNS NEXTTAG VBP
NN NNP WDPREVTAG DT CD
NN VBN PREVWD be
JJS RBS NEXTTAG VBN
VBN VBD SURROUNDTAG NN PRP$
VBN VBD SURROUNDTAG NNS JJ
VBN VBD SURROUNDTAG NNS NN
VBD VBN WDNEXTTAG increased NN
VBZ NNS NEXTWD of
IN RP WDAND2TAGAFT out NNS
JJ NNP NEXTTAG POS
RB RP WDNEXTTAG down DT
CD NNS CURWD 1970s
VBG NNP CURWD Working
VBN VB PREVTAG MD
JJ NN NEXTBIGRAM CC NN
NN JJ SURROUNDTAG STAART NNS
VBN VBD PREVBIGRAM , CC
IN RB NEXTBIGRAM . STAART
NN VBG PREVWD was
NNP NNPS CURWD Cowboys
VBZ NNS PREVWD phone
NNP NNS SURROUNDTAG STAART VBP
RBR JJR WDNEXTTAG lower JJ
PRP$ PRP NEXTTAG IN
VBD VB PREVTAG TO
JJ NN WDPREVTAG NN chief
JJ NN SURROUNDTAG JJ ,
NN JJ WDPREVTAG DT third
VBN VBD SURROUNDTAG NNS NNP
NNP NN SURROUNDTAG STAART NN
NNP NN CURWD HDTV
VBG NN SURROUNDTAG DT ,
VBG NN SURROUNDTAG DT .
NNS VBZ PREVTAG WP
NN VB SURROUNDTAG CC DT
NNPS NNP WDAND2TAGBFR IN Securities
RP IN PREVTAG NNS
VBP NN LBIGRAM funds rate
VBP NN WDPREVTAG NNS market
DT RB RBIGRAM either .
VBN NN SURROUNDTAG DT IN
VBD VB PREV1OR2OR3TAG MD
NN JJ NEXTWD oil
VBN VBD SURROUNDTAG , $
VBD VBN PREVBIGRAM DT RB
VBN JJ PREVWD by
NNP JJ WDNEXTTAG American JJ
NN VBG PREVTAG VBP
JJ RB LBIGRAM very much
NN VBG RBIGRAM operating officer
RB IN RBIGRAM up for
NNS VBZ NEXTBIGRAM JJ NNS
NNS VBZ SURROUNDTAG , IN
VB VBP PREVTAG NNPS
IN RP WDAND2TAGAFT out IN
NNPS NNP PREVBIGRAM CC NNP
NN RB RBIGRAM close to
RBR RB PREVWD no
JJ VBD NEXTTAG DT
RB NNP PREVTAG NNP
MD NN PREVWD good
JJ NN WDPREVTAG NN giant
NN JJ WDNEXTTAG official NNS
VBN VBD SURROUNDTAG , PRP$
VBN VBD SURROUNDTAG , RB
VBN VBD SURROUNDTAG NN PRP
NNP JJ WDNEXTTAG South JJ
NN VBG PREVTAG RB
NNS VBZ SURROUNDTAG , TO
VBZ NNS SURROUNDTAG NN .
NN VB NEXTTAG PRP$
VBP VB PREV1OR2WD do
VB JJ NEXTWD countries
IN WDT NEXTBIGRAM RB VBZ
JJ VB NEXTTAG DT
WDT DT NEXTBIGRAM VBZ ,
NNP RB RBIGRAM First ,
DT NNP WDNEXTTAG A VBZ
JJ RBR RBIGRAM further ,
CD PRP WDNEXTTAG one MD
POS '' PREV1OR2OR3TAG .
PRP NN PREVTAG -LRB-
VBN VBD SURROUNDTAG , PRP
VBN VBD SURROUNDTAG NN NNS
VBN VBD SURROUNDTAG NN RP
NNP NN LBIGRAM STAART Business
VBD VBN PREVTAG VBG
IN RB RBIGRAM before ,
IN RB WDAND2AFT As as
NNP JJ LBIGRAM New York-based
NNP JJ CURWD Mexican
NNP NNPS WDNEXTTAG Motors NNP
NNP NNPS WDPREVTAG NNP Enterprises
JJ RB WDNEXTTAG long IN
VBG JJ SURROUNDTAG DT NN
NN PRP PREVWD are mine
* IN CURWD with
* VB CURWD be
* JJ RBIGRAM such as
* IN LBIGRAM such as
* IN CURWD from

View File

@@ -0,0 +1,646 @@
50 Cent PERS
AIDS
AK-47
AT&T ORG
Abraham Lincoln PERS
Acropolis LOC
Adam Sandler PERS
Adolf Hitler PERS
Adriana Lima PERS
Afghanistan LOC
Africa LOC
Al Capone PERS
Al Pacino PERS
Alaska LOC
Albert Einstein PERS
Albert Hofmann PERS
Albert Schweitzer PERS
Alexander the Great PERS
Alfred Hitchcock PERS
Alice Cooper PERS
Alice in Wonderland
Amazon.com ORG
Amber Heard PERS
Amelia Earhart PERS
American Express
American Idol
Amsterdam LOC
Amy Adams PERS
Amy Winehouse PERS
Ancient Egypt LOC
Ancient Rome LOC
Android
Angelina Jolie PERS
Angry Birds
Anne Frank PERS
Anne Hathaway PERS
Antartica LOC
Apple Inc. ORG
Archimedes PERS
Aretha Franklin PERS
Argentina LOC
Aristotle PERS
Arnold Schwarzenegger PERS
Audi ORG
Audrey Hepburn PERS
Aung San Suu Kyi PERS
Australia LOC
Austria LOC
Avatar
Avril Lavigne PERS
Ayn Rand PERS
Aztec
BMW ORG
Babe Ruth PERS
Bacardi ORG
Backstreet Boys
Bangladesh LOC
Barack Obama PERS
Barbra Streisand PERS
Barcelona LOC
Batman PERS
Beethoven PERS
Belarus LOC
Belgium LOC
Ben Affleck PERS
Ben Folds PERS
Ben Stiller PERS
Benazir Bhutto PERS
Benjamin Franklin PERS
Benjamin Millepied PERS
Bernard Madoff PERS
Beyoncé Knowles PERS
Bill Clinton PERS
Bill Gates PERS
Billie Holiday PERS
Billie Jean King PERS
Bing Crosby PERS
Black Sabbath
Blake Edwards PERS
Blake Lively PERS
Bob Dylan PERS
Bob Geldof PERS
Bob Marley PERS
Brad Pitt PERS
Bradley Manning PERS
Brazil LOC
Brett Favre PERS
Britney Spears PERS
Bruce Lee PERS
Bruce Willis PERS
Bruno Mars PERS
Buddhism
Bulgaria LOC
Burger King
Burma LOC
C.S. Lewis PERS
Cadillac ORG
California LOC
Cameron Diaz PERS
Cameron Newton PERS
Canada LOC
Captain Beefheart PERS
Carl Lewis PERS
Charles Darwin PERS
Charles Dickens PERS
Charles Kindbergh PERS
Charles de Gaulle PERS
Charlie Sheen PERS
Che Guevara PERS
Cheryl Cole PERS
Chicago LOC
China LOC
Chopin PERS
Chris Colfer PERS
Christian Bale PERS
Christiano Ronaldo PERS
Christina Aguilera PERS
Christmas
Christopher Nolan PERS
Chuck Norris PERS
Clint Eastwood PERS
Coca Cola ORG
Coco Chanel ORG
Coldplay
Colombia LOC
Conan PERS
Cristiano Ronaldo PERS
Crystal Harris PERS
Cthulhu PERS
Cuba LOC
DNA
Daft Punk
Dalai Lama PERS
Daniel Radcliffe PERS
Darren Aronofsky PERS
Darren Criss PERS
Darth Vader PERS
David Beckham PERS
David Bowie PERS
David Cook PERS
Demi Lovato PERS
Demi Moore PERS
Denmark LOC
Desmond Tutu PERS
Dexter PERS
Diana PERS
Diego Maradona PERS
Disney ORG
Dmitry Medvedev PERS
Doctor Who PERS
Dr. Dre PERS
Dr. Seuss PERS
Dragon Ball
Dubai LOC
Dwayne Johnson PERS
Earth LOC
Ebenezer Scrooge PERS
Eddie Murphy PERS
Eduardo Saverin PERS
Egypt LOC
El Salvador LOC
Elizabeth Edwards PERS
Elizabeth Hurley PERS
Ellen Page PERS
Elton John PERS
Elvis Presley PERS
Emile Zatopek PERS
Eminem PERS
Emma Roberts PERS
Emma Stone PERS
Emma Watson PERS
Emmeline Pankhurst PERS
England LOC
Enrique Iglesias PERS
Ernest Hemingway PERS
Ernest Hemingway PERS
Europe LOC
Eva Peron PERS
Exxon Mobil PERS
FC Barcelona ORG
FIFA ORG
Facebook ORG
Fahrenheit
Family Guy
Faye Resnick PERS
FedEx ORG
Fidel Castro PERS
Finland LOC
Firefox ORG
Florence Nightingale PERS
Florida LOC
Fort Wayne LOC
France LOC
Frank Sinatra PERS
Franklin D. Roosevelt PERS
Freddie Mercury PERS
Frédéric Chopin PERS
Futurama
Garrett Hedlund PERS
Gene Simmons PERS
General Electric
Genghis Khan PERS
George Bush PERS
George Clooney PERS
George Harrison PERS
George Orwell PERS
George W. Bush PERS
George Washington PERS
Georges St-Pierre PERS
Germany LOC
Google ORG
Google Chrome
Gorillaz
Grand Theft Auto
Greece LOC
Gucci ORG
Gulf War
Gulliver's Travels
Guns N' Roses
Gwyneth Paltrow PERS
HIV
HSBC
Haile Selassie PERS
Haiti LOC
Halliburton ORG
Halloween
Hank Baskett PERS
Hannah Montana PERS
Hanukkah
Harrison Ford PERS
Harry Potter PERS
Hawaii LOC
He-Man PERS
Heath Ledger PERS
Helen Keller PERS
Helena Bonham Carter PERS
Henry Ford PERS
Henry IV PERS
Henry V PERS
Henry VIII PERS
Hilary Duff PERS
Hillary Clinton PERS
Honda ORG
Hong Kong LOC
Hotmail
Hugh Hefner PERS
Humphrey Bogart PERS
Hungary LOC
IBM ORG
IKEA ORG
Iceland LOC
India LOC
Indiana Jones PERS
Indira Gandhi PERS
Indonesia LOC
Internet Explorer
Iran LOC
Ireland LOC
Iron Man PERS
Isaac Newton PERS
Isabelle Caro PERS
Islam
Israel LOC
Italy LOC
Ivy League ORG
J. Robert Oppenheimer PERS
J.K. Rowling PERS
J.R.R. Tolkien PERS
JFK PERS
Jack the Ripper PERS
Jackie Chan PERS
Jacqueline Kennedy Onassis PERS
Jaden Smith PERS
Jake Gyllenhaal PERS
James Bond PERS
James Franco PERS
Jane Austen PERS
Janet Jackson PERS
Japan LOC
Jared Leto PERS
Jason Statham PERS
Jawaharlal Nehru PERS
Jay-Z PERS
Jeff Bridges PERS
Jeff Buckley PERS
Jenna Jameson PERS
Jennifer Aniston PERS
Jesse Owens PERS
Jessica Alba PERS
Jesus PERS
Jim Carrey PERS
Jim Morrisson PERS
Jimi Hendrix PERS
Jimmy Wales PERS
Joaquin Phoenix PERS
John Cena PERS
John Edwards PERS
John F. Kennedy PERS
John Lennon PERS
John M. Keynes PERS
John McCain PERS
John Wayne PERS
Johnnie Walker PERS
Johnny Cash PERS
Johnny Depp PERS
Joseph Stalin PERS
Judy Garland PERS
Julia Roberts PERS
Julian Assange PERS
Julie Andrews PERS
Julius Caesar PERS
Justin Bieber PERS
Justin Timberlake PERS
KFC ORG
KLM ORG
Kama Sutra
Kanye West PERS
Kate Middleton PERS
Katherine Hepburn PERS
Katrina Kaif PERS
Katy Perry PERS
Keira Knightley PERS
Ken Livingstone PERS
Keri Hilson PERS
Kesha PERS
Kevin Bacon PERS
Kid Cudi PERS
Kim Kardashian PERS
Kinect
King Arthur PERS
Kobe Bryant PERS
Kosovo LOC
Kristallnacht
Kristen Stewart PERS
Kurt Cobain PERS
L'Oreal ORG
L. Ron Hubbard PERS
Lady Gaga PERS
Lea Michele PERS
Lebanon LOC
Lech Walesa PERS
Led Zeppelin
Lego
Lenin PERS
Leo Tolstoy PERS
Leon Trotsky PERS
Leonardo DiCaprio PERS
Leonardo da Vinci PERS
Leslie Nielsen PERS
Lexus ORG
Liam Neeson PERS
Lil Wayne PERS
Lindsay Lohan PERS
Linkin Park PERS
Lionel Messi PERS
Loch Ness LOC
London LOC
Lord Baden Powell PERS
Los Angeles LOC
Louis Pasteur PERS
Louis Vuitton PERS
Louvre LOC
Ludwig van Beethoven PERS
Lyndon Johnson PERS
MDMA
Mac OS X
Macaulay Culkin PERS
Madagascar LOC
Madonna PERS
Mahatma Gandhi PERS
Malaysia LOC
Malcolm X PERS
Manchester LOC
Manchester United ORG
Margaret Thatcher PERS
Mariah Carey PERS
Marilyn Monroe PERS
Mario Gómez PERS
Mario Kart
Mark David Chapman PERS
Mark Wahlberg PERS
Mark Zuckerberg PERS
Martin Luther King PERS
Massachussetts LOC
Mata Hari PERS
Matt Damon PERS
Mattel ORG
Maya Angelou PERS
McDonald's ORG
McGill University ORG
Megan Fox PERS
Mercedes-Benz ORG
Merlin PERS
Metallica
Mexico LOC
Miami LOC
Miami Vice
Michael C. Hall PERS
Michael Jackson PERS
Michael Jordan PERS
Michael Vick PERS
Michelin ORG
Michigan LOC
Micky Ward PERS
Microsoft ORG
Microsoft Windows
Middle Ages
Mike Tyson PERS
Mila Kunis PERS
Miley Cyrus PERS
Minecraft
Mohammed Ali PERS
Mona Lisa PERS
Montreal LOC
Morocco LOC
Mother Teresa PERS
Mother's Day
Mozart PERS
Mozilla Firefox
Muhammad PERS
Muhammad Ali PERS
Myanmar LOC
Napoleon PERS
Narnia LOC
Natalie Portman PERS
Nazi Germany
Neil Armstrong PERS
Neil Patrick Harris PERS
Nelson Mandela PERS
Nepal LOC
Netherlands LOC
New York LOC
New York City LOC
New Zealand LOC
Nicki Minaj PERS
Nicolas Cage PERS
Nicole Scherzinger PERS
Nigeria LOC
Nike ORG
Nivea ORG
North America LOC
North Korea LOC
Norway LOC
Olivia Wilde PERS
Oprah Winfrey PERS
Osama Bin Laden PERS
Oscar Wilde PERS
Owen Wilson PERS
Ozzfest
Pablo Picasso PERS
Pakistan LOC
Panasonic ORG
Paris LOC
Paul McCartney PERS
Pele PERS
Pepsi ORG
Peter Sellers PERS
Philadelphia LOC
Philips ORG
Phillipines LOC
Pink Floyd PERS
PlayStation 3
Pocahontas PERS
Pokemon
Pokémon
Poland LOC
Pope John Paul II PERS
Premier League ORG
Prince Charles PERS
Priory of Sion LOC
Procter & Gamble
Puerto Rico LOC
Qatar LOC
Queen Elizabeth II PERS
Queen Victoria PERS
Rachmaninoff PERS
Raiders of the Lost Ark
Raisa Gorbachev PERS
Real Madrid ORG
Red Hot Chili Peppers PERS
Reese Witherspoon PERS
Resident Evil
Richard PERS
Richard Branson PERS
Richard Dawkins PERS
Richard Holbrooke PERS
Richard Nixon PERS
Rihanna PERS
Ringo Starr PERS
Robert De Niro PERS
Robert Pattinson PERS
Robin Hood PERS
Roger Federer PERS
Roman Empire ORG
Romania LOC
Rome LOC
Romeo and Juliet
Ronald Reagan PERS
Ronnie O'Sullivan PERS
Rosa Parks PERS
Russell Brand PERS
Russia LOC
Ryan Reynolds PERS
Saddam Hussein PERS
Sahara LOC
Saint Nicholas PERS
Salman Khan PERS
Samsung ORG
Sandra Bullock PERS
Santa Claus PERS
Sarah Palin PERS
Sasha Grey PERS
Saudi Arabia LOC
Scarlett Johansson PERS
Scientology ORG
Scotland LOC
Sean Combs PERS
Sean Parker PERS
Selena Gomez PERS
Serbia LOC
Sergei Rachmaninoff PERS
Shakira
Shaquille O'Neal PERS
Shaun Ryder PERS
Sherlock Holmes PERS
Shia LaBeouf PERS
Shirley Temple PERS
Siemens ORG
Sigmund Freud PERS
Silvio Berlusconi PERS
Singapore LOC
Skype
Smirnoff ORG
Snoop Dogg PERS
Snow White PERS
Socrates PERS
Somalia LOC
Sony ORG
South Africa LOC
South America LOC
South Korea LOC
South Park
Soviet Union
Spain LOC
Spider-Man PERS
Spiderman PERS
Sri Lanka LOC
Star Trek
Star Wars
Starbucks ORG
Stephen Hawking PERS
Stephen King PERS
Steve Jobs PERS
Steve Nash PERS
Steven Spielberg PERS
Sudan LOC
Super Bowl
Superman PERS
Sweden LOC
Switzerland LOC
Sylvester Stallone PERS
Taiwan LOC
Taj Mahal LOC
Take That
Taylor Lautner PERS
Taylor Momsem PERS
Taylor Swift PERS
Teena Marie PERS
Tennessee LOC
Texas LOC
Thailand LOC
The Beatles
The Chronicles of Narnia
The Godfather
The Green Hornet
The Lord of the Rings
The Rolling Stones
The Simpsons
The Sims
Theodore Roosevelt PERS
Thomas Jefferson PERS
Thor PERS
Tiger Woods PERS
Titanic
Tom Brady PERS
Tom Cruise PERS
Tom Hanks PERS
Toy Story
Toyota ORG
Transformers
Tron
Tupac Shakur PERS
Twin Peaks
Twitter
UEFA Champions League
Ubuntu
Ukraine LOC
United Kingdom LOC
United Nations
United States LOC
Usain Bolt PERS
Vanessa Hudgens PERS
Venus LOC
Vietnam LOC
Vin Diesel PERS
Virginia Woolf PERS
Vladimir Putin PERS
Vodafone ORG
Volkswagen ORG
Walmart ORG
Walt Disney PERS
Warren Buffet PERS
Washington LOC
Washington D.C. LOC
Wesley Snipes PERS
Wii
WikiLeaks ORG
Wikipedia ORG
Will Ferrell PERS
Will Smith PERS
William Shakespeare PERS
Willow Smith PERS
Windows 7
Windows 95
Windows Vista
Windows XP
Winona Ryder PERS
Winston Churchill PERS
Wiz Khalifa PERS
Wolfgang Amadeus Mozart PERS
Woodrow Wilson PERS
World War I
World War II
World of Warcraft
Wright Brothers PERS
X-Men
Xbox 360
Yoko Onen PERS
Yoko Ono PERS
YouTube ORG
amazon.com ORG
eBay ORG
iPad
iPhone
iPod
iPod touch

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,152 @@
;;;
;;; The morphological rules are based on Brill's rule based tagger v1.14,
;;; trained on Brown corpus and Penn Treebank.
;;;
NN s fhassuf 1 NNS x
NN . fchar CD x
NN - fchar JJ x
NN ed fhassuf 2 VBN x
NN ing fhassuf 3 VBG x
ly hassuf 2 RB x
ly addsuf 2 JJ x
NN $ fgoodright CD x
NN al fhassuf 2 JJ x
NN would fgoodright VB x
NN 0 fchar CD x
NN be fgoodright JJ x
NNS us fhassuf 2 JJ x
NNS it fgoodright VBZ x
NN ble fhassuf 3 JJ x
NN ic fhassuf 2 JJ x
NN 1 fchar CD x
NNS ss fhassuf 2 NN x
un deletepref 2 JJ x
NN ive fhassuf 3 JJ x
NNP ed fhassuf 2 JJ x
NN n't fgoodright VB x
VB the fgoodright NN x
NNS he fgoodright VBZ x
VBN he fgoodright VBD x
NN are fgoodright JJ x
JJ was fgoodleft NN x
NN est fhassuf 3 JJS x
VBZ The fgoodright NNS x
NNP ts fhassuf 2 NNS x
NN 4 fchar CD x
NN ize fhassuf 3 VB x
.. hassuf 2 : x
ful hassuf 3 JJ x
NN ate fhassuf 3 VB x
NNP ing fhassuf 3 VBG x
VBG is fgoodleft NN x
NN less fhassuf 4 JJ x
NN ary fhassuf 3 JJ x
Co. goodleft NNP x
NN ant fhassuf 3 JJ x
million goodleft CD x
JJ their fgoodleft IN x
NN he fgoodright VBD x
Mr. goodright NNP x
JJ of fgoodleft NN x
NN so fgoodright JJ x
NN y fdeletesuf 1 JJ x
VBN which fgoodright VBD x
VBD been fgoodright VBN x
VB a fgoodright NN x
NN economic fgoodleft JJ x
9 char CD x
CD t fchar JJ x
NN can fgoodright VB x
VB the fgoodright NN x
JJ S-T-A-R-T fgoodright VBN x
VBN - fchar JJ x
NN lar fhassuf 3 JJ x
NNP ans fhassuf 3 NNPS x
NN men fhassuf 3 NNS x
CD d fchar JJ x
JJ n fdeletesuf 1 VBN x
JJ 's fgoodleft NN x
NNS is fhassuf 2 NN x
ES hassuf 2 NNS x
JJ er fdeletesuf 2 JJR x
Inc. goodleft NNP x
NN 2 fchar CD x
VBD be fgoodleft MD x
ons hassuf 3 NNS x
RB - fchar JJ x
NN very fgoodright JJ x
ous hassuf 3 JJ x
NN a fdeletepref 1 RB x
NNP people fgoodleft JJ x
VB have fgoodleft RB x
NNS It fgoodright VBZ x
NN id fhassuf 2 JJ x
JJ may fgoodleft NN x
VBN but fgoodright VBD x
RS hassuf 2 NNS x
JJ stry fhassuf 4 NN x
NNS them fgoodleft VBZ x
VBZ were fgoodleft NNS x
NN ing faddsuf 3 VB x
JJ s faddsuf 1 NN x
NN 7 fchar CD x
NN d faddsuf 1 VB x
VB but fgoodleft NN x
NN 3 fchar CD x
NN est faddsuf 3 JJ x
NN en fhassuf 2 VBN x
NN costs fgoodright IN x
NN 8 fchar CD x
VB b fhaspref 1 NN x
zes hassuf 3 VBZ x
VBN s faddsuf 1 NN x
some hassuf 4 JJ x
NN ic fhassuf 2 JJ x
ly addsuf 2 JJ x
ness addsuf 4 JJ x
JJS s faddsuf 1 NN x
NN ier fhassuf 3 JJR x
NN ky fhassuf 2 JJ x
tyle hassuf 4 JJ x
NNS ates fhassuf 4 VBZ x
fy hassuf 2 VB x
body addsuf 4 DT x
NN ways fgoodleft JJ x
NNP ies fhassuf 3 NNPS x
VB negative fgoodright NN x
ders hassuf 4 NNS x
ds hassuf 2 NNS x
-day addsuf 4 CD x
nian hassuf 4 JJ x
JJR s faddsuf 1 NN x
ppy hassuf 3 JJ x
NN ish fhassuf 3 JJ x
tors hassuf 4 NNS x
oses hassuf 4 VBZ x
NNS oves fhassuf 4 VBZ x
VBN un fhaspref 2 JJ x
lent hassuf 4 JJ x
NN ward fdeletesuf 4 RB x
VB k fchar NN x
VB r fhassuf 1 NN x
VB e fdeletesuf 1 NN x
NNS Engelken fgoodright VBZ x
NN ient fhassuf 4 JJ x
ED hassuf 2 VBD x
VBG B fchar NNP x
VB le fhassuf 2 NN x
ment addsuf 4 VB x
ING hassuf 3 NN x
JJ ery fhassuf 3 NN x
JJ tus fhassuf 3 NN x
JJ car fhassuf 3 NN x
NN 6 fchar CD x
NNS 0 fchar CD x
JJ ing fdeletesuf 3 VBG x
here hassuf 4 RB x
VBN scr fhaspref 3 VBD x
uces hassuf 4 VBZ x
fies hassuf 4 VBZ x
self deletesuf 4 PRP x
NNP $ fchar $ x
VBN wa fhaspref 2 VBD x

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,472 @@
# -*- coding: utf-8 -*-
'''The pluralize and singular methods from the pattern library.
Licenced under the BSD.
See here https://github.com/clips/pattern/blob/master/LICENSE.txt for
complete license information.
'''
import re
VERB, NOUN, ADJECTIVE, ADVERB = "VB", "NN", "JJ", "RB"
#### PLURALIZE #####################################################################################
# Based on "An Algorithmic Approach to English Pluralization" by Damian Conway:
# http://www.csse.monash.edu.au/~damian/papers/HTML/Plurals.html
# Prepositions are used to solve things like
# "mother-in-law" or "man at arms"
plural_prepositions = [
"about", "above", "across", "after", "among", "around", "at", "athwart", "before", "behind",
"below", "beneath", "beside", "besides", "between", "betwixt", "beyond", "but", "by", "during",
"except", "for", "from", "in", "into", "near", "of", "off", "on", "onto", "out", "over",
"since", "till", "to", "under", "until", "unto", "upon", "with"
]
# Inflection rules that are either general,
# or apply to a certain category of words,
# or apply to a certain category of words only in classical mode,
# or apply only in classical mode.
# Each rule consists of:
# suffix, inflection, category and classic flag.
plural_rules = [
# 0) Indefinite articles and demonstratives.
[["^a$|^an$", "some", None, False],
["^this$", "these", None, False],
["^that$", "those", None, False],
["^any$", "all", None, False]
],
# 1) Possessive adjectives.
# Overlaps with 1/ for "his" and "its".
# Overlaps with 2/ for "her".
[["^my$", "our", None, False],
["^your$|^thy$", "your", None, False],
["^her$|^his$|^its$|^their$", "their", None, False]
],
# 2) Possessive pronouns.
[["^mine$", "ours", None, False],
["^yours$|^thine$", "yours", None, False],
["^hers$|^his$|^its$|^theirs$", "theirs", None, False]
],
# 3) Personal pronouns.
[["^I$", "we", None, False],
["^me$", "us", None, False],
["^myself$", "ourselves", None, False],
["^you$", "you", None, False],
["^thou$|^thee$", "ye", None, False],
["^yourself$|^thyself$", "yourself", None, False],
["^she$|^he$|^it$|^they$", "they", None, False],
["^her$|^him$|^it$|^them$", "them", None, False],
["^herself$|^himself$|^itself$|^themself$", "themselves", None, False],
["^oneself$", "oneselves", None, False]
],
# 4) Words that do not inflect.
[["$", "", "uninflected", False],
["$", "", "uncountable", False],
["fish$", "fish", None, False],
["([- ])bass$", "\\1bass", None, False],
["ois$", "ois", None, False],
["sheep$", "sheep", None, False],
["deer$", "deer", None, False],
["pox$", "pox", None, False],
["([A-Z].*)ese$", "\\1ese", None, False],
["itis$", "itis", None, False],
["(fruct|gluc|galact|lact|ket|malt|rib|sacchar|cellul)ose$", "\\1ose", None, False]
],
# 5) Irregular plurals (mongoose, oxen).
[["atlas$", "atlantes", None, True],
["atlas$", "atlases", None, False],
["beef$", "beeves", None, True],
["brother$", "brethren", None, True],
["child$", "children", None, False],
["corpus$", "corpora", None, True],
["corpus$", "corpuses", None, False],
["^cow$", "kine", None, True],
["ephemeris$", "ephemerides", None, False],
["ganglion$", "ganglia", None, True],
["genie$", "genii", None, True],
["genus$", "genera", None, False],
["graffito$", "graffiti", None, False],
["loaf$", "loaves", None, False],
["money$", "monies", None, True],
["mongoose$", "mongooses", None, False],
["mythos$", "mythoi", None, False],
["octopus$", "octopodes", None, True],
["opus$", "opera", None, True],
["opus$", "opuses", None, False],
["^ox$", "oxen", None, False],
["penis$", "penes", None, True],
["penis$", "penises", None, False],
["soliloquy$", "soliloquies", None, False],
["testis$", "testes", None, False],
["trilby$", "trilbys", None, False],
["turf$", "turves", None, True],
["numen$", "numena", None, False],
["occiput$", "occipita", None, True]
],
# 6) Irregular inflections for common suffixes (synopses, mice, men).
[["man$", "men", None, False],
["person$", "people", None, False],
["([lm])ouse$", "\\1ice", None, False],
["tooth$", "teeth", None, False],
["goose$", "geese", None, False],
["foot$", "feet", None, False],
["zoon$", "zoa", None, False],
["([csx])is$", "\\1es", None, False]
],
# 7) Fully assimilated classical inflections (vertebrae, codices).
[["ex$", "ices", "ex-ices", False],
["ex$", "ices", "ex-ices-classical", True],
["um$", "a", "um-a", False],
["um$", "a", "um-a-classical", True],
["on$", "a", "on-a", False],
["a$", "ae", "a-ae", False],
["a$", "ae", "a-ae-classical", True]
],
# 8) Classical variants of modern inflections (stigmata, soprani).
[["trix$", "trices", None, True],
["eau$", "eaux", None, True],
["ieu$", "ieu", None, True],
["([iay])nx$", "\\1nges", None, True],
["en$", "ina", "en-ina-classical", True],
["a$", "ata", "a-ata-classical", True],
["is$", "ides", "is-ides-classical", True],
["us$", "i", "us-i-classical", True],
["us$", "us", "us-us-classical", True],
["o$", "i", "o-i-classical", True],
["$", "i", "-i-classical", True],
["$", "im", "-im-classical", True]
],
# 9) -ch, -sh and -ss and the s-singular group take -es in the plural (churches, classes, lenses).
[["([cs])h$", "\\1hes", None, False],
["ss$", "sses", None, False],
["x$", "xes", None, False],
["s$", "ses", "s-singular", False]
],
# 10) Certain words ending in -f or -fe take -ves in the plural (lives, wolves).
[["([aeo]l)f$", "\\1ves", None, False],
["([^d]ea)f$", "\\1ves", None, False],
["arf$", "arves", None, False],
["([nlw]i)fe$", "\\1ves", None, False],
],
# 11) -y takes -ys if preceded by a vowel or when a proper noun,
# but -ies if preceded by a consonant (storeys, Marys, stories).
[["([aeiou])y$", "\\1ys", None, False],
["([A-Z].*)y$", "\\1ys", None, False],
["y$", "ies", None, False]
],
# 12) Some words ending in -o take -os, the rest take -oes.
# Words in which the -o is preceded by a vowel always take -os (lassos, potatoes, bamboos).
[["o$", "os", "o-os", False],
["([aeiou])o$", "\\1os", None, False],
["o$", "oes", None, False]
],
# 13) Miltary stuff (Major Generals).
[["l$", "ls", "general-generals", False]
],
# 14) Otherwise, assume that the plural just adds -s (cats, programmes).
[["$", "s", None, False]
],
]
# For performance, compile the regular expressions only once:
for ruleset in plural_rules:
for rule in ruleset:
rule[0] = re.compile(rule[0])
# Suffix categories.
plural_categories = {
"uninflected": [
"aircraft", "antelope", "bison", "bream", "breeches", "britches", "carp", "cattle", "chassis",
"clippers", "cod", "contretemps", "corps", "debris", "diabetes", "djinn", "eland", "elk",
"flounder", "gallows", "graffiti", "headquarters", "herpes", "high-jinks", "homework", "innings",
"jackanapes", "mackerel", "measles", "mews", "moose", "mumps", "offspring", "news", "pincers",
"pliers", "proceedings", "rabies", "salmon", "scissors", "series", "shears", "species", "swine",
"trout", "tuna", "whiting", "wildebeest"],
"uncountable": [
"advice", "bread", "butter", "cannabis", "cheese", "electricity", "equipment", "fruit", "furniture",
"garbage", "gravel", "happiness", "information", "ketchup", "knowledge", "love", "luggage",
"mathematics", "mayonnaise", "meat", "mustard", "news", "progress", "research", "rice",
"sand", "software", "understanding", "water"],
"s-singular": [
"acropolis", "aegis", "alias", "asbestos", "bathos", "bias", "bus", "caddis", "canvas",
"chaos", "christmas", "cosmos", "dais", "digitalis", "epidermis", "ethos", "gas", "glottis",
"ibis", "lens", "mantis", "marquis", "metropolis", "pathos", "pelvis", "polis", "rhinoceros",
"sassafras", "trellis"],
"ex-ices": ["codex", "murex", "silex"],
"ex-ices-classical": [
"apex", "cortex", "index", "latex", "pontifex", "simplex", "vertex", "vortex"],
"um-a": [
"agendum", "bacterium", "candelabrum", "datum", "desideratum", "erratum", "extremum",
"ovum", "stratum"],
"um-a-classical": [
"aquarium", "compendium", "consortium", "cranium", "curriculum", "dictum", "emporium",
"enconium", "gymnasium", "honorarium", "interregnum", "lustrum", "maximum", "medium",
"memorandum", "millenium", "minimum", "momentum", "optimum", "phylum", "quantum", "rostrum",
"spectrum", "speculum", "stadium", "trapezium", "ultimatum", "vacuum", "velum"],
"on-a": [
"aphelion", "asyndeton", "criterion", "hyperbaton", "noumenon", "organon", "perihelion",
"phenomenon", "prolegomenon"],
"a-ae": ["alga", "alumna", "vertebra"],
"a-ae-classical": [
"abscissa", "amoeba", "antenna", "aurora", "formula", "hydra", "hyperbola", "lacuna",
"medusa", "nebula", "nova", "parabola"],
"en-ina-classical": ["foramen", "lumen", "stamen"],
"a-ata-classical": [
"anathema", "bema", "carcinoma", "charisma", "diploma", "dogma", "drama", "edema", "enema",
"enigma", "gumma", "lemma", "lymphoma", "magma", "melisma", "miasma", "oedema", "sarcoma",
"schema", "soma", "stigma", "stoma", "trauma"],
"is-ides-classical": ["clitoris", "iris"],
"us-i-classical": [
"focus", "fungus", "genius", "incubus", "nimbus", "nucleolus", "radius", "stylus", "succubus",
"torus", "umbilicus", "uterus"],
"us-us-classical": [
"apparatus", "cantus", "coitus", "hiatus", "impetus", "nexus", "plexus", "prospectus",
"sinus", "status"],
"o-i-classical": ["alto", "basso", "canto", "contralto", "crescendo", "solo", "soprano", "tempo"],
"-i-classical": ["afreet", "afrit", "efreet"],
"-im-classical": ["cherub", "goy", "seraph"],
"o-os": [
"albino", "archipelago", "armadillo", "commando", "ditto", "dynamo", "embryo", "fiasco",
"generalissimo", "ghetto", "guano", "inferno", "jumbo", "lingo", "lumbago", "magneto",
"manifesto", "medico", "octavo", "photo", "pro", "quarto", "rhino", "stylo"],
"general-generals": [
"Adjutant", "Brigadier", "Lieutenant", "Major", "Quartermaster",
"adjutant", "brigadier", "lieutenant", "major", "quartermaster"],
}
def pluralize(word, pos=NOUN, custom={}, classical=True):
""" Returns the plural of a given word.
For example: child -> children.
Handles nouns and adjectives, using classical inflection by default
(e.g. where "matrix" pluralizes to "matrices" instead of "matrixes").
The custom dictionary is for user-defined replacements.
"""
if word in custom:
return custom[word]
# Recursion of genitives.
# Remove the apostrophe and any trailing -s,
# form the plural of the resultant noun, and then append an apostrophe (dog's -> dogs').
if word.endswith("'") or word.endswith("'s"):
owner = word.rstrip("'s")
owners = pluralize(owner, pos, custom, classical)
if owners.endswith("s"):
return owners + "'"
else:
return owners + "'s"
# Recursion of compound words
# (Postmasters General, mothers-in-law, Roman deities).
words = word.replace("-", " ").split(" ")
if len(words) > 1:
if words[1] == "general" or words[1] == "General" and \
words[0] not in plural_categories["general-generals"]:
return word.replace(words[0], pluralize(words[0], pos, custom, classical))
elif words[1] in plural_prepositions:
return word.replace(words[0], pluralize(words[0], pos, custom, classical))
else:
return word.replace(words[-1], pluralize(words[-1], pos, custom, classical))
# Only a very few number of adjectives inflect.
n = list(range(len(plural_rules)))
if pos.startswith(ADJECTIVE):
n = [0, 1]
# Apply pluralization rules.
for i in n:
ruleset = plural_rules[i]
for rule in ruleset:
suffix, inflection, category, classic = rule
# A general rule, or a classic rule in classical mode.
if category == None:
if not classic or (classic and classical):
if suffix.search(word) is not None:
return suffix.sub(inflection, word)
# A rule relating to a specific category of words.
if category != None:
if word in plural_categories[category] and (not classic or (classic and classical)):
if suffix.search(word) is not None:
return suffix.sub(inflection, word)
#### SINGULARIZE ###################################################################################
# Adapted from Bermi Ferrer's Inflector for Python:
# http://www.bermi.org/inflector/
# Copyright (c) 2006 Bermi Ferrer Martinez
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software to deal in this software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of this software, and to permit
# persons to whom this software is furnished to do so, subject to the following
# condition:
#
# THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THIS SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THIS SOFTWARE.
singular_rules = [
['(?i)(.)ae$', '\\1a'],
['(?i)(.)itis$', '\\1itis'],
['(?i)(.)eaux$', '\\1eau'],
['(?i)(quiz)zes$', '\\1'],
['(?i)(matr)ices$', '\\1ix'],
['(?i)(ap|vert|ind)ices$', '\\1ex'],
['(?i)^(ox)en', '\\1'],
['(?i)(alias|status)es$', '\\1'],
['(?i)([octop|vir])i$', '\\1us'],
['(?i)(cris|ax|test)es$', '\\1is'],
['(?i)(shoe)s$', '\\1'],
['(?i)(o)es$', '\\1'],
['(?i)(bus)es$', '\\1'],
['(?i)([m|l])ice$', '\\1ouse'],
['(?i)(x|ch|ss|sh)es$', '\\1'],
['(?i)(m)ovies$', '\\1ovie'],
['(?i)(.)ombies$', '\\1ombie'],
['(?i)(s)eries$', '\\1eries'],
['(?i)([^aeiouy]|qu)ies$', '\\1y'],
# Certain words ending in -f or -fe take -ves in the plural (lives, wolves).
["([aeo]l)ves$", "\\1f"],
["([^d]ea)ves$", "\\1f"],
["arves$", "arf"],
["erves$", "erve"],
["([nlw]i)ves$", "\\1fe"],
['(?i)([lr])ves$', '\\1f'],
["([aeo])ves$", "\\1ve"],
['(?i)(sive)s$', '\\1'],
['(?i)(tive)s$', '\\1'],
['(?i)(hive)s$', '\\1'],
['(?i)([^f])ves$', '\\1fe'],
# -es suffix.
['(?i)(^analy)ses$', '\\1sis'],
['(?i)((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$', '\\1\\2sis'],
['(?i)(.)opses$', '\\1opsis'],
['(?i)(.)yses$', '\\1ysis'],
['(?i)(h|d|r|o|n|b|cl|p)oses$', '\\1ose'],
['(?i)(fruct|gluc|galact|lact|ket|malt|rib|sacchar|cellul)ose$', '\\1ose'],
['(?i)(.)oses$', '\\1osis'],
# -a
['(?i)([ti])a$', '\\1um'],
['(?i)(n)ews$', '\\1ews'],
['(?i)s$', ''],
]
# For performance, compile the regular expressions only once:
for rule in singular_rules:
rule[0] = re.compile(rule[0])
singular_uninflected = [
"aircraft", "antelope", "bison", "bream", "breeches", "britches", "carp", "cattle", "chassis",
"clippers", "cod", "contretemps", "corps", "debris", "diabetes", "djinn", "eland",
"elk", "flounder", "gallows", "georgia", "graffiti", "headquarters", "herpes", "high-jinks",
"homework", "innings", "jackanapes", "mackerel", "measles", "mews", "moose", "mumps", "news",
"offspring", "pincers", "pliers", "proceedings", "rabies", "salmon", "scissors", "series",
"shears", "species", "swine", "swiss", "trout", "tuna", "whiting", "wildebeest"
]
singular_uncountable = [
"advice", "bread", "butter", "cannabis", "cheese", "electricity", "equipment", "fruit", "furniture",
"garbage", "gravel", "happiness", "information", "ketchup", "knowledge", "love", "luggage",
"mathematics", "mayonnaise", "meat", "mustard", "news", "progress", "research", "rice", "sand",
"software", "understanding", "water"
]
singular_ie = [
"algerie", "auntie", "beanie", "birdie", "bogie", "bombie", "bookie", "collie", "cookie", "cutie",
"doggie", "eyrie", "freebie", "goonie", "groupie", "hankie", "hippie", "hoagie", "hottie",
"indie", "junkie", "laddie", "laramie", "lingerie", "meanie", "nightie", "oldie", "^pie",
"pixie", "quickie", "reverie", "rookie", "softie", "sortie", "stoolie", "sweetie", "techie",
"^tie", "toughie", "valkyrie", "veggie", "weenie", "yuppie", "zombie"
]
singular_s = plural_categories['s-singular']
# key plural, value singular
singular_irregular = {
"men": "man",
"people": "person",
"children": "child",
"sexes": "sex",
"axes": "axe",
"moves": "move",
"teeth": "tooth",
"geese": "goose",
"feet": "foot",
"zoa": "zoon",
"atlantes": "atlas",
"atlases": "atlas",
"beeves": "beef",
"brethren": "brother",
"children": "child",
"corpora": "corpus",
"corpuses": "corpus",
"kine": "cow",
"ephemerides": "ephemeris",
"ganglia": "ganglion",
"genii": "genie",
"genera": "genus",
"graffiti": "graffito",
"helves": "helve",
"leaves": "leaf",
"loaves": "loaf",
"monies": "money",
"mongooses": "mongoose",
"mythoi": "mythos",
"octopodes": "octopus",
"opera": "opus",
"opuses": "opus",
"oxen": "ox",
"penes": "penis",
"penises": "penis",
"soliloquies": "soliloquy",
"testes": "testis",
"trilbys": "trilby",
"turves": "turf",
"numena": "numen",
"occipita": "occiput",
"our": "my",
}
def singularize(word, pos=NOUN, custom={}):
if word in list(custom.keys()):
return custom[word]
# Recursion of compound words (e.g. mothers-in-law).
if "-" in word:
words = word.split("-")
if len(words) > 1 and words[1] in plural_prepositions:
return singularize(words[0], pos, custom)+"-"+"-".join(words[1:])
# dogs' => dog's
if word.endswith("'"):
return singularize(word[:-1]) + "'s"
lower = word.lower()
for w in singular_uninflected:
if w.endswith(lower):
return word
for w in singular_uncountable:
if w.endswith(lower):
return word
for w in singular_ie:
if lower.endswith(w+"s"):
return w
for w in singular_s:
if lower.endswith(w + 'es'):
return w
for w in list(singular_irregular.keys()):
if lower.endswith(w):
return re.sub('(?i)'+w+'$', singular_irregular[w], word)
for rule in singular_rules:
suffix, inflection = rule
match = suffix.search(word)
if match:
groups = match.groups()
for k in range(0, len(groups)):
if groups[k] == None:
inflection = inflection.replace('\\'+str(k+1), '')
return suffix.sub(inflection, word)
return word

View File

@@ -0,0 +1,204 @@
# -*- coding: utf-8 -*-
'''Various noun phrase extractors.'''
from __future__ import unicode_literals, absolute_import
import nltk
from textblob.taggers import PatternTagger
from textblob.decorators import requires_nltk_corpus
from textblob.utils import tree2str, filter_insignificant
from textblob.base import BaseNPExtractor
class ChunkParser(nltk.ChunkParserI):
def __init__(self):
self._trained = False
@requires_nltk_corpus
def train(self):
'''Train the Chunker on the ConLL-2000 corpus.'''
train_data = [[(t, c) for _, t, c in nltk.chunk.tree2conlltags(sent)]
for sent in
nltk.corpus.conll2000.chunked_sents('train.txt',
chunk_types=['NP'])]
unigram_tagger = nltk.UnigramTagger(train_data)
self.tagger = nltk.BigramTagger(train_data, backoff=unigram_tagger)
self._trained = True
def parse(self, sentence):
'''Return the parse tree for the sentence.'''
if not self._trained:
self.train()
pos_tags = [pos for (word, pos) in sentence]
tagged_pos_tags = self.tagger.tag(pos_tags)
chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
conlltags = [(word, pos, chunktag) for ((word, pos), chunktag) in
zip(sentence, chunktags)]
return nltk.chunk.util.conlltags2tree(conlltags)
class ConllExtractor(BaseNPExtractor):
'''A noun phrase extractor that uses chunk parsing trained with the
ConLL-2000 training corpus.
'''
POS_TAGGER = PatternTagger()
# The context-free grammar with which to filter the noun phrases
CFG = {
('NNP', 'NNP'): 'NNP',
('NN', 'NN'): 'NNI',
('NNI', 'NN'): 'NNI',
('JJ', 'JJ'): 'JJ',
('JJ', 'NN'): 'NNI',
}
# POS suffixes that will be ignored
INSIGNIFICANT_SUFFIXES = ['DT', 'CC', 'PRP$', 'PRP']
def __init__(self, parser=None):
self.parser = ChunkParser() if not parser else parser
def extract(self, text):
'''Return a list of noun phrases (strings) for body of text.'''
sentences = nltk.tokenize.sent_tokenize(text)
noun_phrases = []
for sentence in sentences:
parsed = self._parse_sentence(sentence)
# Get the string representation of each subtree that is a
# noun phrase tree
phrases = [_normalize_tags(filter_insignificant(each,
self.INSIGNIFICANT_SUFFIXES)) for each in parsed
if isinstance(each, nltk.tree.Tree) and each.label()
== 'NP' and len(filter_insignificant(each)) >= 1
and _is_match(each, cfg=self.CFG)]
nps = [tree2str(phrase) for phrase in phrases]
noun_phrases.extend(nps)
return noun_phrases
def _parse_sentence(self, sentence):
'''Tag and parse a sentence (a plain, untagged string).'''
tagged = self.POS_TAGGER.tag(sentence)
return self.parser.parse(tagged)
class FastNPExtractor(BaseNPExtractor):
'''A fast and simple noun phrase extractor.
Credit to Shlomi Babluk. Link to original blog post:
http://thetokenizer.com/2013/05/09/efficient-way-to-extract-the-main-topics-of-a-sentence/
'''
CFG = {
('NNP', 'NNP'): 'NNP',
('NN', 'NN'): 'NNI',
('NNI', 'NN'): 'NNI',
('JJ', 'JJ'): 'JJ',
('JJ', 'NN'): 'NNI',
}
def __init__(self):
self._trained = False
@requires_nltk_corpus
def train(self):
train_data = nltk.corpus.brown.tagged_sents(categories='news')
regexp_tagger = nltk.RegexpTagger([
(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),
(r'(-|:|;)$', ':'),
(r'\'*$', 'MD'),
(r'(The|the|A|a|An|an)$', 'AT'),
(r'.*able$', 'JJ'),
(r'^[A-Z].*$', 'NNP'),
(r'.*ness$', 'NN'),
(r'.*ly$', 'RB'),
(r'.*s$', 'NNS'),
(r'.*ing$', 'VBG'),
(r'.*ed$', 'VBD'),
(r'.*', 'NN'),
])
unigram_tagger = nltk.UnigramTagger(train_data, backoff=regexp_tagger)
self.tagger = nltk.BigramTagger(train_data, backoff=unigram_tagger)
self._trained = True
return None
def _tokenize_sentence(self, sentence):
'''Split the sentence into single words/tokens'''
tokens = nltk.word_tokenize(sentence)
return tokens
def extract(self, sentence):
'''Return a list of noun phrases (strings) for body of text.'''
if not self._trained:
self.train()
tokens = self._tokenize_sentence(sentence)
tagged = self.tagger.tag(tokens)
tags = _normalize_tags(tagged)
merge = True
while merge:
merge = False
for x in range(0, len(tags) - 1):
t1 = tags[x]
t2 = tags[x + 1]
key = t1[1], t2[1]
value = self.CFG.get(key, '')
if value:
merge = True
tags.pop(x)
tags.pop(x)
match = '%s %s' % (t1[0], t2[0])
pos = value
tags.insert(x, (match, pos))
break
matches = [t[0] for t in tags if t[1] in ['NNP', 'NNI']]
return matches
### Utility methods ###
def _normalize_tags(chunk):
'''Normalize the corpus tags.
("NN", "NN-PL", "NNS") -> "NN"
'''
ret = []
for word, tag in chunk:
if tag == 'NP-TL' or tag == 'NP':
ret.append((word, 'NNP'))
continue
if tag.endswith('-TL'):
ret.append((word, tag[:-3]))
continue
if tag.endswith('S'):
ret.append((word, tag[:-1]))
continue
ret.append((word, tag))
return ret
def _is_match(tagged_phrase, cfg):
'''Return whether or not a tagged phrases matches a context-free grammar.
'''
copy = list(tagged_phrase) # A copy of the list
merge = True
while merge:
merge = False
for i in range(len(copy) - 1):
first, second = copy[i], copy[i + 1]
key = first[1], second[1] # Tuple of tags e.g. ('NN', 'JJ')
value = cfg.get(key, None)
if value:
merge = True
copy.pop(i)
copy.pop(i)
match = '{0} {1}'.format(first[0], second[0])
pos = value
copy.insert(i, (match, pos))
break
match = any([t[1] in ('NNP', 'NNI') for t in copy])
return match

View File

@@ -0,0 +1,18 @@
# -*- coding: utf-8 -*-
"""Various parser implementations.
.. versionadded:: 0.6.0
"""
from __future__ import absolute_import
from textblob.en import parse as pattern_parse
from textblob.base import BaseParser
class PatternParser(BaseParser):
"""Parser that uses the implementation in Tom de Smedt's pattern library.
http://www.clips.ua.ac.be/pages/pattern-en#parser
"""
def parse(self, text):
"""Parses the text."""
return pattern_parse(text)

View File

@@ -0,0 +1,97 @@
# -*- coding: utf-8 -*-
"""Sentiment analysis implementations.
.. versionadded:: 0.5.0
"""
from __future__ import absolute_import
from collections import namedtuple
import nltk
from textblob.en import sentiment as pattern_sentiment
from textblob.tokenizers import word_tokenize
from textblob.decorators import requires_nltk_corpus
from textblob.base import BaseSentimentAnalyzer, DISCRETE, CONTINUOUS
class PatternAnalyzer(BaseSentimentAnalyzer):
"""Sentiment analyzer that uses the same implementation as the
pattern library. Returns results as a named tuple of the form:
``Sentiment(polarity, subjectivity, [assessments])``
where [assessments] is a list of the assessed tokens and their
polarity and subjectivity scores
"""
kind = CONTINUOUS
# This is only here for backwards-compatibility.
# The return type is actually determined upon calling analyze()
RETURN_TYPE = namedtuple('Sentiment', ['polarity', 'subjectivity'])
def analyze(self, text, keep_assessments=False):
"""Return the sentiment as a named tuple of the form:
``Sentiment(polarity, subjectivity, [assessments])``.
"""
#: Return type declaration
if keep_assessments:
Sentiment = namedtuple('Sentiment', ['polarity', 'subjectivity', 'assessments'])
assessments = pattern_sentiment(text).assessments
polarity, subjectivity = pattern_sentiment(text)
return Sentiment(polarity, subjectivity, assessments)
else:
Sentiment = namedtuple('Sentiment', ['polarity', 'subjectivity'])
return Sentiment(*pattern_sentiment(text))
def _default_feature_extractor(words):
"""Default feature extractor for the NaiveBayesAnalyzer."""
return dict(((word, True) for word in words))
class NaiveBayesAnalyzer(BaseSentimentAnalyzer):
"""Naive Bayes analyzer that is trained on a dataset of movie reviews.
Returns results as a named tuple of the form:
``Sentiment(classification, p_pos, p_neg)``
:param callable feature_extractor: Function that returns a dictionary of
features, given a list of words.
"""
kind = DISCRETE
#: Return type declaration
RETURN_TYPE = namedtuple('Sentiment', ['classification', 'p_pos', 'p_neg'])
def __init__(self, feature_extractor=_default_feature_extractor):
super(NaiveBayesAnalyzer, self).__init__()
self._classifier = None
self.feature_extractor = feature_extractor
@requires_nltk_corpus
def train(self):
"""Train the Naive Bayes classifier on the movie review corpus."""
super(NaiveBayesAnalyzer, self).train()
neg_ids = nltk.corpus.movie_reviews.fileids('neg')
pos_ids = nltk.corpus.movie_reviews.fileids('pos')
neg_feats = [(self.feature_extractor(
nltk.corpus.movie_reviews.words(fileids=[f])), 'neg') for f in neg_ids]
pos_feats = [(self.feature_extractor(
nltk.corpus.movie_reviews.words(fileids=[f])), 'pos') for f in pos_ids]
train_data = neg_feats + pos_feats
self._classifier = nltk.classify.NaiveBayesClassifier.train(train_data)
def analyze(self, text):
"""Return the sentiment as a named tuple of the form:
``Sentiment(classification, p_pos, p_neg)``
"""
# Lazily train the classifier
super(NaiveBayesAnalyzer, self).analyze(text)
tokens = word_tokenize(text, include_punc=False)
filtered = (t.lower() for t in tokens if len(t) >= 3)
feats = self.feature_extractor(filtered)
prob_dist = self._classifier.prob_classify(feats)
return self.RETURN_TYPE(
classification=prob_dist.max(),
p_pos=prob_dist.prob('pos'),
p_neg=prob_dist.prob("neg")
)

View File

@@ -0,0 +1,38 @@
# -*- coding: utf-8 -*-
"""Parts-of-speech tagger implementations."""
from __future__ import absolute_import
import nltk
import textblob.compat
import textblob as tb
from textblob.en import tag as pattern_tag
from textblob.decorators import requires_nltk_corpus
from textblob.base import BaseTagger
class PatternTagger(BaseTagger):
"""Tagger that uses the implementation in
Tom de Smedt's pattern library
(http://www.clips.ua.ac.be/pattern).
"""
def tag(self, text, tokenize=True):
"""Tag a string or BaseBlob."""
if not isinstance(text, textblob.compat.text_type):
text = text.raw
return pattern_tag(text, tokenize)
class NLTKTagger(BaseTagger):
"""Tagger that uses NLTK's standard TreeBank tagger.
NOTE: Requires numpy. Not yet supported with PyPy.
"""
@requires_nltk_corpus
def tag(self, text):
"""Tag a string or BaseBlob."""
if isinstance(text, textblob.compat.text_type):
text = tb.TextBlob(text)
return nltk.tag.pos_tag(text.tokens)

View File

@@ -0,0 +1,48 @@
# -*- coding: utf-8 -*-
MISSING_CORPUS_MESSAGE = """
Looks like you are missing some required data for this feature.
To download the necessary data, simply run
python -m textblob.download_corpora
or use the NLTK downloader to download the missing data: http://nltk.org/data.html
If this doesn't fix the problem, file an issue at https://github.com/sloria/TextBlob/issues.
"""
class TextBlobError(Exception):
"""A TextBlob-related error."""
pass
TextBlobException = TextBlobError # Backwards compat
class MissingCorpusError(TextBlobError):
"""Exception thrown when a user tries to use a feature that requires a
dataset or model that the user does not have on their system.
"""
def __init__(self, message=MISSING_CORPUS_MESSAGE, *args, **kwargs):
super(MissingCorpusError, self).__init__(message, *args, **kwargs)
MissingCorpusException = MissingCorpusError # Backwards compat
class DeprecationError(TextBlobError):
"""Raised when user uses a deprecated feature."""
pass
class TranslatorError(TextBlobError):
"""Raised when an error occurs during language translation or detection."""
pass
class NotTranslated(TranslatorError):
"""Raised when text is unchanged after translation. This may be due to the language
being unsupported by the translator.
"""
pass
class FormatError(TextBlobError):
"""Raised if a data file with an unsupported format is passed to a classifier."""
pass

View File

@@ -0,0 +1,161 @@
# -*- coding: utf-8 -*-
"""File formats for training and testing data.
Includes a registry of valid file formats. New file formats can be added to the
registry like so: ::
from textblob import formats
class PipeDelimitedFormat(formats.DelimitedFormat):
delimiter = '|'
formats.register('psv', PipeDelimitedFormat)
Once a format has been registered, classifiers will be able to read data files with
that format. ::
from textblob.classifiers import NaiveBayesAnalyzer
with open('training_data.psv', 'r') as fp:
cl = NaiveBayesAnalyzer(fp, format='psv')
"""
from __future__ import absolute_import
import json
from collections import OrderedDict
from textblob.compat import PY2, csv
from textblob.utils import is_filelike
DEFAULT_ENCODING = 'utf-8'
class BaseFormat(object):
"""Interface for format classes. Individual formats can decide on the
composition and meaning of ``**kwargs``.
:param File fp: A file-like object.
.. versionchanged:: 0.9.0
Constructor receives a file pointer rather than a file path.
"""
def __init__(self, fp, **kwargs):
pass
def to_iterable(self):
"""Return an iterable object from the data."""
raise NotImplementedError('Must implement a "to_iterable" method.')
@classmethod
def detect(cls, stream):
"""Detect the file format given a filename.
Return True if a stream is this file format.
.. versionchanged:: 0.9.0
Changed from a static method to a class method.
"""
raise NotImplementedError('Must implement a "detect" class method.')
class DelimitedFormat(BaseFormat):
"""A general character-delimited format."""
delimiter = ","
def __init__(self, fp, **kwargs):
BaseFormat.__init__(self, fp, **kwargs)
if PY2:
reader = csv.reader(fp, delimiter=self.delimiter,
encoding=DEFAULT_ENCODING)
else:
reader = csv.reader(fp, delimiter=self.delimiter)
self.data = [row for row in reader]
def to_iterable(self):
"""Return an iterable object from the data."""
return self.data
@classmethod
def detect(cls, stream):
"""Return True if stream is valid."""
try:
csv.Sniffer().sniff(stream, delimiters=cls.delimiter)
return True
except (csv.Error, TypeError):
return False
class CSV(DelimitedFormat):
"""CSV format. Assumes each row is of the form ``text,label``.
::
Today is a good day,pos
I hate this car.,pos
"""
delimiter = ","
class TSV(DelimitedFormat):
"""TSV format. Assumes each row is of the form ``text\tlabel``.
"""
delimiter = "\t"
class JSON(BaseFormat):
"""JSON format.
Assumes that JSON is formatted as an array of objects with ``text`` and
``label`` properties.
::
[
{"text": "Today is a good day.", "label": "pos"},
{"text": "I hate this car.", "label": "neg"}
]
"""
def __init__(self, fp, **kwargs):
BaseFormat.__init__(self, fp, **kwargs)
self.dict = json.load(fp)
def to_iterable(self):
"""Return an iterable object from the JSON data."""
return [(d['text'], d['label']) for d in self.dict]
@classmethod
def detect(cls, stream):
"""Return True if stream is valid JSON."""
try:
json.loads(stream)
return True
except ValueError:
return False
_registry = OrderedDict([
('csv', CSV),
('json', JSON),
('tsv', TSV),
])
def detect(fp, max_read=1024):
"""Attempt to detect a file's format, trying each of the supported
formats. Return the format class that was detected. If no format is
detected, return ``None``.
"""
if not is_filelike(fp):
return None
for Format in _registry.values():
if Format.detect(fp.read(max_read)):
fp.seek(0)
return Format
fp.seek(0)
return None
def get_registry():
"""Return a dictionary of registered formats."""
return _registry
def register(name, format_class):
"""Register a new format.
:param str name: The name that will be used to refer to the format, e.g. 'csv'
:param type format_class: The format class to register.
"""
get_registry()[name] = format_class

View File

@@ -0,0 +1,17 @@
# -*- coding: utf-8 -*-
'''Make word inflection default to English. This allows for backwards
compatibility so you can still import text.inflect.
>>> from textblob.inflect import singularize
is equivalent to
>>> from textblob.en.inflect import singularize
'''
from __future__ import absolute_import
from textblob.en.inflect import singularize, pluralize
__all__ = [
'singularize',
'pluralize',
]

View File

@@ -0,0 +1,179 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
import sys
from textblob.compat import basestring, implements_to_string, PY2, binary_type
class ComparableMixin(object):
'''Implements rich operators for an object.'''
def _compare(self, other, method):
try:
return method(self._cmpkey(), other._cmpkey())
except (AttributeError, TypeError):
# _cmpkey not implemented, or return different type,
# so I can't compare with "other". Try the reverse comparison
return NotImplemented
def __lt__(self, other):
return self._compare(other, lambda s, o: s < o)
def __le__(self, other):
return self._compare(other, lambda s, o: s <= o)
def __eq__(self, other):
return self._compare(other, lambda s, o: s == o)
def __ge__(self, other):
return self._compare(other, lambda s, o: s >= o)
def __gt__(self, other):
return self._compare(other, lambda s, o: s > o)
def __ne__(self, other):
return self._compare(other, lambda s, o: s != o)
class BlobComparableMixin(ComparableMixin):
'''Allow blob objects to be comparable with both strings and blobs.'''
def _compare(self, other, method):
if isinstance(other, basestring):
# Just compare with the other string
return method(self._cmpkey(), other)
return super(BlobComparableMixin, self)._compare(other, method)
@implements_to_string
class StringlikeMixin(object):
'''Make blob objects behave like Python strings.
Expects that classes that use this mixin to have a _strkey() method that
returns the string to apply string methods to. Using _strkey() instead
of __str__ ensures consistent behavior between Python 2 and 3.
'''
def __repr__(self):
'''Returns a string representation for debugging.'''
class_name = self.__class__.__name__
text = self.__unicode__().encode("utf-8") if PY2 else str(self)
ret = '{cls}("{text}")'.format(cls=class_name,
text=text)
return binary_type(ret) if PY2 else ret
def __str__(self):
'''Returns a string representation used in print statements
or str(my_blob).'''
return self._strkey()
def __len__(self):
'''Returns the length of the raw text.'''
return len(self._strkey())
def __iter__(self):
'''Makes the object iterable as if it were a string,
iterating through the raw string's characters.
'''
return iter(self._strkey())
def __contains__(self, sub):
'''Implements the `in` keyword like a Python string.'''
return sub in self._strkey()
def __getitem__(self, index):
'''Returns a substring. If index is an integer, returns a Python
string of a single character. If a range is given, e.g. `blob[3:5]`,
a new instance of the class is returned.
'''
if isinstance(index, int):
return self._strkey()[index] # Just return a single character
else:
# Return a new blob object
return self.__class__(self._strkey()[index])
def find(self, sub, start=0, end=sys.maxsize):
'''Behaves like the built-in str.find() method. Returns an integer,
the index of the first occurrence of the substring argument sub in the
sub-string given by [start:end].
'''
return self._strkey().find(sub, start, end)
def rfind(self, sub, start=0, end=sys.maxsize):
'''Behaves like the built-in str.rfind() method. Returns an integer,
the index of he last (right-most) occurence of the substring argument
sub in the sub-sequence given by [start:end].
'''
return self._strkey().rfind(sub, start, end)
def index(self, sub, start=0, end=sys.maxsize):
'''Like blob.find() but raise ValueError when the substring
is not found.
'''
return self._strkey().index(sub, start, end)
def rindex(self, sub, start=0, end=sys.maxsize):
'''Like blob.rfind() but raise ValueError when substring is not
found.
'''
return self._strkey().rindex(sub, start, end)
def startswith(self, prefix, start=0, end=sys.maxsize):
"""Returns True if the blob starts with the given prefix."""
return self._strkey().startswith(prefix, start, end)
def endswith(self, suffix, start=0, end=sys.maxsize):
"""Returns True if the blob ends with the given suffix."""
return self._strkey().endswith(suffix, start, end)
# PEP8 aliases
starts_with = startswith
ends_with = endswith
def title(self):
"""Returns a blob object with the text in title-case."""
return self.__class__(self._strkey().title())
def format(self, *args, **kwargs):
"""Perform a string formatting operation, like the built-in
`str.format(*args, **kwargs)`. Returns a blob object.
"""
return self.__class__(self._strkey().format(*args, **kwargs))
def split(self, sep=None, maxsplit=sys.maxsize):
"""Behaves like the built-in str.split().
"""
return self._strkey().split(sep, maxsplit)
def strip(self, chars=None):
"""Behaves like the built-in str.strip([chars]) method. Returns
an object with leading and trailing whitespace removed.
"""
return self.__class__(self._strkey().strip(chars))
def upper(self):
"""Like str.upper(), returns new object with all upper-cased characters.
"""
return self.__class__(self._strkey().upper())
def lower(self):
"""Like str.lower(), returns new object with all lower-cased characters.
"""
return self.__class__(self._strkey().lower())
def join(self, iterable):
"""Behaves like the built-in `str.join(iterable)` method, except
returns a blob object.
Returns a blob which is the concatenation of the strings or blobs
in the iterable.
"""
return self.__class__(self._strkey().join(iterable))
def replace(self, old, new, count=sys.maxsize):
"""Return a new blob object with all the occurence of `old` replaced
by `new`.
"""
return self.__class__(self._strkey().replace(old, new, count))

View File

@@ -0,0 +1,19 @@
# -*- coding: utf-8 -*-
"""Default noun phrase extractors are for English to maintain backwards
compatibility, so you can still do
>>> from textblob.np_extractors import ConllExtractor
which is equivalent to
>>> from textblob.en.np_extractors import ConllExtractor
"""
from __future__ import absolute_import
from textblob.base import BaseNPExtractor
from textblob.en.np_extractors import ConllExtractor, FastNPExtractor
__all__ = [
'BaseNPExtractor',
'ConllExtractor',
'FastNPExtractor',
]

View File

@@ -0,0 +1,17 @@
# -*- coding: utf-8 -*-
'''Default parsers to English for backwards compatibility so you can still do
>>> from textblob.parsers import PatternParser
which is equivalent to
>>> from textblob.en.parsers import PatternParser
'''
from __future__ import absolute_import
from textblob.base import BaseParser
from textblob.en.parsers import PatternParser
__all__ = [
'BaseParser',
'PatternParser',
]

View File

@@ -0,0 +1,22 @@
# -*- coding: utf-8 -*-
'''Default sentiment analyzers are English for backwards compatibility, so
you can still do
>>> from textblob.sentiments import PatternAnalyzer
which is equivalent to
>>> from textblob.en.sentiments import PatternAnalyzer
'''
from __future__ import absolute_import
from textblob.base import BaseSentimentAnalyzer
from textblob.en.sentiments import (DISCRETE, CONTINUOUS,
PatternAnalyzer, NaiveBayesAnalyzer)
__all__ = [
'BaseSentimentAnalyzer',
'DISCRETE',
'CONTINUOUS',
'PatternAnalyzer',
'NaiveBayesAnalyzer',
]

View File

@@ -0,0 +1,18 @@
'''Default taggers to the English taggers for backwards incompatibility, so you
can still do
>>> from textblob.taggers import NLTKTagger
which is equivalent to
>>> from textblob.en.taggers import NLTKTagger
'''
from __future__ import absolute_import
from textblob.base import BaseTagger
from textblob.en.taggers import PatternTagger, NLTKTagger
__all__ = [
'BaseTagger',
'PatternTagger',
'NLTKTagger',
]

View File

@@ -0,0 +1,74 @@
# -*- coding: utf-8 -*-
'''Various tokenizer implementations.
.. versionadded:: 0.4.0
'''
from __future__ import absolute_import
from itertools import chain
import nltk
from textblob.utils import strip_punc
from textblob.base import BaseTokenizer
from textblob.decorators import requires_nltk_corpus
class WordTokenizer(BaseTokenizer):
"""NLTK's recommended word tokenizer (currently the TreeBankTokenizer).
Uses regular expressions to tokenize text. Assumes text has already been
segmented into sentences.
Performs the following steps:
* split standard contractions, e.g. don't -> do n't
* split commas and single quotes
* separate periods that appear at the end of line
"""
def tokenize(self, text, include_punc=True):
'''Return a list of word tokens.
:param text: string of text.
:param include_punc: (optional) whether to include punctuation as separate tokens. Default to True.
'''
tokens = nltk.tokenize.word_tokenize(text)
if include_punc:
return tokens
else:
# Return each word token
# Strips punctuation unless the word comes from a contraction
# e.g. "Let's" => ["Let", "'s"]
# e.g. "Can't" => ["Ca", "n't"]
# e.g. "home." => ['home']
return [word if word.startswith("'") else strip_punc(word, all=False)
for word in tokens if strip_punc(word, all=False)]
class SentenceTokenizer(BaseTokenizer):
"""NLTK's sentence tokenizer (currently PunktSentenceTokenizer).
Uses an unsupervised algorithm to build a model for abbreviation words,
collocations, and words that start sentences,
then uses that to find sentence boundaries.
"""
@requires_nltk_corpus
def tokenize(self, text):
'''Return a list of sentences.'''
return nltk.tokenize.sent_tokenize(text)
#: Convenience function for tokenizing sentences
sent_tokenize = SentenceTokenizer().itokenize
_word_tokenizer = WordTokenizer() # Singleton word tokenizer
def word_tokenize(text, include_punc=True, *args, **kwargs):
"""Convenience function for tokenizing text into words.
NOTE: NLTK's word tokenizer expects sentences as input, so the text will be
tokenized to sentences before being tokenized to words.
"""
words = chain.from_iterable(
_word_tokenizer.itokenize(sentence, include_punc=include_punc,
*args, **kwargs)
for sentence in sent_tokenize(text))
return words

View File

@@ -0,0 +1,149 @@
# -*- coding: utf-8 -*-
"""
Translator module that uses the Google Translate API.
Adapted from Terry Yin's google-translate-python.
Language detection added by Steven Loria.
"""
from __future__ import absolute_import
import codecs
import json
import re
from textblob.compat import PY2, request, urlencode
from textblob.exceptions import TranslatorError, NotTranslated
class Translator(object):
"""A language translator and detector.
Usage:
::
>>> from textblob.translate import Translator
>>> t = Translator()
>>> t.translate('hello', from_lang='en', to_lang='fr')
u'bonjour'
>>> t.detect("hola")
u'es'
"""
url = "http://translate.google.com/translate_a/t?client=webapp&dt=bd&dt=ex&dt=ld&dt=md&dt=qca&dt=rw&dt=rm&dt=ss&dt=t&dt=at&ie=UTF-8&oe=UTF-8&otf=2&ssel=0&tsel=0&kc=1"
headers = {
'Accept': '*/*',
'Connection': 'keep-alive',
'User-Agent': (
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) '
'AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.168 Safari/535.19')
}
def translate(self, source, from_lang='auto', to_lang='en', host=None, type_=None):
"""Translate the source text from one language to another."""
if PY2:
source = source.encode('utf-8')
data = {"q": source}
url = u'{url}&sl={from_lang}&tl={to_lang}&hl={to_lang}&tk={tk}&client={client}'.format(
url=self.url,
from_lang=from_lang,
to_lang=to_lang,
tk=_calculate_tk(source),
client="te",
)
response = self._request(url, host=host, type_=type_, data=data)
result = json.loads(response)
if isinstance(result, list):
try:
result = result[0] # ignore detected language
except IndexError:
pass
self._validate_translation(source, result)
return result
def detect(self, source, host=None, type_=None):
"""Detect the source text's language."""
if PY2:
source = source.encode('utf-8')
if len(source) < 3:
raise TranslatorError('Must provide a string with at least 3 characters.')
data = {"q": source}
url = u'{url}&sl=auto&tk={tk}&client={client}'.format(
url=self.url,
tk=_calculate_tk(source),
client="te",
)
response = self._request(url, host=host, type_=type_, data=data)
result, language = json.loads(response)
return language
def _validate_translation(self, source, result):
"""Validate API returned expected schema, and that the translated text
is different than the original string.
"""
if not result:
raise NotTranslated('Translation API returned and empty response.')
if PY2:
result = result.encode('utf-8')
if result.strip() == source.strip():
raise NotTranslated('Translation API returned the input string unchanged.')
def _request(self, url, host=None, type_=None, data=None):
encoded_data = urlencode(data).encode('utf-8')
req = request.Request(url=url, headers=self.headers, data=encoded_data)
if host or type_:
req.set_proxy(host=host, type=type_)
resp = request.urlopen(req)
content = resp.read()
return content.decode('utf-8')
def _unescape(text):
"""Unescape unicode character codes within a string.
"""
pattern = r'\\{1,2}u[0-9a-fA-F]{4}'
return re.sub(pattern, lambda x: codecs.getdecoder('unicode_escape')(x.group())[0], text)
def _calculate_tk(source):
"""Reverse engineered cross-site request protection."""
# Source: https://github.com/soimort/translate-shell/issues/94#issuecomment-165433715
# Source: http://www.liuxiatool.com/t.php
def c_int(x, nbits=32):
""" C cast to int32, int16, int8... """
return (x & ((1 << (nbits - 1)) - 1)) - (x & (1 << (nbits - 1)))
def c_uint(x, nbits=32):
""" C cast to uint32, uint16, uint8... """
return x & ((1 << nbits) - 1)
tkk = [406398, 561666268 + 1526272306]
b = tkk[0]
if PY2:
d = map(ord, source)
else:
d = source.encode('utf-8')
def RL(a, b):
for c in range(0, len(b) - 2, 3):
d = b[c + 2]
d = ord(d) - 87 if d >= 'a' else int(d)
xa = c_uint(a)
d = xa >> d if b[c + 1] == '+' else xa << d
a = a + d & 4294967295 if b[c] == '+' else a ^ d
return c_int(a)
a = b
for di in d:
a = RL(a + di, "+-a^+6")
a = RL(a, "+-3^+b+-f")
a ^= tkk[1]
a = a if a >= 0 else ((a & 2147483647) + 2147483648)
a %= pow(10, 6)
tk = '{0:d}.{1:d}'.format(a, a ^ b)
return tk

View File

@@ -0,0 +1,194 @@
# -*- coding: utf-8 -*-
import csv
from textblob.compat import izip
#http://semver.org/
VERSION = (0, 9, 4)
__version__ = ".".join(map(str,VERSION))
pass_throughs = [
'register_dialect',
'unregister_dialect',
'get_dialect',
'list_dialects',
'field_size_limit',
'Dialect',
'excel',
'excel_tab',
'Sniffer',
'QUOTE_ALL',
'QUOTE_MINIMAL',
'QUOTE_NONNUMERIC',
'QUOTE_NONE',
'Error'
]
__all__ = [
'reader',
'writer',
'DictReader',
'DictWriter',
] + pass_throughs
for prop in pass_throughs:
globals()[prop]=getattr(csv, prop)
def _stringify(s, encoding, errors):
if s is None:
return ''
if isinstance(s, unicode):
return s.encode(encoding, errors)
elif isinstance(s, (int , float)):
pass #let csv.QUOTE_NONNUMERIC do its thing.
elif not isinstance(s, str):
s=str(s)
return s
def _stringify_list(l, encoding, errors='strict'):
try:
return [_stringify(s, encoding, errors) for s in iter(l)]
except TypeError as e:
raise csv.Error(str(e))
def _unicodify(s, encoding):
if s is None:
return None
if isinstance(s, (unicode, int, float)):
return s
elif isinstance(s, str):
return s.decode(encoding)
return s
class UnicodeWriter(object):
"""
>>> import unicodecsv
>>> from cStringIO import StringIO
>>> f = StringIO()
>>> w = unicodecsv.writer(f, encoding='utf-8')
>>> w.writerow((u'é', u'ñ'))
>>> f.seek(0)
>>> r = unicodecsv.reader(f, encoding='utf-8')
>>> row = r.next()
>>> row[0] == u'é'
True
>>> row[1] == u'ñ'
True
"""
def __init__(self, f, dialect=csv.excel, encoding='utf-8', errors='strict',
*args, **kwds):
self.encoding = encoding
self.writer = csv.writer(f, dialect, *args, **kwds)
self.encoding_errors = errors
def writerow(self, row):
self.writer.writerow(_stringify_list(row, self.encoding, self.encoding_errors))
def writerows(self, rows):
for row in rows:
self.writerow(row)
@property
def dialect(self):
return self.writer.dialect
writer = UnicodeWriter
class UnicodeReader(object):
def __init__(self, f, dialect=None, encoding='utf-8', errors='strict',
**kwds):
format_params = ['delimiter', 'doublequote', 'escapechar', 'lineterminator', 'quotechar', 'quoting', 'skipinitialspace']
if dialect is None:
if not any([kwd_name in format_params for kwd_name in kwds.keys()]):
dialect = csv.excel
self.reader = csv.reader(f, dialect, **kwds)
self.encoding = encoding
self.encoding_errors = errors
def next(self):
row = self.reader.next()
encoding = self.encoding
encoding_errors = self.encoding_errors
float_ = float
unicode_ = unicode
return [(value if isinstance(value, float_) else
unicode_(value, encoding, encoding_errors)) for value in row]
def __iter__(self):
return self
@property
def dialect(self):
return self.reader.dialect
@property
def line_num(self):
return self.reader.line_num
reader = UnicodeReader
class DictWriter(csv.DictWriter):
"""
>>> from cStringIO import StringIO
>>> f = StringIO()
>>> w = DictWriter(f, ['a', u'ñ', 'b'], restval=u'î')
>>> w.writerow({'a':'1', u'ñ':'2'})
>>> w.writerow({'a':'1', u'ñ':'2', 'b':u'ø'})
>>> w.writerow({'a':u'é', u'ñ':'2'})
>>> f.seek(0)
>>> r = DictReader(f, fieldnames=['a', u'ñ'], restkey='r')
>>> r.next() == {'a': u'1', u'ñ':'2', 'r': [u'î']}
True
>>> r.next() == {'a': u'1', u'ñ':'2', 'r': [u'\xc3\xb8']}
True
>>> r.next() == {'a': u'\xc3\xa9', u'ñ':'2', 'r': [u'\xc3\xae']}
True
"""
def __init__(self, csvfile, fieldnames, restval='', extrasaction='raise', dialect='excel', encoding='utf-8', errors='strict', *args, **kwds):
self.encoding = encoding
csv.DictWriter.__init__(self, csvfile, fieldnames, restval, extrasaction, dialect, *args, **kwds)
self.writer = UnicodeWriter(csvfile, dialect, encoding=encoding, errors=errors, *args, **kwds)
self.encoding_errors = errors
def writeheader(self):
fieldnames = _stringify_list(self.fieldnames, self.encoding, self.encoding_errors)
header = dict(zip(self.fieldnames, self.fieldnames))
self.writerow(header)
class DictReader(csv.DictReader):
"""
>>> from cStringIO import StringIO
>>> f = StringIO()
>>> w = DictWriter(f, fieldnames=['name', 'place'])
>>> w.writerow({'name': 'Cary Grant', 'place': 'hollywood'})
>>> w.writerow({'name': 'Nathan Brillstone', 'place': u'øLand'})
>>> w.writerow({'name': u'Willam ø. Unicoder', 'place': u'éSpandland'})
>>> f.seek(0)
>>> r = DictReader(f, fieldnames=['name', 'place'])
>>> print r.next() == {'name': 'Cary Grant', 'place': 'hollywood'}
True
>>> print r.next() == {'name': 'Nathan Brillstone', 'place': u'øLand'}
True
>>> print r.next() == {'name': u'Willam ø. Unicoder', 'place': u'éSpandland'}
True
"""
def __init__(self, csvfile, fieldnames=None, restkey=None, restval=None,
dialect='excel', encoding='utf-8', errors='strict', *args,
**kwds):
if fieldnames is not None:
fieldnames = _stringify_list(fieldnames, encoding)
csv.DictReader.__init__(self, csvfile, fieldnames, restkey, restval, dialect, *args, **kwds)
self.reader = UnicodeReader(csvfile, dialect, encoding=encoding,
errors=errors, *args, **kwds)
if fieldnames is None and not hasattr(csv.DictReader, 'fieldnames'):
# Python 2.5 fieldnames workaround. (http://bugs.python.org/issue3436)
reader = UnicodeReader(csvfile, dialect, encoding=encoding, *args, **kwds)
self.fieldnames = _stringify_list(reader.next(), reader.encoding)
self.unicode_fieldnames = [_unicodify(f, encoding) for f in
self.fieldnames]
self.unicode_restkey = _unicodify(restkey, encoding)
def next(self):
row = csv.DictReader.next(self)
result = dict((uni_key, row[str_key]) for (str_key, uni_key) in
izip(self.fieldnames, self.unicode_fieldnames))
rest = row.get(self.restkey)
if rest:
result[self.unicode_restkey] = rest
return result

View File

@@ -0,0 +1,56 @@
# -*- coding: utf-8 -*-
import re
import string
PUNCTUATION_REGEX = re.compile('[{0}]'.format(re.escape(string.punctuation)))
def strip_punc(s, all=False):
"""Removes punctuation from a string.
:param s: The string.
:param all: Remove all punctuation. If False, only removes punctuation from
the ends of the string.
"""
if all:
return PUNCTUATION_REGEX.sub('', s.strip())
else:
return s.strip().strip(string.punctuation)
def lowerstrip(s, all=False):
"""Makes text all lowercase and strips punctuation and whitespace.
:param s: The string.
:param all: Remove all punctuation. If False, only removes punctuation from
the ends of the string.
"""
return strip_punc(s.lower().strip(), all=all)
def tree2str(tree, concat=' '):
"""Convert a nltk.tree.Tree to a string.
For example:
(NP a/DT beautiful/JJ new/JJ dashboard/NN) -> "a beautiful dashboard"
"""
return concat.join([word for (word, tag) in tree])
def filter_insignificant(chunk, tag_suffixes=('DT', 'CC', 'PRP$', 'PRP')):
"""Filter out insignificant (word, tag) tuples from a chunk of text."""
good = []
for word, tag in chunk:
ok = True
for suffix in tag_suffixes:
if tag.endswith(suffix):
ok = False
break
if ok:
good.append((word, tag))
return good
def is_filelike(obj):
"""Return whether ``obj`` is a file-like object."""
return hasattr(obj, 'read')

View File

@@ -0,0 +1,17 @@
# -*- coding: utf-8 -*-
"""Wordnet interface. Contains classes for creating Synsets and Lemmas
directly.
.. versionadded:: 0.7.0
"""
import nltk
#: wordnet module from nltk
wordnet = nltk.corpus.wordnet
#: Synset constructor
Synset = nltk.corpus.wordnet.synset
#: Lemma constructor
Lemma = nltk.corpus.wordnet.lemma
# Part of speech constants
VERB, NOUN, ADJ, ADV = wordnet.VERB, wordnet.NOUN, wordnet.ADJ, wordnet.ADV