Initial commit
This commit is contained in:
16
backend/venv/Lib/site-packages/textblob/__init__.py
Normal file
16
backend/venv/Lib/site-packages/textblob/__init__.py
Normal file
@@ -0,0 +1,16 @@
|
||||
import os
|
||||
from .blob import TextBlob, Word, Sentence, Blobber, WordList
|
||||
|
||||
__version__ = '0.17.1'
|
||||
__license__ = 'MIT'
|
||||
__author__ = 'Steven Loria'
|
||||
|
||||
PACKAGE_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
__all__ = [
|
||||
'TextBlob',
|
||||
'Word',
|
||||
'Sentence',
|
||||
'Blobber',
|
||||
'WordList',
|
||||
]
|
||||
1408
backend/venv/Lib/site-packages/textblob/_text.py
Normal file
1408
backend/venv/Lib/site-packages/textblob/_text.py
Normal file
File diff suppressed because it is too large
Load Diff
107
backend/venv/Lib/site-packages/textblob/base.py
Normal file
107
backend/venv/Lib/site-packages/textblob/base.py
Normal file
@@ -0,0 +1,107 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Abstract base classes for models (taggers, noun phrase extractors, etc.)
|
||||
which define the interface for descendant classes.
|
||||
|
||||
.. versionchanged:: 0.7.0
|
||||
All base classes are defined in the same module, ``textblob.base``.
|
||||
"""
|
||||
from __future__ import absolute_import
|
||||
from abc import ABCMeta, abstractmethod
|
||||
|
||||
import nltk
|
||||
|
||||
from textblob.compat import with_metaclass
|
||||
|
||||
##### POS TAGGERS #####
|
||||
|
||||
class BaseTagger(with_metaclass(ABCMeta)):
|
||||
"""Abstract tagger class from which all taggers
|
||||
inherit from. All descendants must implement a
|
||||
``tag()`` method.
|
||||
"""
|
||||
@abstractmethod
|
||||
def tag(self, text, tokenize=True):
|
||||
"""Return a list of tuples of the form (word, tag)
|
||||
for a given set of text or BaseBlob instance.
|
||||
"""
|
||||
return
|
||||
|
||||
##### NOUN PHRASE EXTRACTORS #####
|
||||
|
||||
class BaseNPExtractor(with_metaclass(ABCMeta)):
|
||||
"""Abstract base class from which all NPExtractor classes inherit.
|
||||
Descendant classes must implement an ``extract(text)`` method
|
||||
that returns a list of noun phrases as strings.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def extract(self, text):
|
||||
"""Return a list of noun phrases (strings) for a body of text."""
|
||||
return
|
||||
|
||||
##### TOKENIZERS #####
|
||||
|
||||
class BaseTokenizer(with_metaclass(ABCMeta), nltk.tokenize.api.TokenizerI):
|
||||
"""Abstract base class from which all Tokenizer classes inherit.
|
||||
Descendant classes must implement a ``tokenize(text)`` method
|
||||
that returns a list of noun phrases as strings.
|
||||
"""
|
||||
@abstractmethod
|
||||
def tokenize(self, text):
|
||||
"""Return a list of tokens (strings) for a body of text.
|
||||
|
||||
:rtype: list
|
||||
"""
|
||||
return
|
||||
|
||||
def itokenize(self, text, *args, **kwargs):
|
||||
"""Return a generator that generates tokens "on-demand".
|
||||
|
||||
.. versionadded:: 0.6.0
|
||||
|
||||
:rtype: generator
|
||||
"""
|
||||
return (t for t in self.tokenize(text, *args, **kwargs))
|
||||
|
||||
##### SENTIMENT ANALYZERS ####
|
||||
|
||||
|
||||
DISCRETE = 'ds'
|
||||
CONTINUOUS = 'co'
|
||||
|
||||
|
||||
class BaseSentimentAnalyzer(with_metaclass(ABCMeta)):
|
||||
"""Abstract base class from which all sentiment analyzers inherit.
|
||||
Should implement an ``analyze(text)`` method which returns either the
|
||||
results of analysis.
|
||||
"""
|
||||
kind = DISCRETE
|
||||
|
||||
def __init__(self):
|
||||
self._trained = False
|
||||
|
||||
def train(self):
|
||||
# Train me
|
||||
self._trained = True
|
||||
|
||||
@abstractmethod
|
||||
def analyze(self, text):
|
||||
"""Return the result of of analysis. Typically returns either a
|
||||
tuple, float, or dictionary.
|
||||
"""
|
||||
# Lazily train the classifier
|
||||
if not self._trained:
|
||||
self.train()
|
||||
# Analyze text
|
||||
return None
|
||||
|
||||
##### PARSERS #####
|
||||
|
||||
class BaseParser(with_metaclass(ABCMeta)):
|
||||
"""Abstract parser class from which all parsers inherit from. All
|
||||
descendants must implement a ``parse()`` method.
|
||||
"""
|
||||
@abstractmethod
|
||||
def parse(self, text):
|
||||
"""Parses the text."""
|
||||
return
|
||||
824
backend/venv/Lib/site-packages/textblob/blob.py
Normal file
824
backend/venv/Lib/site-packages/textblob/blob.py
Normal file
@@ -0,0 +1,824 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Wrappers for various units of text, including the main
|
||||
:class:`TextBlob <textblob.blob.TextBlob>`, :class:`Word <textblob.blob.Word>`,
|
||||
and :class:`WordList <textblob.blob.WordList>` classes.
|
||||
Example usage: ::
|
||||
|
||||
>>> from textblob import TextBlob
|
||||
>>> b = TextBlob("Simple is better than complex.")
|
||||
>>> b.tags
|
||||
[(u'Simple', u'NN'), (u'is', u'VBZ'), (u'better', u'JJR'), (u'than', u'IN'), (u'complex', u'NN')]
|
||||
>>> b.noun_phrases
|
||||
WordList([u'simple'])
|
||||
>>> b.words
|
||||
WordList([u'Simple', u'is', u'better', u'than', u'complex'])
|
||||
>>> b.sentiment
|
||||
(0.06666666666666667, 0.41904761904761906)
|
||||
>>> b.words[0].synsets()[0]
|
||||
Synset('simple.n.01')
|
||||
|
||||
.. versionchanged:: 0.8.0
|
||||
These classes are now imported from ``textblob`` rather than ``text.blob``.
|
||||
"""
|
||||
from __future__ import unicode_literals, absolute_import
|
||||
import sys
|
||||
import json
|
||||
import warnings
|
||||
from collections import defaultdict
|
||||
|
||||
import nltk
|
||||
|
||||
from textblob.decorators import cached_property, requires_nltk_corpus
|
||||
from textblob.utils import lowerstrip, PUNCTUATION_REGEX
|
||||
from textblob.inflect import singularize as _singularize, pluralize as _pluralize
|
||||
from textblob.mixins import BlobComparableMixin, StringlikeMixin
|
||||
from textblob.compat import unicode, basestring
|
||||
from textblob.base import (BaseNPExtractor, BaseTagger, BaseTokenizer,
|
||||
BaseSentimentAnalyzer, BaseParser)
|
||||
from textblob.np_extractors import FastNPExtractor
|
||||
from textblob.taggers import NLTKTagger
|
||||
from textblob.tokenizers import WordTokenizer, sent_tokenize, word_tokenize
|
||||
from textblob.sentiments import PatternAnalyzer
|
||||
from textblob.parsers import PatternParser
|
||||
from textblob.translate import Translator
|
||||
from textblob.en import suggest
|
||||
|
||||
# Wordnet interface
|
||||
# NOTE: textblob.wordnet is not imported so that the wordnet corpus can be lazy-loaded
|
||||
_wordnet = nltk.corpus.wordnet
|
||||
|
||||
def _penn_to_wordnet(tag):
|
||||
"""Converts a Penn corpus tag into a Wordnet tag."""
|
||||
if tag in ("NN", "NNS", "NNP", "NNPS"):
|
||||
return _wordnet.NOUN
|
||||
if tag in ("JJ", "JJR", "JJS"):
|
||||
return _wordnet.ADJ
|
||||
if tag in ("VB", "VBD", "VBG", "VBN", "VBP", "VBZ"):
|
||||
return _wordnet.VERB
|
||||
if tag in ("RB", "RBR", "RBS"):
|
||||
return _wordnet.ADV
|
||||
return None
|
||||
|
||||
class Word(unicode):
|
||||
|
||||
"""A simple word representation. Includes methods for inflection,
|
||||
translation, and WordNet integration.
|
||||
"""
|
||||
|
||||
translator = Translator()
|
||||
|
||||
def __new__(cls, string, pos_tag=None):
|
||||
"""Return a new instance of the class. It is necessary to override
|
||||
this method in order to handle the extra pos_tag argument in the
|
||||
constructor.
|
||||
"""
|
||||
return super(Word, cls).__new__(cls, string)
|
||||
|
||||
def __init__(self, string, pos_tag=None):
|
||||
self.string = string
|
||||
self.pos_tag = pos_tag
|
||||
|
||||
def __repr__(self):
|
||||
return repr(self.string)
|
||||
|
||||
def __str__(self):
|
||||
return self.string
|
||||
|
||||
def singularize(self):
|
||||
"""Return the singular version of the word as a string."""
|
||||
return Word(_singularize(self.string))
|
||||
|
||||
def pluralize(self):
|
||||
'''Return the plural version of the word as a string.'''
|
||||
return Word(_pluralize(self.string))
|
||||
|
||||
def translate(self, from_lang='auto', to="en"):
|
||||
'''Translate the word to another language using Google's
|
||||
Translate API.
|
||||
|
||||
.. deprecated:: 0.16.0
|
||||
Use the official Google Translate API instead.
|
||||
.. versionadded:: 0.5.0
|
||||
'''
|
||||
warnings.warn(
|
||||
'Word.translate is deprecated and will be removed in a future release. '
|
||||
'Use the official Google Translate API instead.',
|
||||
DeprecationWarning
|
||||
)
|
||||
return self.translator.translate(self.string,
|
||||
from_lang=from_lang, to_lang=to)
|
||||
|
||||
def detect_language(self):
|
||||
'''Detect the word's language using Google's Translate API.
|
||||
|
||||
.. deprecated:: 0.16.0
|
||||
Use the official Google Translate API istead.
|
||||
.. versionadded:: 0.5.0
|
||||
'''
|
||||
warnings.warn(
|
||||
'Word.detect_language is deprecated and will be removed in a future release. '
|
||||
'Use the official Google Translate API instead.',
|
||||
DeprecationWarning
|
||||
)
|
||||
return self.translator.detect(self.string)
|
||||
|
||||
def spellcheck(self):
|
||||
'''Return a list of (word, confidence) tuples of spelling corrections.
|
||||
|
||||
Based on: Peter Norvig, "How to Write a Spelling Corrector"
|
||||
(http://norvig.com/spell-correct.html) as implemented in the pattern
|
||||
library.
|
||||
|
||||
.. versionadded:: 0.6.0
|
||||
'''
|
||||
return suggest(self.string)
|
||||
|
||||
def correct(self):
|
||||
'''Correct the spelling of the word. Returns the word with the highest
|
||||
confidence using the spelling corrector.
|
||||
|
||||
.. versionadded:: 0.6.0
|
||||
'''
|
||||
return Word(self.spellcheck()[0][0])
|
||||
|
||||
@cached_property
|
||||
@requires_nltk_corpus
|
||||
def lemma(self):
|
||||
"""Return the lemma of this word using Wordnet's morphy function.
|
||||
"""
|
||||
return self.lemmatize(pos=self.pos_tag)
|
||||
|
||||
@requires_nltk_corpus
|
||||
def lemmatize(self, pos=None):
|
||||
"""Return the lemma for a word using WordNet's morphy function.
|
||||
|
||||
:param pos: Part of speech to filter upon. If `None`, defaults to
|
||||
``_wordnet.NOUN``.
|
||||
|
||||
.. versionadded:: 0.8.1
|
||||
"""
|
||||
if pos is None:
|
||||
tag = _wordnet.NOUN
|
||||
elif pos in _wordnet._FILEMAP.keys():
|
||||
tag = pos
|
||||
else:
|
||||
tag = _penn_to_wordnet(pos)
|
||||
lemmatizer = nltk.stem.WordNetLemmatizer()
|
||||
return lemmatizer.lemmatize(self.string, tag)
|
||||
|
||||
PorterStemmer = nltk.stem.porter.PorterStemmer()
|
||||
LancasterStemmer = nltk.stem.lancaster.LancasterStemmer()
|
||||
SnowballStemmer = nltk.stem.snowball.SnowballStemmer("english")
|
||||
|
||||
#added 'stemmer' on lines of lemmatizer
|
||||
#based on nltk
|
||||
def stem(self, stemmer=PorterStemmer):
|
||||
"""Stem a word using various NLTK stemmers. (Default: Porter Stemmer)
|
||||
|
||||
.. versionadded:: 0.12.0
|
||||
"""
|
||||
return stemmer.stem(self.string)
|
||||
|
||||
@cached_property
|
||||
def synsets(self):
|
||||
"""The list of Synset objects for this Word.
|
||||
|
||||
:rtype: list of Synsets
|
||||
|
||||
.. versionadded:: 0.7.0
|
||||
"""
|
||||
return self.get_synsets(pos=None)
|
||||
|
||||
@cached_property
|
||||
def definitions(self):
|
||||
"""The list of definitions for this word. Each definition corresponds
|
||||
to a synset.
|
||||
|
||||
.. versionadded:: 0.7.0
|
||||
"""
|
||||
return self.define(pos=None)
|
||||
|
||||
def get_synsets(self, pos=None):
|
||||
"""Return a list of Synset objects for this word.
|
||||
|
||||
:param pos: A part-of-speech tag to filter upon. If ``None``, all
|
||||
synsets for all parts of speech will be loaded.
|
||||
|
||||
:rtype: list of Synsets
|
||||
|
||||
.. versionadded:: 0.7.0
|
||||
"""
|
||||
return _wordnet.synsets(self.string, pos)
|
||||
|
||||
def define(self, pos=None):
|
||||
"""Return a list of definitions for this word. Each definition
|
||||
corresponds to a synset for this word.
|
||||
|
||||
:param pos: A part-of-speech tag to filter upon. If ``None``, definitions
|
||||
for all parts of speech will be loaded.
|
||||
:rtype: List of strings
|
||||
|
||||
.. versionadded:: 0.7.0
|
||||
"""
|
||||
return [syn.definition() for syn in self.get_synsets(pos=pos)]
|
||||
|
||||
|
||||
class WordList(list):
|
||||
"""A list-like collection of words."""
|
||||
|
||||
def __init__(self, collection):
|
||||
"""Initialize a WordList. Takes a collection of strings as
|
||||
its only argument.
|
||||
"""
|
||||
super(WordList, self).__init__([Word(w) for w in collection])
|
||||
|
||||
def __str__(self):
|
||||
"""Returns a string representation for printing."""
|
||||
return super(WordList, self).__repr__()
|
||||
|
||||
def __repr__(self):
|
||||
"""Returns a string representation for debugging."""
|
||||
class_name = self.__class__.__name__
|
||||
return '{cls}({lst})'.format(cls=class_name, lst=super(WordList, self).__repr__())
|
||||
|
||||
def __getitem__(self, key):
|
||||
"""Returns a string at the given index."""
|
||||
item = super(WordList, self).__getitem__(key)
|
||||
if isinstance(key, slice):
|
||||
return self.__class__(item)
|
||||
else:
|
||||
return item
|
||||
|
||||
def __getslice__(self, i, j):
|
||||
# This is included for Python 2.* compatibility
|
||||
return self.__class__(super(WordList, self).__getslice__(i, j))
|
||||
|
||||
def __setitem__(self, index, obj):
|
||||
"""Places object at given index, replacing existing item. If the object
|
||||
is a string, inserts a :class:`Word <Word>` object.
|
||||
"""
|
||||
if isinstance(obj, basestring):
|
||||
super(WordList, self).__setitem__(index, Word(obj))
|
||||
else:
|
||||
super(WordList, self).__setitem__(index, obj)
|
||||
|
||||
def count(self, strg, case_sensitive=False, *args, **kwargs):
|
||||
"""Get the count of a word or phrase `s` within this WordList.
|
||||
|
||||
:param strg: The string to count.
|
||||
:param case_sensitive: A boolean, whether or not the search is case-sensitive.
|
||||
"""
|
||||
if not case_sensitive:
|
||||
return [word.lower() for word in self].count(strg.lower(), *args,
|
||||
**kwargs)
|
||||
return super(WordList, self).count(strg, *args, **kwargs)
|
||||
|
||||
def append(self, obj):
|
||||
"""Append an object to end. If the object is a string, appends a
|
||||
:class:`Word <Word>` object.
|
||||
"""
|
||||
if isinstance(obj, basestring):
|
||||
super(WordList, self).append(Word(obj))
|
||||
else:
|
||||
super(WordList, self).append(obj)
|
||||
|
||||
def extend(self, iterable):
|
||||
"""Extend WordList by appending elements from ``iterable``. If an element
|
||||
is a string, appends a :class:`Word <Word>` object.
|
||||
"""
|
||||
for e in iterable:
|
||||
self.append(e)
|
||||
|
||||
def upper(self):
|
||||
"""Return a new WordList with each word upper-cased."""
|
||||
return self.__class__([word.upper() for word in self])
|
||||
|
||||
def lower(self):
|
||||
"""Return a new WordList with each word lower-cased."""
|
||||
return self.__class__([word.lower() for word in self])
|
||||
|
||||
def singularize(self):
|
||||
"""Return the single version of each word in this WordList."""
|
||||
return self.__class__([word.singularize() for word in self])
|
||||
|
||||
def pluralize(self):
|
||||
"""Return the plural version of each word in this WordList."""
|
||||
return self.__class__([word.pluralize() for word in self])
|
||||
|
||||
def lemmatize(self):
|
||||
"""Return the lemma of each word in this WordList."""
|
||||
return self.__class__([word.lemmatize() for word in self])
|
||||
|
||||
def stem(self, *args, **kwargs):
|
||||
"""Return the stem for each word in this WordList."""
|
||||
return self.__class__([word.stem(*args, **kwargs) for word in self])
|
||||
|
||||
|
||||
def _validated_param(obj, name, base_class, default, base_class_name=None):
|
||||
"""Validates a parameter passed to __init__. Makes sure that obj is
|
||||
the correct class. Return obj if it's not None or falls back to default
|
||||
|
||||
:param obj: The object passed in.
|
||||
:param name: The name of the parameter.
|
||||
:param base_class: The class that obj must inherit from.
|
||||
:param default: The default object to fall back upon if obj is None.
|
||||
"""
|
||||
base_class_name = base_class_name if base_class_name else base_class.__name__
|
||||
if obj is not None and not isinstance(obj, base_class):
|
||||
raise ValueError('{name} must be an instance of {cls}'
|
||||
.format(name=name, cls=base_class_name))
|
||||
return obj or default
|
||||
|
||||
|
||||
def _initialize_models(obj, tokenizer, pos_tagger,
|
||||
np_extractor, analyzer, parser, classifier):
|
||||
"""Common initialization between BaseBlob and Blobber classes."""
|
||||
# tokenizer may be a textblob or an NLTK tokenizer
|
||||
obj.tokenizer = _validated_param(tokenizer, "tokenizer",
|
||||
base_class=(BaseTokenizer, nltk.tokenize.api.TokenizerI),
|
||||
default=BaseBlob.tokenizer,
|
||||
base_class_name="BaseTokenizer")
|
||||
obj.np_extractor = _validated_param(np_extractor, "np_extractor",
|
||||
base_class=BaseNPExtractor,
|
||||
default=BaseBlob.np_extractor)
|
||||
obj.pos_tagger = _validated_param(pos_tagger, "pos_tagger",
|
||||
BaseTagger, BaseBlob.pos_tagger)
|
||||
obj.analyzer = _validated_param(analyzer, "analyzer",
|
||||
BaseSentimentAnalyzer, BaseBlob.analyzer)
|
||||
obj.parser = _validated_param(parser, "parser", BaseParser, BaseBlob.parser)
|
||||
obj.classifier = classifier
|
||||
|
||||
|
||||
class BaseBlob(StringlikeMixin, BlobComparableMixin):
|
||||
"""An abstract base class that all textblob classes will inherit from.
|
||||
Includes words, POS tag, NP, and word count properties. Also includes
|
||||
basic dunder and string methods for making objects like Python strings.
|
||||
|
||||
:param text: A string.
|
||||
:param tokenizer: (optional) A tokenizer instance. If ``None``,
|
||||
defaults to :class:`WordTokenizer() <textblob.tokenizers.WordTokenizer>`.
|
||||
:param np_extractor: (optional) An NPExtractor instance. If ``None``,
|
||||
defaults to :class:`FastNPExtractor() <textblob.en.np_extractors.FastNPExtractor>`.
|
||||
:param pos_tagger: (optional) A Tagger instance. If ``None``,
|
||||
defaults to :class:`NLTKTagger <textblob.en.taggers.NLTKTagger>`.
|
||||
:param analyzer: (optional) A sentiment analyzer. If ``None``,
|
||||
defaults to :class:`PatternAnalyzer <textblob.en.sentiments.PatternAnalyzer>`.
|
||||
:param parser: A parser. If ``None``, defaults to
|
||||
:class:`PatternParser <textblob.en.parsers.PatternParser>`.
|
||||
:param classifier: A classifier.
|
||||
|
||||
.. versionchanged:: 0.6.0
|
||||
``clean_html`` parameter deprecated, as it was in NLTK.
|
||||
"""
|
||||
np_extractor = FastNPExtractor()
|
||||
pos_tagger = NLTKTagger()
|
||||
tokenizer = WordTokenizer()
|
||||
translator = Translator()
|
||||
analyzer = PatternAnalyzer()
|
||||
parser = PatternParser()
|
||||
|
||||
def __init__(self, text, tokenizer=None,
|
||||
pos_tagger=None, np_extractor=None, analyzer=None,
|
||||
parser=None, classifier=None, clean_html=False):
|
||||
if not isinstance(text, basestring):
|
||||
raise TypeError('The `text` argument passed to `__init__(text)` '
|
||||
'must be a string, not {0}'.format(type(text)))
|
||||
if clean_html:
|
||||
raise NotImplementedError("clean_html has been deprecated. "
|
||||
"To remove HTML markup, use BeautifulSoup's "
|
||||
"get_text() function")
|
||||
self.raw = self.string = text
|
||||
self.stripped = lowerstrip(self.raw, all=True)
|
||||
_initialize_models(self, tokenizer, pos_tagger, np_extractor, analyzer,
|
||||
parser, classifier)
|
||||
|
||||
@cached_property
|
||||
def words(self):
|
||||
"""Return a list of word tokens. This excludes punctuation characters.
|
||||
If you want to include punctuation characters, access the ``tokens``
|
||||
property.
|
||||
|
||||
:returns: A :class:`WordList <WordList>` of word tokens.
|
||||
"""
|
||||
return WordList(word_tokenize(self.raw, include_punc=False))
|
||||
|
||||
@cached_property
|
||||
def tokens(self):
|
||||
"""Return a list of tokens, using this blob's tokenizer object
|
||||
(defaults to :class:`WordTokenizer <textblob.tokenizers.WordTokenizer>`).
|
||||
"""
|
||||
return WordList(self.tokenizer.tokenize(self.raw))
|
||||
|
||||
def tokenize(self, tokenizer=None):
|
||||
"""Return a list of tokens, using ``tokenizer``.
|
||||
|
||||
:param tokenizer: (optional) A tokenizer object. If None, defaults to
|
||||
this blob's default tokenizer.
|
||||
"""
|
||||
t = tokenizer if tokenizer is not None else self.tokenizer
|
||||
return WordList(t.tokenize(self.raw))
|
||||
|
||||
def parse(self, parser=None):
|
||||
"""Parse the text.
|
||||
|
||||
:param parser: (optional) A parser instance. If ``None``, defaults to
|
||||
this blob's default parser.
|
||||
|
||||
.. versionadded:: 0.6.0
|
||||
"""
|
||||
p = parser if parser is not None else self.parser
|
||||
return p.parse(self.raw)
|
||||
|
||||
def classify(self):
|
||||
"""Classify the blob using the blob's ``classifier``."""
|
||||
if self.classifier is None:
|
||||
raise NameError("This blob has no classifier. Train one first!")
|
||||
return self.classifier.classify(self.raw)
|
||||
|
||||
@cached_property
|
||||
def sentiment(self):
|
||||
"""Return a tuple of form (polarity, subjectivity ) where polarity
|
||||
is a float within the range [-1.0, 1.0] and subjectivity is a float
|
||||
within the range [0.0, 1.0] where 0.0 is very objective and 1.0 is
|
||||
very subjective.
|
||||
|
||||
:rtype: namedtuple of the form ``Sentiment(polarity, subjectivity)``
|
||||
"""
|
||||
return self.analyzer.analyze(self.raw)
|
||||
|
||||
@cached_property
|
||||
def sentiment_assessments(self):
|
||||
"""Return a tuple of form (polarity, subjectivity, assessments ) where
|
||||
polarity is a float within the range [-1.0, 1.0], subjectivity is a
|
||||
float within the range [0.0, 1.0] where 0.0 is very objective and 1.0
|
||||
is very subjective, and assessments is a list of polarity and
|
||||
subjectivity scores for the assessed tokens.
|
||||
|
||||
:rtype: namedtuple of the form ``Sentiment(polarity, subjectivity,
|
||||
assessments)``
|
||||
"""
|
||||
return self.analyzer.analyze(self.raw, keep_assessments=True)
|
||||
|
||||
@cached_property
|
||||
def polarity(self):
|
||||
"""Return the polarity score as a float within the range [-1.0, 1.0]
|
||||
|
||||
:rtype: float
|
||||
"""
|
||||
return PatternAnalyzer().analyze(self.raw)[0]
|
||||
|
||||
@cached_property
|
||||
def subjectivity(self):
|
||||
"""Return the subjectivity score as a float within the range [0.0, 1.0]
|
||||
where 0.0 is very objective and 1.0 is very subjective.
|
||||
|
||||
:rtype: float
|
||||
"""
|
||||
return PatternAnalyzer().analyze(self.raw)[1]
|
||||
|
||||
@cached_property
|
||||
def noun_phrases(self):
|
||||
"""Returns a list of noun phrases for this blob."""
|
||||
return WordList([phrase.strip().lower()
|
||||
for phrase in self.np_extractor.extract(self.raw)
|
||||
if len(phrase) > 1])
|
||||
|
||||
@cached_property
|
||||
def pos_tags(self):
|
||||
"""Returns an list of tuples of the form (word, POS tag).
|
||||
|
||||
Example:
|
||||
::
|
||||
|
||||
[('At', 'IN'), ('eight', 'CD'), ("o'clock", 'JJ'), ('on', 'IN'),
|
||||
('Thursday', 'NNP'), ('morning', 'NN')]
|
||||
|
||||
:rtype: list of tuples
|
||||
"""
|
||||
if isinstance(self, TextBlob):
|
||||
return [val for sublist in [s.pos_tags for s in self.sentences] for val in sublist]
|
||||
else:
|
||||
return [(Word(unicode(word), pos_tag=t), unicode(t))
|
||||
for word, t in self.pos_tagger.tag(self)
|
||||
if not PUNCTUATION_REGEX.match(unicode(t))]
|
||||
|
||||
tags = pos_tags
|
||||
|
||||
@cached_property
|
||||
def word_counts(self):
|
||||
"""Dictionary of word frequencies in this text.
|
||||
"""
|
||||
counts = defaultdict(int)
|
||||
stripped_words = [lowerstrip(word) for word in self.words]
|
||||
for word in stripped_words:
|
||||
counts[word] += 1
|
||||
return counts
|
||||
|
||||
@cached_property
|
||||
def np_counts(self):
|
||||
"""Dictionary of noun phrase frequencies in this text.
|
||||
"""
|
||||
counts = defaultdict(int)
|
||||
for phrase in self.noun_phrases:
|
||||
counts[phrase] += 1
|
||||
return counts
|
||||
|
||||
def ngrams(self, n=3):
|
||||
"""Return a list of n-grams (tuples of n successive words) for this
|
||||
blob.
|
||||
|
||||
:rtype: List of :class:`WordLists <WordList>`
|
||||
"""
|
||||
if n <= 0:
|
||||
return []
|
||||
grams = [WordList(self.words[i:i + n])
|
||||
for i in range(len(self.words) - n + 1)]
|
||||
return grams
|
||||
|
||||
def translate(self, from_lang="auto", to="en"):
|
||||
"""Translate the blob to another language.
|
||||
Uses the Google Translate API. Returns a new TextBlob.
|
||||
|
||||
Requires an internet connection.
|
||||
|
||||
Usage:
|
||||
::
|
||||
|
||||
>>> b = TextBlob("Simple is better than complex")
|
||||
>>> b.translate(to="es")
|
||||
TextBlob('Lo simple es mejor que complejo')
|
||||
|
||||
Language code reference:
|
||||
https://developers.google.com/translate/v2/using_rest#language-params
|
||||
|
||||
.. deprecated:: 0.16.0
|
||||
Use the official Google Translate API instead.
|
||||
.. versionadded:: 0.5.0.
|
||||
|
||||
:param str from_lang: Language to translate from. If ``None``, will attempt
|
||||
to detect the language.
|
||||
:param str to: Language to translate to.
|
||||
:rtype: :class:`BaseBlob <BaseBlob>`
|
||||
"""
|
||||
warnings.warn(
|
||||
'TextBlob.translate is deprecated and will be removed in a future release. '
|
||||
'Use the official Google Translate API instead.',
|
||||
DeprecationWarning
|
||||
)
|
||||
return self.__class__(self.translator.translate(self.raw,
|
||||
from_lang=from_lang, to_lang=to))
|
||||
|
||||
def detect_language(self):
|
||||
"""Detect the blob's language using the Google Translate API.
|
||||
|
||||
Requires an internet connection.
|
||||
|
||||
Usage:
|
||||
::
|
||||
|
||||
>>> b = TextBlob("bonjour")
|
||||
>>> b.detect_language()
|
||||
u'fr'
|
||||
|
||||
Language code reference:
|
||||
https://developers.google.com/translate/v2/using_rest#language-params
|
||||
|
||||
.. deprecated:: 0.16.0
|
||||
Use the official Google Translate API instead.
|
||||
.. versionadded:: 0.5.0
|
||||
|
||||
:rtype: str
|
||||
"""
|
||||
warnings.warn(
|
||||
'TextBlob.detext_translate is deprecated and will be removed in a future release. '
|
||||
'Use the official Google Translate API instead.',
|
||||
DeprecationWarning
|
||||
)
|
||||
return self.translator.detect(self.raw)
|
||||
|
||||
def correct(self):
|
||||
"""Attempt to correct the spelling of a blob.
|
||||
|
||||
.. versionadded:: 0.6.0
|
||||
|
||||
:rtype: :class:`BaseBlob <BaseBlob>`
|
||||
"""
|
||||
# regex matches: word or punctuation or whitespace
|
||||
tokens = nltk.tokenize.regexp_tokenize(self.raw, r"\w+|[^\w\s]|\s")
|
||||
corrected = (Word(w).correct() for w in tokens)
|
||||
ret = ''.join(corrected)
|
||||
return self.__class__(ret)
|
||||
|
||||
def _cmpkey(self):
|
||||
"""Key used by ComparableMixin to implement all rich comparison
|
||||
operators.
|
||||
"""
|
||||
return self.raw
|
||||
|
||||
def _strkey(self):
|
||||
"""Key used by StringlikeMixin to implement string methods."""
|
||||
return self.raw
|
||||
|
||||
def __hash__(self):
|
||||
return hash(self._cmpkey())
|
||||
|
||||
def __add__(self, other):
|
||||
'''Concatenates two text objects the same way Python strings are
|
||||
concatenated.
|
||||
|
||||
Arguments:
|
||||
- `other`: a string or a text object
|
||||
'''
|
||||
if isinstance(other, basestring):
|
||||
return self.__class__(self.raw + other)
|
||||
elif isinstance(other, BaseBlob):
|
||||
return self.__class__(self.raw + other.raw)
|
||||
else:
|
||||
raise TypeError('Operands must be either strings or {0} objects'
|
||||
.format(self.__class__.__name__))
|
||||
|
||||
def split(self, sep=None, maxsplit=sys.maxsize):
|
||||
"""Behaves like the built-in str.split() except returns a
|
||||
WordList.
|
||||
|
||||
:rtype: :class:`WordList <WordList>`
|
||||
"""
|
||||
return WordList(self._strkey().split(sep, maxsplit))
|
||||
|
||||
|
||||
class TextBlob(BaseBlob):
|
||||
"""A general text block, meant for larger bodies of text (esp. those
|
||||
containing sentences). Inherits from :class:`BaseBlob <BaseBlob>`.
|
||||
|
||||
:param str text: A string.
|
||||
:param tokenizer: (optional) A tokenizer instance. If ``None``, defaults to
|
||||
:class:`WordTokenizer() <textblob.tokenizers.WordTokenizer>`.
|
||||
:param np_extractor: (optional) An NPExtractor instance. If ``None``,
|
||||
defaults to :class:`FastNPExtractor() <textblob.en.np_extractors.FastNPExtractor>`.
|
||||
:param pos_tagger: (optional) A Tagger instance. If ``None``, defaults to
|
||||
:class:`NLTKTagger <textblob.en.taggers.NLTKTagger>`.
|
||||
:param analyzer: (optional) A sentiment analyzer. If ``None``, defaults to
|
||||
:class:`PatternAnalyzer <textblob.en.sentiments.PatternAnalyzer>`.
|
||||
:param classifier: (optional) A classifier.
|
||||
"""
|
||||
|
||||
@cached_property
|
||||
def sentences(self):
|
||||
"""Return list of :class:`Sentence <Sentence>` objects."""
|
||||
return self._create_sentence_objects()
|
||||
|
||||
@cached_property
|
||||
def words(self):
|
||||
"""Return a list of word tokens. This excludes punctuation characters.
|
||||
If you want to include punctuation characters, access the ``tokens``
|
||||
property.
|
||||
|
||||
:returns: A :class:`WordList <WordList>` of word tokens.
|
||||
"""
|
||||
return WordList(word_tokenize(self.raw, include_punc=False))
|
||||
|
||||
@property
|
||||
def raw_sentences(self):
|
||||
"""List of strings, the raw sentences in the blob."""
|
||||
return [sentence.raw for sentence in self.sentences]
|
||||
|
||||
@property
|
||||
def serialized(self):
|
||||
"""Returns a list of each sentence's dict representation."""
|
||||
return [sentence.dict for sentence in self.sentences]
|
||||
|
||||
def to_json(self, *args, **kwargs):
|
||||
'''Return a json representation (str) of this blob.
|
||||
Takes the same arguments as json.dumps.
|
||||
|
||||
.. versionadded:: 0.5.1
|
||||
'''
|
||||
return json.dumps(self.serialized, *args, **kwargs)
|
||||
|
||||
@property
|
||||
def json(self):
|
||||
'''The json representation of this blob.
|
||||
|
||||
.. versionchanged:: 0.5.1
|
||||
Made ``json`` a property instead of a method to restore backwards
|
||||
compatibility that was broken after version 0.4.0.
|
||||
'''
|
||||
return self.to_json()
|
||||
|
||||
def _create_sentence_objects(self):
|
||||
'''Returns a list of Sentence objects from the raw text.
|
||||
'''
|
||||
sentence_objects = []
|
||||
sentences = sent_tokenize(self.raw)
|
||||
char_index = 0 # Keeps track of character index within the blob
|
||||
for sent in sentences:
|
||||
# Compute the start and end indices of the sentence
|
||||
# within the blob
|
||||
start_index = self.raw.index(sent, char_index)
|
||||
char_index += len(sent)
|
||||
end_index = start_index + len(sent)
|
||||
# Sentences share the same models as their parent blob
|
||||
s = Sentence(sent, start_index=start_index, end_index=end_index,
|
||||
tokenizer=self.tokenizer, np_extractor=self.np_extractor,
|
||||
pos_tagger=self.pos_tagger, analyzer=self.analyzer,
|
||||
parser=self.parser, classifier=self.classifier)
|
||||
sentence_objects.append(s)
|
||||
return sentence_objects
|
||||
|
||||
|
||||
class Sentence(BaseBlob):
|
||||
"""A sentence within a TextBlob. Inherits from :class:`BaseBlob <BaseBlob>`.
|
||||
|
||||
:param sentence: A string, the raw sentence.
|
||||
:param start_index: An int, the index where this sentence begins
|
||||
in a TextBlob. If not given, defaults to 0.
|
||||
:param end_index: An int, the index where this sentence ends in
|
||||
a TextBlob. If not given, defaults to the
|
||||
length of the sentence - 1.
|
||||
"""
|
||||
|
||||
def __init__(self, sentence, start_index=0, end_index=None, *args, **kwargs):
|
||||
super(Sentence, self).__init__(sentence, *args, **kwargs)
|
||||
#: The start index within a TextBlob
|
||||
self.start = self.start_index = start_index
|
||||
#: The end index within a textBlob
|
||||
self.end = self.end_index = end_index or len(sentence) - 1
|
||||
|
||||
@property
|
||||
def dict(self):
|
||||
'''The dict representation of this sentence.'''
|
||||
return {
|
||||
'raw': self.raw,
|
||||
'start_index': self.start_index,
|
||||
'end_index': self.end_index,
|
||||
'stripped': self.stripped,
|
||||
'noun_phrases': self.noun_phrases,
|
||||
'polarity': self.polarity,
|
||||
'subjectivity': self.subjectivity,
|
||||
}
|
||||
|
||||
|
||||
class Blobber(object):
|
||||
"""A factory for TextBlobs that all share the same tagger,
|
||||
tokenizer, parser, classifier, and np_extractor.
|
||||
|
||||
Usage:
|
||||
|
||||
>>> from textblob import Blobber
|
||||
>>> from textblob.taggers import NLTKTagger
|
||||
>>> from textblob.tokenizers import SentenceTokenizer
|
||||
>>> tb = Blobber(pos_tagger=NLTKTagger(), tokenizer=SentenceTokenizer())
|
||||
>>> blob1 = tb("This is one blob.")
|
||||
>>> blob2 = tb("This blob has the same tagger and tokenizer.")
|
||||
>>> blob1.pos_tagger is blob2.pos_tagger
|
||||
True
|
||||
|
||||
:param tokenizer: (optional) A tokenizer instance. If ``None``,
|
||||
defaults to :class:`WordTokenizer() <textblob.tokenizers.WordTokenizer>`.
|
||||
:param np_extractor: (optional) An NPExtractor instance. If ``None``,
|
||||
defaults to :class:`FastNPExtractor() <textblob.en.np_extractors.FastNPExtractor>`.
|
||||
:param pos_tagger: (optional) A Tagger instance. If ``None``,
|
||||
defaults to :class:`NLTKTagger <textblob.en.taggers.NLTKTagger>`.
|
||||
:param analyzer: (optional) A sentiment analyzer. If ``None``,
|
||||
defaults to :class:`PatternAnalyzer <textblob.en.sentiments.PatternAnalyzer>`.
|
||||
:param parser: A parser. If ``None``, defaults to
|
||||
:class:`PatternParser <textblob.en.parsers.PatternParser>`.
|
||||
:param classifier: A classifier.
|
||||
|
||||
.. versionadded:: 0.4.0
|
||||
"""
|
||||
|
||||
np_extractor = FastNPExtractor()
|
||||
pos_tagger = NLTKTagger()
|
||||
tokenizer = WordTokenizer()
|
||||
analyzer = PatternAnalyzer()
|
||||
parser = PatternParser()
|
||||
|
||||
def __init__(self, tokenizer=None, pos_tagger=None, np_extractor=None,
|
||||
analyzer=None, parser=None, classifier=None):
|
||||
_initialize_models(self, tokenizer, pos_tagger, np_extractor, analyzer,
|
||||
parser, classifier)
|
||||
|
||||
def __call__(self, text):
|
||||
"""Return a new TextBlob object with this Blobber's ``np_extractor``,
|
||||
``pos_tagger``, ``tokenizer``, ``analyzer``, and ``classifier``.
|
||||
|
||||
:returns: A new :class:`TextBlob <TextBlob>`.
|
||||
"""
|
||||
return TextBlob(text, tokenizer=self.tokenizer, pos_tagger=self.pos_tagger,
|
||||
np_extractor=self.np_extractor, analyzer=self.analyzer,
|
||||
parser=self.parser,
|
||||
classifier=self.classifier)
|
||||
|
||||
def __repr__(self):
|
||||
classifier_name = self.classifier.__class__.__name__ + "()" if self.classifier else "None"
|
||||
return ("Blobber(tokenizer={0}(), pos_tagger={1}(), "
|
||||
"np_extractor={2}(), analyzer={3}(), parser={4}(), classifier={5})")\
|
||||
.format(self.tokenizer.__class__.__name__,
|
||||
self.pos_tagger.__class__.__name__,
|
||||
self.np_extractor.__class__.__name__,
|
||||
self.analyzer.__class__.__name__,
|
||||
self.parser.__class__.__name__,
|
||||
classifier_name)
|
||||
|
||||
__str__ = __repr__
|
||||
503
backend/venv/Lib/site-packages/textblob/classifiers.py
Normal file
503
backend/venv/Lib/site-packages/textblob/classifiers.py
Normal file
@@ -0,0 +1,503 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Various classifier implementations. Also includes basic feature extractor
|
||||
methods.
|
||||
|
||||
Example Usage:
|
||||
::
|
||||
|
||||
>>> from textblob import TextBlob
|
||||
>>> from textblob.classifiers import NaiveBayesClassifier
|
||||
>>> train = [
|
||||
... ('I love this sandwich.', 'pos'),
|
||||
... ('This is an amazing place!', 'pos'),
|
||||
... ('I feel very good about these beers.', 'pos'),
|
||||
... ('I do not like this restaurant', 'neg'),
|
||||
... ('I am tired of this stuff.', 'neg'),
|
||||
... ("I can't deal with this", 'neg'),
|
||||
... ("My boss is horrible.", "neg")
|
||||
... ]
|
||||
>>> cl = NaiveBayesClassifier(train)
|
||||
>>> cl.classify("I feel amazing!")
|
||||
'pos'
|
||||
>>> blob = TextBlob("The beer is good. But the hangover is horrible.", classifier=cl)
|
||||
>>> for s in blob.sentences:
|
||||
... print(s)
|
||||
... print(s.classify())
|
||||
...
|
||||
The beer is good.
|
||||
pos
|
||||
But the hangover is horrible.
|
||||
neg
|
||||
|
||||
.. versionadded:: 0.6.0
|
||||
"""
|
||||
from __future__ import absolute_import
|
||||
from itertools import chain
|
||||
|
||||
import nltk
|
||||
|
||||
from textblob.compat import basestring
|
||||
from textblob.decorators import cached_property
|
||||
from textblob.exceptions import FormatError
|
||||
from textblob.tokenizers import word_tokenize
|
||||
from textblob.utils import strip_punc, is_filelike
|
||||
import textblob.formats as formats
|
||||
|
||||
### Basic feature extractors ###
|
||||
|
||||
|
||||
def _get_words_from_dataset(dataset):
|
||||
"""Return a set of all words in a dataset.
|
||||
|
||||
:param dataset: A list of tuples of the form ``(words, label)`` where
|
||||
``words`` is either a string of a list of tokens.
|
||||
"""
|
||||
# Words may be either a string or a list of tokens. Return an iterator
|
||||
# of tokens accordingly
|
||||
def tokenize(words):
|
||||
if isinstance(words, basestring):
|
||||
return word_tokenize(words, include_punc=False)
|
||||
else:
|
||||
return words
|
||||
all_words = chain.from_iterable(tokenize(words) for words, _ in dataset)
|
||||
return set(all_words)
|
||||
|
||||
def _get_document_tokens(document):
|
||||
if isinstance(document, basestring):
|
||||
tokens = set((strip_punc(w, all=False)
|
||||
for w in word_tokenize(document, include_punc=False)))
|
||||
else:
|
||||
tokens = set(strip_punc(w, all=False) for w in document)
|
||||
return tokens
|
||||
|
||||
def basic_extractor(document, train_set):
|
||||
"""A basic document feature extractor that returns a dict indicating
|
||||
what words in ``train_set`` are contained in ``document``.
|
||||
|
||||
:param document: The text to extract features from. Can be a string or an iterable.
|
||||
:param list train_set: Training data set, a list of tuples of the form
|
||||
``(words, label)`` OR an iterable of strings.
|
||||
"""
|
||||
|
||||
try:
|
||||
el_zero = next(iter(train_set)) # Infer input from first element.
|
||||
except StopIteration:
|
||||
return {}
|
||||
if isinstance(el_zero, basestring):
|
||||
word_features = [w for w in chain([el_zero], train_set)]
|
||||
else:
|
||||
try:
|
||||
assert(isinstance(el_zero[0], basestring))
|
||||
word_features = _get_words_from_dataset(chain([el_zero], train_set))
|
||||
except Exception:
|
||||
raise ValueError('train_set is probably malformed.')
|
||||
|
||||
tokens = _get_document_tokens(document)
|
||||
features = dict(((u'contains({0})'.format(word), (word in tokens))
|
||||
for word in word_features))
|
||||
return features
|
||||
|
||||
|
||||
def contains_extractor(document):
|
||||
"""A basic document feature extractor that returns a dict of words that
|
||||
the document contains.
|
||||
"""
|
||||
tokens = _get_document_tokens(document)
|
||||
features = dict((u'contains({0})'.format(w), True) for w in tokens)
|
||||
return features
|
||||
|
||||
##### CLASSIFIERS #####
|
||||
|
||||
class BaseClassifier(object):
|
||||
"""Abstract classifier class from which all classifers inherit. At a
|
||||
minimum, descendant classes must implement a ``classify`` method and have
|
||||
a ``classifier`` property.
|
||||
|
||||
:param train_set: The training set, either a list of tuples of the form
|
||||
``(text, classification)`` or a file-like object. ``text`` may be either
|
||||
a string or an iterable.
|
||||
:param callable feature_extractor: A feature extractor function that takes one or
|
||||
two arguments: ``document`` and ``train_set``.
|
||||
:param str format: If ``train_set`` is a filename, the file format, e.g.
|
||||
``"csv"`` or ``"json"``. If ``None``, will attempt to detect the
|
||||
file format.
|
||||
:param kwargs: Additional keyword arguments are passed to the constructor
|
||||
of the :class:`Format <textblob.formats.BaseFormat>` class used to
|
||||
read the data. Only applies when a file-like object is passed as
|
||||
``train_set``.
|
||||
|
||||
.. versionadded:: 0.6.0
|
||||
"""
|
||||
|
||||
def __init__(self, train_set, feature_extractor=basic_extractor, format=None, **kwargs):
|
||||
self.format_kwargs = kwargs
|
||||
self.feature_extractor = feature_extractor
|
||||
if is_filelike(train_set):
|
||||
self.train_set = self._read_data(train_set, format)
|
||||
else: # train_set is a list of tuples
|
||||
self.train_set = train_set
|
||||
self._word_set = _get_words_from_dataset(self.train_set) # Keep a hidden set of unique words.
|
||||
self.train_features = None
|
||||
|
||||
def _read_data(self, dataset, format=None):
|
||||
"""Reads a data file and returns an iterable that can be used
|
||||
as testing or training data.
|
||||
"""
|
||||
# Attempt to detect file format if "format" isn't specified
|
||||
if not format:
|
||||
format_class = formats.detect(dataset)
|
||||
if not format_class:
|
||||
raise FormatError('Could not automatically detect format for the given '
|
||||
'data source.')
|
||||
else:
|
||||
registry = formats.get_registry()
|
||||
if format not in registry.keys():
|
||||
raise ValueError("'{0}' format not supported.".format(format))
|
||||
format_class = registry[format]
|
||||
return format_class(dataset, **self.format_kwargs).to_iterable()
|
||||
|
||||
@cached_property
|
||||
def classifier(self):
|
||||
"""The classifier object."""
|
||||
raise NotImplementedError('Must implement the "classifier" property.')
|
||||
|
||||
def classify(self, text):
|
||||
"""Classifies a string of text."""
|
||||
raise NotImplementedError('Must implement a "classify" method.')
|
||||
|
||||
def train(self, labeled_featureset):
|
||||
"""Trains the classifier."""
|
||||
raise NotImplementedError('Must implement a "train" method.')
|
||||
|
||||
def labels(self):
|
||||
"""Returns an iterable containing the possible labels."""
|
||||
raise NotImplementedError('Must implement a "labels" method.')
|
||||
|
||||
def extract_features(self, text):
|
||||
'''Extracts features from a body of text.
|
||||
|
||||
:rtype: dictionary of features
|
||||
'''
|
||||
# Feature extractor may take one or two arguments
|
||||
try:
|
||||
return self.feature_extractor(text, self._word_set)
|
||||
except (TypeError, AttributeError):
|
||||
return self.feature_extractor(text)
|
||||
|
||||
|
||||
class NLTKClassifier(BaseClassifier):
|
||||
"""An abstract class that wraps around the nltk.classify module.
|
||||
|
||||
Expects that descendant classes include a class variable ``nltk_class``
|
||||
which is the class in the nltk.classify module to be wrapped.
|
||||
|
||||
Example: ::
|
||||
|
||||
class MyClassifier(NLTKClassifier):
|
||||
nltk_class = nltk.classify.svm.SvmClassifier
|
||||
"""
|
||||
|
||||
#: The NLTK class to be wrapped. Must be a class within nltk.classify
|
||||
nltk_class = None
|
||||
|
||||
def __init__(self, train_set,
|
||||
feature_extractor=basic_extractor, format=None, **kwargs):
|
||||
super(NLTKClassifier, self).__init__(train_set, feature_extractor, format, **kwargs)
|
||||
self.train_features = [(self.extract_features(d), c) for d, c in self.train_set]
|
||||
|
||||
def __repr__(self):
|
||||
class_name = self.__class__.__name__
|
||||
return "<{cls} trained on {n} instances>".format(cls=class_name,
|
||||
n=len(self.train_set))
|
||||
|
||||
@cached_property
|
||||
def classifier(self):
|
||||
"""The classifier."""
|
||||
try:
|
||||
return self.train()
|
||||
except AttributeError: # nltk_class has not been defined
|
||||
raise ValueError("NLTKClassifier must have a nltk_class"
|
||||
" variable that is not None.")
|
||||
|
||||
def train(self, *args, **kwargs):
|
||||
"""Train the classifier with a labeled feature set and return
|
||||
the classifier. Takes the same arguments as the wrapped NLTK class.
|
||||
This method is implicitly called when calling ``classify`` or
|
||||
``accuracy`` methods and is included only to allow passing in arguments
|
||||
to the ``train`` method of the wrapped NLTK class.
|
||||
|
||||
.. versionadded:: 0.6.2
|
||||
|
||||
:rtype: A classifier
|
||||
"""
|
||||
try:
|
||||
self.classifier = self.nltk_class.train(self.train_features,
|
||||
*args, **kwargs)
|
||||
return self.classifier
|
||||
except AttributeError:
|
||||
raise ValueError("NLTKClassifier must have a nltk_class"
|
||||
" variable that is not None.")
|
||||
|
||||
def labels(self):
|
||||
"""Return an iterable of possible labels."""
|
||||
return self.classifier.labels()
|
||||
|
||||
def classify(self, text):
|
||||
"""Classifies the text.
|
||||
|
||||
:param str text: A string of text.
|
||||
"""
|
||||
text_features = self.extract_features(text)
|
||||
return self.classifier.classify(text_features)
|
||||
|
||||
def accuracy(self, test_set, format=None):
|
||||
"""Compute the accuracy on a test set.
|
||||
|
||||
:param test_set: A list of tuples of the form ``(text, label)``, or a
|
||||
file pointer.
|
||||
:param format: If ``test_set`` is a filename, the file format, e.g.
|
||||
``"csv"`` or ``"json"``. If ``None``, will attempt to detect the
|
||||
file format.
|
||||
"""
|
||||
if is_filelike(test_set):
|
||||
test_data = self._read_data(test_set, format)
|
||||
else: # test_set is a list of tuples
|
||||
test_data = test_set
|
||||
test_features = [(self.extract_features(d), c) for d, c in test_data]
|
||||
return nltk.classify.accuracy(self.classifier, test_features)
|
||||
|
||||
def update(self, new_data, *args, **kwargs):
|
||||
"""Update the classifier with new training data and re-trains the
|
||||
classifier.
|
||||
|
||||
:param new_data: New data as a list of tuples of the form
|
||||
``(text, label)``.
|
||||
"""
|
||||
self.train_set += new_data
|
||||
self._word_set.update(_get_words_from_dataset(new_data))
|
||||
self.train_features = [(self.extract_features(d), c)
|
||||
for d, c in self.train_set]
|
||||
try:
|
||||
self.classifier = self.nltk_class.train(self.train_features,
|
||||
*args, **kwargs)
|
||||
except AttributeError: # Descendant has not defined nltk_class
|
||||
raise ValueError("NLTKClassifier must have a nltk_class"
|
||||
" variable that is not None.")
|
||||
return True
|
||||
|
||||
|
||||
class NaiveBayesClassifier(NLTKClassifier):
|
||||
"""A classifier based on the Naive Bayes algorithm, as implemented in
|
||||
NLTK.
|
||||
|
||||
:param train_set: The training set, either a list of tuples of the form
|
||||
``(text, classification)`` or a filename. ``text`` may be either
|
||||
a string or an iterable.
|
||||
:param feature_extractor: A feature extractor function that takes one or
|
||||
two arguments: ``document`` and ``train_set``.
|
||||
:param format: If ``train_set`` is a filename, the file format, e.g.
|
||||
``"csv"`` or ``"json"``. If ``None``, will attempt to detect the
|
||||
file format.
|
||||
|
||||
.. versionadded:: 0.6.0
|
||||
"""
|
||||
|
||||
nltk_class = nltk.classify.NaiveBayesClassifier
|
||||
|
||||
def prob_classify(self, text):
|
||||
"""Return the label probability distribution for classifying a string
|
||||
of text.
|
||||
|
||||
Example:
|
||||
::
|
||||
|
||||
>>> classifier = NaiveBayesClassifier(train_data)
|
||||
>>> prob_dist = classifier.prob_classify("I feel happy this morning.")
|
||||
>>> prob_dist.max()
|
||||
'positive'
|
||||
>>> prob_dist.prob("positive")
|
||||
0.7
|
||||
|
||||
:rtype: nltk.probability.DictionaryProbDist
|
||||
"""
|
||||
text_features = self.extract_features(text)
|
||||
return self.classifier.prob_classify(text_features)
|
||||
|
||||
def informative_features(self, *args, **kwargs):
|
||||
"""Return the most informative features as a list of tuples of the
|
||||
form ``(feature_name, feature_value)``.
|
||||
|
||||
:rtype: list
|
||||
"""
|
||||
return self.classifier.most_informative_features(*args, **kwargs)
|
||||
|
||||
def show_informative_features(self, *args, **kwargs):
|
||||
"""Displays a listing of the most informative features for this
|
||||
classifier.
|
||||
|
||||
:rtype: None
|
||||
"""
|
||||
return self.classifier.show_most_informative_features(*args, **kwargs)
|
||||
|
||||
|
||||
class DecisionTreeClassifier(NLTKClassifier):
|
||||
"""A classifier based on the decision tree algorithm, as implemented in
|
||||
NLTK.
|
||||
|
||||
:param train_set: The training set, either a list of tuples of the form
|
||||
``(text, classification)`` or a filename. ``text`` may be either
|
||||
a string or an iterable.
|
||||
:param feature_extractor: A feature extractor function that takes one or
|
||||
two arguments: ``document`` and ``train_set``.
|
||||
:param format: If ``train_set`` is a filename, the file format, e.g.
|
||||
``"csv"`` or ``"json"``. If ``None``, will attempt to detect the
|
||||
file format.
|
||||
|
||||
.. versionadded:: 0.6.2
|
||||
"""
|
||||
|
||||
nltk_class = nltk.classify.decisiontree.DecisionTreeClassifier
|
||||
|
||||
def pretty_format(self, *args, **kwargs):
|
||||
"""Return a string containing a pretty-printed version of this decision
|
||||
tree. Each line in the string corresponds to a single decision tree node
|
||||
or leaf, and indentation is used to display the structure of the tree.
|
||||
|
||||
:rtype: str
|
||||
"""
|
||||
return self.classifier.pretty_format(*args, **kwargs)
|
||||
|
||||
# Backwards-compat
|
||||
pprint = pretty_format
|
||||
|
||||
def pseudocode(self, *args, **kwargs):
|
||||
"""Return a string representation of this decision tree that expresses
|
||||
the decisions it makes as a nested set of pseudocode if statements.
|
||||
|
||||
:rtype: str
|
||||
"""
|
||||
return self.classifier.pseudocode(*args, **kwargs)
|
||||
|
||||
|
||||
class PositiveNaiveBayesClassifier(NLTKClassifier):
|
||||
"""A variant of the Naive Bayes Classifier that performs binary
|
||||
classification with partially-labeled training sets, i.e. when only
|
||||
one class is labeled and the other is not. Assuming a prior distribution
|
||||
on the two labels, uses the unlabeled set to estimate the frequencies of
|
||||
the features.
|
||||
|
||||
Example usage:
|
||||
::
|
||||
|
||||
>>> from text.classifiers import PositiveNaiveBayesClassifier
|
||||
>>> sports_sentences = ['The team dominated the game',
|
||||
... 'They lost the ball',
|
||||
... 'The game was intense',
|
||||
... 'The goalkeeper catched the ball',
|
||||
... 'The other team controlled the ball']
|
||||
>>> various_sentences = ['The President did not comment',
|
||||
... 'I lost the keys',
|
||||
... 'The team won the game',
|
||||
... 'Sara has two kids',
|
||||
... 'The ball went off the court',
|
||||
... 'They had the ball for the whole game',
|
||||
... 'The show is over']
|
||||
>>> classifier = PositiveNaiveBayesClassifier(positive_set=sports_sentences,
|
||||
... unlabeled_set=various_sentences)
|
||||
>>> classifier.classify("My team lost the game")
|
||||
True
|
||||
>>> classifier.classify("And now for something completely different.")
|
||||
False
|
||||
|
||||
|
||||
:param positive_set: A collection of strings that have the positive label.
|
||||
:param unlabeled_set: A collection of unlabeled strings.
|
||||
:param feature_extractor: A feature extractor function.
|
||||
:param positive_prob_prior: A prior estimate of the probability of the
|
||||
label ``True``.
|
||||
|
||||
.. versionadded:: 0.7.0
|
||||
"""
|
||||
|
||||
nltk_class = nltk.classify.PositiveNaiveBayesClassifier
|
||||
|
||||
def __init__(self, positive_set, unlabeled_set,
|
||||
feature_extractor=contains_extractor,
|
||||
positive_prob_prior=0.5, **kwargs):
|
||||
self.feature_extractor = feature_extractor
|
||||
self.positive_set = positive_set
|
||||
self.unlabeled_set = unlabeled_set
|
||||
self.positive_features = [self.extract_features(d)
|
||||
for d in self.positive_set]
|
||||
self.unlabeled_features = [self.extract_features(d)
|
||||
for d in self.unlabeled_set]
|
||||
self.positive_prob_prior = positive_prob_prior
|
||||
|
||||
def __repr__(self):
|
||||
class_name = self.__class__.__name__
|
||||
return "<{cls} trained on {n_pos} labeled and {n_unlabeled} unlabeled instances>"\
|
||||
.format(cls=class_name, n_pos=len(self.positive_set),
|
||||
n_unlabeled=len(self.unlabeled_set))
|
||||
|
||||
# Override
|
||||
def train(self, *args, **kwargs):
|
||||
"""Train the classifier with a labeled and unlabeled feature sets and return
|
||||
the classifier. Takes the same arguments as the wrapped NLTK class.
|
||||
This method is implicitly called when calling ``classify`` or
|
||||
``accuracy`` methods and is included only to allow passing in arguments
|
||||
to the ``train`` method of the wrapped NLTK class.
|
||||
|
||||
:rtype: A classifier
|
||||
"""
|
||||
self.classifier = self.nltk_class.train(self.positive_features,
|
||||
self.unlabeled_features,
|
||||
self.positive_prob_prior)
|
||||
return self.classifier
|
||||
|
||||
def update(self, new_positive_data=None,
|
||||
new_unlabeled_data=None, positive_prob_prior=0.5,
|
||||
*args, **kwargs):
|
||||
"""Update the classifier with new data and re-trains the
|
||||
classifier.
|
||||
|
||||
:param new_positive_data: List of new, labeled strings.
|
||||
:param new_unlabeled_data: List of new, unlabeled strings.
|
||||
"""
|
||||
self.positive_prob_prior = positive_prob_prior
|
||||
if new_positive_data:
|
||||
self.positive_set += new_positive_data
|
||||
self.positive_features += [self.extract_features(d)
|
||||
for d in new_positive_data]
|
||||
if new_unlabeled_data:
|
||||
self.unlabeled_set += new_unlabeled_data
|
||||
self.unlabeled_features += [self.extract_features(d)
|
||||
for d in new_unlabeled_data]
|
||||
self.classifier = self.nltk_class.train(self.positive_features,
|
||||
self.unlabeled_features,
|
||||
self.positive_prob_prior,
|
||||
*args, **kwargs)
|
||||
return True
|
||||
|
||||
|
||||
class MaxEntClassifier(NLTKClassifier):
|
||||
__doc__ = nltk.classify.maxent.MaxentClassifier.__doc__
|
||||
nltk_class = nltk.classify.maxent.MaxentClassifier
|
||||
|
||||
def prob_classify(self, text):
|
||||
"""Return the label probability distribution for classifying a string
|
||||
of text.
|
||||
|
||||
Example:
|
||||
::
|
||||
|
||||
>>> classifier = MaxEntClassifier(train_data)
|
||||
>>> prob_dist = classifier.prob_classify("I feel happy this morning.")
|
||||
>>> prob_dist.max()
|
||||
'positive'
|
||||
>>> prob_dist.prob("positive")
|
||||
0.7
|
||||
|
||||
:rtype: nltk.probability.DictionaryProbDist
|
||||
"""
|
||||
feats = self.extract_features(text)
|
||||
return self.classifier.prob_classify(feats)
|
||||
53
backend/venv/Lib/site-packages/textblob/compat.py
Normal file
53
backend/venv/Lib/site-packages/textblob/compat.py
Normal file
@@ -0,0 +1,53 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import sys
|
||||
|
||||
PY2 = int(sys.version[0]) == 2
|
||||
|
||||
if PY2:
|
||||
from itertools import imap, izip
|
||||
import urllib2 as request
|
||||
from urllib import quote as urlquote
|
||||
from urllib import urlencode
|
||||
text_type = unicode
|
||||
binary_type = str
|
||||
string_types = (str, unicode)
|
||||
unicode = unicode
|
||||
basestring = basestring
|
||||
imap = imap
|
||||
izip = izip
|
||||
import unicodecsv as csv
|
||||
|
||||
def implements_to_string(cls):
|
||||
"""Class decorator that renames __str__ to __unicode__ and
|
||||
modifies __str__ that returns utf-8.
|
||||
"""
|
||||
cls.__unicode__ = cls.__str__
|
||||
cls.__str__ = lambda x: x.__unicode__().encode('utf-8')
|
||||
return cls
|
||||
else: # PY3
|
||||
from urllib import request
|
||||
from urllib.parse import quote as urlquote
|
||||
from urllib.parse import urlencode
|
||||
text_type = str
|
||||
binary_type = bytes
|
||||
string_types = (str,)
|
||||
unicode = str
|
||||
basestring = (str, bytes)
|
||||
imap = map
|
||||
izip = zip
|
||||
import csv
|
||||
|
||||
implements_to_string = lambda x: x
|
||||
|
||||
|
||||
# From six
|
||||
def with_metaclass(meta, *bases):
|
||||
"""Create a base class with a metaclass."""
|
||||
# This requires a bit of explanation: the basic idea is to make a dummy
|
||||
# metaclass for one level of class instantiation that replaces itself with
|
||||
# the actual metaclass.
|
||||
class metaclass(meta): # noqa
|
||||
|
||||
def __new__(cls, name, this_bases, d):
|
||||
return meta(name, bases, d)
|
||||
return type.__new__(metaclass, 'temporary_class', (), {})
|
||||
39
backend/venv/Lib/site-packages/textblob/decorators.py
Normal file
39
backend/venv/Lib/site-packages/textblob/decorators.py
Normal file
@@ -0,0 +1,39 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Custom decorators."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from functools import wraps
|
||||
from textblob.exceptions import MissingCorpusError
|
||||
|
||||
|
||||
class cached_property(object):
|
||||
"""A property that is only computed once per instance and then replaces
|
||||
itself with an ordinary attribute. Deleting the attribute resets the
|
||||
property.
|
||||
|
||||
Credit to Marcel Hellkamp, author of bottle.py.
|
||||
"""
|
||||
|
||||
def __init__(self, func):
|
||||
self.__doc__ = getattr(func, '__doc__')
|
||||
self.func = func
|
||||
|
||||
def __get__(self, obj, cls):
|
||||
if obj is None:
|
||||
return self
|
||||
value = obj.__dict__[self.func.__name__] = self.func(obj)
|
||||
return value
|
||||
|
||||
|
||||
def requires_nltk_corpus(func):
|
||||
"""Wraps a function that requires an NLTK corpus. If the corpus isn't found,
|
||||
raise a :exc:`MissingCorpusError`.
|
||||
"""
|
||||
@wraps(func)
|
||||
def decorated(*args, **kwargs):
|
||||
try:
|
||||
return func(*args, **kwargs)
|
||||
except LookupError as err:
|
||||
print(err)
|
||||
raise MissingCorpusError()
|
||||
return decorated
|
||||
51
backend/venv/Lib/site-packages/textblob/download_corpora.py
Normal file
51
backend/venv/Lib/site-packages/textblob/download_corpora.py
Normal file
@@ -0,0 +1,51 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Downloads the necessary NLTK corpora for TextBlob.
|
||||
|
||||
Usage: ::
|
||||
|
||||
$ python -m textblob.download_corpora
|
||||
|
||||
If you only intend to use TextBlob's default models, you can use the "lite"
|
||||
option: ::
|
||||
|
||||
$ python -m textblob.download_corpora lite
|
||||
|
||||
"""
|
||||
import sys
|
||||
import nltk
|
||||
|
||||
MIN_CORPORA = [
|
||||
'brown', # Required for FastNPExtractor
|
||||
'punkt', # Required for WordTokenizer
|
||||
'wordnet', # Required for lemmatization
|
||||
'averaged_perceptron_tagger', # Required for NLTKTagger
|
||||
]
|
||||
|
||||
ADDITIONAL_CORPORA = [
|
||||
'conll2000', # Required for ConllExtractor
|
||||
'movie_reviews', # Required for NaiveBayesAnalyzer
|
||||
]
|
||||
|
||||
ALL_CORPORA = MIN_CORPORA + ADDITIONAL_CORPORA
|
||||
|
||||
def download_lite():
|
||||
for each in MIN_CORPORA:
|
||||
nltk.download(each)
|
||||
|
||||
|
||||
def download_all():
|
||||
for each in ALL_CORPORA:
|
||||
nltk.download(each)
|
||||
|
||||
|
||||
def main():
|
||||
if 'lite' in sys.argv:
|
||||
download_lite()
|
||||
else:
|
||||
download_all()
|
||||
print("Finished.")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
139
backend/venv/Lib/site-packages/textblob/en/__init__.py
Normal file
139
backend/venv/Lib/site-packages/textblob/en/__init__.py
Normal file
@@ -0,0 +1,139 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
'''This file is based on pattern.en. See the bundled NOTICE file for
|
||||
license information.
|
||||
'''
|
||||
from __future__ import absolute_import
|
||||
import os
|
||||
|
||||
from textblob._text import (Parser as _Parser, Sentiment as _Sentiment, Lexicon,
|
||||
WORD, POS, CHUNK, PNP, PENN, UNIVERSAL, Spelling)
|
||||
|
||||
from textblob.compat import text_type, unicode
|
||||
|
||||
try:
|
||||
MODULE = os.path.dirname(os.path.abspath(__file__))
|
||||
except:
|
||||
MODULE = ""
|
||||
|
||||
spelling = Spelling(
|
||||
path = os.path.join(MODULE, "en-spelling.txt")
|
||||
)
|
||||
|
||||
#--- ENGLISH PARSER --------------------------------------------------------------------------------
|
||||
|
||||
def find_lemmata(tokens):
|
||||
""" Annotates the tokens with lemmata for plural nouns and conjugated verbs,
|
||||
where each token is a [word, part-of-speech] list.
|
||||
"""
|
||||
for token in tokens:
|
||||
word, pos, lemma = token[0], token[1], token[0]
|
||||
# cats => cat
|
||||
if pos == "NNS":
|
||||
lemma = singularize(word)
|
||||
# sat => sit
|
||||
if pos.startswith(("VB", "MD")):
|
||||
lemma = conjugate(word, INFINITIVE) or word
|
||||
token.append(lemma.lower())
|
||||
return tokens
|
||||
|
||||
class Parser(_Parser):
|
||||
|
||||
def find_lemmata(self, tokens, **kwargs):
|
||||
return find_lemmata(tokens)
|
||||
|
||||
def find_tags(self, tokens, **kwargs):
|
||||
if kwargs.get("tagset") in (PENN, None):
|
||||
kwargs.setdefault("map", lambda token, tag: (token, tag))
|
||||
if kwargs.get("tagset") == UNIVERSAL:
|
||||
kwargs.setdefault("map", lambda token, tag: penntreebank2universal(token, tag))
|
||||
return _Parser.find_tags(self, tokens, **kwargs)
|
||||
|
||||
class Sentiment(_Sentiment):
|
||||
|
||||
def load(self, path=None):
|
||||
_Sentiment.load(self, path)
|
||||
# Map "terrible" to adverb "terribly" (+1% accuracy)
|
||||
if not path:
|
||||
for w, pos in list(dict.items(self)):
|
||||
if "JJ" in pos:
|
||||
if w.endswith("y"):
|
||||
w = w[:-1] + "i"
|
||||
if w.endswith("le"):
|
||||
w = w[:-2]
|
||||
p, s, i = pos["JJ"]
|
||||
self.annotate(w + "ly", "RB", p, s, i)
|
||||
|
||||
|
||||
lexicon = Lexicon(
|
||||
path = os.path.join(MODULE, "en-lexicon.txt"),
|
||||
morphology = os.path.join(MODULE, "en-morphology.txt"),
|
||||
context = os.path.join(MODULE, "en-context.txt"),
|
||||
entities = os.path.join(MODULE, "en-entities.txt"),
|
||||
language = "en"
|
||||
)
|
||||
parser = Parser(
|
||||
lexicon = lexicon,
|
||||
default = ("NN", "NNP", "CD"),
|
||||
language = "en"
|
||||
)
|
||||
|
||||
sentiment = Sentiment(
|
||||
path = os.path.join(MODULE, "en-sentiment.xml"),
|
||||
synset = "wordnet_id",
|
||||
negations = ("no", "not", "n't", "never"),
|
||||
modifiers = ("RB",),
|
||||
modifier = lambda w: w.endswith("ly"),
|
||||
tokenizer = parser.find_tokens,
|
||||
language = "en"
|
||||
)
|
||||
|
||||
|
||||
def tokenize(s, *args, **kwargs):
|
||||
""" Returns a list of sentences, where punctuation marks have been split from words.
|
||||
"""
|
||||
return parser.find_tokens(text_type(s), *args, **kwargs)
|
||||
|
||||
def parse(s, *args, **kwargs):
|
||||
""" Returns a tagged Unicode string.
|
||||
"""
|
||||
return parser.parse(unicode(s), *args, **kwargs)
|
||||
|
||||
def parsetree(s, *args, **kwargs):
|
||||
""" Returns a parsed Text from the given string.
|
||||
"""
|
||||
return Text(parse(unicode(s), *args, **kwargs))
|
||||
|
||||
def split(s, token=[WORD, POS, CHUNK, PNP]):
|
||||
""" Returns a parsed Text from the given parsed string.
|
||||
"""
|
||||
return Text(text_type(s), token)
|
||||
|
||||
def tag(s, tokenize=True, encoding="utf-8"):
|
||||
""" Returns a list of (token, tag)-tuples from the given string.
|
||||
"""
|
||||
tags = []
|
||||
for sentence in parse(s, tokenize, True, False, False, False, encoding).split():
|
||||
for token in sentence:
|
||||
tags.append((token[0], token[1]))
|
||||
return tags
|
||||
|
||||
def suggest(w):
|
||||
""" Returns a list of (word, confidence)-tuples of spelling corrections.
|
||||
"""
|
||||
return spelling.suggest(w)
|
||||
|
||||
def polarity(s, **kwargs):
|
||||
""" Returns the sentence polarity (positive/negative) between -1.0 and 1.0.
|
||||
"""
|
||||
return sentiment(unicode(s), **kwargs)[0]
|
||||
|
||||
def subjectivity(s, **kwargs):
|
||||
""" Returns the sentence subjectivity (objective/subjective) between 0.0 and 1.0.
|
||||
"""
|
||||
return sentiment(unicode(s), **kwargs)[1]
|
||||
|
||||
def positive(s, threshold=0.1, **kwargs):
|
||||
""" Returns True if the given sentence has a positive sentiment (polarity >= threshold).
|
||||
"""
|
||||
return polarity(unicode(s), **kwargs) >= threshold
|
||||
|
||||
294
backend/venv/Lib/site-packages/textblob/en/en-context.txt
Normal file
294
backend/venv/Lib/site-packages/textblob/en/en-context.txt
Normal file
@@ -0,0 +1,294 @@
|
||||
;;;
|
||||
;;; The contextual rules are based on Brill's rule based tagger v1.14,
|
||||
;;; trained on Brown corpus and Penn Treebank.
|
||||
;;;
|
||||
IN VB PREVTAG PRP
|
||||
NN VB PREVTAG TO
|
||||
VBP VB PREV1OR2OR3TAG MD
|
||||
NN VB PREV1OR2TAG MD
|
||||
VB NN PREV1OR2TAG DT
|
||||
VBD VBN PREV1OR2OR3TAG VBZ
|
||||
VBN VBD PREVTAG PRP
|
||||
VBN VBD PREVTAG NNP
|
||||
VBD VBN PREVTAG VBD
|
||||
VBP VB PREVTAG TO
|
||||
POS VBZ PREVTAG PRP
|
||||
VB VBP PREVTAG NNS
|
||||
IN RB WDAND2AFT as as
|
||||
VBD VBN PREV1OR2WD have
|
||||
IN WDT NEXT1OR2TAG VB
|
||||
VB VBP PREVTAG PRP
|
||||
VBP VB PREV1OR2WD n't
|
||||
IN WDT NEXTTAG VBZ
|
||||
JJ NNP NEXTTAG NNP
|
||||
IN WDT NEXTTAG VBD
|
||||
JJ NN NEXTWD of
|
||||
VBD VBN PREV1OR2WD be
|
||||
JJR RBR NEXTTAG JJ
|
||||
IN WDT NEXTTAG VBP
|
||||
JJS RBS WDNEXTTAG most JJ
|
||||
VBN VBD SURROUNDTAG NN DT
|
||||
NNS VBZ PREVTAG PRP
|
||||
POS VBZ NEXT1OR2TAG DT
|
||||
NNP NN SURROUNDTAG STAART NNS
|
||||
VBD VBN NEXTWD by
|
||||
VB NN PREV1OR2TAG IN
|
||||
VB VBP PREVTAG WDT
|
||||
VBG NN PREVTAG JJ
|
||||
NNS VBZ NEXTTAG DT
|
||||
VBN VBD PREVTAG WP
|
||||
NN VBP PREVTAG NNS
|
||||
VB NN PREVTAG NN
|
||||
NN VB PREVWD n't
|
||||
NN VBG NEXTTAG DT
|
||||
RB JJ NEXTTAG NN
|
||||
NN VBP PREVTAG PRP
|
||||
VBN VBD SURROUNDTAG NNS DT
|
||||
VB NN PREV1OR2TAG POS
|
||||
JJ NN NEXTTAG VBD
|
||||
RB RP WDNEXTTAG up DT
|
||||
JJ VB PREVTAG TO
|
||||
VBN VBD SURROUNDTAG , DT
|
||||
VBN VBD PREVWD that
|
||||
VB VBP PREVBIGRAM NNS RB
|
||||
NNP JJ SURROUNDTAG STAART NN
|
||||
VB VBN PREVTAG VBZ
|
||||
NNP JJ WDNEXTTAG American NNS
|
||||
JJ RB NEXTTAG JJR
|
||||
NNS NN CURWD yen
|
||||
IN WDT NEXTTAG VBD
|
||||
DT IN WDAND2TAGAFT that NNS
|
||||
POS VBZ PREVWD that
|
||||
JJ VB PREVTAG MD
|
||||
VB NN PREVTAG JJ
|
||||
JJR RBR NEXTTAG RB
|
||||
VBD VBN PREV1OR2WD are
|
||||
NN JJ WDNEXTTAG executive NN
|
||||
NNP JJ WDNEXTTAG American NN
|
||||
VBN VBD PREVTAG WDT
|
||||
VBD VBN PREVBIGRAM VBD RB
|
||||
JJ NN SURROUNDTAG DT .
|
||||
NNP JJ NEXTWD German
|
||||
VBN VB PREVTAG TO
|
||||
VBN VBD PREVBIGRAM NNP RB
|
||||
RB IN RBIGRAM up to
|
||||
VB VBP PREVTAG WP
|
||||
JJ NN SURROUNDTAG DT IN
|
||||
IN DT NEXTWD 's
|
||||
VBD VBN WDNEXTTAG ended NNP
|
||||
VBD VBN SURROUNDTAG DT NN
|
||||
NNS NNP NEXTTAG NNP
|
||||
NN NNP NEXTTAG NNP
|
||||
VBG NN SURROUNDTAG DT IN
|
||||
NNP JJ SURROUNDTAG STAART NNS
|
||||
RB RP WDPREVTAG VB up
|
||||
VBN VBD PREVBIGRAM PRP RB
|
||||
JJ RB NEXTTAG VBN
|
||||
NN VBP PREVTAG RB
|
||||
NNS VBZ PREVTAG RB
|
||||
POS VBZ PREVTAG WP
|
||||
VB VBN PREVWD have
|
||||
NN PDT WDNEXTTAG half DT
|
||||
IN WDT NEXTTAG MD
|
||||
POS VBZ PREVTAG DT
|
||||
NN NNP CURWD Integrated
|
||||
POS '' NEXT1OR2TAG ''
|
||||
VBD VBN PREVTAG IN
|
||||
JJR RBR NEXT1OR2TAG VBN
|
||||
JJS RBS WDNEXTTAG most RB
|
||||
JJ NN SURROUNDTAG JJ IN
|
||||
VBZ NNS PREVTAG JJ
|
||||
NNS VBZ WDPREVTAG JJ is
|
||||
JJ NN NEXTTAG VBZ
|
||||
VBP NN PREVTAG DT
|
||||
JJ NN SURROUNDTAG JJ .
|
||||
NNPS NNP NEXTTAG NNP
|
||||
WDT DT PREVTAG CC
|
||||
RB IN WDNEXTTAG so PRP
|
||||
VBP NN PREVWD earnings
|
||||
NN VBG PREVWD is
|
||||
NNS VBZ PREV1OR2WD Mr.
|
||||
VBZ NNS PREVWD the
|
||||
RB RP WDPREVTAG VBN up
|
||||
NNPS NNS PREVTAG STAART
|
||||
VBN VBD SURROUNDTAG NN JJ
|
||||
VBP VB PREV2TAG VB
|
||||
RBR JJR NEXTTAG NNS
|
||||
JJ NN SURROUNDTAG DT ,
|
||||
JJ NN SURROUNDTAG IN .
|
||||
NN VB PREVTAG TO
|
||||
VB NN PREVTAG VB
|
||||
NN VBP PREVWD who
|
||||
RB RP WDPREVTAG VBG up
|
||||
NN RB WDNEXTTAG right RB
|
||||
VBZ POS WDPREVTAG NNP 's
|
||||
JJ RP WDNEXTTAG up NN
|
||||
VBN VBD SURROUNDTAG NN NN
|
||||
VBN VBD SURROUNDTAG CC DT
|
||||
JJ NN NEXTBIGRAM MD VB
|
||||
JJ RB WDNEXTTAG early IN
|
||||
JJ VBN SURROUNDTAG STAART IN
|
||||
IN RB RBIGRAM though ,
|
||||
VBD VBN PREV1OR2WD been
|
||||
DT PDT WDNEXTTAG all DT
|
||||
VBN VBD PREVBIGRAM NN RB
|
||||
NN VB PREVWD help
|
||||
VBP VB PREV1OR2WD not
|
||||
VBP NN PREVTAG JJ
|
||||
DT WDT PREVTAG NNS
|
||||
NN VBP PREVTAG WDT
|
||||
VB RB RBIGRAM close to
|
||||
NNS VBZ PREVBIGRAM , WDT
|
||||
IN RP WDNEXTTAG out DT
|
||||
DT RB NEXTWD longer
|
||||
IN JJ SURROUNDTAG DT NN
|
||||
DT WDT SURROUNDTAG NN VBZ
|
||||
IN VB NEXT2TAG VB
|
||||
IN NN PREVTAG DT
|
||||
VBN VBD SURROUNDTAG NNS NNS
|
||||
IN RB RBIGRAM about $
|
||||
EX RB NEXT1OR2TAG IN
|
||||
NN VBG NEXTTAG PRP$
|
||||
NN VBG CURWD living
|
||||
VBZ NNS PREVTAG PRP$
|
||||
RBR JJR NEXTTAG NN
|
||||
RBR JJR CURWD higher
|
||||
VB VBP PREVBIGRAM PRP RB
|
||||
NN VB PREVTAG MD
|
||||
VB NN PREV1OR2TAG PRP$
|
||||
RP IN PREV1OR2TAG ,
|
||||
VB JJ PREVTAG DT
|
||||
DT IN PREVWD out
|
||||
POS VBZ PREVTAG EX
|
||||
JJ NN NEXTTAG POS
|
||||
NN JJ CURWD first
|
||||
VBD VBN PREVWD the
|
||||
NNS VBZ WDPREVTAG NNP plans
|
||||
NNP NNS SURROUNDTAG STAART IN
|
||||
RB JJ NEXTTAG NNS
|
||||
JJ RB CURWD just
|
||||
VBP NN PREVWD sales
|
||||
NNS NNPS PREVWD Orange
|
||||
VB VBN PREVTAG VBD
|
||||
WDT DT PREVTAG IN
|
||||
NN JJ WDNEXTTAG right NN
|
||||
NN VBG WDNEXTTAG operating IN
|
||||
JJ VBN CURWD insured
|
||||
JJ NNP LBIGRAM STAART U.S.
|
||||
IN DT NEXTTAG STAART
|
||||
POS '' PREV1OR2OR3TAG ``
|
||||
NN JJ WDNEXTTAG official NN
|
||||
NNP JJ CURWD Irish
|
||||
JJ RB NEXTTAG RBR
|
||||
VBG NN WDPREVTAG DT selling
|
||||
VBP VB PREV1OR2OR3TAG MD
|
||||
WDT IN NEXTTAG PRP
|
||||
EX RB NEXTTAG .
|
||||
VBN VBD SURROUNDTAG NNS PRP$
|
||||
VBN VBD CURWD said
|
||||
JJ RB PREVTAG MD
|
||||
NN VBG NEXTBIGRAM JJ NNS
|
||||
JJ RB WDNEXTTAG late IN
|
||||
VBG NN PREVTAG PRP$
|
||||
VBZ NNS NEXTTAG VBP
|
||||
NN NNP WDPREVTAG DT CD
|
||||
NN VBN PREVWD be
|
||||
JJS RBS NEXTTAG VBN
|
||||
VBN VBD SURROUNDTAG NN PRP$
|
||||
VBN VBD SURROUNDTAG NNS JJ
|
||||
VBN VBD SURROUNDTAG NNS NN
|
||||
VBD VBN WDNEXTTAG increased NN
|
||||
VBZ NNS NEXTWD of
|
||||
IN RP WDAND2TAGAFT out NNS
|
||||
JJ NNP NEXTTAG POS
|
||||
RB RP WDNEXTTAG down DT
|
||||
CD NNS CURWD 1970s
|
||||
VBG NNP CURWD Working
|
||||
VBN VB PREVTAG MD
|
||||
JJ NN NEXTBIGRAM CC NN
|
||||
NN JJ SURROUNDTAG STAART NNS
|
||||
VBN VBD PREVBIGRAM , CC
|
||||
IN RB NEXTBIGRAM . STAART
|
||||
NN VBG PREVWD was
|
||||
NNP NNPS CURWD Cowboys
|
||||
VBZ NNS PREVWD phone
|
||||
NNP NNS SURROUNDTAG STAART VBP
|
||||
RBR JJR WDNEXTTAG lower JJ
|
||||
PRP$ PRP NEXTTAG IN
|
||||
VBD VB PREVTAG TO
|
||||
JJ NN WDPREVTAG NN chief
|
||||
JJ NN SURROUNDTAG JJ ,
|
||||
NN JJ WDPREVTAG DT third
|
||||
VBN VBD SURROUNDTAG NNS NNP
|
||||
NNP NN SURROUNDTAG STAART NN
|
||||
NNP NN CURWD HDTV
|
||||
VBG NN SURROUNDTAG DT ,
|
||||
VBG NN SURROUNDTAG DT .
|
||||
NNS VBZ PREVTAG WP
|
||||
NN VB SURROUNDTAG CC DT
|
||||
NNPS NNP WDAND2TAGBFR IN Securities
|
||||
RP IN PREVTAG NNS
|
||||
VBP NN LBIGRAM funds rate
|
||||
VBP NN WDPREVTAG NNS market
|
||||
DT RB RBIGRAM either .
|
||||
VBN NN SURROUNDTAG DT IN
|
||||
VBD VB PREV1OR2OR3TAG MD
|
||||
NN JJ NEXTWD oil
|
||||
VBN VBD SURROUNDTAG , $
|
||||
VBD VBN PREVBIGRAM DT RB
|
||||
VBN JJ PREVWD by
|
||||
NNP JJ WDNEXTTAG American JJ
|
||||
NN VBG PREVTAG VBP
|
||||
JJ RB LBIGRAM very much
|
||||
NN VBG RBIGRAM operating officer
|
||||
RB IN RBIGRAM up for
|
||||
NNS VBZ NEXTBIGRAM JJ NNS
|
||||
NNS VBZ SURROUNDTAG , IN
|
||||
VB VBP PREVTAG NNPS
|
||||
IN RP WDAND2TAGAFT out IN
|
||||
NNPS NNP PREVBIGRAM CC NNP
|
||||
NN RB RBIGRAM close to
|
||||
RBR RB PREVWD no
|
||||
JJ VBD NEXTTAG DT
|
||||
RB NNP PREVTAG NNP
|
||||
MD NN PREVWD good
|
||||
JJ NN WDPREVTAG NN giant
|
||||
NN JJ WDNEXTTAG official NNS
|
||||
VBN VBD SURROUNDTAG , PRP$
|
||||
VBN VBD SURROUNDTAG , RB
|
||||
VBN VBD SURROUNDTAG NN PRP
|
||||
NNP JJ WDNEXTTAG South JJ
|
||||
NN VBG PREVTAG RB
|
||||
NNS VBZ SURROUNDTAG , TO
|
||||
VBZ NNS SURROUNDTAG NN .
|
||||
NN VB NEXTTAG PRP$
|
||||
VBP VB PREV1OR2WD do
|
||||
VB JJ NEXTWD countries
|
||||
IN WDT NEXTBIGRAM RB VBZ
|
||||
JJ VB NEXTTAG DT
|
||||
WDT DT NEXTBIGRAM VBZ ,
|
||||
NNP RB RBIGRAM First ,
|
||||
DT NNP WDNEXTTAG A VBZ
|
||||
JJ RBR RBIGRAM further ,
|
||||
CD PRP WDNEXTTAG one MD
|
||||
POS '' PREV1OR2OR3TAG .
|
||||
PRP NN PREVTAG -LRB-
|
||||
VBN VBD SURROUNDTAG , PRP
|
||||
VBN VBD SURROUNDTAG NN NNS
|
||||
VBN VBD SURROUNDTAG NN RP
|
||||
NNP NN LBIGRAM STAART Business
|
||||
VBD VBN PREVTAG VBG
|
||||
IN RB RBIGRAM before ,
|
||||
IN RB WDAND2AFT As as
|
||||
NNP JJ LBIGRAM New York-based
|
||||
NNP JJ CURWD Mexican
|
||||
NNP NNPS WDNEXTTAG Motors NNP
|
||||
NNP NNPS WDPREVTAG NNP Enterprises
|
||||
JJ RB WDNEXTTAG long IN
|
||||
VBG JJ SURROUNDTAG DT NN
|
||||
NN PRP PREVWD are mine
|
||||
* IN CURWD with
|
||||
* VB CURWD be
|
||||
* JJ RBIGRAM such as
|
||||
* IN LBIGRAM such as
|
||||
* IN CURWD from
|
||||
646
backend/venv/Lib/site-packages/textblob/en/en-entities.txt
Normal file
646
backend/venv/Lib/site-packages/textblob/en/en-entities.txt
Normal file
@@ -0,0 +1,646 @@
|
||||
50 Cent PERS
|
||||
AIDS
|
||||
AK-47
|
||||
AT&T ORG
|
||||
Abraham Lincoln PERS
|
||||
Acropolis LOC
|
||||
Adam Sandler PERS
|
||||
Adolf Hitler PERS
|
||||
Adriana Lima PERS
|
||||
Afghanistan LOC
|
||||
Africa LOC
|
||||
Al Capone PERS
|
||||
Al Pacino PERS
|
||||
Alaska LOC
|
||||
Albert Einstein PERS
|
||||
Albert Hofmann PERS
|
||||
Albert Schweitzer PERS
|
||||
Alexander the Great PERS
|
||||
Alfred Hitchcock PERS
|
||||
Alice Cooper PERS
|
||||
Alice in Wonderland
|
||||
Amazon.com ORG
|
||||
Amber Heard PERS
|
||||
Amelia Earhart PERS
|
||||
American Express
|
||||
American Idol
|
||||
Amsterdam LOC
|
||||
Amy Adams PERS
|
||||
Amy Winehouse PERS
|
||||
Ancient Egypt LOC
|
||||
Ancient Rome LOC
|
||||
Android
|
||||
Angelina Jolie PERS
|
||||
Angry Birds
|
||||
Anne Frank PERS
|
||||
Anne Hathaway PERS
|
||||
Antartica LOC
|
||||
Apple Inc. ORG
|
||||
Archimedes PERS
|
||||
Aretha Franklin PERS
|
||||
Argentina LOC
|
||||
Aristotle PERS
|
||||
Arnold Schwarzenegger PERS
|
||||
Audi ORG
|
||||
Audrey Hepburn PERS
|
||||
Aung San Suu Kyi PERS
|
||||
Australia LOC
|
||||
Austria LOC
|
||||
Avatar
|
||||
Avril Lavigne PERS
|
||||
Ayn Rand PERS
|
||||
Aztec
|
||||
BMW ORG
|
||||
Babe Ruth PERS
|
||||
Bacardi ORG
|
||||
Backstreet Boys
|
||||
Bangladesh LOC
|
||||
Barack Obama PERS
|
||||
Barbra Streisand PERS
|
||||
Barcelona LOC
|
||||
Batman PERS
|
||||
Beethoven PERS
|
||||
Belarus LOC
|
||||
Belgium LOC
|
||||
Ben Affleck PERS
|
||||
Ben Folds PERS
|
||||
Ben Stiller PERS
|
||||
Benazir Bhutto PERS
|
||||
Benjamin Franklin PERS
|
||||
Benjamin Millepied PERS
|
||||
Bernard Madoff PERS
|
||||
Beyoncé Knowles PERS
|
||||
Bill Clinton PERS
|
||||
Bill Gates PERS
|
||||
Billie Holiday PERS
|
||||
Billie Jean King PERS
|
||||
Bing Crosby PERS
|
||||
Black Sabbath
|
||||
Blake Edwards PERS
|
||||
Blake Lively PERS
|
||||
Bob Dylan PERS
|
||||
Bob Geldof PERS
|
||||
Bob Marley PERS
|
||||
Brad Pitt PERS
|
||||
Bradley Manning PERS
|
||||
Brazil LOC
|
||||
Brett Favre PERS
|
||||
Britney Spears PERS
|
||||
Bruce Lee PERS
|
||||
Bruce Willis PERS
|
||||
Bruno Mars PERS
|
||||
Buddhism
|
||||
Bulgaria LOC
|
||||
Burger King
|
||||
Burma LOC
|
||||
C.S. Lewis PERS
|
||||
Cadillac ORG
|
||||
California LOC
|
||||
Cameron Diaz PERS
|
||||
Cameron Newton PERS
|
||||
Canada LOC
|
||||
Captain Beefheart PERS
|
||||
Carl Lewis PERS
|
||||
Charles Darwin PERS
|
||||
Charles Dickens PERS
|
||||
Charles Kindbergh PERS
|
||||
Charles de Gaulle PERS
|
||||
Charlie Sheen PERS
|
||||
Che Guevara PERS
|
||||
Cheryl Cole PERS
|
||||
Chicago LOC
|
||||
China LOC
|
||||
Chopin PERS
|
||||
Chris Colfer PERS
|
||||
Christian Bale PERS
|
||||
Christiano Ronaldo PERS
|
||||
Christina Aguilera PERS
|
||||
Christmas
|
||||
Christopher Nolan PERS
|
||||
Chuck Norris PERS
|
||||
Clint Eastwood PERS
|
||||
Coca Cola ORG
|
||||
Coco Chanel ORG
|
||||
Coldplay
|
||||
Colombia LOC
|
||||
Conan PERS
|
||||
Cristiano Ronaldo PERS
|
||||
Crystal Harris PERS
|
||||
Cthulhu PERS
|
||||
Cuba LOC
|
||||
DNA
|
||||
Daft Punk
|
||||
Dalai Lama PERS
|
||||
Daniel Radcliffe PERS
|
||||
Darren Aronofsky PERS
|
||||
Darren Criss PERS
|
||||
Darth Vader PERS
|
||||
David Beckham PERS
|
||||
David Bowie PERS
|
||||
David Cook PERS
|
||||
Demi Lovato PERS
|
||||
Demi Moore PERS
|
||||
Denmark LOC
|
||||
Desmond Tutu PERS
|
||||
Dexter PERS
|
||||
Diana PERS
|
||||
Diego Maradona PERS
|
||||
Disney ORG
|
||||
Dmitry Medvedev PERS
|
||||
Doctor Who PERS
|
||||
Dr. Dre PERS
|
||||
Dr. Seuss PERS
|
||||
Dragon Ball
|
||||
Dubai LOC
|
||||
Dwayne Johnson PERS
|
||||
Earth LOC
|
||||
Ebenezer Scrooge PERS
|
||||
Eddie Murphy PERS
|
||||
Eduardo Saverin PERS
|
||||
Egypt LOC
|
||||
El Salvador LOC
|
||||
Elizabeth Edwards PERS
|
||||
Elizabeth Hurley PERS
|
||||
Ellen Page PERS
|
||||
Elton John PERS
|
||||
Elvis Presley PERS
|
||||
Emile Zatopek PERS
|
||||
Eminem PERS
|
||||
Emma Roberts PERS
|
||||
Emma Stone PERS
|
||||
Emma Watson PERS
|
||||
Emmeline Pankhurst PERS
|
||||
England LOC
|
||||
Enrique Iglesias PERS
|
||||
Ernest Hemingway PERS
|
||||
Ernest Hemingway PERS
|
||||
Europe LOC
|
||||
Eva Peron PERS
|
||||
Exxon Mobil PERS
|
||||
FC Barcelona ORG
|
||||
FIFA ORG
|
||||
Facebook ORG
|
||||
Fahrenheit
|
||||
Family Guy
|
||||
Faye Resnick PERS
|
||||
FedEx ORG
|
||||
Fidel Castro PERS
|
||||
Finland LOC
|
||||
Firefox ORG
|
||||
Florence Nightingale PERS
|
||||
Florida LOC
|
||||
Fort Wayne LOC
|
||||
France LOC
|
||||
Frank Sinatra PERS
|
||||
Franklin D. Roosevelt PERS
|
||||
Freddie Mercury PERS
|
||||
Frédéric Chopin PERS
|
||||
Futurama
|
||||
Garrett Hedlund PERS
|
||||
Gene Simmons PERS
|
||||
General Electric
|
||||
Genghis Khan PERS
|
||||
George Bush PERS
|
||||
George Clooney PERS
|
||||
George Harrison PERS
|
||||
George Orwell PERS
|
||||
George W. Bush PERS
|
||||
George Washington PERS
|
||||
Georges St-Pierre PERS
|
||||
Germany LOC
|
||||
Google ORG
|
||||
Google Chrome
|
||||
Gorillaz
|
||||
Grand Theft Auto
|
||||
Greece LOC
|
||||
Gucci ORG
|
||||
Gulf War
|
||||
Gulliver's Travels
|
||||
Guns N' Roses
|
||||
Gwyneth Paltrow PERS
|
||||
HIV
|
||||
HSBC
|
||||
Haile Selassie PERS
|
||||
Haiti LOC
|
||||
Halliburton ORG
|
||||
Halloween
|
||||
Hank Baskett PERS
|
||||
Hannah Montana PERS
|
||||
Hanukkah
|
||||
Harrison Ford PERS
|
||||
Harry Potter PERS
|
||||
Hawaii LOC
|
||||
He-Man PERS
|
||||
Heath Ledger PERS
|
||||
Helen Keller PERS
|
||||
Helena Bonham Carter PERS
|
||||
Henry Ford PERS
|
||||
Henry IV PERS
|
||||
Henry V PERS
|
||||
Henry VIII PERS
|
||||
Hilary Duff PERS
|
||||
Hillary Clinton PERS
|
||||
Honda ORG
|
||||
Hong Kong LOC
|
||||
Hotmail
|
||||
Hugh Hefner PERS
|
||||
Humphrey Bogart PERS
|
||||
Hungary LOC
|
||||
IBM ORG
|
||||
IKEA ORG
|
||||
Iceland LOC
|
||||
India LOC
|
||||
Indiana Jones PERS
|
||||
Indira Gandhi PERS
|
||||
Indonesia LOC
|
||||
Internet Explorer
|
||||
Iran LOC
|
||||
Ireland LOC
|
||||
Iron Man PERS
|
||||
Isaac Newton PERS
|
||||
Isabelle Caro PERS
|
||||
Islam
|
||||
Israel LOC
|
||||
Italy LOC
|
||||
Ivy League ORG
|
||||
J. Robert Oppenheimer PERS
|
||||
J.K. Rowling PERS
|
||||
J.R.R. Tolkien PERS
|
||||
JFK PERS
|
||||
Jack the Ripper PERS
|
||||
Jackie Chan PERS
|
||||
Jacqueline Kennedy Onassis PERS
|
||||
Jaden Smith PERS
|
||||
Jake Gyllenhaal PERS
|
||||
James Bond PERS
|
||||
James Franco PERS
|
||||
Jane Austen PERS
|
||||
Janet Jackson PERS
|
||||
Japan LOC
|
||||
Jared Leto PERS
|
||||
Jason Statham PERS
|
||||
Jawaharlal Nehru PERS
|
||||
Jay-Z PERS
|
||||
Jeff Bridges PERS
|
||||
Jeff Buckley PERS
|
||||
Jenna Jameson PERS
|
||||
Jennifer Aniston PERS
|
||||
Jesse Owens PERS
|
||||
Jessica Alba PERS
|
||||
Jesus PERS
|
||||
Jim Carrey PERS
|
||||
Jim Morrisson PERS
|
||||
Jimi Hendrix PERS
|
||||
Jimmy Wales PERS
|
||||
Joaquin Phoenix PERS
|
||||
John Cena PERS
|
||||
John Edwards PERS
|
||||
John F. Kennedy PERS
|
||||
John Lennon PERS
|
||||
John M. Keynes PERS
|
||||
John McCain PERS
|
||||
John Wayne PERS
|
||||
Johnnie Walker PERS
|
||||
Johnny Cash PERS
|
||||
Johnny Depp PERS
|
||||
Joseph Stalin PERS
|
||||
Judy Garland PERS
|
||||
Julia Roberts PERS
|
||||
Julian Assange PERS
|
||||
Julie Andrews PERS
|
||||
Julius Caesar PERS
|
||||
Justin Bieber PERS
|
||||
Justin Timberlake PERS
|
||||
KFC ORG
|
||||
KLM ORG
|
||||
Kama Sutra
|
||||
Kanye West PERS
|
||||
Kate Middleton PERS
|
||||
Katherine Hepburn PERS
|
||||
Katrina Kaif PERS
|
||||
Katy Perry PERS
|
||||
Keira Knightley PERS
|
||||
Ken Livingstone PERS
|
||||
Keri Hilson PERS
|
||||
Kesha PERS
|
||||
Kevin Bacon PERS
|
||||
Kid Cudi PERS
|
||||
Kim Kardashian PERS
|
||||
Kinect
|
||||
King Arthur PERS
|
||||
Kobe Bryant PERS
|
||||
Kosovo LOC
|
||||
Kristallnacht
|
||||
Kristen Stewart PERS
|
||||
Kurt Cobain PERS
|
||||
L'Oreal ORG
|
||||
L. Ron Hubbard PERS
|
||||
Lady Gaga PERS
|
||||
Lea Michele PERS
|
||||
Lebanon LOC
|
||||
Lech Walesa PERS
|
||||
Led Zeppelin
|
||||
Lego
|
||||
Lenin PERS
|
||||
Leo Tolstoy PERS
|
||||
Leon Trotsky PERS
|
||||
Leonardo DiCaprio PERS
|
||||
Leonardo da Vinci PERS
|
||||
Leslie Nielsen PERS
|
||||
Lexus ORG
|
||||
Liam Neeson PERS
|
||||
Lil Wayne PERS
|
||||
Lindsay Lohan PERS
|
||||
Linkin Park PERS
|
||||
Lionel Messi PERS
|
||||
Loch Ness LOC
|
||||
London LOC
|
||||
Lord Baden Powell PERS
|
||||
Los Angeles LOC
|
||||
Louis Pasteur PERS
|
||||
Louis Vuitton PERS
|
||||
Louvre LOC
|
||||
Ludwig van Beethoven PERS
|
||||
Lyndon Johnson PERS
|
||||
MDMA
|
||||
Mac OS X
|
||||
Macaulay Culkin PERS
|
||||
Madagascar LOC
|
||||
Madonna PERS
|
||||
Mahatma Gandhi PERS
|
||||
Malaysia LOC
|
||||
Malcolm X PERS
|
||||
Manchester LOC
|
||||
Manchester United ORG
|
||||
Margaret Thatcher PERS
|
||||
Mariah Carey PERS
|
||||
Marilyn Monroe PERS
|
||||
Mario Gómez PERS
|
||||
Mario Kart
|
||||
Mark David Chapman PERS
|
||||
Mark Wahlberg PERS
|
||||
Mark Zuckerberg PERS
|
||||
Martin Luther King PERS
|
||||
Massachussetts LOC
|
||||
Mata Hari PERS
|
||||
Matt Damon PERS
|
||||
Mattel ORG
|
||||
Maya Angelou PERS
|
||||
McDonald's ORG
|
||||
McGill University ORG
|
||||
Megan Fox PERS
|
||||
Mercedes-Benz ORG
|
||||
Merlin PERS
|
||||
Metallica
|
||||
Mexico LOC
|
||||
Miami LOC
|
||||
Miami Vice
|
||||
Michael C. Hall PERS
|
||||
Michael Jackson PERS
|
||||
Michael Jordan PERS
|
||||
Michael Vick PERS
|
||||
Michelin ORG
|
||||
Michigan LOC
|
||||
Micky Ward PERS
|
||||
Microsoft ORG
|
||||
Microsoft Windows
|
||||
Middle Ages
|
||||
Mike Tyson PERS
|
||||
Mila Kunis PERS
|
||||
Miley Cyrus PERS
|
||||
Minecraft
|
||||
Mohammed Ali PERS
|
||||
Mona Lisa PERS
|
||||
Montreal LOC
|
||||
Morocco LOC
|
||||
Mother Teresa PERS
|
||||
Mother's Day
|
||||
Mozart PERS
|
||||
Mozilla Firefox
|
||||
Muhammad PERS
|
||||
Muhammad Ali PERS
|
||||
Myanmar LOC
|
||||
Napoleon PERS
|
||||
Narnia LOC
|
||||
Natalie Portman PERS
|
||||
Nazi Germany
|
||||
Neil Armstrong PERS
|
||||
Neil Patrick Harris PERS
|
||||
Nelson Mandela PERS
|
||||
Nepal LOC
|
||||
Netherlands LOC
|
||||
New York LOC
|
||||
New York City LOC
|
||||
New Zealand LOC
|
||||
Nicki Minaj PERS
|
||||
Nicolas Cage PERS
|
||||
Nicole Scherzinger PERS
|
||||
Nigeria LOC
|
||||
Nike ORG
|
||||
Nivea ORG
|
||||
North America LOC
|
||||
North Korea LOC
|
||||
Norway LOC
|
||||
Olivia Wilde PERS
|
||||
Oprah Winfrey PERS
|
||||
Osama Bin Laden PERS
|
||||
Oscar Wilde PERS
|
||||
Owen Wilson PERS
|
||||
Ozzfest
|
||||
Pablo Picasso PERS
|
||||
Pakistan LOC
|
||||
Panasonic ORG
|
||||
Paris LOC
|
||||
Paul McCartney PERS
|
||||
Pele PERS
|
||||
Pepsi ORG
|
||||
Peter Sellers PERS
|
||||
Philadelphia LOC
|
||||
Philips ORG
|
||||
Phillipines LOC
|
||||
Pink Floyd PERS
|
||||
PlayStation 3
|
||||
Pocahontas PERS
|
||||
Pokemon
|
||||
Pokémon
|
||||
Poland LOC
|
||||
Pope John Paul II PERS
|
||||
Premier League ORG
|
||||
Prince Charles PERS
|
||||
Priory of Sion LOC
|
||||
Procter & Gamble
|
||||
Puerto Rico LOC
|
||||
Qatar LOC
|
||||
Queen Elizabeth II PERS
|
||||
Queen Victoria PERS
|
||||
Rachmaninoff PERS
|
||||
Raiders of the Lost Ark
|
||||
Raisa Gorbachev PERS
|
||||
Real Madrid ORG
|
||||
Red Hot Chili Peppers PERS
|
||||
Reese Witherspoon PERS
|
||||
Resident Evil
|
||||
Richard PERS
|
||||
Richard Branson PERS
|
||||
Richard Dawkins PERS
|
||||
Richard Holbrooke PERS
|
||||
Richard Nixon PERS
|
||||
Rihanna PERS
|
||||
Ringo Starr PERS
|
||||
Robert De Niro PERS
|
||||
Robert Pattinson PERS
|
||||
Robin Hood PERS
|
||||
Roger Federer PERS
|
||||
Roman Empire ORG
|
||||
Romania LOC
|
||||
Rome LOC
|
||||
Romeo and Juliet
|
||||
Ronald Reagan PERS
|
||||
Ronnie O'Sullivan PERS
|
||||
Rosa Parks PERS
|
||||
Russell Brand PERS
|
||||
Russia LOC
|
||||
Ryan Reynolds PERS
|
||||
Saddam Hussein PERS
|
||||
Sahara LOC
|
||||
Saint Nicholas PERS
|
||||
Salman Khan PERS
|
||||
Samsung ORG
|
||||
Sandra Bullock PERS
|
||||
Santa Claus PERS
|
||||
Sarah Palin PERS
|
||||
Sasha Grey PERS
|
||||
Saudi Arabia LOC
|
||||
Scarlett Johansson PERS
|
||||
Scientology ORG
|
||||
Scotland LOC
|
||||
Sean Combs PERS
|
||||
Sean Parker PERS
|
||||
Selena Gomez PERS
|
||||
Serbia LOC
|
||||
Sergei Rachmaninoff PERS
|
||||
Shakira
|
||||
Shaquille O'Neal PERS
|
||||
Shaun Ryder PERS
|
||||
Sherlock Holmes PERS
|
||||
Shia LaBeouf PERS
|
||||
Shirley Temple PERS
|
||||
Siemens ORG
|
||||
Sigmund Freud PERS
|
||||
Silvio Berlusconi PERS
|
||||
Singapore LOC
|
||||
Skype
|
||||
Smirnoff ORG
|
||||
Snoop Dogg PERS
|
||||
Snow White PERS
|
||||
Socrates PERS
|
||||
Somalia LOC
|
||||
Sony ORG
|
||||
South Africa LOC
|
||||
South America LOC
|
||||
South Korea LOC
|
||||
South Park
|
||||
Soviet Union
|
||||
Spain LOC
|
||||
Spider-Man PERS
|
||||
Spiderman PERS
|
||||
Sri Lanka LOC
|
||||
Star Trek
|
||||
Star Wars
|
||||
Starbucks ORG
|
||||
Stephen Hawking PERS
|
||||
Stephen King PERS
|
||||
Steve Jobs PERS
|
||||
Steve Nash PERS
|
||||
Steven Spielberg PERS
|
||||
Sudan LOC
|
||||
Super Bowl
|
||||
Superman PERS
|
||||
Sweden LOC
|
||||
Switzerland LOC
|
||||
Sylvester Stallone PERS
|
||||
Taiwan LOC
|
||||
Taj Mahal LOC
|
||||
Take That
|
||||
Taylor Lautner PERS
|
||||
Taylor Momsem PERS
|
||||
Taylor Swift PERS
|
||||
Teena Marie PERS
|
||||
Tennessee LOC
|
||||
Texas LOC
|
||||
Thailand LOC
|
||||
The Beatles
|
||||
The Chronicles of Narnia
|
||||
The Godfather
|
||||
The Green Hornet
|
||||
The Lord of the Rings
|
||||
The Rolling Stones
|
||||
The Simpsons
|
||||
The Sims
|
||||
Theodore Roosevelt PERS
|
||||
Thomas Jefferson PERS
|
||||
Thor PERS
|
||||
Tiger Woods PERS
|
||||
Titanic
|
||||
Tom Brady PERS
|
||||
Tom Cruise PERS
|
||||
Tom Hanks PERS
|
||||
Toy Story
|
||||
Toyota ORG
|
||||
Transformers
|
||||
Tron
|
||||
Tupac Shakur PERS
|
||||
Twin Peaks
|
||||
Twitter
|
||||
UEFA Champions League
|
||||
Ubuntu
|
||||
Ukraine LOC
|
||||
United Kingdom LOC
|
||||
United Nations
|
||||
United States LOC
|
||||
Usain Bolt PERS
|
||||
Vanessa Hudgens PERS
|
||||
Venus LOC
|
||||
Vietnam LOC
|
||||
Vin Diesel PERS
|
||||
Virginia Woolf PERS
|
||||
Vladimir Putin PERS
|
||||
Vodafone ORG
|
||||
Volkswagen ORG
|
||||
Walmart ORG
|
||||
Walt Disney PERS
|
||||
Warren Buffet PERS
|
||||
Washington LOC
|
||||
Washington D.C. LOC
|
||||
Wesley Snipes PERS
|
||||
Wii
|
||||
WikiLeaks ORG
|
||||
Wikipedia ORG
|
||||
Will Ferrell PERS
|
||||
Will Smith PERS
|
||||
William Shakespeare PERS
|
||||
Willow Smith PERS
|
||||
Windows 7
|
||||
Windows 95
|
||||
Windows Vista
|
||||
Windows XP
|
||||
Winona Ryder PERS
|
||||
Winston Churchill PERS
|
||||
Wiz Khalifa PERS
|
||||
Wolfgang Amadeus Mozart PERS
|
||||
Woodrow Wilson PERS
|
||||
World War I
|
||||
World War II
|
||||
World of Warcraft
|
||||
Wright Brothers PERS
|
||||
X-Men
|
||||
Xbox 360
|
||||
Yoko Onen PERS
|
||||
Yoko Ono PERS
|
||||
YouTube ORG
|
||||
amazon.com ORG
|
||||
eBay ORG
|
||||
iPad
|
||||
iPhone
|
||||
iPod
|
||||
iPod touch
|
||||
94137
backend/venv/Lib/site-packages/textblob/en/en-lexicon.txt
Normal file
94137
backend/venv/Lib/site-packages/textblob/en/en-lexicon.txt
Normal file
File diff suppressed because it is too large
Load Diff
152
backend/venv/Lib/site-packages/textblob/en/en-morphology.txt
Normal file
152
backend/venv/Lib/site-packages/textblob/en/en-morphology.txt
Normal file
@@ -0,0 +1,152 @@
|
||||
;;;
|
||||
;;; The morphological rules are based on Brill's rule based tagger v1.14,
|
||||
;;; trained on Brown corpus and Penn Treebank.
|
||||
;;;
|
||||
NN s fhassuf 1 NNS x
|
||||
NN . fchar CD x
|
||||
NN - fchar JJ x
|
||||
NN ed fhassuf 2 VBN x
|
||||
NN ing fhassuf 3 VBG x
|
||||
ly hassuf 2 RB x
|
||||
ly addsuf 2 JJ x
|
||||
NN $ fgoodright CD x
|
||||
NN al fhassuf 2 JJ x
|
||||
NN would fgoodright VB x
|
||||
NN 0 fchar CD x
|
||||
NN be fgoodright JJ x
|
||||
NNS us fhassuf 2 JJ x
|
||||
NNS it fgoodright VBZ x
|
||||
NN ble fhassuf 3 JJ x
|
||||
NN ic fhassuf 2 JJ x
|
||||
NN 1 fchar CD x
|
||||
NNS ss fhassuf 2 NN x
|
||||
un deletepref 2 JJ x
|
||||
NN ive fhassuf 3 JJ x
|
||||
NNP ed fhassuf 2 JJ x
|
||||
NN n't fgoodright VB x
|
||||
VB the fgoodright NN x
|
||||
NNS he fgoodright VBZ x
|
||||
VBN he fgoodright VBD x
|
||||
NN are fgoodright JJ x
|
||||
JJ was fgoodleft NN x
|
||||
NN est fhassuf 3 JJS x
|
||||
VBZ The fgoodright NNS x
|
||||
NNP ts fhassuf 2 NNS x
|
||||
NN 4 fchar CD x
|
||||
NN ize fhassuf 3 VB x
|
||||
.. hassuf 2 : x
|
||||
ful hassuf 3 JJ x
|
||||
NN ate fhassuf 3 VB x
|
||||
NNP ing fhassuf 3 VBG x
|
||||
VBG is fgoodleft NN x
|
||||
NN less fhassuf 4 JJ x
|
||||
NN ary fhassuf 3 JJ x
|
||||
Co. goodleft NNP x
|
||||
NN ant fhassuf 3 JJ x
|
||||
million goodleft CD x
|
||||
JJ their fgoodleft IN x
|
||||
NN he fgoodright VBD x
|
||||
Mr. goodright NNP x
|
||||
JJ of fgoodleft NN x
|
||||
NN so fgoodright JJ x
|
||||
NN y fdeletesuf 1 JJ x
|
||||
VBN which fgoodright VBD x
|
||||
VBD been fgoodright VBN x
|
||||
VB a fgoodright NN x
|
||||
NN economic fgoodleft JJ x
|
||||
9 char CD x
|
||||
CD t fchar JJ x
|
||||
NN can fgoodright VB x
|
||||
VB the fgoodright NN x
|
||||
JJ S-T-A-R-T fgoodright VBN x
|
||||
VBN - fchar JJ x
|
||||
NN lar fhassuf 3 JJ x
|
||||
NNP ans fhassuf 3 NNPS x
|
||||
NN men fhassuf 3 NNS x
|
||||
CD d fchar JJ x
|
||||
JJ n fdeletesuf 1 VBN x
|
||||
JJ 's fgoodleft NN x
|
||||
NNS is fhassuf 2 NN x
|
||||
ES hassuf 2 NNS x
|
||||
JJ er fdeletesuf 2 JJR x
|
||||
Inc. goodleft NNP x
|
||||
NN 2 fchar CD x
|
||||
VBD be fgoodleft MD x
|
||||
ons hassuf 3 NNS x
|
||||
RB - fchar JJ x
|
||||
NN very fgoodright JJ x
|
||||
ous hassuf 3 JJ x
|
||||
NN a fdeletepref 1 RB x
|
||||
NNP people fgoodleft JJ x
|
||||
VB have fgoodleft RB x
|
||||
NNS It fgoodright VBZ x
|
||||
NN id fhassuf 2 JJ x
|
||||
JJ may fgoodleft NN x
|
||||
VBN but fgoodright VBD x
|
||||
RS hassuf 2 NNS x
|
||||
JJ stry fhassuf 4 NN x
|
||||
NNS them fgoodleft VBZ x
|
||||
VBZ were fgoodleft NNS x
|
||||
NN ing faddsuf 3 VB x
|
||||
JJ s faddsuf 1 NN x
|
||||
NN 7 fchar CD x
|
||||
NN d faddsuf 1 VB x
|
||||
VB but fgoodleft NN x
|
||||
NN 3 fchar CD x
|
||||
NN est faddsuf 3 JJ x
|
||||
NN en fhassuf 2 VBN x
|
||||
NN costs fgoodright IN x
|
||||
NN 8 fchar CD x
|
||||
VB b fhaspref 1 NN x
|
||||
zes hassuf 3 VBZ x
|
||||
VBN s faddsuf 1 NN x
|
||||
some hassuf 4 JJ x
|
||||
NN ic fhassuf 2 JJ x
|
||||
ly addsuf 2 JJ x
|
||||
ness addsuf 4 JJ x
|
||||
JJS s faddsuf 1 NN x
|
||||
NN ier fhassuf 3 JJR x
|
||||
NN ky fhassuf 2 JJ x
|
||||
tyle hassuf 4 JJ x
|
||||
NNS ates fhassuf 4 VBZ x
|
||||
fy hassuf 2 VB x
|
||||
body addsuf 4 DT x
|
||||
NN ways fgoodleft JJ x
|
||||
NNP ies fhassuf 3 NNPS x
|
||||
VB negative fgoodright NN x
|
||||
ders hassuf 4 NNS x
|
||||
ds hassuf 2 NNS x
|
||||
-day addsuf 4 CD x
|
||||
nian hassuf 4 JJ x
|
||||
JJR s faddsuf 1 NN x
|
||||
ppy hassuf 3 JJ x
|
||||
NN ish fhassuf 3 JJ x
|
||||
tors hassuf 4 NNS x
|
||||
oses hassuf 4 VBZ x
|
||||
NNS oves fhassuf 4 VBZ x
|
||||
VBN un fhaspref 2 JJ x
|
||||
lent hassuf 4 JJ x
|
||||
NN ward fdeletesuf 4 RB x
|
||||
VB k fchar NN x
|
||||
VB r fhassuf 1 NN x
|
||||
VB e fdeletesuf 1 NN x
|
||||
NNS Engelken fgoodright VBZ x
|
||||
NN ient fhassuf 4 JJ x
|
||||
ED hassuf 2 VBD x
|
||||
VBG B fchar NNP x
|
||||
VB le fhassuf 2 NN x
|
||||
ment addsuf 4 VB x
|
||||
ING hassuf 3 NN x
|
||||
JJ ery fhassuf 3 NN x
|
||||
JJ tus fhassuf 3 NN x
|
||||
JJ car fhassuf 3 NN x
|
||||
NN 6 fchar CD x
|
||||
NNS 0 fchar CD x
|
||||
JJ ing fdeletesuf 3 VBG x
|
||||
here hassuf 4 RB x
|
||||
VBN scr fhaspref 3 VBD x
|
||||
uces hassuf 4 VBZ x
|
||||
fies hassuf 4 VBZ x
|
||||
self deletesuf 4 PRP x
|
||||
NNP $ fchar $ x
|
||||
VBN wa fhaspref 2 VBD x
|
||||
2932
backend/venv/Lib/site-packages/textblob/en/en-sentiment.xml
Normal file
2932
backend/venv/Lib/site-packages/textblob/en/en-sentiment.xml
Normal file
File diff suppressed because it is too large
Load Diff
29162
backend/venv/Lib/site-packages/textblob/en/en-spelling.txt
Normal file
29162
backend/venv/Lib/site-packages/textblob/en/en-spelling.txt
Normal file
File diff suppressed because it is too large
Load Diff
472
backend/venv/Lib/site-packages/textblob/en/inflect.py
Normal file
472
backend/venv/Lib/site-packages/textblob/en/inflect.py
Normal file
@@ -0,0 +1,472 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
'''The pluralize and singular methods from the pattern library.
|
||||
|
||||
Licenced under the BSD.
|
||||
See here https://github.com/clips/pattern/blob/master/LICENSE.txt for
|
||||
complete license information.
|
||||
'''
|
||||
import re
|
||||
|
||||
VERB, NOUN, ADJECTIVE, ADVERB = "VB", "NN", "JJ", "RB"
|
||||
|
||||
#### PLURALIZE #####################################################################################
|
||||
# Based on "An Algorithmic Approach to English Pluralization" by Damian Conway:
|
||||
# http://www.csse.monash.edu.au/~damian/papers/HTML/Plurals.html
|
||||
|
||||
# Prepositions are used to solve things like
|
||||
# "mother-in-law" or "man at arms"
|
||||
plural_prepositions = [
|
||||
"about", "above", "across", "after", "among", "around", "at", "athwart", "before", "behind",
|
||||
"below", "beneath", "beside", "besides", "between", "betwixt", "beyond", "but", "by", "during",
|
||||
"except", "for", "from", "in", "into", "near", "of", "off", "on", "onto", "out", "over",
|
||||
"since", "till", "to", "under", "until", "unto", "upon", "with"
|
||||
]
|
||||
|
||||
# Inflection rules that are either general,
|
||||
# or apply to a certain category of words,
|
||||
# or apply to a certain category of words only in classical mode,
|
||||
# or apply only in classical mode.
|
||||
# Each rule consists of:
|
||||
# suffix, inflection, category and classic flag.
|
||||
plural_rules = [
|
||||
# 0) Indefinite articles and demonstratives.
|
||||
[["^a$|^an$", "some", None, False],
|
||||
["^this$", "these", None, False],
|
||||
["^that$", "those", None, False],
|
||||
["^any$", "all", None, False]
|
||||
],
|
||||
# 1) Possessive adjectives.
|
||||
# Overlaps with 1/ for "his" and "its".
|
||||
# Overlaps with 2/ for "her".
|
||||
[["^my$", "our", None, False],
|
||||
["^your$|^thy$", "your", None, False],
|
||||
["^her$|^his$|^its$|^their$", "their", None, False]
|
||||
],
|
||||
# 2) Possessive pronouns.
|
||||
[["^mine$", "ours", None, False],
|
||||
["^yours$|^thine$", "yours", None, False],
|
||||
["^hers$|^his$|^its$|^theirs$", "theirs", None, False]
|
||||
],
|
||||
# 3) Personal pronouns.
|
||||
[["^I$", "we", None, False],
|
||||
["^me$", "us", None, False],
|
||||
["^myself$", "ourselves", None, False],
|
||||
["^you$", "you", None, False],
|
||||
["^thou$|^thee$", "ye", None, False],
|
||||
["^yourself$|^thyself$", "yourself", None, False],
|
||||
["^she$|^he$|^it$|^they$", "they", None, False],
|
||||
["^her$|^him$|^it$|^them$", "them", None, False],
|
||||
["^herself$|^himself$|^itself$|^themself$", "themselves", None, False],
|
||||
["^oneself$", "oneselves", None, False]
|
||||
],
|
||||
# 4) Words that do not inflect.
|
||||
[["$", "", "uninflected", False],
|
||||
["$", "", "uncountable", False],
|
||||
["fish$", "fish", None, False],
|
||||
["([- ])bass$", "\\1bass", None, False],
|
||||
["ois$", "ois", None, False],
|
||||
["sheep$", "sheep", None, False],
|
||||
["deer$", "deer", None, False],
|
||||
["pox$", "pox", None, False],
|
||||
["([A-Z].*)ese$", "\\1ese", None, False],
|
||||
["itis$", "itis", None, False],
|
||||
["(fruct|gluc|galact|lact|ket|malt|rib|sacchar|cellul)ose$", "\\1ose", None, False]
|
||||
],
|
||||
# 5) Irregular plurals (mongoose, oxen).
|
||||
[["atlas$", "atlantes", None, True],
|
||||
["atlas$", "atlases", None, False],
|
||||
["beef$", "beeves", None, True],
|
||||
["brother$", "brethren", None, True],
|
||||
["child$", "children", None, False],
|
||||
["corpus$", "corpora", None, True],
|
||||
["corpus$", "corpuses", None, False],
|
||||
["^cow$", "kine", None, True],
|
||||
["ephemeris$", "ephemerides", None, False],
|
||||
["ganglion$", "ganglia", None, True],
|
||||
["genie$", "genii", None, True],
|
||||
["genus$", "genera", None, False],
|
||||
["graffito$", "graffiti", None, False],
|
||||
["loaf$", "loaves", None, False],
|
||||
["money$", "monies", None, True],
|
||||
["mongoose$", "mongooses", None, False],
|
||||
["mythos$", "mythoi", None, False],
|
||||
["octopus$", "octopodes", None, True],
|
||||
["opus$", "opera", None, True],
|
||||
["opus$", "opuses", None, False],
|
||||
["^ox$", "oxen", None, False],
|
||||
["penis$", "penes", None, True],
|
||||
["penis$", "penises", None, False],
|
||||
["soliloquy$", "soliloquies", None, False],
|
||||
["testis$", "testes", None, False],
|
||||
["trilby$", "trilbys", None, False],
|
||||
["turf$", "turves", None, True],
|
||||
["numen$", "numena", None, False],
|
||||
["occiput$", "occipita", None, True]
|
||||
],
|
||||
# 6) Irregular inflections for common suffixes (synopses, mice, men).
|
||||
[["man$", "men", None, False],
|
||||
["person$", "people", None, False],
|
||||
["([lm])ouse$", "\\1ice", None, False],
|
||||
["tooth$", "teeth", None, False],
|
||||
["goose$", "geese", None, False],
|
||||
["foot$", "feet", None, False],
|
||||
["zoon$", "zoa", None, False],
|
||||
["([csx])is$", "\\1es", None, False]
|
||||
],
|
||||
# 7) Fully assimilated classical inflections (vertebrae, codices).
|
||||
[["ex$", "ices", "ex-ices", False],
|
||||
["ex$", "ices", "ex-ices-classical", True],
|
||||
["um$", "a", "um-a", False],
|
||||
["um$", "a", "um-a-classical", True],
|
||||
["on$", "a", "on-a", False],
|
||||
["a$", "ae", "a-ae", False],
|
||||
["a$", "ae", "a-ae-classical", True]
|
||||
],
|
||||
# 8) Classical variants of modern inflections (stigmata, soprani).
|
||||
[["trix$", "trices", None, True],
|
||||
["eau$", "eaux", None, True],
|
||||
["ieu$", "ieu", None, True],
|
||||
["([iay])nx$", "\\1nges", None, True],
|
||||
["en$", "ina", "en-ina-classical", True],
|
||||
["a$", "ata", "a-ata-classical", True],
|
||||
["is$", "ides", "is-ides-classical", True],
|
||||
["us$", "i", "us-i-classical", True],
|
||||
["us$", "us", "us-us-classical", True],
|
||||
["o$", "i", "o-i-classical", True],
|
||||
["$", "i", "-i-classical", True],
|
||||
["$", "im", "-im-classical", True]
|
||||
],
|
||||
# 9) -ch, -sh and -ss and the s-singular group take -es in the plural (churches, classes, lenses).
|
||||
[["([cs])h$", "\\1hes", None, False],
|
||||
["ss$", "sses", None, False],
|
||||
["x$", "xes", None, False],
|
||||
["s$", "ses", "s-singular", False]
|
||||
],
|
||||
# 10) Certain words ending in -f or -fe take -ves in the plural (lives, wolves).
|
||||
[["([aeo]l)f$", "\\1ves", None, False],
|
||||
["([^d]ea)f$", "\\1ves", None, False],
|
||||
["arf$", "arves", None, False],
|
||||
["([nlw]i)fe$", "\\1ves", None, False],
|
||||
],
|
||||
# 11) -y takes -ys if preceded by a vowel or when a proper noun,
|
||||
# but -ies if preceded by a consonant (storeys, Marys, stories).
|
||||
[["([aeiou])y$", "\\1ys", None, False],
|
||||
["([A-Z].*)y$", "\\1ys", None, False],
|
||||
["y$", "ies", None, False]
|
||||
],
|
||||
# 12) Some words ending in -o take -os, the rest take -oes.
|
||||
# Words in which the -o is preceded by a vowel always take -os (lassos, potatoes, bamboos).
|
||||
[["o$", "os", "o-os", False],
|
||||
["([aeiou])o$", "\\1os", None, False],
|
||||
["o$", "oes", None, False]
|
||||
],
|
||||
# 13) Miltary stuff (Major Generals).
|
||||
[["l$", "ls", "general-generals", False]
|
||||
],
|
||||
# 14) Otherwise, assume that the plural just adds -s (cats, programmes).
|
||||
[["$", "s", None, False]
|
||||
],
|
||||
]
|
||||
|
||||
# For performance, compile the regular expressions only once:
|
||||
for ruleset in plural_rules:
|
||||
for rule in ruleset:
|
||||
rule[0] = re.compile(rule[0])
|
||||
|
||||
# Suffix categories.
|
||||
plural_categories = {
|
||||
"uninflected": [
|
||||
"aircraft", "antelope", "bison", "bream", "breeches", "britches", "carp", "cattle", "chassis",
|
||||
"clippers", "cod", "contretemps", "corps", "debris", "diabetes", "djinn", "eland", "elk",
|
||||
"flounder", "gallows", "graffiti", "headquarters", "herpes", "high-jinks", "homework", "innings",
|
||||
"jackanapes", "mackerel", "measles", "mews", "moose", "mumps", "offspring", "news", "pincers",
|
||||
"pliers", "proceedings", "rabies", "salmon", "scissors", "series", "shears", "species", "swine",
|
||||
"trout", "tuna", "whiting", "wildebeest"],
|
||||
"uncountable": [
|
||||
"advice", "bread", "butter", "cannabis", "cheese", "electricity", "equipment", "fruit", "furniture",
|
||||
"garbage", "gravel", "happiness", "information", "ketchup", "knowledge", "love", "luggage",
|
||||
"mathematics", "mayonnaise", "meat", "mustard", "news", "progress", "research", "rice",
|
||||
"sand", "software", "understanding", "water"],
|
||||
"s-singular": [
|
||||
"acropolis", "aegis", "alias", "asbestos", "bathos", "bias", "bus", "caddis", "canvas",
|
||||
"chaos", "christmas", "cosmos", "dais", "digitalis", "epidermis", "ethos", "gas", "glottis",
|
||||
"ibis", "lens", "mantis", "marquis", "metropolis", "pathos", "pelvis", "polis", "rhinoceros",
|
||||
"sassafras", "trellis"],
|
||||
"ex-ices": ["codex", "murex", "silex"],
|
||||
"ex-ices-classical": [
|
||||
"apex", "cortex", "index", "latex", "pontifex", "simplex", "vertex", "vortex"],
|
||||
"um-a": [
|
||||
"agendum", "bacterium", "candelabrum", "datum", "desideratum", "erratum", "extremum",
|
||||
"ovum", "stratum"],
|
||||
"um-a-classical": [
|
||||
"aquarium", "compendium", "consortium", "cranium", "curriculum", "dictum", "emporium",
|
||||
"enconium", "gymnasium", "honorarium", "interregnum", "lustrum", "maximum", "medium",
|
||||
"memorandum", "millenium", "minimum", "momentum", "optimum", "phylum", "quantum", "rostrum",
|
||||
"spectrum", "speculum", "stadium", "trapezium", "ultimatum", "vacuum", "velum"],
|
||||
"on-a": [
|
||||
"aphelion", "asyndeton", "criterion", "hyperbaton", "noumenon", "organon", "perihelion",
|
||||
"phenomenon", "prolegomenon"],
|
||||
"a-ae": ["alga", "alumna", "vertebra"],
|
||||
"a-ae-classical": [
|
||||
"abscissa", "amoeba", "antenna", "aurora", "formula", "hydra", "hyperbola", "lacuna",
|
||||
"medusa", "nebula", "nova", "parabola"],
|
||||
"en-ina-classical": ["foramen", "lumen", "stamen"],
|
||||
"a-ata-classical": [
|
||||
"anathema", "bema", "carcinoma", "charisma", "diploma", "dogma", "drama", "edema", "enema",
|
||||
"enigma", "gumma", "lemma", "lymphoma", "magma", "melisma", "miasma", "oedema", "sarcoma",
|
||||
"schema", "soma", "stigma", "stoma", "trauma"],
|
||||
"is-ides-classical": ["clitoris", "iris"],
|
||||
"us-i-classical": [
|
||||
"focus", "fungus", "genius", "incubus", "nimbus", "nucleolus", "radius", "stylus", "succubus",
|
||||
"torus", "umbilicus", "uterus"],
|
||||
"us-us-classical": [
|
||||
"apparatus", "cantus", "coitus", "hiatus", "impetus", "nexus", "plexus", "prospectus",
|
||||
"sinus", "status"],
|
||||
"o-i-classical": ["alto", "basso", "canto", "contralto", "crescendo", "solo", "soprano", "tempo"],
|
||||
"-i-classical": ["afreet", "afrit", "efreet"],
|
||||
"-im-classical": ["cherub", "goy", "seraph"],
|
||||
"o-os": [
|
||||
"albino", "archipelago", "armadillo", "commando", "ditto", "dynamo", "embryo", "fiasco",
|
||||
"generalissimo", "ghetto", "guano", "inferno", "jumbo", "lingo", "lumbago", "magneto",
|
||||
"manifesto", "medico", "octavo", "photo", "pro", "quarto", "rhino", "stylo"],
|
||||
"general-generals": [
|
||||
"Adjutant", "Brigadier", "Lieutenant", "Major", "Quartermaster",
|
||||
"adjutant", "brigadier", "lieutenant", "major", "quartermaster"],
|
||||
}
|
||||
|
||||
def pluralize(word, pos=NOUN, custom={}, classical=True):
|
||||
""" Returns the plural of a given word.
|
||||
For example: child -> children.
|
||||
Handles nouns and adjectives, using classical inflection by default
|
||||
(e.g. where "matrix" pluralizes to "matrices" instead of "matrixes").
|
||||
The custom dictionary is for user-defined replacements.
|
||||
"""
|
||||
|
||||
if word in custom:
|
||||
return custom[word]
|
||||
|
||||
# Recursion of genitives.
|
||||
# Remove the apostrophe and any trailing -s,
|
||||
# form the plural of the resultant noun, and then append an apostrophe (dog's -> dogs').
|
||||
if word.endswith("'") or word.endswith("'s"):
|
||||
owner = word.rstrip("'s")
|
||||
owners = pluralize(owner, pos, custom, classical)
|
||||
if owners.endswith("s"):
|
||||
return owners + "'"
|
||||
else:
|
||||
return owners + "'s"
|
||||
|
||||
# Recursion of compound words
|
||||
# (Postmasters General, mothers-in-law, Roman deities).
|
||||
words = word.replace("-", " ").split(" ")
|
||||
if len(words) > 1:
|
||||
if words[1] == "general" or words[1] == "General" and \
|
||||
words[0] not in plural_categories["general-generals"]:
|
||||
return word.replace(words[0], pluralize(words[0], pos, custom, classical))
|
||||
elif words[1] in plural_prepositions:
|
||||
return word.replace(words[0], pluralize(words[0], pos, custom, classical))
|
||||
else:
|
||||
return word.replace(words[-1], pluralize(words[-1], pos, custom, classical))
|
||||
|
||||
# Only a very few number of adjectives inflect.
|
||||
n = list(range(len(plural_rules)))
|
||||
if pos.startswith(ADJECTIVE):
|
||||
n = [0, 1]
|
||||
|
||||
# Apply pluralization rules.
|
||||
for i in n:
|
||||
ruleset = plural_rules[i]
|
||||
for rule in ruleset:
|
||||
suffix, inflection, category, classic = rule
|
||||
# A general rule, or a classic rule in classical mode.
|
||||
if category == None:
|
||||
if not classic or (classic and classical):
|
||||
if suffix.search(word) is not None:
|
||||
return suffix.sub(inflection, word)
|
||||
# A rule relating to a specific category of words.
|
||||
if category != None:
|
||||
if word in plural_categories[category] and (not classic or (classic and classical)):
|
||||
if suffix.search(word) is not None:
|
||||
return suffix.sub(inflection, word)
|
||||
|
||||
#### SINGULARIZE ###################################################################################
|
||||
# Adapted from Bermi Ferrer's Inflector for Python:
|
||||
# http://www.bermi.org/inflector/
|
||||
|
||||
# Copyright (c) 2006 Bermi Ferrer Martinez
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software to deal in this software without restriction, including
|
||||
# without limitation the rights to use, copy, modify, merge, publish,
|
||||
# distribute, sublicense, and/or sell copies of this software, and to permit
|
||||
# persons to whom this software is furnished to do so, subject to the following
|
||||
# condition:
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THIS SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
# THIS SOFTWARE.
|
||||
|
||||
singular_rules = [
|
||||
['(?i)(.)ae$', '\\1a'],
|
||||
['(?i)(.)itis$', '\\1itis'],
|
||||
['(?i)(.)eaux$', '\\1eau'],
|
||||
['(?i)(quiz)zes$', '\\1'],
|
||||
['(?i)(matr)ices$', '\\1ix'],
|
||||
['(?i)(ap|vert|ind)ices$', '\\1ex'],
|
||||
['(?i)^(ox)en', '\\1'],
|
||||
['(?i)(alias|status)es$', '\\1'],
|
||||
['(?i)([octop|vir])i$', '\\1us'],
|
||||
['(?i)(cris|ax|test)es$', '\\1is'],
|
||||
['(?i)(shoe)s$', '\\1'],
|
||||
['(?i)(o)es$', '\\1'],
|
||||
['(?i)(bus)es$', '\\1'],
|
||||
['(?i)([m|l])ice$', '\\1ouse'],
|
||||
['(?i)(x|ch|ss|sh)es$', '\\1'],
|
||||
['(?i)(m)ovies$', '\\1ovie'],
|
||||
['(?i)(.)ombies$', '\\1ombie'],
|
||||
['(?i)(s)eries$', '\\1eries'],
|
||||
['(?i)([^aeiouy]|qu)ies$', '\\1y'],
|
||||
# Certain words ending in -f or -fe take -ves in the plural (lives, wolves).
|
||||
["([aeo]l)ves$", "\\1f"],
|
||||
["([^d]ea)ves$", "\\1f"],
|
||||
["arves$", "arf"],
|
||||
["erves$", "erve"],
|
||||
["([nlw]i)ves$", "\\1fe"],
|
||||
['(?i)([lr])ves$', '\\1f'],
|
||||
["([aeo])ves$", "\\1ve"],
|
||||
['(?i)(sive)s$', '\\1'],
|
||||
['(?i)(tive)s$', '\\1'],
|
||||
['(?i)(hive)s$', '\\1'],
|
||||
['(?i)([^f])ves$', '\\1fe'],
|
||||
# -es suffix.
|
||||
['(?i)(^analy)ses$', '\\1sis'],
|
||||
['(?i)((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$', '\\1\\2sis'],
|
||||
['(?i)(.)opses$', '\\1opsis'],
|
||||
['(?i)(.)yses$', '\\1ysis'],
|
||||
['(?i)(h|d|r|o|n|b|cl|p)oses$', '\\1ose'],
|
||||
['(?i)(fruct|gluc|galact|lact|ket|malt|rib|sacchar|cellul)ose$', '\\1ose'],
|
||||
['(?i)(.)oses$', '\\1osis'],
|
||||
# -a
|
||||
['(?i)([ti])a$', '\\1um'],
|
||||
['(?i)(n)ews$', '\\1ews'],
|
||||
['(?i)s$', ''],
|
||||
]
|
||||
|
||||
# For performance, compile the regular expressions only once:
|
||||
for rule in singular_rules:
|
||||
rule[0] = re.compile(rule[0])
|
||||
|
||||
singular_uninflected = [
|
||||
"aircraft", "antelope", "bison", "bream", "breeches", "britches", "carp", "cattle", "chassis",
|
||||
"clippers", "cod", "contretemps", "corps", "debris", "diabetes", "djinn", "eland",
|
||||
"elk", "flounder", "gallows", "georgia", "graffiti", "headquarters", "herpes", "high-jinks",
|
||||
"homework", "innings", "jackanapes", "mackerel", "measles", "mews", "moose", "mumps", "news",
|
||||
"offspring", "pincers", "pliers", "proceedings", "rabies", "salmon", "scissors", "series",
|
||||
"shears", "species", "swine", "swiss", "trout", "tuna", "whiting", "wildebeest"
|
||||
]
|
||||
singular_uncountable = [
|
||||
"advice", "bread", "butter", "cannabis", "cheese", "electricity", "equipment", "fruit", "furniture",
|
||||
"garbage", "gravel", "happiness", "information", "ketchup", "knowledge", "love", "luggage",
|
||||
"mathematics", "mayonnaise", "meat", "mustard", "news", "progress", "research", "rice", "sand",
|
||||
"software", "understanding", "water"
|
||||
]
|
||||
singular_ie = [
|
||||
"algerie", "auntie", "beanie", "birdie", "bogie", "bombie", "bookie", "collie", "cookie", "cutie",
|
||||
"doggie", "eyrie", "freebie", "goonie", "groupie", "hankie", "hippie", "hoagie", "hottie",
|
||||
"indie", "junkie", "laddie", "laramie", "lingerie", "meanie", "nightie", "oldie", "^pie",
|
||||
"pixie", "quickie", "reverie", "rookie", "softie", "sortie", "stoolie", "sweetie", "techie",
|
||||
"^tie", "toughie", "valkyrie", "veggie", "weenie", "yuppie", "zombie"
|
||||
]
|
||||
singular_s = plural_categories['s-singular']
|
||||
|
||||
# key plural, value singular
|
||||
singular_irregular = {
|
||||
"men": "man",
|
||||
"people": "person",
|
||||
"children": "child",
|
||||
"sexes": "sex",
|
||||
"axes": "axe",
|
||||
"moves": "move",
|
||||
"teeth": "tooth",
|
||||
"geese": "goose",
|
||||
"feet": "foot",
|
||||
"zoa": "zoon",
|
||||
"atlantes": "atlas",
|
||||
"atlases": "atlas",
|
||||
"beeves": "beef",
|
||||
"brethren": "brother",
|
||||
"children": "child",
|
||||
"corpora": "corpus",
|
||||
"corpuses": "corpus",
|
||||
"kine": "cow",
|
||||
"ephemerides": "ephemeris",
|
||||
"ganglia": "ganglion",
|
||||
"genii": "genie",
|
||||
"genera": "genus",
|
||||
"graffiti": "graffito",
|
||||
"helves": "helve",
|
||||
"leaves": "leaf",
|
||||
"loaves": "loaf",
|
||||
"monies": "money",
|
||||
"mongooses": "mongoose",
|
||||
"mythoi": "mythos",
|
||||
"octopodes": "octopus",
|
||||
"opera": "opus",
|
||||
"opuses": "opus",
|
||||
"oxen": "ox",
|
||||
"penes": "penis",
|
||||
"penises": "penis",
|
||||
"soliloquies": "soliloquy",
|
||||
"testes": "testis",
|
||||
"trilbys": "trilby",
|
||||
"turves": "turf",
|
||||
"numena": "numen",
|
||||
"occipita": "occiput",
|
||||
"our": "my",
|
||||
}
|
||||
|
||||
def singularize(word, pos=NOUN, custom={}):
|
||||
|
||||
if word in list(custom.keys()):
|
||||
return custom[word]
|
||||
|
||||
# Recursion of compound words (e.g. mothers-in-law).
|
||||
if "-" in word:
|
||||
words = word.split("-")
|
||||
if len(words) > 1 and words[1] in plural_prepositions:
|
||||
return singularize(words[0], pos, custom)+"-"+"-".join(words[1:])
|
||||
# dogs' => dog's
|
||||
if word.endswith("'"):
|
||||
return singularize(word[:-1]) + "'s"
|
||||
|
||||
lower = word.lower()
|
||||
for w in singular_uninflected:
|
||||
if w.endswith(lower):
|
||||
return word
|
||||
for w in singular_uncountable:
|
||||
if w.endswith(lower):
|
||||
return word
|
||||
for w in singular_ie:
|
||||
if lower.endswith(w+"s"):
|
||||
return w
|
||||
for w in singular_s:
|
||||
if lower.endswith(w + 'es'):
|
||||
return w
|
||||
for w in list(singular_irregular.keys()):
|
||||
if lower.endswith(w):
|
||||
return re.sub('(?i)'+w+'$', singular_irregular[w], word)
|
||||
|
||||
for rule in singular_rules:
|
||||
suffix, inflection = rule
|
||||
match = suffix.search(word)
|
||||
if match:
|
||||
groups = match.groups()
|
||||
for k in range(0, len(groups)):
|
||||
if groups[k] == None:
|
||||
inflection = inflection.replace('\\'+str(k+1), '')
|
||||
return suffix.sub(inflection, word)
|
||||
|
||||
return word
|
||||
204
backend/venv/Lib/site-packages/textblob/en/np_extractors.py
Normal file
204
backend/venv/Lib/site-packages/textblob/en/np_extractors.py
Normal file
@@ -0,0 +1,204 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
'''Various noun phrase extractors.'''
|
||||
from __future__ import unicode_literals, absolute_import
|
||||
|
||||
import nltk
|
||||
|
||||
from textblob.taggers import PatternTagger
|
||||
from textblob.decorators import requires_nltk_corpus
|
||||
from textblob.utils import tree2str, filter_insignificant
|
||||
from textblob.base import BaseNPExtractor
|
||||
|
||||
|
||||
class ChunkParser(nltk.ChunkParserI):
|
||||
|
||||
def __init__(self):
|
||||
self._trained = False
|
||||
|
||||
@requires_nltk_corpus
|
||||
def train(self):
|
||||
'''Train the Chunker on the ConLL-2000 corpus.'''
|
||||
train_data = [[(t, c) for _, t, c in nltk.chunk.tree2conlltags(sent)]
|
||||
for sent in
|
||||
nltk.corpus.conll2000.chunked_sents('train.txt',
|
||||
chunk_types=['NP'])]
|
||||
unigram_tagger = nltk.UnigramTagger(train_data)
|
||||
self.tagger = nltk.BigramTagger(train_data, backoff=unigram_tagger)
|
||||
self._trained = True
|
||||
|
||||
def parse(self, sentence):
|
||||
'''Return the parse tree for the sentence.'''
|
||||
if not self._trained:
|
||||
self.train()
|
||||
pos_tags = [pos for (word, pos) in sentence]
|
||||
tagged_pos_tags = self.tagger.tag(pos_tags)
|
||||
chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
|
||||
conlltags = [(word, pos, chunktag) for ((word, pos), chunktag) in
|
||||
zip(sentence, chunktags)]
|
||||
return nltk.chunk.util.conlltags2tree(conlltags)
|
||||
|
||||
|
||||
class ConllExtractor(BaseNPExtractor):
|
||||
|
||||
'''A noun phrase extractor that uses chunk parsing trained with the
|
||||
ConLL-2000 training corpus.
|
||||
'''
|
||||
|
||||
POS_TAGGER = PatternTagger()
|
||||
|
||||
# The context-free grammar with which to filter the noun phrases
|
||||
CFG = {
|
||||
('NNP', 'NNP'): 'NNP',
|
||||
('NN', 'NN'): 'NNI',
|
||||
('NNI', 'NN'): 'NNI',
|
||||
('JJ', 'JJ'): 'JJ',
|
||||
('JJ', 'NN'): 'NNI',
|
||||
}
|
||||
|
||||
# POS suffixes that will be ignored
|
||||
INSIGNIFICANT_SUFFIXES = ['DT', 'CC', 'PRP$', 'PRP']
|
||||
|
||||
def __init__(self, parser=None):
|
||||
self.parser = ChunkParser() if not parser else parser
|
||||
|
||||
def extract(self, text):
|
||||
'''Return a list of noun phrases (strings) for body of text.'''
|
||||
sentences = nltk.tokenize.sent_tokenize(text)
|
||||
noun_phrases = []
|
||||
for sentence in sentences:
|
||||
parsed = self._parse_sentence(sentence)
|
||||
# Get the string representation of each subtree that is a
|
||||
# noun phrase tree
|
||||
phrases = [_normalize_tags(filter_insignificant(each,
|
||||
self.INSIGNIFICANT_SUFFIXES)) for each in parsed
|
||||
if isinstance(each, nltk.tree.Tree) and each.label()
|
||||
== 'NP' and len(filter_insignificant(each)) >= 1
|
||||
and _is_match(each, cfg=self.CFG)]
|
||||
nps = [tree2str(phrase) for phrase in phrases]
|
||||
noun_phrases.extend(nps)
|
||||
return noun_phrases
|
||||
|
||||
def _parse_sentence(self, sentence):
|
||||
'''Tag and parse a sentence (a plain, untagged string).'''
|
||||
tagged = self.POS_TAGGER.tag(sentence)
|
||||
return self.parser.parse(tagged)
|
||||
|
||||
|
||||
class FastNPExtractor(BaseNPExtractor):
|
||||
|
||||
'''A fast and simple noun phrase extractor.
|
||||
|
||||
Credit to Shlomi Babluk. Link to original blog post:
|
||||
|
||||
http://thetokenizer.com/2013/05/09/efficient-way-to-extract-the-main-topics-of-a-sentence/
|
||||
'''
|
||||
|
||||
CFG = {
|
||||
('NNP', 'NNP'): 'NNP',
|
||||
('NN', 'NN'): 'NNI',
|
||||
('NNI', 'NN'): 'NNI',
|
||||
('JJ', 'JJ'): 'JJ',
|
||||
('JJ', 'NN'): 'NNI',
|
||||
}
|
||||
|
||||
def __init__(self):
|
||||
self._trained = False
|
||||
|
||||
@requires_nltk_corpus
|
||||
def train(self):
|
||||
train_data = nltk.corpus.brown.tagged_sents(categories='news')
|
||||
regexp_tagger = nltk.RegexpTagger([
|
||||
(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),
|
||||
(r'(-|:|;)$', ':'),
|
||||
(r'\'*$', 'MD'),
|
||||
(r'(The|the|A|a|An|an)$', 'AT'),
|
||||
(r'.*able$', 'JJ'),
|
||||
(r'^[A-Z].*$', 'NNP'),
|
||||
(r'.*ness$', 'NN'),
|
||||
(r'.*ly$', 'RB'),
|
||||
(r'.*s$', 'NNS'),
|
||||
(r'.*ing$', 'VBG'),
|
||||
(r'.*ed$', 'VBD'),
|
||||
(r'.*', 'NN'),
|
||||
])
|
||||
unigram_tagger = nltk.UnigramTagger(train_data, backoff=regexp_tagger)
|
||||
self.tagger = nltk.BigramTagger(train_data, backoff=unigram_tagger)
|
||||
self._trained = True
|
||||
return None
|
||||
|
||||
|
||||
def _tokenize_sentence(self, sentence):
|
||||
'''Split the sentence into single words/tokens'''
|
||||
tokens = nltk.word_tokenize(sentence)
|
||||
return tokens
|
||||
|
||||
def extract(self, sentence):
|
||||
'''Return a list of noun phrases (strings) for body of text.'''
|
||||
if not self._trained:
|
||||
self.train()
|
||||
tokens = self._tokenize_sentence(sentence)
|
||||
tagged = self.tagger.tag(tokens)
|
||||
tags = _normalize_tags(tagged)
|
||||
merge = True
|
||||
while merge:
|
||||
merge = False
|
||||
for x in range(0, len(tags) - 1):
|
||||
t1 = tags[x]
|
||||
t2 = tags[x + 1]
|
||||
key = t1[1], t2[1]
|
||||
value = self.CFG.get(key, '')
|
||||
if value:
|
||||
merge = True
|
||||
tags.pop(x)
|
||||
tags.pop(x)
|
||||
match = '%s %s' % (t1[0], t2[0])
|
||||
pos = value
|
||||
tags.insert(x, (match, pos))
|
||||
break
|
||||
|
||||
matches = [t[0] for t in tags if t[1] in ['NNP', 'NNI']]
|
||||
return matches
|
||||
|
||||
|
||||
### Utility methods ###
|
||||
|
||||
def _normalize_tags(chunk):
|
||||
'''Normalize the corpus tags.
|
||||
("NN", "NN-PL", "NNS") -> "NN"
|
||||
'''
|
||||
ret = []
|
||||
for word, tag in chunk:
|
||||
if tag == 'NP-TL' or tag == 'NP':
|
||||
ret.append((word, 'NNP'))
|
||||
continue
|
||||
if tag.endswith('-TL'):
|
||||
ret.append((word, tag[:-3]))
|
||||
continue
|
||||
if tag.endswith('S'):
|
||||
ret.append((word, tag[:-1]))
|
||||
continue
|
||||
ret.append((word, tag))
|
||||
return ret
|
||||
|
||||
|
||||
def _is_match(tagged_phrase, cfg):
|
||||
'''Return whether or not a tagged phrases matches a context-free grammar.
|
||||
'''
|
||||
copy = list(tagged_phrase) # A copy of the list
|
||||
merge = True
|
||||
while merge:
|
||||
merge = False
|
||||
for i in range(len(copy) - 1):
|
||||
first, second = copy[i], copy[i + 1]
|
||||
key = first[1], second[1] # Tuple of tags e.g. ('NN', 'JJ')
|
||||
value = cfg.get(key, None)
|
||||
if value:
|
||||
merge = True
|
||||
copy.pop(i)
|
||||
copy.pop(i)
|
||||
match = '{0} {1}'.format(first[0], second[0])
|
||||
pos = value
|
||||
copy.insert(i, (match, pos))
|
||||
break
|
||||
match = any([t[1] in ('NNP', 'NNI') for t in copy])
|
||||
return match
|
||||
18
backend/venv/Lib/site-packages/textblob/en/parsers.py
Normal file
18
backend/venv/Lib/site-packages/textblob/en/parsers.py
Normal file
@@ -0,0 +1,18 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Various parser implementations.
|
||||
|
||||
.. versionadded:: 0.6.0
|
||||
"""
|
||||
from __future__ import absolute_import
|
||||
from textblob.en import parse as pattern_parse
|
||||
from textblob.base import BaseParser
|
||||
|
||||
|
||||
class PatternParser(BaseParser):
|
||||
"""Parser that uses the implementation in Tom de Smedt's pattern library.
|
||||
http://www.clips.ua.ac.be/pages/pattern-en#parser
|
||||
"""
|
||||
|
||||
def parse(self, text):
|
||||
"""Parses the text."""
|
||||
return pattern_parse(text)
|
||||
97
backend/venv/Lib/site-packages/textblob/en/sentiments.py
Normal file
97
backend/venv/Lib/site-packages/textblob/en/sentiments.py
Normal file
@@ -0,0 +1,97 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Sentiment analysis implementations.
|
||||
|
||||
.. versionadded:: 0.5.0
|
||||
"""
|
||||
from __future__ import absolute_import
|
||||
from collections import namedtuple
|
||||
|
||||
import nltk
|
||||
|
||||
from textblob.en import sentiment as pattern_sentiment
|
||||
from textblob.tokenizers import word_tokenize
|
||||
from textblob.decorators import requires_nltk_corpus
|
||||
from textblob.base import BaseSentimentAnalyzer, DISCRETE, CONTINUOUS
|
||||
|
||||
|
||||
class PatternAnalyzer(BaseSentimentAnalyzer):
|
||||
"""Sentiment analyzer that uses the same implementation as the
|
||||
pattern library. Returns results as a named tuple of the form:
|
||||
|
||||
``Sentiment(polarity, subjectivity, [assessments])``
|
||||
|
||||
where [assessments] is a list of the assessed tokens and their
|
||||
polarity and subjectivity scores
|
||||
"""
|
||||
kind = CONTINUOUS
|
||||
# This is only here for backwards-compatibility.
|
||||
# The return type is actually determined upon calling analyze()
|
||||
RETURN_TYPE = namedtuple('Sentiment', ['polarity', 'subjectivity'])
|
||||
|
||||
def analyze(self, text, keep_assessments=False):
|
||||
"""Return the sentiment as a named tuple of the form:
|
||||
``Sentiment(polarity, subjectivity, [assessments])``.
|
||||
"""
|
||||
#: Return type declaration
|
||||
if keep_assessments:
|
||||
Sentiment = namedtuple('Sentiment', ['polarity', 'subjectivity', 'assessments'])
|
||||
assessments = pattern_sentiment(text).assessments
|
||||
polarity, subjectivity = pattern_sentiment(text)
|
||||
return Sentiment(polarity, subjectivity, assessments)
|
||||
|
||||
else:
|
||||
Sentiment = namedtuple('Sentiment', ['polarity', 'subjectivity'])
|
||||
return Sentiment(*pattern_sentiment(text))
|
||||
|
||||
|
||||
def _default_feature_extractor(words):
|
||||
"""Default feature extractor for the NaiveBayesAnalyzer."""
|
||||
return dict(((word, True) for word in words))
|
||||
|
||||
|
||||
class NaiveBayesAnalyzer(BaseSentimentAnalyzer):
|
||||
"""Naive Bayes analyzer that is trained on a dataset of movie reviews.
|
||||
Returns results as a named tuple of the form:
|
||||
``Sentiment(classification, p_pos, p_neg)``
|
||||
|
||||
:param callable feature_extractor: Function that returns a dictionary of
|
||||
features, given a list of words.
|
||||
"""
|
||||
|
||||
kind = DISCRETE
|
||||
#: Return type declaration
|
||||
RETURN_TYPE = namedtuple('Sentiment', ['classification', 'p_pos', 'p_neg'])
|
||||
|
||||
def __init__(self, feature_extractor=_default_feature_extractor):
|
||||
super(NaiveBayesAnalyzer, self).__init__()
|
||||
self._classifier = None
|
||||
self.feature_extractor = feature_extractor
|
||||
|
||||
@requires_nltk_corpus
|
||||
def train(self):
|
||||
"""Train the Naive Bayes classifier on the movie review corpus."""
|
||||
super(NaiveBayesAnalyzer, self).train()
|
||||
neg_ids = nltk.corpus.movie_reviews.fileids('neg')
|
||||
pos_ids = nltk.corpus.movie_reviews.fileids('pos')
|
||||
neg_feats = [(self.feature_extractor(
|
||||
nltk.corpus.movie_reviews.words(fileids=[f])), 'neg') for f in neg_ids]
|
||||
pos_feats = [(self.feature_extractor(
|
||||
nltk.corpus.movie_reviews.words(fileids=[f])), 'pos') for f in pos_ids]
|
||||
train_data = neg_feats + pos_feats
|
||||
self._classifier = nltk.classify.NaiveBayesClassifier.train(train_data)
|
||||
|
||||
def analyze(self, text):
|
||||
"""Return the sentiment as a named tuple of the form:
|
||||
``Sentiment(classification, p_pos, p_neg)``
|
||||
"""
|
||||
# Lazily train the classifier
|
||||
super(NaiveBayesAnalyzer, self).analyze(text)
|
||||
tokens = word_tokenize(text, include_punc=False)
|
||||
filtered = (t.lower() for t in tokens if len(t) >= 3)
|
||||
feats = self.feature_extractor(filtered)
|
||||
prob_dist = self._classifier.prob_classify(feats)
|
||||
return self.RETURN_TYPE(
|
||||
classification=prob_dist.max(),
|
||||
p_pos=prob_dist.prob('pos'),
|
||||
p_neg=prob_dist.prob("neg")
|
||||
)
|
||||
38
backend/venv/Lib/site-packages/textblob/en/taggers.py
Normal file
38
backend/venv/Lib/site-packages/textblob/en/taggers.py
Normal file
@@ -0,0 +1,38 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Parts-of-speech tagger implementations."""
|
||||
from __future__ import absolute_import
|
||||
|
||||
import nltk
|
||||
import textblob.compat
|
||||
|
||||
import textblob as tb
|
||||
from textblob.en import tag as pattern_tag
|
||||
from textblob.decorators import requires_nltk_corpus
|
||||
from textblob.base import BaseTagger
|
||||
|
||||
|
||||
class PatternTagger(BaseTagger):
|
||||
"""Tagger that uses the implementation in
|
||||
Tom de Smedt's pattern library
|
||||
(http://www.clips.ua.ac.be/pattern).
|
||||
"""
|
||||
|
||||
def tag(self, text, tokenize=True):
|
||||
"""Tag a string or BaseBlob."""
|
||||
if not isinstance(text, textblob.compat.text_type):
|
||||
text = text.raw
|
||||
return pattern_tag(text, tokenize)
|
||||
|
||||
|
||||
class NLTKTagger(BaseTagger):
|
||||
"""Tagger that uses NLTK's standard TreeBank tagger.
|
||||
NOTE: Requires numpy. Not yet supported with PyPy.
|
||||
"""
|
||||
|
||||
@requires_nltk_corpus
|
||||
def tag(self, text):
|
||||
"""Tag a string or BaseBlob."""
|
||||
if isinstance(text, textblob.compat.text_type):
|
||||
text = tb.TextBlob(text)
|
||||
|
||||
return nltk.tag.pos_tag(text.tokens)
|
||||
48
backend/venv/Lib/site-packages/textblob/exceptions.py
Normal file
48
backend/venv/Lib/site-packages/textblob/exceptions.py
Normal file
@@ -0,0 +1,48 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
MISSING_CORPUS_MESSAGE = """
|
||||
Looks like you are missing some required data for this feature.
|
||||
|
||||
To download the necessary data, simply run
|
||||
|
||||
python -m textblob.download_corpora
|
||||
|
||||
or use the NLTK downloader to download the missing data: http://nltk.org/data.html
|
||||
If this doesn't fix the problem, file an issue at https://github.com/sloria/TextBlob/issues.
|
||||
"""
|
||||
|
||||
class TextBlobError(Exception):
|
||||
"""A TextBlob-related error."""
|
||||
pass
|
||||
|
||||
|
||||
TextBlobException = TextBlobError # Backwards compat
|
||||
|
||||
class MissingCorpusError(TextBlobError):
|
||||
"""Exception thrown when a user tries to use a feature that requires a
|
||||
dataset or model that the user does not have on their system.
|
||||
"""
|
||||
|
||||
def __init__(self, message=MISSING_CORPUS_MESSAGE, *args, **kwargs):
|
||||
super(MissingCorpusError, self).__init__(message, *args, **kwargs)
|
||||
|
||||
|
||||
MissingCorpusException = MissingCorpusError # Backwards compat
|
||||
|
||||
class DeprecationError(TextBlobError):
|
||||
"""Raised when user uses a deprecated feature."""
|
||||
pass
|
||||
|
||||
class TranslatorError(TextBlobError):
|
||||
"""Raised when an error occurs during language translation or detection."""
|
||||
pass
|
||||
|
||||
class NotTranslated(TranslatorError):
|
||||
"""Raised when text is unchanged after translation. This may be due to the language
|
||||
being unsupported by the translator.
|
||||
"""
|
||||
pass
|
||||
|
||||
class FormatError(TextBlobError):
|
||||
"""Raised if a data file with an unsupported format is passed to a classifier."""
|
||||
pass
|
||||
161
backend/venv/Lib/site-packages/textblob/formats.py
Normal file
161
backend/venv/Lib/site-packages/textblob/formats.py
Normal file
@@ -0,0 +1,161 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""File formats for training and testing data.
|
||||
|
||||
Includes a registry of valid file formats. New file formats can be added to the
|
||||
registry like so: ::
|
||||
|
||||
from textblob import formats
|
||||
|
||||
class PipeDelimitedFormat(formats.DelimitedFormat):
|
||||
delimiter = '|'
|
||||
|
||||
formats.register('psv', PipeDelimitedFormat)
|
||||
|
||||
Once a format has been registered, classifiers will be able to read data files with
|
||||
that format. ::
|
||||
|
||||
from textblob.classifiers import NaiveBayesAnalyzer
|
||||
|
||||
with open('training_data.psv', 'r') as fp:
|
||||
cl = NaiveBayesAnalyzer(fp, format='psv')
|
||||
"""
|
||||
from __future__ import absolute_import
|
||||
import json
|
||||
from collections import OrderedDict
|
||||
|
||||
from textblob.compat import PY2, csv
|
||||
from textblob.utils import is_filelike
|
||||
|
||||
DEFAULT_ENCODING = 'utf-8'
|
||||
|
||||
class BaseFormat(object):
|
||||
"""Interface for format classes. Individual formats can decide on the
|
||||
composition and meaning of ``**kwargs``.
|
||||
|
||||
:param File fp: A file-like object.
|
||||
|
||||
.. versionchanged:: 0.9.0
|
||||
Constructor receives a file pointer rather than a file path.
|
||||
"""
|
||||
def __init__(self, fp, **kwargs):
|
||||
pass
|
||||
|
||||
def to_iterable(self):
|
||||
"""Return an iterable object from the data."""
|
||||
raise NotImplementedError('Must implement a "to_iterable" method.')
|
||||
|
||||
@classmethod
|
||||
def detect(cls, stream):
|
||||
"""Detect the file format given a filename.
|
||||
Return True if a stream is this file format.
|
||||
|
||||
.. versionchanged:: 0.9.0
|
||||
Changed from a static method to a class method.
|
||||
"""
|
||||
raise NotImplementedError('Must implement a "detect" class method.')
|
||||
|
||||
class DelimitedFormat(BaseFormat):
|
||||
"""A general character-delimited format."""
|
||||
|
||||
delimiter = ","
|
||||
|
||||
def __init__(self, fp, **kwargs):
|
||||
BaseFormat.__init__(self, fp, **kwargs)
|
||||
if PY2:
|
||||
reader = csv.reader(fp, delimiter=self.delimiter,
|
||||
encoding=DEFAULT_ENCODING)
|
||||
else:
|
||||
reader = csv.reader(fp, delimiter=self.delimiter)
|
||||
self.data = [row for row in reader]
|
||||
|
||||
def to_iterable(self):
|
||||
"""Return an iterable object from the data."""
|
||||
return self.data
|
||||
|
||||
@classmethod
|
||||
def detect(cls, stream):
|
||||
"""Return True if stream is valid."""
|
||||
try:
|
||||
csv.Sniffer().sniff(stream, delimiters=cls.delimiter)
|
||||
return True
|
||||
except (csv.Error, TypeError):
|
||||
return False
|
||||
|
||||
|
||||
class CSV(DelimitedFormat):
|
||||
"""CSV format. Assumes each row is of the form ``text,label``.
|
||||
::
|
||||
|
||||
Today is a good day,pos
|
||||
I hate this car.,pos
|
||||
"""
|
||||
delimiter = ","
|
||||
|
||||
|
||||
class TSV(DelimitedFormat):
|
||||
"""TSV format. Assumes each row is of the form ``text\tlabel``.
|
||||
"""
|
||||
delimiter = "\t"
|
||||
|
||||
|
||||
class JSON(BaseFormat):
|
||||
"""JSON format.
|
||||
|
||||
Assumes that JSON is formatted as an array of objects with ``text`` and
|
||||
``label`` properties.
|
||||
::
|
||||
|
||||
[
|
||||
{"text": "Today is a good day.", "label": "pos"},
|
||||
{"text": "I hate this car.", "label": "neg"}
|
||||
]
|
||||
"""
|
||||
def __init__(self, fp, **kwargs):
|
||||
BaseFormat.__init__(self, fp, **kwargs)
|
||||
self.dict = json.load(fp)
|
||||
|
||||
def to_iterable(self):
|
||||
"""Return an iterable object from the JSON data."""
|
||||
return [(d['text'], d['label']) for d in self.dict]
|
||||
|
||||
@classmethod
|
||||
def detect(cls, stream):
|
||||
"""Return True if stream is valid JSON."""
|
||||
try:
|
||||
json.loads(stream)
|
||||
return True
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
|
||||
_registry = OrderedDict([
|
||||
('csv', CSV),
|
||||
('json', JSON),
|
||||
('tsv', TSV),
|
||||
])
|
||||
|
||||
def detect(fp, max_read=1024):
|
||||
"""Attempt to detect a file's format, trying each of the supported
|
||||
formats. Return the format class that was detected. If no format is
|
||||
detected, return ``None``.
|
||||
"""
|
||||
if not is_filelike(fp):
|
||||
return None
|
||||
for Format in _registry.values():
|
||||
if Format.detect(fp.read(max_read)):
|
||||
fp.seek(0)
|
||||
return Format
|
||||
fp.seek(0)
|
||||
return None
|
||||
|
||||
def get_registry():
|
||||
"""Return a dictionary of registered formats."""
|
||||
return _registry
|
||||
|
||||
def register(name, format_class):
|
||||
"""Register a new format.
|
||||
|
||||
:param str name: The name that will be used to refer to the format, e.g. 'csv'
|
||||
:param type format_class: The format class to register.
|
||||
"""
|
||||
get_registry()[name] = format_class
|
||||
17
backend/venv/Lib/site-packages/textblob/inflect.py
Normal file
17
backend/venv/Lib/site-packages/textblob/inflect.py
Normal file
@@ -0,0 +1,17 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
'''Make word inflection default to English. This allows for backwards
|
||||
compatibility so you can still import text.inflect.
|
||||
|
||||
>>> from textblob.inflect import singularize
|
||||
|
||||
is equivalent to
|
||||
|
||||
>>> from textblob.en.inflect import singularize
|
||||
'''
|
||||
from __future__ import absolute_import
|
||||
from textblob.en.inflect import singularize, pluralize
|
||||
|
||||
__all__ = [
|
||||
'singularize',
|
||||
'pluralize',
|
||||
]
|
||||
179
backend/venv/Lib/site-packages/textblob/mixins.py
Normal file
179
backend/venv/Lib/site-packages/textblob/mixins.py
Normal file
@@ -0,0 +1,179 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import
|
||||
import sys
|
||||
from textblob.compat import basestring, implements_to_string, PY2, binary_type
|
||||
|
||||
|
||||
class ComparableMixin(object):
|
||||
|
||||
'''Implements rich operators for an object.'''
|
||||
|
||||
def _compare(self, other, method):
|
||||
try:
|
||||
return method(self._cmpkey(), other._cmpkey())
|
||||
except (AttributeError, TypeError):
|
||||
# _cmpkey not implemented, or return different type,
|
||||
# so I can't compare with "other". Try the reverse comparison
|
||||
return NotImplemented
|
||||
|
||||
def __lt__(self, other):
|
||||
return self._compare(other, lambda s, o: s < o)
|
||||
|
||||
def __le__(self, other):
|
||||
return self._compare(other, lambda s, o: s <= o)
|
||||
|
||||
def __eq__(self, other):
|
||||
return self._compare(other, lambda s, o: s == o)
|
||||
|
||||
def __ge__(self, other):
|
||||
return self._compare(other, lambda s, o: s >= o)
|
||||
|
||||
def __gt__(self, other):
|
||||
return self._compare(other, lambda s, o: s > o)
|
||||
|
||||
def __ne__(self, other):
|
||||
return self._compare(other, lambda s, o: s != o)
|
||||
|
||||
|
||||
class BlobComparableMixin(ComparableMixin):
|
||||
|
||||
'''Allow blob objects to be comparable with both strings and blobs.'''
|
||||
|
||||
def _compare(self, other, method):
|
||||
if isinstance(other, basestring):
|
||||
# Just compare with the other string
|
||||
return method(self._cmpkey(), other)
|
||||
return super(BlobComparableMixin, self)._compare(other, method)
|
||||
|
||||
|
||||
@implements_to_string
|
||||
class StringlikeMixin(object):
|
||||
|
||||
'''Make blob objects behave like Python strings.
|
||||
|
||||
Expects that classes that use this mixin to have a _strkey() method that
|
||||
returns the string to apply string methods to. Using _strkey() instead
|
||||
of __str__ ensures consistent behavior between Python 2 and 3.
|
||||
'''
|
||||
|
||||
def __repr__(self):
|
||||
'''Returns a string representation for debugging.'''
|
||||
class_name = self.__class__.__name__
|
||||
text = self.__unicode__().encode("utf-8") if PY2 else str(self)
|
||||
ret = '{cls}("{text}")'.format(cls=class_name,
|
||||
text=text)
|
||||
return binary_type(ret) if PY2 else ret
|
||||
|
||||
def __str__(self):
|
||||
'''Returns a string representation used in print statements
|
||||
or str(my_blob).'''
|
||||
return self._strkey()
|
||||
|
||||
def __len__(self):
|
||||
'''Returns the length of the raw text.'''
|
||||
return len(self._strkey())
|
||||
|
||||
def __iter__(self):
|
||||
'''Makes the object iterable as if it were a string,
|
||||
iterating through the raw string's characters.
|
||||
'''
|
||||
return iter(self._strkey())
|
||||
|
||||
def __contains__(self, sub):
|
||||
'''Implements the `in` keyword like a Python string.'''
|
||||
return sub in self._strkey()
|
||||
|
||||
def __getitem__(self, index):
|
||||
'''Returns a substring. If index is an integer, returns a Python
|
||||
string of a single character. If a range is given, e.g. `blob[3:5]`,
|
||||
a new instance of the class is returned.
|
||||
'''
|
||||
if isinstance(index, int):
|
||||
return self._strkey()[index] # Just return a single character
|
||||
else:
|
||||
# Return a new blob object
|
||||
return self.__class__(self._strkey()[index])
|
||||
|
||||
def find(self, sub, start=0, end=sys.maxsize):
|
||||
'''Behaves like the built-in str.find() method. Returns an integer,
|
||||
the index of the first occurrence of the substring argument sub in the
|
||||
sub-string given by [start:end].
|
||||
'''
|
||||
return self._strkey().find(sub, start, end)
|
||||
|
||||
def rfind(self, sub, start=0, end=sys.maxsize):
|
||||
'''Behaves like the built-in str.rfind() method. Returns an integer,
|
||||
the index of he last (right-most) occurence of the substring argument
|
||||
sub in the sub-sequence given by [start:end].
|
||||
'''
|
||||
return self._strkey().rfind(sub, start, end)
|
||||
|
||||
def index(self, sub, start=0, end=sys.maxsize):
|
||||
'''Like blob.find() but raise ValueError when the substring
|
||||
is not found.
|
||||
'''
|
||||
return self._strkey().index(sub, start, end)
|
||||
|
||||
def rindex(self, sub, start=0, end=sys.maxsize):
|
||||
'''Like blob.rfind() but raise ValueError when substring is not
|
||||
found.
|
||||
'''
|
||||
return self._strkey().rindex(sub, start, end)
|
||||
|
||||
def startswith(self, prefix, start=0, end=sys.maxsize):
|
||||
"""Returns True if the blob starts with the given prefix."""
|
||||
return self._strkey().startswith(prefix, start, end)
|
||||
|
||||
def endswith(self, suffix, start=0, end=sys.maxsize):
|
||||
"""Returns True if the blob ends with the given suffix."""
|
||||
return self._strkey().endswith(suffix, start, end)
|
||||
|
||||
# PEP8 aliases
|
||||
starts_with = startswith
|
||||
ends_with = endswith
|
||||
|
||||
def title(self):
|
||||
"""Returns a blob object with the text in title-case."""
|
||||
return self.__class__(self._strkey().title())
|
||||
|
||||
def format(self, *args, **kwargs):
|
||||
"""Perform a string formatting operation, like the built-in
|
||||
`str.format(*args, **kwargs)`. Returns a blob object.
|
||||
"""
|
||||
return self.__class__(self._strkey().format(*args, **kwargs))
|
||||
|
||||
def split(self, sep=None, maxsplit=sys.maxsize):
|
||||
"""Behaves like the built-in str.split().
|
||||
"""
|
||||
return self._strkey().split(sep, maxsplit)
|
||||
|
||||
def strip(self, chars=None):
|
||||
"""Behaves like the built-in str.strip([chars]) method. Returns
|
||||
an object with leading and trailing whitespace removed.
|
||||
"""
|
||||
return self.__class__(self._strkey().strip(chars))
|
||||
|
||||
def upper(self):
|
||||
"""Like str.upper(), returns new object with all upper-cased characters.
|
||||
"""
|
||||
return self.__class__(self._strkey().upper())
|
||||
|
||||
def lower(self):
|
||||
"""Like str.lower(), returns new object with all lower-cased characters.
|
||||
"""
|
||||
return self.__class__(self._strkey().lower())
|
||||
|
||||
def join(self, iterable):
|
||||
"""Behaves like the built-in `str.join(iterable)` method, except
|
||||
returns a blob object.
|
||||
|
||||
Returns a blob which is the concatenation of the strings or blobs
|
||||
in the iterable.
|
||||
"""
|
||||
return self.__class__(self._strkey().join(iterable))
|
||||
|
||||
def replace(self, old, new, count=sys.maxsize):
|
||||
"""Return a new blob object with all the occurence of `old` replaced
|
||||
by `new`.
|
||||
"""
|
||||
return self.__class__(self._strkey().replace(old, new, count))
|
||||
19
backend/venv/Lib/site-packages/textblob/np_extractors.py
Normal file
19
backend/venv/Lib/site-packages/textblob/np_extractors.py
Normal file
@@ -0,0 +1,19 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Default noun phrase extractors are for English to maintain backwards
|
||||
compatibility, so you can still do
|
||||
|
||||
>>> from textblob.np_extractors import ConllExtractor
|
||||
|
||||
which is equivalent to
|
||||
|
||||
>>> from textblob.en.np_extractors import ConllExtractor
|
||||
"""
|
||||
from __future__ import absolute_import
|
||||
from textblob.base import BaseNPExtractor
|
||||
from textblob.en.np_extractors import ConllExtractor, FastNPExtractor
|
||||
|
||||
__all__ = [
|
||||
'BaseNPExtractor',
|
||||
'ConllExtractor',
|
||||
'FastNPExtractor',
|
||||
]
|
||||
17
backend/venv/Lib/site-packages/textblob/parsers.py
Normal file
17
backend/venv/Lib/site-packages/textblob/parsers.py
Normal file
@@ -0,0 +1,17 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
'''Default parsers to English for backwards compatibility so you can still do
|
||||
|
||||
>>> from textblob.parsers import PatternParser
|
||||
|
||||
which is equivalent to
|
||||
|
||||
>>> from textblob.en.parsers import PatternParser
|
||||
'''
|
||||
from __future__ import absolute_import
|
||||
from textblob.base import BaseParser
|
||||
from textblob.en.parsers import PatternParser
|
||||
|
||||
__all__ = [
|
||||
'BaseParser',
|
||||
'PatternParser',
|
||||
]
|
||||
22
backend/venv/Lib/site-packages/textblob/sentiments.py
Normal file
22
backend/venv/Lib/site-packages/textblob/sentiments.py
Normal file
@@ -0,0 +1,22 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
'''Default sentiment analyzers are English for backwards compatibility, so
|
||||
you can still do
|
||||
|
||||
>>> from textblob.sentiments import PatternAnalyzer
|
||||
|
||||
which is equivalent to
|
||||
|
||||
>>> from textblob.en.sentiments import PatternAnalyzer
|
||||
'''
|
||||
from __future__ import absolute_import
|
||||
from textblob.base import BaseSentimentAnalyzer
|
||||
from textblob.en.sentiments import (DISCRETE, CONTINUOUS,
|
||||
PatternAnalyzer, NaiveBayesAnalyzer)
|
||||
|
||||
__all__ = [
|
||||
'BaseSentimentAnalyzer',
|
||||
'DISCRETE',
|
||||
'CONTINUOUS',
|
||||
'PatternAnalyzer',
|
||||
'NaiveBayesAnalyzer',
|
||||
]
|
||||
18
backend/venv/Lib/site-packages/textblob/taggers.py
Normal file
18
backend/venv/Lib/site-packages/textblob/taggers.py
Normal file
@@ -0,0 +1,18 @@
|
||||
'''Default taggers to the English taggers for backwards incompatibility, so you
|
||||
can still do
|
||||
|
||||
>>> from textblob.taggers import NLTKTagger
|
||||
|
||||
which is equivalent to
|
||||
|
||||
>>> from textblob.en.taggers import NLTKTagger
|
||||
'''
|
||||
from __future__ import absolute_import
|
||||
from textblob.base import BaseTagger
|
||||
from textblob.en.taggers import PatternTagger, NLTKTagger
|
||||
|
||||
__all__ = [
|
||||
'BaseTagger',
|
||||
'PatternTagger',
|
||||
'NLTKTagger',
|
||||
]
|
||||
74
backend/venv/Lib/site-packages/textblob/tokenizers.py
Normal file
74
backend/venv/Lib/site-packages/textblob/tokenizers.py
Normal file
@@ -0,0 +1,74 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
'''Various tokenizer implementations.
|
||||
|
||||
.. versionadded:: 0.4.0
|
||||
'''
|
||||
from __future__ import absolute_import
|
||||
from itertools import chain
|
||||
|
||||
import nltk
|
||||
|
||||
from textblob.utils import strip_punc
|
||||
from textblob.base import BaseTokenizer
|
||||
from textblob.decorators import requires_nltk_corpus
|
||||
|
||||
|
||||
class WordTokenizer(BaseTokenizer):
|
||||
"""NLTK's recommended word tokenizer (currently the TreeBankTokenizer).
|
||||
Uses regular expressions to tokenize text. Assumes text has already been
|
||||
segmented into sentences.
|
||||
|
||||
Performs the following steps:
|
||||
|
||||
* split standard contractions, e.g. don't -> do n't
|
||||
* split commas and single quotes
|
||||
* separate periods that appear at the end of line
|
||||
"""
|
||||
|
||||
def tokenize(self, text, include_punc=True):
|
||||
'''Return a list of word tokens.
|
||||
|
||||
:param text: string of text.
|
||||
:param include_punc: (optional) whether to include punctuation as separate tokens. Default to True.
|
||||
'''
|
||||
tokens = nltk.tokenize.word_tokenize(text)
|
||||
if include_punc:
|
||||
return tokens
|
||||
else:
|
||||
# Return each word token
|
||||
# Strips punctuation unless the word comes from a contraction
|
||||
# e.g. "Let's" => ["Let", "'s"]
|
||||
# e.g. "Can't" => ["Ca", "n't"]
|
||||
# e.g. "home." => ['home']
|
||||
return [word if word.startswith("'") else strip_punc(word, all=False)
|
||||
for word in tokens if strip_punc(word, all=False)]
|
||||
|
||||
|
||||
class SentenceTokenizer(BaseTokenizer):
|
||||
"""NLTK's sentence tokenizer (currently PunktSentenceTokenizer).
|
||||
Uses an unsupervised algorithm to build a model for abbreviation words,
|
||||
collocations, and words that start sentences,
|
||||
then uses that to find sentence boundaries.
|
||||
"""
|
||||
|
||||
@requires_nltk_corpus
|
||||
def tokenize(self, text):
|
||||
'''Return a list of sentences.'''
|
||||
return nltk.tokenize.sent_tokenize(text)
|
||||
|
||||
|
||||
#: Convenience function for tokenizing sentences
|
||||
sent_tokenize = SentenceTokenizer().itokenize
|
||||
|
||||
_word_tokenizer = WordTokenizer() # Singleton word tokenizer
|
||||
def word_tokenize(text, include_punc=True, *args, **kwargs):
|
||||
"""Convenience function for tokenizing text into words.
|
||||
|
||||
NOTE: NLTK's word tokenizer expects sentences as input, so the text will be
|
||||
tokenized to sentences before being tokenized to words.
|
||||
"""
|
||||
words = chain.from_iterable(
|
||||
_word_tokenizer.itokenize(sentence, include_punc=include_punc,
|
||||
*args, **kwargs)
|
||||
for sentence in sent_tokenize(text))
|
||||
return words
|
||||
149
backend/venv/Lib/site-packages/textblob/translate.py
Normal file
149
backend/venv/Lib/site-packages/textblob/translate.py
Normal file
@@ -0,0 +1,149 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Translator module that uses the Google Translate API.
|
||||
|
||||
Adapted from Terry Yin's google-translate-python.
|
||||
Language detection added by Steven Loria.
|
||||
"""
|
||||
from __future__ import absolute_import
|
||||
|
||||
import codecs
|
||||
import json
|
||||
import re
|
||||
|
||||
from textblob.compat import PY2, request, urlencode
|
||||
from textblob.exceptions import TranslatorError, NotTranslated
|
||||
|
||||
|
||||
class Translator(object):
|
||||
|
||||
"""A language translator and detector.
|
||||
|
||||
Usage:
|
||||
::
|
||||
>>> from textblob.translate import Translator
|
||||
>>> t = Translator()
|
||||
>>> t.translate('hello', from_lang='en', to_lang='fr')
|
||||
u'bonjour'
|
||||
>>> t.detect("hola")
|
||||
u'es'
|
||||
"""
|
||||
|
||||
url = "http://translate.google.com/translate_a/t?client=webapp&dt=bd&dt=ex&dt=ld&dt=md&dt=qca&dt=rw&dt=rm&dt=ss&dt=t&dt=at&ie=UTF-8&oe=UTF-8&otf=2&ssel=0&tsel=0&kc=1"
|
||||
|
||||
headers = {
|
||||
'Accept': '*/*',
|
||||
'Connection': 'keep-alive',
|
||||
'User-Agent': (
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) '
|
||||
'AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.168 Safari/535.19')
|
||||
}
|
||||
|
||||
def translate(self, source, from_lang='auto', to_lang='en', host=None, type_=None):
|
||||
"""Translate the source text from one language to another."""
|
||||
if PY2:
|
||||
source = source.encode('utf-8')
|
||||
data = {"q": source}
|
||||
url = u'{url}&sl={from_lang}&tl={to_lang}&hl={to_lang}&tk={tk}&client={client}'.format(
|
||||
url=self.url,
|
||||
from_lang=from_lang,
|
||||
to_lang=to_lang,
|
||||
tk=_calculate_tk(source),
|
||||
client="te",
|
||||
)
|
||||
response = self._request(url, host=host, type_=type_, data=data)
|
||||
result = json.loads(response)
|
||||
if isinstance(result, list):
|
||||
try:
|
||||
result = result[0] # ignore detected language
|
||||
except IndexError:
|
||||
pass
|
||||
self._validate_translation(source, result)
|
||||
return result
|
||||
|
||||
def detect(self, source, host=None, type_=None):
|
||||
"""Detect the source text's language."""
|
||||
if PY2:
|
||||
source = source.encode('utf-8')
|
||||
if len(source) < 3:
|
||||
raise TranslatorError('Must provide a string with at least 3 characters.')
|
||||
data = {"q": source}
|
||||
url = u'{url}&sl=auto&tk={tk}&client={client}'.format(
|
||||
url=self.url,
|
||||
tk=_calculate_tk(source),
|
||||
client="te",
|
||||
)
|
||||
response = self._request(url, host=host, type_=type_, data=data)
|
||||
result, language = json.loads(response)
|
||||
return language
|
||||
|
||||
def _validate_translation(self, source, result):
|
||||
"""Validate API returned expected schema, and that the translated text
|
||||
is different than the original string.
|
||||
"""
|
||||
if not result:
|
||||
raise NotTranslated('Translation API returned and empty response.')
|
||||
if PY2:
|
||||
result = result.encode('utf-8')
|
||||
if result.strip() == source.strip():
|
||||
raise NotTranslated('Translation API returned the input string unchanged.')
|
||||
|
||||
def _request(self, url, host=None, type_=None, data=None):
|
||||
encoded_data = urlencode(data).encode('utf-8')
|
||||
req = request.Request(url=url, headers=self.headers, data=encoded_data)
|
||||
if host or type_:
|
||||
req.set_proxy(host=host, type=type_)
|
||||
resp = request.urlopen(req)
|
||||
content = resp.read()
|
||||
return content.decode('utf-8')
|
||||
|
||||
|
||||
def _unescape(text):
|
||||
"""Unescape unicode character codes within a string.
|
||||
"""
|
||||
pattern = r'\\{1,2}u[0-9a-fA-F]{4}'
|
||||
return re.sub(pattern, lambda x: codecs.getdecoder('unicode_escape')(x.group())[0], text)
|
||||
|
||||
|
||||
def _calculate_tk(source):
|
||||
"""Reverse engineered cross-site request protection."""
|
||||
# Source: https://github.com/soimort/translate-shell/issues/94#issuecomment-165433715
|
||||
# Source: http://www.liuxiatool.com/t.php
|
||||
|
||||
def c_int(x, nbits=32):
|
||||
""" C cast to int32, int16, int8... """
|
||||
return (x & ((1 << (nbits - 1)) - 1)) - (x & (1 << (nbits - 1)))
|
||||
|
||||
def c_uint(x, nbits=32):
|
||||
""" C cast to uint32, uint16, uint8... """
|
||||
return x & ((1 << nbits) - 1)
|
||||
|
||||
tkk = [406398, 561666268 + 1526272306]
|
||||
b = tkk[0]
|
||||
|
||||
if PY2:
|
||||
d = map(ord, source)
|
||||
else:
|
||||
d = source.encode('utf-8')
|
||||
|
||||
def RL(a, b):
|
||||
for c in range(0, len(b) - 2, 3):
|
||||
d = b[c + 2]
|
||||
d = ord(d) - 87 if d >= 'a' else int(d)
|
||||
xa = c_uint(a)
|
||||
d = xa >> d if b[c + 1] == '+' else xa << d
|
||||
a = a + d & 4294967295 if b[c] == '+' else a ^ d
|
||||
return c_int(a)
|
||||
|
||||
a = b
|
||||
|
||||
for di in d:
|
||||
a = RL(a + di, "+-a^+6")
|
||||
|
||||
a = RL(a, "+-3^+b+-f")
|
||||
a ^= tkk[1]
|
||||
a = a if a >= 0 else ((a & 2147483647) + 2147483648)
|
||||
a %= pow(10, 6)
|
||||
|
||||
tk = '{0:d}.{1:d}'.format(a, a ^ b)
|
||||
return tk
|
||||
194
backend/venv/Lib/site-packages/textblob/unicodecsv/__init__.py
Normal file
194
backend/venv/Lib/site-packages/textblob/unicodecsv/__init__.py
Normal file
@@ -0,0 +1,194 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import csv
|
||||
from textblob.compat import izip
|
||||
|
||||
#http://semver.org/
|
||||
VERSION = (0, 9, 4)
|
||||
__version__ = ".".join(map(str,VERSION))
|
||||
|
||||
pass_throughs = [
|
||||
'register_dialect',
|
||||
'unregister_dialect',
|
||||
'get_dialect',
|
||||
'list_dialects',
|
||||
'field_size_limit',
|
||||
'Dialect',
|
||||
'excel',
|
||||
'excel_tab',
|
||||
'Sniffer',
|
||||
'QUOTE_ALL',
|
||||
'QUOTE_MINIMAL',
|
||||
'QUOTE_NONNUMERIC',
|
||||
'QUOTE_NONE',
|
||||
'Error'
|
||||
]
|
||||
__all__ = [
|
||||
'reader',
|
||||
'writer',
|
||||
'DictReader',
|
||||
'DictWriter',
|
||||
] + pass_throughs
|
||||
|
||||
for prop in pass_throughs:
|
||||
globals()[prop]=getattr(csv, prop)
|
||||
|
||||
def _stringify(s, encoding, errors):
|
||||
if s is None:
|
||||
return ''
|
||||
if isinstance(s, unicode):
|
||||
return s.encode(encoding, errors)
|
||||
elif isinstance(s, (int , float)):
|
||||
pass #let csv.QUOTE_NONNUMERIC do its thing.
|
||||
elif not isinstance(s, str):
|
||||
s=str(s)
|
||||
return s
|
||||
|
||||
def _stringify_list(l, encoding, errors='strict'):
|
||||
try:
|
||||
return [_stringify(s, encoding, errors) for s in iter(l)]
|
||||
except TypeError as e:
|
||||
raise csv.Error(str(e))
|
||||
|
||||
def _unicodify(s, encoding):
|
||||
if s is None:
|
||||
return None
|
||||
if isinstance(s, (unicode, int, float)):
|
||||
return s
|
||||
elif isinstance(s, str):
|
||||
return s.decode(encoding)
|
||||
return s
|
||||
|
||||
class UnicodeWriter(object):
|
||||
"""
|
||||
>>> import unicodecsv
|
||||
>>> from cStringIO import StringIO
|
||||
>>> f = StringIO()
|
||||
>>> w = unicodecsv.writer(f, encoding='utf-8')
|
||||
>>> w.writerow((u'é', u'ñ'))
|
||||
>>> f.seek(0)
|
||||
>>> r = unicodecsv.reader(f, encoding='utf-8')
|
||||
>>> row = r.next()
|
||||
>>> row[0] == u'é'
|
||||
True
|
||||
>>> row[1] == u'ñ'
|
||||
True
|
||||
"""
|
||||
def __init__(self, f, dialect=csv.excel, encoding='utf-8', errors='strict',
|
||||
*args, **kwds):
|
||||
self.encoding = encoding
|
||||
self.writer = csv.writer(f, dialect, *args, **kwds)
|
||||
self.encoding_errors = errors
|
||||
|
||||
def writerow(self, row):
|
||||
self.writer.writerow(_stringify_list(row, self.encoding, self.encoding_errors))
|
||||
|
||||
def writerows(self, rows):
|
||||
for row in rows:
|
||||
self.writerow(row)
|
||||
|
||||
@property
|
||||
def dialect(self):
|
||||
return self.writer.dialect
|
||||
writer = UnicodeWriter
|
||||
|
||||
class UnicodeReader(object):
|
||||
def __init__(self, f, dialect=None, encoding='utf-8', errors='strict',
|
||||
**kwds):
|
||||
format_params = ['delimiter', 'doublequote', 'escapechar', 'lineterminator', 'quotechar', 'quoting', 'skipinitialspace']
|
||||
if dialect is None:
|
||||
if not any([kwd_name in format_params for kwd_name in kwds.keys()]):
|
||||
dialect = csv.excel
|
||||
self.reader = csv.reader(f, dialect, **kwds)
|
||||
self.encoding = encoding
|
||||
self.encoding_errors = errors
|
||||
|
||||
def next(self):
|
||||
row = self.reader.next()
|
||||
encoding = self.encoding
|
||||
encoding_errors = self.encoding_errors
|
||||
float_ = float
|
||||
unicode_ = unicode
|
||||
return [(value if isinstance(value, float_) else
|
||||
unicode_(value, encoding, encoding_errors)) for value in row]
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
@property
|
||||
def dialect(self):
|
||||
return self.reader.dialect
|
||||
|
||||
@property
|
||||
def line_num(self):
|
||||
return self.reader.line_num
|
||||
reader = UnicodeReader
|
||||
|
||||
class DictWriter(csv.DictWriter):
|
||||
"""
|
||||
>>> from cStringIO import StringIO
|
||||
>>> f = StringIO()
|
||||
>>> w = DictWriter(f, ['a', u'ñ', 'b'], restval=u'î')
|
||||
>>> w.writerow({'a':'1', u'ñ':'2'})
|
||||
>>> w.writerow({'a':'1', u'ñ':'2', 'b':u'ø'})
|
||||
>>> w.writerow({'a':u'é', u'ñ':'2'})
|
||||
>>> f.seek(0)
|
||||
>>> r = DictReader(f, fieldnames=['a', u'ñ'], restkey='r')
|
||||
>>> r.next() == {'a': u'1', u'ñ':'2', 'r': [u'î']}
|
||||
True
|
||||
>>> r.next() == {'a': u'1', u'ñ':'2', 'r': [u'\xc3\xb8']}
|
||||
True
|
||||
>>> r.next() == {'a': u'\xc3\xa9', u'ñ':'2', 'r': [u'\xc3\xae']}
|
||||
True
|
||||
"""
|
||||
def __init__(self, csvfile, fieldnames, restval='', extrasaction='raise', dialect='excel', encoding='utf-8', errors='strict', *args, **kwds):
|
||||
self.encoding = encoding
|
||||
csv.DictWriter.__init__(self, csvfile, fieldnames, restval, extrasaction, dialect, *args, **kwds)
|
||||
self.writer = UnicodeWriter(csvfile, dialect, encoding=encoding, errors=errors, *args, **kwds)
|
||||
self.encoding_errors = errors
|
||||
|
||||
def writeheader(self):
|
||||
fieldnames = _stringify_list(self.fieldnames, self.encoding, self.encoding_errors)
|
||||
header = dict(zip(self.fieldnames, self.fieldnames))
|
||||
self.writerow(header)
|
||||
|
||||
class DictReader(csv.DictReader):
|
||||
"""
|
||||
>>> from cStringIO import StringIO
|
||||
>>> f = StringIO()
|
||||
>>> w = DictWriter(f, fieldnames=['name', 'place'])
|
||||
>>> w.writerow({'name': 'Cary Grant', 'place': 'hollywood'})
|
||||
>>> w.writerow({'name': 'Nathan Brillstone', 'place': u'øLand'})
|
||||
>>> w.writerow({'name': u'Willam ø. Unicoder', 'place': u'éSpandland'})
|
||||
>>> f.seek(0)
|
||||
>>> r = DictReader(f, fieldnames=['name', 'place'])
|
||||
>>> print r.next() == {'name': 'Cary Grant', 'place': 'hollywood'}
|
||||
True
|
||||
>>> print r.next() == {'name': 'Nathan Brillstone', 'place': u'øLand'}
|
||||
True
|
||||
>>> print r.next() == {'name': u'Willam ø. Unicoder', 'place': u'éSpandland'}
|
||||
True
|
||||
"""
|
||||
def __init__(self, csvfile, fieldnames=None, restkey=None, restval=None,
|
||||
dialect='excel', encoding='utf-8', errors='strict', *args,
|
||||
**kwds):
|
||||
if fieldnames is not None:
|
||||
fieldnames = _stringify_list(fieldnames, encoding)
|
||||
csv.DictReader.__init__(self, csvfile, fieldnames, restkey, restval, dialect, *args, **kwds)
|
||||
self.reader = UnicodeReader(csvfile, dialect, encoding=encoding,
|
||||
errors=errors, *args, **kwds)
|
||||
if fieldnames is None and not hasattr(csv.DictReader, 'fieldnames'):
|
||||
# Python 2.5 fieldnames workaround. (http://bugs.python.org/issue3436)
|
||||
reader = UnicodeReader(csvfile, dialect, encoding=encoding, *args, **kwds)
|
||||
self.fieldnames = _stringify_list(reader.next(), reader.encoding)
|
||||
self.unicode_fieldnames = [_unicodify(f, encoding) for f in
|
||||
self.fieldnames]
|
||||
self.unicode_restkey = _unicodify(restkey, encoding)
|
||||
|
||||
def next(self):
|
||||
row = csv.DictReader.next(self)
|
||||
result = dict((uni_key, row[str_key]) for (str_key, uni_key) in
|
||||
izip(self.fieldnames, self.unicode_fieldnames))
|
||||
rest = row.get(self.restkey)
|
||||
if rest:
|
||||
result[self.unicode_restkey] = rest
|
||||
return result
|
||||
56
backend/venv/Lib/site-packages/textblob/utils.py
Normal file
56
backend/venv/Lib/site-packages/textblob/utils.py
Normal file
@@ -0,0 +1,56 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import string
|
||||
|
||||
PUNCTUATION_REGEX = re.compile('[{0}]'.format(re.escape(string.punctuation)))
|
||||
|
||||
|
||||
def strip_punc(s, all=False):
|
||||
"""Removes punctuation from a string.
|
||||
|
||||
:param s: The string.
|
||||
:param all: Remove all punctuation. If False, only removes punctuation from
|
||||
the ends of the string.
|
||||
"""
|
||||
if all:
|
||||
return PUNCTUATION_REGEX.sub('', s.strip())
|
||||
else:
|
||||
return s.strip().strip(string.punctuation)
|
||||
|
||||
|
||||
def lowerstrip(s, all=False):
|
||||
"""Makes text all lowercase and strips punctuation and whitespace.
|
||||
|
||||
:param s: The string.
|
||||
:param all: Remove all punctuation. If False, only removes punctuation from
|
||||
the ends of the string.
|
||||
"""
|
||||
return strip_punc(s.lower().strip(), all=all)
|
||||
|
||||
|
||||
def tree2str(tree, concat=' '):
|
||||
"""Convert a nltk.tree.Tree to a string.
|
||||
|
||||
For example:
|
||||
(NP a/DT beautiful/JJ new/JJ dashboard/NN) -> "a beautiful dashboard"
|
||||
"""
|
||||
return concat.join([word for (word, tag) in tree])
|
||||
|
||||
|
||||
def filter_insignificant(chunk, tag_suffixes=('DT', 'CC', 'PRP$', 'PRP')):
|
||||
"""Filter out insignificant (word, tag) tuples from a chunk of text."""
|
||||
good = []
|
||||
for word, tag in chunk:
|
||||
ok = True
|
||||
for suffix in tag_suffixes:
|
||||
if tag.endswith(suffix):
|
||||
ok = False
|
||||
break
|
||||
if ok:
|
||||
good.append((word, tag))
|
||||
return good
|
||||
|
||||
|
||||
def is_filelike(obj):
|
||||
"""Return whether ``obj`` is a file-like object."""
|
||||
return hasattr(obj, 'read')
|
||||
17
backend/venv/Lib/site-packages/textblob/wordnet.py
Normal file
17
backend/venv/Lib/site-packages/textblob/wordnet.py
Normal file
@@ -0,0 +1,17 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Wordnet interface. Contains classes for creating Synsets and Lemmas
|
||||
directly.
|
||||
|
||||
.. versionadded:: 0.7.0
|
||||
|
||||
"""
|
||||
import nltk
|
||||
|
||||
#: wordnet module from nltk
|
||||
wordnet = nltk.corpus.wordnet
|
||||
#: Synset constructor
|
||||
Synset = nltk.corpus.wordnet.synset
|
||||
#: Lemma constructor
|
||||
Lemma = nltk.corpus.wordnet.lemma
|
||||
# Part of speech constants
|
||||
VERB, NOUN, ADJ, ADV = wordnet.VERB, wordnet.NOUN, wordnet.ADJ, wordnet.ADV
|
||||
Reference in New Issue
Block a user