687 lines
33 KiB
Python
687 lines
33 KiB
Python
# coding: utf-8
|
|
# Author: C.J. Hutto
|
|
# Thanks to George Berry for reducing the time complexity from something like O(N^4) to O(N).
|
|
# Thanks to Ewan Klein and Pierpaolo Pantone for bringing VADER into NLTK. Those modifications were awesome.
|
|
# For license information, see LICENSE.TXT
|
|
|
|
"""
|
|
If you use the VADER sentiment analysis tools, please cite:
|
|
Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for
|
|
Sentiment Analysis of Social Media Text. Eighth International Conference on
|
|
Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.
|
|
"""
|
|
import os
|
|
import re
|
|
import math
|
|
import string
|
|
import codecs
|
|
import json
|
|
from itertools import product
|
|
from inspect import getsourcefile
|
|
from io import open
|
|
|
|
# ##Constants##
|
|
|
|
# (empirically derived mean sentiment intensity rating increase for booster words)
|
|
B_INCR = 0.293
|
|
B_DECR = -0.293
|
|
|
|
# (empirically derived mean sentiment intensity rating increase for using ALLCAPs to emphasize a word)
|
|
C_INCR = 0.733
|
|
N_SCALAR = -0.74
|
|
|
|
NEGATE = \
|
|
["aint", "arent", "cannot", "cant", "couldnt", "darent", "didnt", "doesnt",
|
|
"ain't", "aren't", "can't", "couldn't", "daren't", "didn't", "doesn't",
|
|
"dont", "hadnt", "hasnt", "havent", "isnt", "mightnt", "mustnt", "neither",
|
|
"don't", "hadn't", "hasn't", "haven't", "isn't", "mightn't", "mustn't",
|
|
"neednt", "needn't", "never", "none", "nope", "nor", "not", "nothing", "nowhere",
|
|
"oughtnt", "shant", "shouldnt", "uhuh", "wasnt", "werent",
|
|
"oughtn't", "shan't", "shouldn't", "uh-uh", "wasn't", "weren't",
|
|
"without", "wont", "wouldnt", "won't", "wouldn't", "rarely", "seldom", "despite"]
|
|
|
|
# booster/dampener 'intensifiers' or 'degree adverbs'
|
|
# http://en.wiktionary.org/wiki/Category:English_degree_adverbs
|
|
|
|
BOOSTER_DICT = \
|
|
{"absolutely": B_INCR, "amazingly": B_INCR, "awfully": B_INCR,
|
|
"completely": B_INCR, "considerable": B_INCR, "considerably": B_INCR,
|
|
"decidedly": B_INCR, "deeply": B_INCR, "effing": B_INCR, "enormous": B_INCR, "enormously": B_INCR,
|
|
"entirely": B_INCR, "especially": B_INCR, "exceptional": B_INCR, "exceptionally": B_INCR,
|
|
"extreme": B_INCR, "extremely": B_INCR,
|
|
"fabulously": B_INCR, "flipping": B_INCR, "flippin": B_INCR, "frackin": B_INCR, "fracking": B_INCR,
|
|
"fricking": B_INCR, "frickin": B_INCR, "frigging": B_INCR, "friggin": B_INCR, "fully": B_INCR,
|
|
"fuckin": B_INCR, "fucking": B_INCR, "fuggin": B_INCR, "fugging": B_INCR,
|
|
"greatly": B_INCR, "hella": B_INCR, "highly": B_INCR, "hugely": B_INCR,
|
|
"incredible": B_INCR, "incredibly": B_INCR, "intensely": B_INCR,
|
|
"major": B_INCR, "majorly": B_INCR, "more": B_INCR, "most": B_INCR, "particularly": B_INCR,
|
|
"purely": B_INCR, "quite": B_INCR, "really": B_INCR, "remarkably": B_INCR,
|
|
"so": B_INCR, "substantially": B_INCR,
|
|
"thoroughly": B_INCR, "total": B_INCR, "totally": B_INCR, "tremendous": B_INCR, "tremendously": B_INCR,
|
|
"uber": B_INCR, "unbelievably": B_INCR, "unusually": B_INCR, "utter": B_INCR, "utterly": B_INCR,
|
|
"very": B_INCR,
|
|
"almost": B_DECR, "barely": B_DECR, "hardly": B_DECR, "just enough": B_DECR,
|
|
"kind of": B_DECR, "kinda": B_DECR, "kindof": B_DECR, "kind-of": B_DECR,
|
|
"less": B_DECR, "little": B_DECR, "marginal": B_DECR, "marginally": B_DECR,
|
|
"occasional": B_DECR, "occasionally": B_DECR, "partly": B_DECR,
|
|
"scarce": B_DECR, "scarcely": B_DECR, "slight": B_DECR, "slightly": B_DECR, "somewhat": B_DECR,
|
|
"sort of": B_DECR, "sorta": B_DECR, "sortof": B_DECR, "sort-of": B_DECR}
|
|
|
|
# check for sentiment laden idioms that do not contain lexicon words (future work, not yet implemented)
|
|
SENTIMENT_LADEN_IDIOMS = {"cut the mustard": 2, "hand to mouth": -2,
|
|
"back handed": -2, "blow smoke": -2, "blowing smoke": -2,
|
|
"upper hand": 1, "break a leg": 2,
|
|
"cooking with gas": 2, "in the black": 2, "in the red": -2,
|
|
"on the ball": 2, "under the weather": -2}
|
|
|
|
# check for special case idioms and phrases containing lexicon words
|
|
SPECIAL_CASES = {"the shit": 3, "the bomb": 3, "bad ass": 1.5, "badass": 1.5, "bus stop": 0.0,
|
|
"yeah right": -2, "kiss of death": -1.5, "to die for": 3, "beating heart": 3.5}
|
|
|
|
|
|
# #Static methods# #
|
|
|
|
def negated(input_words, include_nt=True):
|
|
"""
|
|
Determine if input contains negation words
|
|
"""
|
|
input_words = [str(w).lower() for w in input_words]
|
|
neg_words = []
|
|
neg_words.extend(NEGATE)
|
|
for word in neg_words:
|
|
if word in input_words:
|
|
return True
|
|
if include_nt:
|
|
for word in input_words:
|
|
if "n't" in word:
|
|
return True
|
|
'''if "least" in input_words:
|
|
i = input_words.index("least")
|
|
if i > 0 and input_words[i - 1] != "at":
|
|
return True'''
|
|
return False
|
|
|
|
|
|
def normalize(score, alpha=15):
|
|
"""
|
|
Normalize the score to be between -1 and 1 using an alpha that
|
|
approximates the max expected value
|
|
"""
|
|
norm_score = score / math.sqrt((score * score) + alpha)
|
|
if norm_score < -1.0:
|
|
return -1.0
|
|
elif norm_score > 1.0:
|
|
return 1.0
|
|
else:
|
|
return norm_score
|
|
|
|
|
|
def allcap_differential(words):
|
|
"""
|
|
Check whether just some words in the input are ALL CAPS
|
|
:param list words: The words to inspect
|
|
:returns: `True` if some but not all items in `words` are ALL CAPS
|
|
"""
|
|
is_different = False
|
|
allcap_words = 0
|
|
for word in words:
|
|
if word.isupper():
|
|
allcap_words += 1
|
|
cap_differential = len(words) - allcap_words
|
|
if 0 < cap_differential < len(words):
|
|
is_different = True
|
|
return is_different
|
|
|
|
|
|
def scalar_inc_dec(word, valence, is_cap_diff):
|
|
"""
|
|
Check if the preceding words increase, decrease, or negate/nullify the
|
|
valence
|
|
"""
|
|
scalar = 0.0
|
|
word_lower = word.lower()
|
|
if word_lower in BOOSTER_DICT:
|
|
scalar = BOOSTER_DICT[word_lower]
|
|
if valence < 0:
|
|
scalar *= -1
|
|
# check if booster/dampener word is in ALLCAPS (while others aren't)
|
|
if word.isupper() and is_cap_diff:
|
|
if valence > 0:
|
|
scalar += C_INCR
|
|
else:
|
|
scalar -= C_INCR
|
|
return scalar
|
|
|
|
|
|
class SentiText(object):
|
|
"""
|
|
Identify sentiment-relevant string-level properties of input text.
|
|
"""
|
|
|
|
def __init__(self, text):
|
|
if not isinstance(text, str):
|
|
text = str(text).encode('utf-8')
|
|
self.text = text
|
|
self.words_and_emoticons = self._words_and_emoticons()
|
|
# doesn't separate words from\
|
|
# adjacent punctuation (keeps emoticons & contractions)
|
|
self.is_cap_diff = allcap_differential(self.words_and_emoticons)
|
|
|
|
@staticmethod
|
|
def _strip_punc_if_word(token):
|
|
"""
|
|
Removes all trailing and leading punctuation
|
|
If the resulting string has two or fewer characters,
|
|
then it was likely an emoticon, so return original string
|
|
(ie ":)" stripped would be "", so just return ":)"
|
|
"""
|
|
stripped = token.strip(string.punctuation)
|
|
if len(stripped) <= 2:
|
|
return token
|
|
return stripped
|
|
|
|
def _words_and_emoticons(self):
|
|
"""
|
|
Removes leading and trailing puncutation
|
|
Leaves contractions and most emoticons
|
|
Does not preserve punc-plus-letter emoticons (e.g. :D)
|
|
"""
|
|
wes = self.text.split()
|
|
stripped = list(map(self._strip_punc_if_word, wes))
|
|
return stripped
|
|
|
|
class SentimentIntensityAnalyzer(object):
|
|
"""
|
|
Give a sentiment intensity score to sentences.
|
|
"""
|
|
|
|
def __init__(self, lexicon_file="vader_lexicon.txt", emoji_lexicon="emoji_utf8_lexicon.txt"):
|
|
_this_module_file_path_ = os.path.abspath(getsourcefile(lambda: 0))
|
|
lexicon_full_filepath = os.path.join(os.path.dirname(_this_module_file_path_), lexicon_file)
|
|
with codecs.open(lexicon_full_filepath, encoding='utf-8') as f:
|
|
self.lexicon_full_filepath = f.read()
|
|
self.lexicon = self.make_lex_dict()
|
|
|
|
emoji_full_filepath = os.path.join(os.path.dirname(_this_module_file_path_), emoji_lexicon)
|
|
with codecs.open(emoji_full_filepath, encoding='utf-8') as f:
|
|
self.emoji_full_filepath = f.read()
|
|
self.emojis = self.make_emoji_dict()
|
|
|
|
def make_lex_dict(self):
|
|
"""
|
|
Convert lexicon file to a dictionary
|
|
"""
|
|
lex_dict = {}
|
|
for line in self.lexicon_full_filepath.rstrip('\n').split('\n'):
|
|
if not line:
|
|
continue
|
|
(word, measure) = line.strip().split('\t')[0:2]
|
|
lex_dict[word] = float(measure)
|
|
return lex_dict
|
|
|
|
def make_emoji_dict(self):
|
|
"""
|
|
Convert emoji lexicon file to a dictionary
|
|
"""
|
|
emoji_dict = {}
|
|
for line in self.emoji_full_filepath.rstrip('\n').split('\n'):
|
|
(emoji, description) = line.strip().split('\t')[0:2]
|
|
emoji_dict[emoji] = description
|
|
return emoji_dict
|
|
|
|
def polarity_scores(self, text):
|
|
"""
|
|
Return a float for sentiment strength based on the input text.
|
|
Positive values are positive valence, negative value are negative
|
|
valence.
|
|
"""
|
|
# convert emojis to their textual descriptions
|
|
text_no_emoji = ""
|
|
prev_space = True
|
|
for chr in text:
|
|
if chr in self.emojis:
|
|
# get the textual description
|
|
description = self.emojis[chr]
|
|
if not prev_space:
|
|
text_no_emoji += ' '
|
|
text_no_emoji += description
|
|
prev_space = False
|
|
else:
|
|
text_no_emoji += chr
|
|
prev_space = chr == ' '
|
|
text = text_no_emoji.strip()
|
|
|
|
sentitext = SentiText(text)
|
|
|
|
sentiments = []
|
|
words_and_emoticons = sentitext.words_and_emoticons
|
|
for i, item in enumerate(words_and_emoticons):
|
|
valence = 0
|
|
# check for vader_lexicon words that may be used as modifiers or negations
|
|
if item.lower() in BOOSTER_DICT:
|
|
sentiments.append(valence)
|
|
continue
|
|
if (i < len(words_and_emoticons) - 1 and item.lower() == "kind" and
|
|
words_and_emoticons[i + 1].lower() == "of"):
|
|
sentiments.append(valence)
|
|
continue
|
|
|
|
sentiments = self.sentiment_valence(valence, sentitext, item, i, sentiments)
|
|
|
|
sentiments = self._but_check(words_and_emoticons, sentiments)
|
|
|
|
valence_dict = self.score_valence(sentiments, text)
|
|
|
|
return valence_dict
|
|
|
|
def sentiment_valence(self, valence, sentitext, item, i, sentiments):
|
|
is_cap_diff = sentitext.is_cap_diff
|
|
words_and_emoticons = sentitext.words_and_emoticons
|
|
item_lowercase = item.lower()
|
|
if item_lowercase in self.lexicon:
|
|
# get the sentiment valence
|
|
valence = self.lexicon[item_lowercase]
|
|
|
|
# check for "no" as negation for an adjacent lexicon item vs "no" as its own stand-alone lexicon item
|
|
if item_lowercase == "no" and i != len(words_and_emoticons)-1 and words_and_emoticons[i + 1].lower() in self.lexicon:
|
|
# don't use valence of "no" as a lexicon item. Instead set it's valence to 0.0 and negate the next item
|
|
valence = 0.0
|
|
if (i > 0 and words_and_emoticons[i - 1].lower() == "no") \
|
|
or (i > 1 and words_and_emoticons[i - 2].lower() == "no") \
|
|
or (i > 2 and words_and_emoticons[i - 3].lower() == "no" and words_and_emoticons[i - 1].lower() in ["or", "nor"] ):
|
|
valence = self.lexicon[item_lowercase] * N_SCALAR
|
|
|
|
# check if sentiment laden word is in ALL CAPS (while others aren't)
|
|
if item.isupper() and is_cap_diff:
|
|
if valence > 0:
|
|
valence += C_INCR
|
|
else:
|
|
valence -= C_INCR
|
|
|
|
for start_i in range(0, 3):
|
|
# dampen the scalar modifier of preceding words and emoticons
|
|
# (excluding the ones that immediately preceed the item) based
|
|
# on their distance from the current item.
|
|
if i > start_i and words_and_emoticons[i - (start_i + 1)].lower() not in self.lexicon:
|
|
s = scalar_inc_dec(words_and_emoticons[i - (start_i + 1)], valence, is_cap_diff)
|
|
if start_i == 1 and s != 0:
|
|
s = s * 0.95
|
|
if start_i == 2 and s != 0:
|
|
s = s * 0.9
|
|
valence = valence + s
|
|
valence = self._negation_check(valence, words_and_emoticons, start_i, i)
|
|
if start_i == 2:
|
|
valence = self._special_idioms_check(valence, words_and_emoticons, i)
|
|
|
|
valence = self._least_check(valence, words_and_emoticons, i)
|
|
sentiments.append(valence)
|
|
return sentiments
|
|
|
|
def _least_check(self, valence, words_and_emoticons, i):
|
|
# check for negation case using "least"
|
|
if i > 1 and words_and_emoticons[i - 1].lower() not in self.lexicon \
|
|
and words_and_emoticons[i - 1].lower() == "least":
|
|
if words_and_emoticons[i - 2].lower() != "at" and words_and_emoticons[i - 2].lower() != "very":
|
|
valence = valence * N_SCALAR
|
|
elif i > 0 and words_and_emoticons[i - 1].lower() not in self.lexicon \
|
|
and words_and_emoticons[i - 1].lower() == "least":
|
|
valence = valence * N_SCALAR
|
|
return valence
|
|
|
|
@staticmethod
|
|
def _but_check(words_and_emoticons, sentiments):
|
|
# check for modification in sentiment due to contrastive conjunction 'but'
|
|
words_and_emoticons_lower = [str(w).lower() for w in words_and_emoticons]
|
|
if 'but' in words_and_emoticons_lower:
|
|
bi = words_and_emoticons_lower.index('but')
|
|
for sentiment in sentiments:
|
|
si = sentiments.index(sentiment)
|
|
if si < bi:
|
|
sentiments.pop(si)
|
|
sentiments.insert(si, sentiment * 0.5)
|
|
elif si > bi:
|
|
sentiments.pop(si)
|
|
sentiments.insert(si, sentiment * 1.5)
|
|
return sentiments
|
|
|
|
@staticmethod
|
|
def _special_idioms_check(valence, words_and_emoticons, i):
|
|
words_and_emoticons_lower = [str(w).lower() for w in words_and_emoticons]
|
|
onezero = "{0} {1}".format(words_and_emoticons_lower[i - 1], words_and_emoticons_lower[i])
|
|
|
|
twoonezero = "{0} {1} {2}".format(words_and_emoticons_lower[i - 2],
|
|
words_and_emoticons_lower[i - 1], words_and_emoticons_lower[i])
|
|
|
|
twoone = "{0} {1}".format(words_and_emoticons_lower[i - 2], words_and_emoticons_lower[i - 1])
|
|
|
|
threetwoone = "{0} {1} {2}".format(words_and_emoticons_lower[i - 3],
|
|
words_and_emoticons_lower[i - 2], words_and_emoticons_lower[i - 1])
|
|
|
|
threetwo = "{0} {1}".format(words_and_emoticons_lower[i - 3], words_and_emoticons_lower[i - 2])
|
|
|
|
sequences = [onezero, twoonezero, twoone, threetwoone, threetwo]
|
|
|
|
for seq in sequences:
|
|
if seq in SPECIAL_CASES:
|
|
valence = SPECIAL_CASES[seq]
|
|
break
|
|
|
|
if len(words_and_emoticons_lower) - 1 > i:
|
|
zeroone = "{0} {1}".format(words_and_emoticons_lower[i], words_and_emoticons_lower[i + 1])
|
|
if zeroone in SPECIAL_CASES:
|
|
valence = SPECIAL_CASES[zeroone]
|
|
if len(words_and_emoticons_lower) - 1 > i + 1:
|
|
zeroonetwo = "{0} {1} {2}".format(words_and_emoticons_lower[i], words_and_emoticons_lower[i + 1],
|
|
words_and_emoticons_lower[i + 2])
|
|
if zeroonetwo in SPECIAL_CASES:
|
|
valence = SPECIAL_CASES[zeroonetwo]
|
|
|
|
# check for booster/dampener bi-grams such as 'sort of' or 'kind of'
|
|
n_grams = [threetwoone, threetwo, twoone]
|
|
for n_gram in n_grams:
|
|
if n_gram in BOOSTER_DICT:
|
|
valence = valence + BOOSTER_DICT[n_gram]
|
|
return valence
|
|
|
|
@staticmethod
|
|
def _sentiment_laden_idioms_check(valence, senti_text_lower):
|
|
# Future Work
|
|
# check for sentiment laden idioms that don't contain a lexicon word
|
|
idioms_valences = []
|
|
for idiom in SENTIMENT_LADEN_IDIOMS:
|
|
if idiom in senti_text_lower:
|
|
print(idiom, senti_text_lower)
|
|
valence = SENTIMENT_LADEN_IDIOMS[idiom]
|
|
idioms_valences.append(valence)
|
|
if len(idioms_valences) > 0:
|
|
valence = sum(idioms_valences) / float(len(idioms_valences))
|
|
return valence
|
|
|
|
@staticmethod
|
|
def _negation_check(valence, words_and_emoticons, start_i, i):
|
|
words_and_emoticons_lower = [str(w).lower() for w in words_and_emoticons]
|
|
if start_i == 0:
|
|
if negated([words_and_emoticons_lower[i - (start_i + 1)]]): # 1 word preceding lexicon word (w/o stopwords)
|
|
valence = valence * N_SCALAR
|
|
if start_i == 1:
|
|
if words_and_emoticons_lower[i - 2] == "never" and \
|
|
(words_and_emoticons_lower[i - 1] == "so" or
|
|
words_and_emoticons_lower[i - 1] == "this"):
|
|
valence = valence * 1.25
|
|
elif words_and_emoticons_lower[i - 2] == "without" and \
|
|
words_and_emoticons_lower[i - 1] == "doubt":
|
|
valence = valence
|
|
elif negated([words_and_emoticons_lower[i - (start_i + 1)]]): # 2 words preceding the lexicon word position
|
|
valence = valence * N_SCALAR
|
|
if start_i == 2:
|
|
if words_and_emoticons_lower[i - 3] == "never" and \
|
|
(words_and_emoticons_lower[i - 2] == "so" or words_and_emoticons_lower[i - 2] == "this") or \
|
|
(words_and_emoticons_lower[i - 1] == "so" or words_and_emoticons_lower[i - 1] == "this"):
|
|
valence = valence * 1.25
|
|
elif words_and_emoticons_lower[i - 3] == "without" and \
|
|
(words_and_emoticons_lower[i - 2] == "doubt" or words_and_emoticons_lower[i - 1] == "doubt"):
|
|
valence = valence
|
|
elif negated([words_and_emoticons_lower[i - (start_i + 1)]]): # 3 words preceding the lexicon word position
|
|
valence = valence * N_SCALAR
|
|
return valence
|
|
|
|
def _punctuation_emphasis(self, text):
|
|
# add emphasis from exclamation points and question marks
|
|
ep_amplifier = self._amplify_ep(text)
|
|
qm_amplifier = self._amplify_qm(text)
|
|
punct_emph_amplifier = ep_amplifier + qm_amplifier
|
|
return punct_emph_amplifier
|
|
|
|
@staticmethod
|
|
def _amplify_ep(text):
|
|
# check for added emphasis resulting from exclamation points (up to 4 of them)
|
|
ep_count = text.count("!")
|
|
if ep_count > 4:
|
|
ep_count = 4
|
|
# (empirically derived mean sentiment intensity rating increase for
|
|
# exclamation points)
|
|
ep_amplifier = ep_count * 0.292
|
|
return ep_amplifier
|
|
|
|
@staticmethod
|
|
def _amplify_qm(text):
|
|
# check for added emphasis resulting from question marks (2 or 3+)
|
|
qm_count = text.count("?")
|
|
qm_amplifier = 0
|
|
if qm_count > 1:
|
|
if qm_count <= 3:
|
|
# (empirically derived mean sentiment intensity rating increase for
|
|
# question marks)
|
|
qm_amplifier = qm_count * 0.18
|
|
else:
|
|
qm_amplifier = 0.96
|
|
return qm_amplifier
|
|
|
|
@staticmethod
|
|
def _sift_sentiment_scores(sentiments):
|
|
# want separate positive versus negative sentiment scores
|
|
pos_sum = 0.0
|
|
neg_sum = 0.0
|
|
neu_count = 0
|
|
for sentiment_score in sentiments:
|
|
if sentiment_score > 0:
|
|
pos_sum += (float(sentiment_score) + 1) # compensates for neutral words that are counted as 1
|
|
if sentiment_score < 0:
|
|
neg_sum += (float(sentiment_score) - 1) # when used with math.fabs(), compensates for neutrals
|
|
if sentiment_score == 0:
|
|
neu_count += 1
|
|
return pos_sum, neg_sum, neu_count
|
|
|
|
def score_valence(self, sentiments, text):
|
|
if sentiments:
|
|
sum_s = float(sum(sentiments))
|
|
# compute and add emphasis from punctuation in text
|
|
punct_emph_amplifier = self._punctuation_emphasis(text)
|
|
if sum_s > 0:
|
|
sum_s += punct_emph_amplifier
|
|
elif sum_s < 0:
|
|
sum_s -= punct_emph_amplifier
|
|
|
|
compound = normalize(sum_s)
|
|
# discriminate between positive, negative and neutral sentiment scores
|
|
pos_sum, neg_sum, neu_count = self._sift_sentiment_scores(sentiments)
|
|
|
|
if pos_sum > math.fabs(neg_sum):
|
|
pos_sum += punct_emph_amplifier
|
|
elif pos_sum < math.fabs(neg_sum):
|
|
neg_sum -= punct_emph_amplifier
|
|
|
|
total = pos_sum + math.fabs(neg_sum) + neu_count
|
|
pos = math.fabs(pos_sum / total)
|
|
neg = math.fabs(neg_sum / total)
|
|
neu = math.fabs(neu_count / total)
|
|
|
|
else:
|
|
compound = 0.0
|
|
pos = 0.0
|
|
neg = 0.0
|
|
neu = 0.0
|
|
|
|
sentiment_dict = \
|
|
{"neg": round(neg, 3),
|
|
"neu": round(neu, 3),
|
|
"pos": round(pos, 3),
|
|
"compound": round(compound, 4)}
|
|
|
|
return sentiment_dict
|
|
|
|
|
|
if __name__ == '__main__':
|
|
# --- examples -------
|
|
sentences = ["VADER is smart, handsome, and funny.", # positive sentence example
|
|
"VADER is smart, handsome, and funny!",
|
|
# punctuation emphasis handled correctly (sentiment intensity adjusted)
|
|
"VADER is very smart, handsome, and funny.",
|
|
# booster words handled correctly (sentiment intensity adjusted)
|
|
"VADER is VERY SMART, handsome, and FUNNY.", # emphasis for ALLCAPS handled
|
|
"VADER is VERY SMART, handsome, and FUNNY!!!",
|
|
# combination of signals - VADER appropriately adjusts intensity
|
|
"VADER is VERY SMART, uber handsome, and FRIGGIN FUNNY!!!",
|
|
# booster words & punctuation make this close to ceiling for score
|
|
"VADER is not smart, handsome, nor funny.", # negation sentence example
|
|
"The book was good.", # positive sentence
|
|
"At least it isn't a horrible book.", # negated negative sentence with contraction
|
|
"The book was only kind of good.",
|
|
# qualified positive sentence is handled correctly (intensity adjusted)
|
|
"The plot was good, but the characters are uncompelling and the dialog is not great.",
|
|
# mixed negation sentence
|
|
"Today SUX!", # negative slang with capitalization emphasis
|
|
"Today only kinda sux! But I'll get by, lol",
|
|
# mixed sentiment example with slang and constrastive conjunction "but"
|
|
"Make sure you :) or :D today!", # emoticons handled
|
|
"Catch utf-8 emoji such as 💘 and 💋 and 😁", # emojis handled
|
|
"Not bad at all" # Capitalized negation
|
|
]
|
|
|
|
analyzer = SentimentIntensityAnalyzer()
|
|
|
|
print("----------------------------------------------------")
|
|
print(" - Analyze typical example cases, including handling of:")
|
|
print(" -- negations")
|
|
print(" -- punctuation emphasis & punctuation flooding")
|
|
print(" -- word-shape as emphasis (capitalization difference)")
|
|
print(" -- degree modifiers (intensifiers such as 'very' and dampeners such as 'kind of')")
|
|
print(" -- slang words as modifiers such as 'uber' or 'friggin' or 'kinda'")
|
|
print(" -- contrastive conjunction 'but' indicating a shift in sentiment; sentiment of later text is dominant")
|
|
print(" -- use of contractions as negations")
|
|
print(" -- sentiment laden emoticons such as :) and :D")
|
|
print(" -- utf-8 encoded emojis such as 💘 and 💋 and 😁")
|
|
print(" -- sentiment laden slang words (e.g., 'sux')")
|
|
print(" -- sentiment laden initialisms and acronyms (for example: 'lol') \n")
|
|
for sentence in sentences:
|
|
vs = analyzer.polarity_scores(sentence)
|
|
print("{:-<65} {}".format(sentence, str(vs)))
|
|
print("----------------------------------------------------")
|
|
print(" - About the scoring: ")
|
|
print(""" -- The 'compound' score is computed by summing the valence scores of each word in the lexicon, adjusted
|
|
according to the rules, and then normalized to be between -1 (most extreme negative) and +1 (most extreme positive).
|
|
This is the most useful metric if you want a single unidimensional measure of sentiment for a given sentence.
|
|
Calling it a 'normalized, weighted composite score' is accurate.""")
|
|
print(""" -- The 'pos', 'neu', and 'neg' scores are ratios for proportions of text that fall in each category (so these
|
|
should all add up to be 1... or close to it with float operation). These are the most useful metrics if
|
|
you want multidimensional measures of sentiment for a given sentence.""")
|
|
print("----------------------------------------------------")
|
|
|
|
# input("\nPress Enter to continue the demo...\n") # for DEMO purposes...
|
|
|
|
tricky_sentences = ["Sentiment analysis has never been good.",
|
|
"Sentiment analysis has never been this good!",
|
|
"Most automated sentiment analysis tools are shit.",
|
|
"With VADER, sentiment analysis is the shit!",
|
|
"Other sentiment analysis tools can be quite bad.",
|
|
"On the other hand, VADER is quite bad ass",
|
|
"VADER is such a badass!", # slang with punctuation emphasis
|
|
"Without a doubt, excellent idea.",
|
|
"Roger Dodger is one of the most compelling variations on this theme.",
|
|
"Roger Dodger is at least compelling as a variation on the theme.",
|
|
"Roger Dodger is one of the least compelling variations on this theme.",
|
|
"Not such a badass after all.", # Capitalized negation with slang
|
|
"Without a doubt, an excellent idea." # "without {any} doubt" as negation
|
|
]
|
|
print("----------------------------------------------------")
|
|
print(" - Analyze examples of tricky sentences that cause trouble to other sentiment analysis tools.")
|
|
print(" -- special case idioms - e.g., 'never good' vs 'never this good', or 'bad' vs 'bad ass'.")
|
|
print(" -- special uses of 'least' as negation versus comparison \n")
|
|
for sentence in tricky_sentences:
|
|
vs = analyzer.polarity_scores(sentence)
|
|
print("{:-<69} {}".format(sentence, str(vs)))
|
|
print("----------------------------------------------------")
|
|
|
|
# input("\nPress Enter to continue the demo...\n") # for DEMO purposes...
|
|
|
|
print("----------------------------------------------------")
|
|
print(
|
|
" - VADER works best when analysis is done at the sentence level (but it can work on single words or entire novels).")
|
|
paragraph = "It was one of the worst movies I've seen, despite good reviews. Unbelievably bad acting!! Poor direction. VERY poor production. The movie was bad. Very bad movie. VERY BAD movie!"
|
|
print(" -- For example, given the following paragraph text from a hypothetical movie review:\n\t'{}'".format(
|
|
paragraph))
|
|
print(
|
|
" -- You could use NLTK to break the paragraph into sentence tokens for VADER, then average the results for the paragraph like this: \n")
|
|
# simple example to tokenize paragraph into sentences for VADER
|
|
from nltk import tokenize
|
|
|
|
sentence_list = tokenize.sent_tokenize(paragraph)
|
|
paragraphSentiments = 0.0
|
|
for sentence in sentence_list:
|
|
vs = analyzer.polarity_scores(sentence)
|
|
print("{:-<69} {}".format(sentence, str(vs["compound"])))
|
|
paragraphSentiments += vs["compound"]
|
|
print("AVERAGE SENTIMENT FOR PARAGRAPH: \t" + str(round(paragraphSentiments / len(sentence_list), 4)))
|
|
print("----------------------------------------------------")
|
|
|
|
# input("\nPress Enter to continue the demo...\n") # for DEMO purposes...
|
|
|
|
print("----------------------------------------------------")
|
|
print(" - Analyze sentiment of IMAGES/VIDEO data based on annotation 'tags' or image labels. \n")
|
|
conceptList = ["balloons", "cake", "candles", "happy birthday", "friends", "laughing", "smiling", "party"]
|
|
conceptSentiments = 0.0
|
|
for concept in conceptList:
|
|
vs = analyzer.polarity_scores(concept)
|
|
print("{:-<15} {}".format(concept, str(vs['compound'])))
|
|
conceptSentiments += vs["compound"]
|
|
print("AVERAGE SENTIMENT OF TAGS/LABELS: \t" + str(round(conceptSentiments / len(conceptList), 4)))
|
|
print("\t")
|
|
conceptList = ["riot", "fire", "fight", "blood", "mob", "war", "police", "tear gas"]
|
|
conceptSentiments = 0.0
|
|
for concept in conceptList:
|
|
vs = analyzer.polarity_scores(concept)
|
|
print("{:-<15} {}".format(concept, str(vs['compound'])))
|
|
conceptSentiments += vs["compound"]
|
|
print("AVERAGE SENTIMENT OF TAGS/LABELS: \t" + str(round(conceptSentiments / len(conceptList), 4)))
|
|
print("----------------------------------------------------")
|
|
|
|
# input("\nPress Enter to continue the demo...") # for DEMO purposes...
|
|
|
|
do_translate = input(
|
|
"\nWould you like to run VADER demo examples with NON-ENGLISH text? \n (Note: requires Internet access and uses the 'requests' library) \n Type 'y' or 'n', then press Enter: ")
|
|
if do_translate.lower().lstrip().__contains__("y"):
|
|
import requests
|
|
print("\n----------------------------------------------------")
|
|
print(" - Analyze sentiment of NON ENGLISH text...for example:")
|
|
print(" -- French, German, Spanish, Italian, Russian, Japanese, Arabic, Chinese(Simplified) , Chinese(Traditional)")
|
|
print(" -- many other languages supported. \n")
|
|
languages = ["English", "French", "German", "Spanish", "Italian", "Russian", "Japanese", "Arabic", "Chinese(Simplified)", "Chinese(Traditional)"]
|
|
language_codes = ["en", "fr", "de", "es", "it", "ru", "ja", "ar", "zh-CN", "zh-TW"]
|
|
nonEnglish_sentences = ["I'm surprised to see just how amazingly helpful VADER is!",
|
|
"Je suis surpris de voir comment VADER est incroyablement utile !",
|
|
"Ich bin überrascht zu sehen, nur wie erstaunlich nützlich VADER!",
|
|
"Me sorprende ver sólo cómo increíblemente útil VADER!",
|
|
"Sono sorpreso di vedere solo come incredibilmente utile VADER è!",
|
|
"Я удивлен увидеть, как раз как удивительно полезно ВЕЙДЕРА!",
|
|
"私はちょうどどのように驚くほど役に立つベイダーを見て驚いています!",
|
|
"أنا مندهش لرؤية فقط كيف مثير للدهشة فيدر فائدة!",
|
|
"我很惊讶地看到VADER是如此有用!",
|
|
"我很驚訝地看到VADER是如此有用!"
|
|
]
|
|
for sentence in nonEnglish_sentences:
|
|
to_lang = "en"
|
|
from_lang = language_codes[nonEnglish_sentences.index(sentence)]
|
|
if (from_lang == "en") or (from_lang == "en-US"):
|
|
translation = sentence
|
|
translator_name = "No translation needed"
|
|
else: # please note usage limits for My Memory Translation Service: http://mymemory.translated.net/doc/usagelimits.php
|
|
# using MY MEMORY NET http://mymemory.translated.net
|
|
api_url = "http://mymemory.translated.net/api/get?q={}&langpair={}|{}".format(sentence, from_lang,
|
|
to_lang)
|
|
hdrs = {
|
|
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
|
|
'Accept-Encoding': 'none',
|
|
'Accept-Language': 'en-US,en;q=0.8',
|
|
'Connection': 'keep-alive'}
|
|
response = requests.get(api_url, headers=hdrs)
|
|
response_json = json.loads(response.text)
|
|
translation = response_json["responseData"]["translatedText"]
|
|
translator_name = "MemoryNet Translation Service"
|
|
vs = analyzer.polarity_scores(translation)
|
|
print("- {: <8}: {: <69}\t {} ({})".format(languages[nonEnglish_sentences.index(sentence)], sentence,
|
|
str(vs['compound']), translator_name))
|
|
print("----------------------------------------------------")
|
|
|
|
print("\n\n Demo Done!") |