#!pip install plotly
#!pip install textblob


import re
import string
import numpy as np
import pandas as pd

# plotting
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter

#textblob
from textblob import TextBlob

# nltk
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk import tokenize

# sklearn
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay


data = pd.read_csv('tweets.csv')
data.drop_duplicates(subset = 'Tweets', keep = 'first', inplace = True)
data.reset_index(inplace = True)
data.drop(columns=['index'], inplace = True)
data


#Removes the RT from tweets
data['Tweets'] = list(map(lambda x: re.sub('RT',"",str(x)),list(data['Tweets'])))

#Forces all characters to be lowercase for uniformity
data['Tweets'] = data['Tweets'].str.lower()

#Removes all punctuations
def clean_punc(txt):
    return txt.translate(str.maketrans('', '', string.punctuation))
data['Tweets'] = data['Tweets'].apply(lambda x: clean_punc(x))
data['Tweets'] = data['Tweets'].apply(lambda x: re.sub('\d+', '', x))
data['Tweets'] = data['Tweets'].apply(lambda x: re.sub("[^\w\d\s]+", '', x))

#Removing websites
data['Tweets'] = data['Tweets'].apply(lambda x: re.sub('www.[^s]+', '', x))

#Removes all the common stop word in the English language
sw = stopwords.words("english")
data['Tweets'] = data['Tweets'].apply(lambda x: " ".join([word for word in str(x).split() if word not in sw]))
data


#Uses TextBlob's sentiment polarity function to get the sentiment of each tweet
sentiment_lst = data['Tweets'].apply(lambda x: TextBlob(x).sentiment.polarity)
data['Sentiment'] = sentiment_lst

#Interprets the Sentiment into English (<0 is Negative, 0 is Neutral, and >0 is Positive)
data['Meaning'] = [None]*len(data)
for index, row in data.iterrows():
    if data.loc[index,'Sentiment'] < 0:
        data.loc[index,'Meaning'] = "Negative"
    elif data.loc[index,'Sentiment'] > 0:
        data.loc[index,'Meaning'] = "Positive"
    else:
        data.loc[index,'Meaning'] = "Neutral"
data


values = []
meaning = []

#Goes through and gets all the values tied to each unique meaning to be added to a dataframe
for i in np.unique(data['Meaning']):
    values.append(len(data[data['Meaning']==i]))
    meaning.append(i)
    
#Dataframe created and values are added to be plotted below
df = pd.DataFrame()
df['Meaning'] = meaning
df['Value'] = values
fig = px.pie(df, values='Value', names='Meaning', title='Overall Sentiment of POTUS\'s Tweets')
fig.update_traces(textinfo='percent+label+value', textposition='inside')
fig.show()


sns.set_theme(style="whitegrid")
ax = sns.boxplot(x=data["Sentiment"])


#Gets the mean and std of the data
mean = data['Sentiment'].mean()
std = data['Sentiment'].std()
standardized_value = []

#Goes through the data and gets each sentiment value and subtracts it from the mean and divides by the std to 
#standardize the values
for index,row in data.iterrows():
    standardized_value.append((data.loc[index,'Sentiment'] - mean)/std)
data['StandardizedValue'] = standardized_value

#Plots a histogram
ax = data['StandardizedValue'].plot(kind='hist', color='g')
ax.yaxis.set_major_formatter(PercentFormatter(xmax=len(data)))
ax.set_xlabel("Standardized Sentiment Values")
ax.set_ylabel("Percent of Tweets")
ax.set_title("Percent of Tweets Per Standardized Sentiment Value")
plt.show()


#Tokenizes each tweet
data['Modified'] = data['Tweets'].apply(RegexpTokenizer('\w+').tokenize)

#Stems each word to put them in their base/root form
data['Modified'] = data['Modified'].apply(lambda x: [nltk.PorterStemmer().stem(c) for c in x])

#Lemmatizes each word to bring meaning
data['Modified'] = data['Modified'].apply(lambda x: [nltk.WordNetLemmatizer().lemmatize(c) for c in x])
data


#Combines the words to be ready for the data to be split into a training and testing set
data['Combined'] = data['Modified'].apply(lambda x: " ".join(c for c in x))

#Splits the data into a training and testing set
X_train, X_test, y_train, y_test = train_test_split(np.array(data['Combined']),np.array(data['Meaning']),test_size = 0.3)

#Uses a TF-IDF conversion for the data and prints the data
vectoriser = TfidfVectorizer(ngram_range=(1,2))
vect = vectoriser.fit_transform(X_train)
table = pd.DataFrame(vect.T.todense(), index=vectoriser.get_feature_names())
table


#Displays the top 15 words used
count = pd.DataFrame(table.sum(axis=1))
countdf = count.sort_values(0,ascending=False).head(15)
countdf


#Uses the TD-IDF vector on the data to tranform it
X_train = vectoriser.transform(X_train)
X_test  = vectoriser.transform(X_test)

#Function to predict values based on the data and model. Displays a confusion matrix to show true positives, neutrals,
#and negatives. Prints out the accuracy and f1 scores of the model.
def evaluate(model):
    clf = model.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    cm = confusion_matrix(y_test, y_pred, labels=clf.classes_)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=clf.classes_)
    disp.plot()
    plt.show()
    print(str(model) + ' accuracy score: ' + str(accuracy_score(y_test, y_pred)))
    print(str(model) + ' f1 score: ' + str(f1_score(y_test, y_pred,average=None,zero_division=0)) + '\n')
    
#Models to be used
models = [LinearSVC(), BernoulliNB(), LogisticRegression()]

#Goes through each model and runs the function above using the model.
for i in models:
    evaluate(i)

LinearSVC() accuracy score: 0.7144992526158446
LinearSVC() f1 score: [0.4295302  0.58641975 0.81156069]

BernoulliNB() accuracy score: 0.5665171898355755
BernoulliNB() f1 score: [0.         0.06122449 0.72147002]

LogisticRegression() accuracy score: 0.6457399103139013
LogisticRegression() f1 score: [0.15384615 0.43243243 0.76299376]

	Tweets
0	January 6, 2021 made it clear: there is a dagg...
1	Today, my administration announced that health...
2	We must be firm, resolute, and unyielding in o...
3	Today, on National Law Enforcement Appreciatio...
4	My message to everyone impacted by the Marshal...
...	...
2225	RT : Tune in for the first press briefing of t...
2226	After taking the oath of office this afternoon...
2227	The time to move forward is now.
2228	There is no time to waste when it comes to tac...
2229	Folks — This will be the account for my offici...

	Tweets
0	january made clear dagger throat democracy tom...
1	today administration announced health insurers...
2	must firm resolute unyielding defense right vo...
3	today national law enforcement appreciation da...
4	message everyone impacted marshall fire intend...
...	...
2225	tune first press briefing bidenharris administ...
2226	taking oath office afternoon got right work ta...
2227	time move forward
2228	time waste comes tackling crises face thats to...
2229	folks account official duties president pm jan...

	Tweets	Sentiment	Meaning
0	january made clear dagger throat democracy tom...	0.100000	Positive
1	today administration announced health insurers...	0.000000	Neutral
2	must firm resolute unyielding defense right vo...	0.042857	Positive
3	today national law enforcement appreciation da...	0.650000	Positive
4	message everyone impacted marshall fire intend...	-0.025000	Negative
...	...	...	...
2225	tune first press briefing bidenharris administ...	0.250000	Positive
2226	taking oath office afternoon got right work ta...	0.195238	Positive
2227	time move forward	0.000000	Neutral
2228	time waste comes tackling crises face thats to...	0.103810	Positive
2229	folks account official duties president pm jan...	-0.500000	Negative

	Tweets	Sentiment	Meaning	StandardizedValue	Modified
0	january made clear dagger throat democracy tom...	0.100000	Positive	-0.078124	[januari, made, clear, dagger, throat, democra...
1	today administration announced health insurers...	0.000000	Neutral	-0.515866	[today, administr, announc, health, insur, req...
2	must firm resolute unyielding defense right vo...	0.042857	Positive	-0.328262	[must, firm, resolut, unyield, defens, right, ...
3	today national law enforcement appreciation da...	0.650000	Positive	2.329459	[today, nation, law, enforc, appreci, day, jil...
4	message everyone impacted marshall fire intend...	-0.025000	Negative	-0.625302	[messag, everyon, impact, marshal, fire, inten...
...	...	...	...	...	...
2225	tune first press briefing bidenharris administ...	0.250000	Positive	0.578490	[tune, first, press, brief, bidenharri, admini...
2226	taking oath office afternoon got right work ta...	0.195238	Positive	0.338774	[take, oath, offic, afternoon, got, right, wor...
2227	time move forward	0.000000	Neutral	-0.515866	[time, move, forward]
2228	time waste comes tackling crises face thats to...	0.103810	Positive	-0.061448	[time, wast, come, tackl, crise, face, that, t...
2229	folks account official duties president pm jan...	-0.500000	Negative	-2.704578	[folk, account, offici, duti, presid, pm, janu...

	0
american	31.905965
get	25.173573
vaccin	23.665318
infrastructur	20.577519
america	20.527632
back	20.394797
build	19.753718
job	19.453658
work	19.251240
plan	19.174209
better	17.859784
tune	17.676136
today	17.329147
peopl	16.830516
nation	16.512541

Twitter Sentiment Analysis¶

By: Johann Antisseril¶

Part 1 - Overview¶

1.1 Introduction¶

Part 2 - Exploratory Analysis¶

2.1 Getting the Data¶

2.2 Importing the CSV¶

2.3 Cleaning Tweets¶

2.4 Applying Sentiment Polarity¶

2.5.1 Overall Sentiment of POTUS's Tweets¶

2.5.2 Boxplot of Sentiment Values¶

2.5.3 Percent of Tweets Per Standardized Sentiment Value¶

Part 3 - Machine Learning¶

3.1 Modifying the Tweets for ML Models¶

3.2 TF-IDF Vectorization¶

3.3 Top Words¶

3.4 Machine Learning Models¶

	0	1	2	3	4	5	6	7	8	9	...	1551	1552	1553	1554	1555	1556	1557	1558	1559	1560
aanhpi	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
aanhpi equal	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
aapi	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
aapi commun	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
aaron	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
zerocarbon renew	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
zeroemiss	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
zeroemiss unveil	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
zip	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
zip code	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0