Naive Bayes for Text Classification¶

import pandas
import numpy
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder

url = "https://raw.githubusercontent.com/shantanuo/naive_bayes_for_text_classification/master/spam.csv"
data = pandas.read_csv(url, encoding="latin-1")

data.head()

for i in range(2, 5):
  column_name = 'Unnamed: ' + str(i)
  data = data.drop(column_name, axis=1)
  
data.head()

data.columns = ['Target', 'Text']
data['Target'] = LabelEncoder().fit_transform(data['Target'])
data.head()

train, test = train_test_split(data, test_size=0.2, random_state=0)
train['Text'].head()

1114    No no:)this is kallis home ground.amla home to...
3589    I am in escape theatre now. . Going to watch K...
3095    We walked from my moms. Right on stagwood pass...
1012       I dunno they close oredi not... ÌÏ v ma fan...
3320                               Yo im right by yo work
Name: Text, dtype: object

countVectorizer = CountVectorizer()
X_train_vectorized = countVectorizer.fit_transform(train['Text'])
ndf = pandas.SparseDataFrame(
    X_train_vectorized.toarray(), columns=countVectorizer.get_feature_names()
)
ndf.iloc[0][ndf.iloc[0] > 0]

amla      1
durban    1
ground    1
home      2
is        2
kallis    1
no        2
this      1
town      1
Name: 0, dtype: Sparse[int64, nan]
IntIndex
Indices: array([0, 1, 2, 3, 4, 5, 6, 7, 8])

naive_bayes = MultinomialNB(alpha=0.1)
naive_bayes.fit(X_train_vectorized, train['Target'])

MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

y = naive_bayes.predict(countVectorizer.transform(test['Text']))

predictedData = {'Target': y, 'Text': test['Text']}
predictedDataFrame = pandas.DataFrame(predictedData)
predictedDataFrame.head(10)

predictedDataFrame['Target'] = predictedDataFrame['Target'].replace(0, 'ham')
predictedDataFrame['Target'] = predictedDataFrame['Target'].replace(1, 'spam')
    
predictedDataFrame.head(10)

predictedDataFrame[predictedDataFrame.Target == 'ham'].head(10)

predictedDataFrame[predictedDataFrame.Target == 'spam'].head(10)

roc_auc_score(test['Target'], y)

0.9723646958751762

my_url = "https://raw.githubusercontent.com/hentai-lab/Machine-Learning/master/docs/my_own_spam_data.csv"
my_data = pandas.read_csv(my_url, encoding="latin-1")

my_data['Target'] = LabelEncoder().fit_transform(my_data['Target'])
my_data = my_data.sample(frac=1)

my_data

my_y = naive_bayes.predict(countVectorizer.transform(my_data['Text']))

my_predictedData = {'Target': my_y, 'Text': my_data['Text']}
my_predictedDataFrame = pandas.DataFrame(my_predictedData)
my_predictedDataFrame

my_predictedDataFrame['Target'] = my_predictedDataFrame['Target'].replace(0, 'ham')
my_predictedDataFrame['Target'] = my_predictedDataFrame['Target'].replace(1, 'spam')
    
my_predictedDataFrame

roc_auc_score(my_data['Target'], my_y)

0.5625

my_X_train_vectorized = countVectorizer.fit_transform(my_data['Text'])
ndf = pandas.SparseDataFrame(
    my_X_train_vectorized.toarray(), columns=countVectorizer.get_feature_names()
)
ndf.iloc[0][ndf.iloc[0] > 0]
naive_bayes.fit(my_X_train_vectorized, my_data['Target'])

MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

input_text = [input('Enter your message: ')]
input_target = input('Is it spam or not?: ')
if input_target == 'no' or input_target == 'No' or input_target == 'NO':
  input_target = 0
elif input_target == 'yes' or input_target == 'Yes' or input_target == 'YES' or input_target == 'yEs' or input_target == 'yeS':
  input_target = 1

another_y = naive_bayes.predict(countVectorizer.transform(input_text))

another_predictedData = [['User', input_target, input_text[0]], ['Naive Bayes', another_y, input_text[0]]]
another_predictedData = pandas.DataFrame(another_predictedData, columns=['Source', 'Target', 'Text'])
another_predictedData['Target'] = another_predictedData['Target'].replace(0, 'ham')
another_predictedData['Target'] = another_predictedData['Target'].replace(1, 'spam')

another_predictedData

Enter your message: Christmas promotion for you!
Is it spam or not?: Yes

	v1	v2	Unnamed: 2	Unnamed: 3	Unnamed: 4
0	ham	Go until jurong point, crazy.. Available only ...	NaN	NaN	NaN
1	ham	Ok lar... Joking wif u oni...	NaN	NaN	NaN
2	spam	Free entry in 2 a wkly comp to win FA Cup fina...	NaN	NaN	NaN
3	ham	U dun say so early hor... U c already then say...	NaN	NaN	NaN
4	ham	Nah I don't think he goes to usf, he lives aro...	NaN	NaN	NaN

	v1	v2
0	ham	Go until jurong point, crazy.. Available only ...
1	ham	Ok lar... Joking wif u oni...
2	spam	Free entry in 2 a wkly comp to win FA Cup fina...
3	ham	U dun say so early hor... U c already then say...
4	ham	Nah I don't think he goes to usf, he lives aro...

	Target	Text
0	0	Go until jurong point, crazy.. Available only ...
1	0	Ok lar... Joking wif u oni...
2	1	Free entry in 2 a wkly comp to win FA Cup fina...
3	0	U dun say so early hor... U c already then say...
4	0	Nah I don't think he goes to usf, he lives aro...

	Target	Text
4456	0	Aight should I just plan to come up later toni...
690	0	Was the farm open?
944	0	I sent my scores to sophas and i had to do sec...
3768	0	Was gr8 to see that message. So when r u leavi...
1189	0	In that case I guess I'll see you at campus lodge
4437	0	Nothing will ever be easy. But don't be lookin...
3587	0	If you were/are free i can give. Otherwise nal...
1982	0	Hey i will be late... i'm at amk. Need to drin...
2038	0	Hey are we going for the lo lesson or gym?
2078	1	85233 FREE>Ringtone!Reply REAL

	Target	Text
4456	ham	Aight should I just plan to come up later toni...
690	ham	Was the farm open?
944	ham	I sent my scores to sophas and i had to do sec...
3768	ham	Was gr8 to see that message. So when r u leavi...
1189	ham	In that case I guess I'll see you at campus lodge
4437	ham	Nothing will ever be easy. But don't be lookin...
3587	ham	If you were/are free i can give. Otherwise nal...
1982	ham	Hey i will be late... i'm at amk. Need to drin...
2038	ham	Hey are we going for the lo lesson or gym?
2078	spam	85233 FREE>Ringtone!Reply REAL

	Target	Text
0	1	Missed the webcast? We've got you covered. We'...
14	0	Just today you take the unpublished Syrah from...
1	1	Azure SQL Database gateway IP addresses in Eas...
11	0	What if you could travel back in time? If you ...
6	1	15% OFF to start the week off! Cinepolis and C...
4	1	Today is awesome! Come and see! Cinemark and C...
2	1	It's discount after discount for you! The 'Off...
5	1	BGS Box - Upgrade Your Ticket for Promotional ...
8	0	We have a new in our app! Friends never call f...
9	0	The Rainbow Six Siege Theme Park has been inva...
13	0	Are we going to Spain? HyperX takes you!
10	0	A/B testing, ecommerce, and more: Chatfuel new...
7	1	Let there be heart for so much promotion here ...
12	0	We have the perfect job for you!
15	0	Up to 65% off for you here this Halloween!
3	1	What do you think of Cinemark and Cinepolis on...

	Source	Target	Text
0	User	spam	Christmas promotion for you!
1	Naive Bayes	spam	Christmas promotion for you!

	Target	Text
2078	spam	85233 FREE>Ringtone!Reply REAL
2690	spam	sports fans - get the latest sports news str* ...
134	spam	Want 2 get laid tonight? Want real Dogging loc...
15	spam	XXXMobileMovieClub: To use your credit, click ...
348	spam	Fancy a shag? I do.Interested? sextextuk.com t...
3499	spam	Dorothy@kiefer.com (Bank of Granite issues Str...
3753	spam	Bloomberg -Message center +447797706009 Why wa...
2311	spam	Congratulations! Thanks to a good friend U hav...
1658	spam	RGENT! This is the 2nd attempt to contact U!U ...
1121	spam	Do you want 750 anytime any network mins 150 t...