In [1]:
import pandas
import numpy
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
In [2]:
url = "https://raw.githubusercontent.com/shantanuo/naive_bayes_for_text_classification/master/spam.csv"
data = pandas.read_csv(url, encoding="latin-1")

data.head()
Out[2]:
v1 v2 Unnamed: 2 Unnamed: 3 Unnamed: 4
0 ham Go until jurong point, crazy.. Available only ... NaN NaN NaN
1 ham Ok lar... Joking wif u oni... NaN NaN NaN
2 spam Free entry in 2 a wkly comp to win FA Cup fina... NaN NaN NaN
3 ham U dun say so early hor... U c already then say... NaN NaN NaN
4 ham Nah I don't think he goes to usf, he lives aro... NaN NaN NaN
In [3]:
for i in range(2, 5):
  column_name = 'Unnamed: ' + str(i)
  data = data.drop(column_name, axis=1)
  
data.head()
Out[3]:
v1 v2
0 ham Go until jurong point, crazy.. Available only ...
1 ham Ok lar... Joking wif u oni...
2 spam Free entry in 2 a wkly comp to win FA Cup fina...
3 ham U dun say so early hor... U c already then say...
4 ham Nah I don't think he goes to usf, he lives aro...
In [4]:
data.columns = ['Target', 'Text']
data['Target'] = LabelEncoder().fit_transform(data['Target'])
data.head()
Out[4]:
Target Text
0 0 Go until jurong point, crazy.. Available only ...
1 0 Ok lar... Joking wif u oni...
2 1 Free entry in 2 a wkly comp to win FA Cup fina...
3 0 U dun say so early hor... U c already then say...
4 0 Nah I don't think he goes to usf, he lives aro...
In [5]:
train, test = train_test_split(data, test_size=0.2, random_state=0)
train['Text'].head()
Out[5]:
1114    No no:)this is kallis home ground.amla home to...
3589    I am in escape theatre now. . Going to watch K...
3095    We walked from my moms. Right on stagwood pass...
1012       I dunno they close oredi not... ÌÏ v ma fan...
3320                               Yo im right by yo work
Name: Text, dtype: object
In [6]:
countVectorizer = CountVectorizer()
X_train_vectorized = countVectorizer.fit_transform(train['Text'])
ndf = pandas.SparseDataFrame(
    X_train_vectorized.toarray(), columns=countVectorizer.get_feature_names()
)
ndf.iloc[0][ndf.iloc[0] > 0]
Out[6]:
amla      1
durban    1
ground    1
home      2
is        2
kallis    1
no        2
this      1
town      1
Name: 0, dtype: Sparse[int64, nan]
IntIndex
Indices: array([0, 1, 2, 3, 4, 5, 6, 7, 8])
In [7]:
naive_bayes = MultinomialNB(alpha=0.1)
naive_bayes.fit(X_train_vectorized, train['Target'])
Out[7]:
MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)
In [8]:
y = naive_bayes.predict(countVectorizer.transform(test['Text']))
In [9]:
predictedData = {'Target': y, 'Text': test['Text']}
predictedDataFrame = pandas.DataFrame(predictedData)
predictedDataFrame.head(10)
Out[9]:
Target Text
4456 0 Aight should I just plan to come up later toni...
690 0 Was the farm open?
944 0 I sent my scores to sophas and i had to do sec...
3768 0 Was gr8 to see that message. So when r u leavi...
1189 0 In that case I guess I'll see you at campus lodge
4437 0 Nothing will ever be easy. But don't be lookin...
3587 0 If you were/are free i can give. Otherwise nal...
1982 0 Hey i will be late... i'm at amk. Need to drin...
2038 0 Hey are we going for the lo lesson or gym?
2078 1 85233 FREE>Ringtone!Reply REAL
In [10]:
predictedDataFrame['Target'] = predictedDataFrame['Target'].replace(0, 'ham')
predictedDataFrame['Target'] = predictedDataFrame['Target'].replace(1, 'spam')
    
predictedDataFrame.head(10)
Out[10]:
Target Text
4456 ham Aight should I just plan to come up later toni...
690 ham Was the farm open?
944 ham I sent my scores to sophas and i had to do sec...
3768 ham Was gr8 to see that message. So when r u leavi...
1189 ham In that case I guess I'll see you at campus lodge
4437 ham Nothing will ever be easy. But don't be lookin...
3587 ham If you were/are free i can give. Otherwise nal...
1982 ham Hey i will be late... i'm at amk. Need to drin...
2038 ham Hey are we going for the lo lesson or gym?
2078 spam 85233 FREE>Ringtone!Reply REAL
In [11]:
predictedDataFrame[predictedDataFrame.Target == 'ham'].head(10)
Out[11]:
Target Text
4456 ham Aight should I just plan to come up later toni...
690 ham Was the farm open?
944 ham I sent my scores to sophas and i had to do sec...
3768 ham Was gr8 to see that message. So when r u leavi...
1189 ham In that case I guess I'll see you at campus lodge
4437 ham Nothing will ever be easy. But don't be lookin...
3587 ham If you were/are free i can give. Otherwise nal...
1982 ham Hey i will be late... i'm at amk. Need to drin...
2038 ham Hey are we going for the lo lesson or gym?
4224 ham \The world suffers a lot... Not because of the...
In [12]:
predictedDataFrame[predictedDataFrame.Target == 'spam'].head(10)
Out[12]:
Target Text
2078 spam 85233 FREE>Ringtone!Reply REAL
2690 spam sports fans - get the latest sports news str* ...
134 spam Want 2 get laid tonight? Want real Dogging loc...
15 spam XXXMobileMovieClub: To use your credit, click ...
348 spam Fancy a shag? I do.Interested? sextextuk.com t...
3499 spam Dorothy@kiefer.com (Bank of Granite issues Str...
3753 spam Bloomberg -Message center +447797706009 Why wa...
2311 spam Congratulations! Thanks to a good friend U hav...
1658 spam RGENT! This is the 2nd attempt to contact U!U ...
1121 spam Do you want 750 anytime any network mins 150 t...
In [13]:
roc_auc_score(test['Target'], y)
Out[13]:
0.9723646958751762
In [14]:
my_url = "https://raw.githubusercontent.com/hentai-lab/Machine-Learning/master/docs/my_own_spam_data.csv"
my_data = pandas.read_csv(my_url, encoding="latin-1")

my_data['Target'] = LabelEncoder().fit_transform(my_data['Target'])
my_data = my_data.sample(frac=1)

my_data
Out[14]:
Target Text
0 1 Missed the webcast? We've got you covered. We'...
14 0 Just today you take the unpublished Syrah from...
1 1 Azure SQL Database gateway IP addresses in Eas...
11 0 What if you could travel back in time? If you ...
6 1 15% OFF to start the week off! Cinepolis and C...
4 1 Today is awesome! Come and see! Cinemark and C...
2 1 It's discount after discount for you! The 'Off...
5 1 BGS Box - Upgrade Your Ticket for Promotional ...
8 0 We have a new in our app! Friends never call f...
9 0 The Rainbow Six Siege Theme Park has been inva...
13 0 Are we going to Spain? HyperX takes you!
10 0 A/B testing, ecommerce, and more: Chatfuel new...
7 1 Let there be heart for so much promotion here ...
12 0 We have the perfect job for you!
15 0 Up to 65% off for you here this Halloween!
3 1 What do you think of Cinemark and Cinepolis on...
In [15]:
my_y = naive_bayes.predict(countVectorizer.transform(my_data['Text']))
In [16]:
my_predictedData = {'Target': my_y, 'Text': my_data['Text']}
my_predictedDataFrame = pandas.DataFrame(my_predictedData)
my_predictedDataFrame
Out[16]:
Target Text
0 0 Missed the webcast? We've got you covered. We'...
14 1 Just today you take the unpublished Syrah from...
1 0 Azure SQL Database gateway IP addresses in Eas...
11 0 What if you could travel back in time? If you ...
6 0 15% OFF to start the week off! Cinepolis and C...
4 0 Today is awesome! Come and see! Cinemark and C...
2 1 It's discount after discount for you! The 'Off...
5 1 BGS Box - Upgrade Your Ticket for Promotional ...
8 0 We have a new in our app! Friends never call f...
9 0 The Rainbow Six Siege Theme Park has been inva...
13 0 Are we going to Spain? HyperX takes you!
10 0 A/B testing, ecommerce, and more: Chatfuel new...
7 0 Let there be heart for so much promotion here ...
12 0 We have the perfect job for you!
15 0 Up to 65% off for you here this Halloween!
3 0 What do you think of Cinemark and Cinepolis on...
In [17]:
my_predictedDataFrame['Target'] = my_predictedDataFrame['Target'].replace(0, 'ham')
my_predictedDataFrame['Target'] = my_predictedDataFrame['Target'].replace(1, 'spam')
    
my_predictedDataFrame
Out[17]:
Target Text
0 ham Missed the webcast? We've got you covered. We'...
14 spam Just today you take the unpublished Syrah from...
1 ham Azure SQL Database gateway IP addresses in Eas...
11 ham What if you could travel back in time? If you ...
6 ham 15% OFF to start the week off! Cinepolis and C...
4 ham Today is awesome! Come and see! Cinemark and C...
2 spam It's discount after discount for you! The 'Off...
5 spam BGS Box - Upgrade Your Ticket for Promotional ...
8 ham We have a new in our app! Friends never call f...
9 ham The Rainbow Six Siege Theme Park has been inva...
13 ham Are we going to Spain? HyperX takes you!
10 ham A/B testing, ecommerce, and more: Chatfuel new...
7 ham Let there be heart for so much promotion here ...
12 ham We have the perfect job for you!
15 ham Up to 65% off for you here this Halloween!
3 ham What do you think of Cinemark and Cinepolis on...
In [18]:
roc_auc_score(my_data['Target'], my_y)
Out[18]:
0.5625
In [19]:
my_X_train_vectorized = countVectorizer.fit_transform(my_data['Text'])
ndf = pandas.SparseDataFrame(
    my_X_train_vectorized.toarray(), columns=countVectorizer.get_feature_names()
)
ndf.iloc[0][ndf.iloc[0] > 0]
naive_bayes.fit(my_X_train_vectorized, my_data['Target'])
Out[19]:
MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)
In [20]:
input_text = [input('Enter your message: ')]
input_target = input('Is it spam or not?: ')
if input_target == 'no' or input_target == 'No' or input_target == 'NO':
  input_target = 0
elif input_target == 'yes' or input_target == 'Yes' or input_target == 'YES' or input_target == 'yEs' or input_target == 'yeS':
  input_target = 1

another_y = naive_bayes.predict(countVectorizer.transform(input_text))

another_predictedData = [['User', input_target, input_text[0]], ['Naive Bayes', another_y, input_text[0]]]
another_predictedData = pandas.DataFrame(another_predictedData, columns=['Source', 'Target', 'Text'])
another_predictedData['Target'] = another_predictedData['Target'].replace(0, 'ham')
another_predictedData['Target'] = another_predictedData['Target'].replace(1, 'spam')

another_predictedData
Enter your message: Christmas promotion for you!
Is it spam or not?: Yes
Out[20]:
Source Target Text
0 User spam Christmas promotion for you!
1 Naive Bayes spam Christmas promotion for you!