import pandas
import numpy
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
url = "https://raw.githubusercontent.com/shantanuo/naive_bayes_for_text_classification/master/spam.csv"
data = pandas.read_csv(url, encoding="latin-1")
data.head()
for i in range(2, 5):
column_name = 'Unnamed: ' + str(i)
data = data.drop(column_name, axis=1)
data.head()
data.columns = ['Target', 'Text']
data['Target'] = LabelEncoder().fit_transform(data['Target'])
data.head()
train, test = train_test_split(data, test_size=0.2, random_state=0)
train['Text'].head()
countVectorizer = CountVectorizer()
X_train_vectorized = countVectorizer.fit_transform(train['Text'])
ndf = pandas.SparseDataFrame(
X_train_vectorized.toarray(), columns=countVectorizer.get_feature_names()
)
ndf.iloc[0][ndf.iloc[0] > 0]
naive_bayes = MultinomialNB(alpha=0.1)
naive_bayes.fit(X_train_vectorized, train['Target'])
y = naive_bayes.predict(countVectorizer.transform(test['Text']))
predictedData = {'Target': y, 'Text': test['Text']}
predictedDataFrame = pandas.DataFrame(predictedData)
predictedDataFrame.head(10)
predictedDataFrame['Target'] = predictedDataFrame['Target'].replace(0, 'ham')
predictedDataFrame['Target'] = predictedDataFrame['Target'].replace(1, 'spam')
predictedDataFrame.head(10)
predictedDataFrame[predictedDataFrame.Target == 'ham'].head(10)
predictedDataFrame[predictedDataFrame.Target == 'spam'].head(10)
roc_auc_score(test['Target'], y)
my_url = "https://raw.githubusercontent.com/hentai-lab/Machine-Learning/master/docs/my_own_spam_data.csv"
my_data = pandas.read_csv(my_url, encoding="latin-1")
my_data['Target'] = LabelEncoder().fit_transform(my_data['Target'])
my_data = my_data.sample(frac=1)
my_data
my_y = naive_bayes.predict(countVectorizer.transform(my_data['Text']))
my_predictedData = {'Target': my_y, 'Text': my_data['Text']}
my_predictedDataFrame = pandas.DataFrame(my_predictedData)
my_predictedDataFrame
my_predictedDataFrame['Target'] = my_predictedDataFrame['Target'].replace(0, 'ham')
my_predictedDataFrame['Target'] = my_predictedDataFrame['Target'].replace(1, 'spam')
my_predictedDataFrame
roc_auc_score(my_data['Target'], my_y)
my_X_train_vectorized = countVectorizer.fit_transform(my_data['Text'])
ndf = pandas.SparseDataFrame(
my_X_train_vectorized.toarray(), columns=countVectorizer.get_feature_names()
)
ndf.iloc[0][ndf.iloc[0] > 0]
naive_bayes.fit(my_X_train_vectorized, my_data['Target'])
input_text = [input('Enter your message: ')]
input_target = input('Is it spam or not?: ')
if input_target == 'no' or input_target == 'No' or input_target == 'NO':
input_target = 0
elif input_target == 'yes' or input_target == 'Yes' or input_target == 'YES' or input_target == 'yEs' or input_target == 'yeS':
input_target = 1
another_y = naive_bayes.predict(countVectorizer.transform(input_text))
another_predictedData = [['User', input_target, input_text[0]], ['Naive Bayes', another_y, input_text[0]]]
another_predictedData = pandas.DataFrame(another_predictedData, columns=['Source', 'Target', 'Text'])
another_predictedData['Target'] = another_predictedData['Target'].replace(0, 'ham')
another_predictedData['Target'] = another_predictedData['Target'].replace(1, 'spam')
another_predictedData