We will write a python code that detect spam using Naive Bayes Classifier.
import glob
import os
from nltk.corpus import names
emails, labels = [], []
def read_file(filename):
with open(filename, 'r', encoding='ISO-8859-1') as infile:
return infile.read()
for filename in glob.glob(os.path.join('data/spam', '*.txt')):
emails.append(read_file(filename))
labels.append(1)
for filename in glob.glob(os.path.join('data/ham', '*.txt')):
emails.append(read_file(filename))
labels.append(0)
Now it is time to clean the data:
lemmatizer = WordNetLemmatizer()
all_names = set(names.words())
def clean_text(docs):
cleaned_docs = []
for doc in docs:
cleaned_docs.append(
' '.join([
lemmatizer.lemmatize(word.lower())
for word in doc.split()
if word.isalpha() and word not in all_names
])
)
return cleaned_docs
cleaned_emails = clean_text(emails)
Now it is time to create vectorization of the text documents
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words='english', max_features=500)
term_docs = cv.fit_transform(cleaned_emails)
feature_names = cv.get_feature_names()
Now let us restructure labels so we can use it in processing
def get_label_index(labels):
from collections import defaultdict
label_index = defaultdict(list)
for index, label in enumerate(labels):
label_index[label].append(index)
return label_index
label_index = get_label_index(labels)
label_index will be in the format:
{0: [1500, 1501, 1502, ...], 1: [0, 1, 2, ...]}
Now let us calculate prior
def calculate_prior(label_index):
prior = { label: len(index) for label, index in label_index.items()}
total_count = sum(prior.values())
for label in prior:
prior[label] /= float(total_count)
return prior
prior = calculate_prior(label_index)
Now let us calculate the likelihood
import numpy as np
def calculate_likelihood(term_document_matrix, label_index, smooting=0):
likelihood = {}
for label, index in label_index.items():
likelihood[label] = term_document_matrix[index, :].sum(axis=0) + smoothing
likelihood[label] = np.asarray(likelihood[label])[0]
total_count = likelihood[label].sum()
likelihood[label] = likelihood[label] / float(total_count)
return likelihood
smooting = 1
likelihood = calculate_likelihood(term_docs, label_index, smoothing)
Let us remember:
But there is a problem, and to describe it, we just shows the first 5 elements of likelihood in the ham emails:
likelihood[0][:5]
and you will something similar to this:
array([9.042245e-5, 6.536442-5, ...])
With such small numbers, multiplying will result in overflow error.
Instead will calculate the summation of their natural algorigthms then convert it back to its natural exponential value:
def calculate_posterior(term_document_matrix, prior, likelihood):
num_docs = term_document_matrix.shape[0]
posteriors = []
for i in range(num_docs):
posterior = {key: np.log(prior_label) for key, prior_label in prior.items()}
for label, likelihood_label in likelihood.items():
term_document_vector = term_document_matrix.getrow(i)
counts = term_document_vector.data
indices = term_document_vector.indices
for count, index in zip(counts, indices):
posterior[label] += np.log(likelihood_label[index]) * count
min_log_posterior = min(posterior.values())
for label in posterior:
try:
posterior[label] = np.exp(posterior[label] - min_log_posterior)
except:
posterior[label] = float('inf')
sum_posterior = sum(posterior.values())
for label in posterior:
if posterior[label] == float('inf'):
posterior[label] = 1.0
else:
posterior[label] /= sum_posterior
posteriors.append(posterior.copy())
return posteriors
to test that, let us create two emails, and clean them and send them
email_tests = [
...
]
cleaned_test = clean_text(email_tests)
term_docs_test = cv.transform(cleaned_test)
posterior = calculate_posterior(term_docs_test, prior, likelihood)
List of posts
This post is part of a series of posts
- Preperation and introduction.
- Naive Bayes by example
- Scrubbing natural language text.
- Naive Bayes’ Classifire.
- Writing Naive Bayes from scratch (this post).
- Using Scikit-learn library