We will write a python code that detect spam using Naive Bayes Classifier.


import glob
import os
from nltk.corpus import names

emails, labels = [], []

def read_file(filename):
    with open(filename, 'r', encoding='ISO-8859-1') as infile:
        return infile.read()

for filename in glob.glob(os.path.join('data/spam', '*.txt')):
    emails.append(read_file(filename))
    labels.append(1)

for filename in glob.glob(os.path.join('data/ham', '*.txt')):
    emails.append(read_file(filename))
    labels.append(0)

Now it is time to clean the data:


lemmatizer = WordNetLemmatizer()
all_names = set(names.words())
def clean_text(docs):
    cleaned_docs = []
    for doc in docs:
        cleaned_docs.append(
          ' '.join([
              lemmatizer.lemmatize(word.lower())
              for word in doc.split()
              if word.isalpha() and word not in all_names
          ])
        )
    return cleaned_docs

cleaned_emails = clean_text(emails)


Now it is time to create vectorization of the text documents

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words='english', max_features=500)
term_docs = cv.fit_transform(cleaned_emails)
feature_names = cv.get_feature_names()


Now let us restructure labels so we can use it in processing

def get_label_index(labels):
    from collections import defaultdict
    label_index = defaultdict(list)
    for index, label in enumerate(labels):
        label_index[label].append(index)
    return label_index
label_index = get_label_index(labels)

label_index will be in the format:

{0: [1500, 1501, 1502, ...], 1: [0, 1, 2, ...]}


Now let us calculate prior

def calculate_prior(label_index):
    prior = { label: len(index) for label, index in label_index.items()}
    total_count = sum(prior.values())
    for label in prior:
        prior[label] /= float(total_count)
    return prior

prior = calculate_prior(label_index)


Now let us calculate the likelihood

import numpy as np
def calculate_likelihood(term_document_matrix, label_index, smooting=0):
    likelihood = {}
    for label, index in label_index.items():
        likelihood[label] = term_document_matrix[index, :].sum(axis=0) + smoothing
        likelihood[label] = np.asarray(likelihood[label])[0]
        total_count = likelihood[label].sum()
        likelihood[label] = likelihood[label] / float(total_count)
    return likelihood
smooting = 1
likelihood = calculate_likelihood(term_docs, label_index, smoothing)


Let us remember:


But there is a problem, and to describe it, we just shows the first 5 elements of likelihood in the ham emails:

likelihood[0][:5]

and you will something similar to this:

array([9.042245e-5, 6.536442-5, ...])

With such small numbers, multiplying will result in overflow error.
Instead will calculate the summation of their natural algorigthms then convert it back to its natural exponential value:

def calculate_posterior(term_document_matrix, prior, likelihood):
    num_docs = term_document_matrix.shape[0]
    posteriors = []
    for i in range(num_docs):
        posterior = {key: np.log(prior_label) for key, prior_label in prior.items()}
        for label, likelihood_label in likelihood.items():
            term_document_vector = term_document_matrix.getrow(i)
            counts = term_document_vector.data
            indices = term_document_vector.indices
            for count, index in zip(counts, indices):
                posterior[label] += np.log(likelihood_label[index]) * count
            min_log_posterior = min(posterior.values())
            for label in posterior:
                try:
                    posterior[label] = np.exp(posterior[label] - min_log_posterior)
                except:
                    posterior[label] = float('inf')
            sum_posterior = sum(posterior.values())
            for label in posterior:
                if posterior[label] == float('inf'):
                    posterior[label] = 1.0
                else:
                    posterior[label] /= sum_posterior
            posteriors.append(posterior.copy())
    return posteriors

to test that, let us create two emails, and clean them and send them

email_tests = [
  ...
]
cleaned_test = clean_text(email_tests)
term_docs_test = cv.transform(cleaned_test)
posterior = calculate_posterior(term_docs_test, prior, likelihood)

List of posts

This post is part of a series of posts

  1. Preperation and introduction.
  2. Naive Bayes by example
  3. Scrubbing natural language text.
  4. Naive Bayes’ Classifire.
  5. Writing Naive Bayes from scratch (this post).
  6. Using Scikit-learn library