Chapter 3: Gaussian Naïve Bayes

One of the most common probability distribution to use with Naïve Bayes classification is the Gaussian distribution. Indeed, in many cases, features can be assumed to follow a normal distribution.

import numpy as np

from supervised.nb_classifier import NBClassifier

EPSILON = 1E-16  # offset to avoid "divide by zero" errors


class GaussianNB(NBClassifier):

    def _pdf(self, x, mean, std):

        num = np.exp(-((x - mean)**2) / (EPSILON + 2 * std**2))
        den = np.sqrt(2 * np.pi * std**2) + EPSILON

        return num / den

Fitting

When features are expected to follow a normal distribution, fitting our model comes down to calculating the mean and standard deviation of each of our feature. This means that fitting the evidence comes down to this:

    def _fit_evidence(self, X):

        feature_probas = []

        for feature in X.T:  # iterate through the features instead of the samples

            feature_probas.append(dict(mean=np.mean(feature),
                                       n=len(feature),
                                       std=np.std(feature, ddof=1)))

        return np.array(feature_probas)

We also keep track of the number of features that were observed, which will be useful if we need to update the model. Fitting the likelihood then becomes trivial, as it is similar to fitting the evidence for each class.

    def _fit_likelihood(self, X, y):

        likelihood_ = []

        for c in self.classes_:

            samples = X[y == c]  # only keep samples of class c

            likelihood_.append(self._fit_evidence(samples))

        return np.array(likelihood_)

Getting

Assuming that our model is trained, we need to be able to make use of its state in order to compute the evidence and likelihood. We can then reuse the _pdf that was defined at the beginning.

    def _get_evidence(self, sample):

        evidence = 1.0

        for i, feature in enumerate(sample):

            mean = self.evidence_[i]["mean"]
            std = self.evidence_[i]["std"]

            evidence *= self._pdf(feature, mean, std)

        return evidence

    def _get_likelihood(self, sample, c):

        likelihood = 1.0

        for i, feature in enumerate(sample):

            mean = self.likelihood_[i]["mean"]
            std = self.likelihood_[i]["std"]

            likelihood *= self._pdf(feature, mean, std)

        return likelihood

Updating

Updating the model means that given new data, the standard deviation, and the mean for each feature has to be updated.

    def _update_evidence(self, X):

        for i, feature in enumerate(X.T):   # iterate through the features instead of the samples

            self.evidence_[i] = self._update_mean_std_n(feature, self.evidence_[i])

        return self.evidence_

    def _update_likelihood(self, X, y):

        for c in self.classes_:

            samples = X[y == c]  # only keep samples of class c

            for i, feature in enumerate(samples.T):  # iterate through the features instead of the samples

                self.likelihood_[i] = self._update_mean_std_n(feature, self.likelihood_[i])

        return self.likelihood_

There exists a way to do this online for both the mean and the standard deviation.

    def _update_mean_std_n(self, feature, mean_std_n):

        old_m = mean_std_n["mean"]
        old_std = mean_std_n["std"]
        old_n = mean_std_n["n"]

        n = old_n + len(feature)

        m = (old_m * old_n + np.mean(feature) * n) / (old_n + n)

        s = np.sqrt((old_n * (old_std**2 + (old_m - m)**2)
                     + len(feature) * (np.var(feature)
                                       + (np.mean(feature) - m)**2)
                     ) / (old_n + len(feature)))

        return dict(mean=m, std=std, n=n)