Baysian Classifier

Our training data show whether emails are spam or not spam

Training

  • Split data into spam / not spam

  • Fit some number of guassian mixtures to each class

Predicting

  • Given the training data, get the probability that X came from each of the classes. Normalize based on the probability of belonging to class 0 or 1 without considering X

  • Return the class with high likilihood

Load Data

import pandas as pd
import numpy as np
import pdb
from scipy.stats import norm, multivariate_normal
base_path = "/Users/trevorgordon/Library/Mobile Documents/com~apple~CloudDocs/Documents/root/Columbia/Fall2021/ELEN4720/Assignments/assignment3/"
X_test = pd.read_csv(base_path + "hw3-data/Prob3_Xtest.csv", header=None).to_numpy()
y_test = pd.read_csv(base_path + "hw3-data/Prob3_ytest.csv", header=None).to_numpy()
X_train = pd.read_csv(base_path + "hw3-data/Prob3_Xtrain.csv", header=None).to_numpy()
y_train = pd.read_csv(base_path + "hw3-data/Prob3_ytrain.csv", header=None).to_numpy()

What I need to do

I have training data that is spam or not spam.

I have 10 predictor columns

I need to split training data by spam/not spam, and get the variance and means.

  • class conditional density will be the Gaussian mixture model

Initialize:

  • Mixing weights to be uniform

  • Covariance to be the mean covariange

  • Initalize mean from a sample using the empiral mean and covariance

Iterating

class EMClusterIng():

    def __init__(self, num_clusters, max_iter, random_state, conv_tol):
        self.num_clusters = num_clusters
        self.max_iter = max_iter
        self.conv_tol = conv_tol

    def initialize_cluster_means(self):


        self.cluster_means = np.zeros((self.num_clusters, self.num_feature_dims))
        for num_cluster in range(self.num_clusters):
        
            m1 =np.random.multivariate_normal(
                self.empirical_cluster_means,
                self.empirical_cluster_covariance)

            self.cluster_means[num_cluster] = m1


    def initialize_empirical_cluster_mean_covariance(self, X):

        self.cluster_covariance = np.zeros((self.num_clusters, self.num_feature_dims, self.num_feature_dims))
        self.empirical_cluster_means = np.mean(X, axis=0)
        self.empirical_cluster_covariance = (np.ma.cov(X.T).data).reshape((self.num_feature_dims, self.num_feature_dims))

        for num_cluster in range(self.num_clusters):
            self.cluster_covariance[num_cluster, :, :] = np.ma.cov(X.T).data

    def initialize_weights_and_prob(self):
        self.w = 1/self.num_clusters*np.ones(self.num_clusters)
        self.p = 1/self.num_clusters*np.ones((self.num_samples, self.num_clusters))
        self.resp = np.zeros(self.p.shape)
            

    def assign_data_to_cluster(self, X):
        """Assign data to cluster with prob


        - Find the total probability of datapoint being in all guassians
        - 

        Args:
            X ([type]): [description]

        Returns:
            [type]: [description]
        """

        self.compute_log_likelihood(X)
        log_likelihood = np.sum(np.log(np.sum(self.resp, axis = 1)))

        # normalize over all possible cluster assignments
        self.resp = self.resp / self.resp.sum(axis = 1, keepdims = 1)
        self.p = self.resp
        # print(f"log_likelihood is {log_likelihood}")
        return log_likelihood

    def refit_clusters(self, X):
        # Calculate Means
        r_weights = self.p.sum(axis=0).reshape((self.num_clusters, 1)) #/ self.p.sum(axis=0).sum()
        self.w = r_weights / X.shape[0]
        new_u = self.p.T @ X
        self.cluster_means = new_u / r_weights
        
        # Cacluate new covariances
        for num_cluster in range(self.num_clusters):
            this_p = self.p[:, num_cluster]
            diff = (X - self.cluster_means[num_cluster]).T
            diff_squared = (this_p * diff @ diff.T)
            new_cov = diff_squared / self.w[num_cluster]
            self.cluster_covariance[num_cluster] = new_cov / r_weights[num_cluster]

    def compute_log_likelihood(self, X):
        for num_cluster in range(self.num_clusters):
            prior = self.w[num_cluster]
            u = self.cluster_means[num_cluster]
            cov = self.cluster_covariance[num_cluster]
            likelihood = multivariate_normal(u, cov, allow_singular=True).pdf(X)
            self.resp[:, num_cluster] = prior * likelihood

        return

    def fit(self, X):
        """Fit the data
        """
        self.num_samples, self.num_feature_dims = X.shape
        self.initialize_empirical_cluster_mean_covariance(X)
        self.initialize_cluster_means()
        self.initialize_weights_and_prob()
        self.log_likelihood_trace = []
        log_lik = 0

        for iter_i in range(self.max_iter):
            try:
                # Calculate distance from each datapoint to the cluster means.
                log_lik_new = self.assign_data_to_cluster(X)

            except np.linalg.LinAlgError:
                
                pass
            
            self.log_likelihood_trace.append(log_lik_new)

            if abs(log_lik_new - log_lik) <= self.conv_tol:
                self.converged = True
                print("converged!")
                break

            log_lik = log_lik_new
            
            self.refit_clusters(X)

            X_with_cluster = np.concatenate([X, self.p], axis=1)
            X_with_cluster_df = pd.DataFrame(X_with_cluster) 

        return self.p
X_y_train = np.concatenate([X_train, y_train], axis=1)
X_0 = X_y_train[X_y_train[:, -1] == 0, :-1]
X_1 = X_y_train[X_y_train[:, -1] == 1, :-1]

all_data_x0 = []
all_data_x1 = []

first_admit_iter = 4

for i in range(10):
    # em = GMM(3, 30, 1, 2234)
    em = EMClusterIng(num_clusters=3, max_iter=30, random_state=2, conv_tol=5)
    em.fit(X_0)
    all_data_x0.extend(zip(range(len(em.log_likelihood_trace)), em.log_likelihood_trace, [i]*len(em.log_likelihood_trace)))

log_likelihood_trace_df_x0 = pd.DataFrame(all_data_x0, columns=["iter", "log_likelihood", "i"])
log_likelihood_trace_df_x0 = log_likelihood_trace_df_x0.loc[log_likelihood_trace_df_x0["iter"] > first_admit_iter, :]

all_data_x1 = []
for i in range(10):
    # em2 = GMM(3, 30, 1, 2234)
    em2 = EMClusterIng(num_clusters=3, max_iter=30, random_state=2, conv_tol=5)
    em2.fit(X_1)
    all_data_x1.extend(zip(range(len(em2.log_likelihood_trace)), em2.log_likelihood_trace, [i]*len(em2.log_likelihood_trace)))

log_likelihood_trace_df_x1 = pd.DataFrame(all_data_x1, columns=["iter", "log_likelihood", "i"])
log_likelihood_trace_df_x1 = log_likelihood_trace_df_x1.loc[log_likelihood_trace_df_x1["iter"] > first_admit_iter, :]
converged!
converged!
converged!
converged!
converged!
converged!
converged!
converged!
converged!
converged!
converged!
converged!
converged!
import plotly.express as px
fig = px.line(log_likelihood_trace_df_x0, x="iter", y="log_likelihood", color="i", title="Log likelihood for class X0 GMM mixture with 3 clusters", width=750, height=500)
# fig.write_image("/Users/loreliegordon/Library/Mobile Documents/com~apple~CloudDocs/Documents/root/Columbia/Fall2021/ELEN4720/Assignments/assignment3/submission/log_like_x0.png")
fig.show()
import plotly.express as px
fig = px.line(log_likelihood_trace_df_x1, x="iter", y="log_likelihood", color="i", title="Log likelihood for class X1 GMM mixture with 3 clusters", width=750, height=500)
# fig.write_image("/Users/loreliegordon/Library/Mobile Documents/com~apple~CloudDocs/Documents/root/Columbia/Fall2021/ELEN4720/Assignments/assignment3/submission/log_like_x1.png")
fig.show()

Putting it together with a bayes classifier

class BayesClassifier():
    """BayesClassifier

    This is an implementation from scratch that has the following:
    """

    def __init__(self, num_guassians=3) -> None:
        self.num_guassians = num_guassians

    def fit(self, X_train, y_train):
        """Fit the input data

        After this function the model parameters will be fit. We need to have:
        - pi hat which is the probability of encountering class 0 or class 1 without considering X

        Args:
            X_train (np.array): Training features
            y_train (np.array): Single column for the binary predicted class either 0 or 1
        """

        X_y_train = np.concatenate([X_train, y_train], axis=1)
        X_0 = X_y_train[X_y_train[:, -1] == 0, :-1]
        X_1 = X_y_train[X_y_train[:, -1] == 1, :-1]

        self.em_x0 = EMClusterIng(num_clusters=self.num_guassians, max_iter=30, random_state=2, conv_tol=1e-3)
        self.em_x0.fit(X_0)

        self.em_x1 = EMClusterIng(num_clusters=self.num_guassians, max_iter=30, random_state=2, conv_tol=1e-3)
        self.em_x1.fit(X_1)

        self.d = len(X_train[0])
        self.n = len(y_train)
        self._n_y0 = len(y_train[y_train == 0])
        self._n_y1 = len(y_train[y_train == 1])
        self.p_y0 = self._n_y0 / self.n
        self.p_y1 = self._n_y1 / self.n

    def predict(self, X):
        """Predict new data

        To predict we need to:
            - Calculate the probability of being either class
            - Choose the class with the higher probability
        """

        y_pred_y0 = np.zeros(shape=(len(X)))
        y_pred_y1 = np.zeros(shape=(len(X)))

        for i in range(len(X)):
            x_i = X[i]
            y_pred_y0[i] = self.em_x0.assign_data_to_cluster(x_i)
            y_pred_y1[i] = self.em_x1.assign_data_to_cluster(x_i)
            
        y_pred_y0 = self.p_y0 * y_pred_y0
        y_pred_y1 = self.p_y1 * y_pred_y1
        predictions = (y_pred_y1 > y_pred_y0).astype(int)
        return predictions
for num_g in range(4, 5):
    bc = BayesClassifier(num_guassians=num_g)
    bc.fit(X_train, y_train)
    y_pred = bc.predict(X_test)

    joined = pd.concat([pd.DataFrame(y_pred, columns=["y_pred"]), pd.DataFrame(y_test, columns=["y_test"])], axis=1)
    joined["correct"] = joined["y_pred"] == joined["y_test"]
    y_test_1_y_pred_1 = joined.loc[(joined["correct"] == True) & (joined["y_test"] == 1),"correct"].count()
    y_test_0_y_pred_0 = joined.loc[(joined["correct"] == True) & (joined["y_test"] == 0),"correct"].count()
    y_test_1_y_pred_0 = joined.loc[(joined["correct"] == False) & (joined["y_test"] == 1),"correct"].count()
    y_test_0_y_pred_1 = joined.loc[(joined["correct"] == False) & (joined["y_test"] == 0),"correct"].count()

    print(f"Results for predictor with {num_g} guassians")
    print(pd.DataFrame([[y_test_0_y_pred_0, y_test_0_y_pred_1], [y_test_1_y_pred_0, y_test_1_y_pred_1]]))
/Users/loreliegordon/.virtualenvs/sandbox2/lib/python3.7/site-packages/ipykernel_launcher.py:102: RuntimeWarning:

invalid value encountered in true_divide
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/var/folders/w3/8cmbpqt17zgg1lcgbdw84r6c0000gn/T/ipykernel_43971/1690612699.py in <module>
      1 for num_g in range(4, 5):
      2     bc = BayesClassifier(num_guassians=num_g)
----> 3     bc.fit(X_train, y_train)
      4     y_pred = bc.predict(X_test)
      5 

/var/folders/w3/8cmbpqt17zgg1lcgbdw84r6c0000gn/T/ipykernel_43971/2922175977.py in fit(self, X_train, y_train)
     27 
     28         self.em_x1 = EMClusterIng(num_clusters=self.num_guassians, max_iter=30, random_state=2, conv_tol=1e-3)
---> 29         self.em_x1.fit(X_1)
     30 
     31         self.d = len(X_train[0])

/var/folders/w3/8cmbpqt17zgg1lcgbdw84r6c0000gn/T/ipykernel_43971/633934957.py in fit(self, X)
    145             try:
    146                 # Calculate distance from each datapoint to the cluster means.
--> 147                 log_lik_new = self.assign_data_to_cluster(X)
    148 
    149             except np.linalg.LinAlgError:

/var/folders/w3/8cmbpqt17zgg1lcgbdw84r6c0000gn/T/ipykernel_43971/633934957.py in assign_data_to_cluster(self, X)
     48         """
     49 
---> 50         self.compute_log_likelihood(X)
     51         log_likelihood = np.sum(np.log(np.sum(self.resp, axis = 1)))
     52 

/var/folders/w3/8cmbpqt17zgg1lcgbdw84r6c0000gn/T/ipykernel_43971/633934957.py in compute_log_likelihood(self, X)
    117             u = self.cluster_means[num_cluster]
    118             cov = self.cluster_covariance[num_cluster]
--> 119             likelihood = multivariate_normal(u, cov, allow_singular=True).pdf(X)
    120             self.resp[:, num_cluster] = prior * likelihood
    121 

~/.virtualenvs/sandbox2/lib/python3.7/site-packages/scipy/stats/_multivariate.py in __call__(self, mean, cov, allow_singular, seed)
    360         return multivariate_normal_frozen(mean, cov,
    361                                           allow_singular=allow_singular,
--> 362                                           seed=seed)
    363 
    364     def _process_parameters(self, dim, mean, cov):

~/.virtualenvs/sandbox2/lib/python3.7/site-packages/scipy/stats/_multivariate.py in __init__(self, mean, cov, allow_singular, seed, maxpts, abseps, releps)
    728         self.dim, self.mean, self.cov = self._dist._process_parameters(
    729                                                             None, mean, cov)
--> 730         self.cov_info = _PSD(self.cov, allow_singular=allow_singular)
    731         if not maxpts:
    732             maxpts = 1000000 * self.dim

~/.virtualenvs/sandbox2/lib/python3.7/site-packages/scipy/stats/_multivariate.py in __init__(self, M, cond, rcond, lower, check_finite, allow_singular)
    156         # Note that eigh takes care of array conversion, chkfinite,
    157         # and assertion that the matrix is square.
--> 158         s, u = scipy.linalg.eigh(M, lower=lower, check_finite=check_finite)
    159 
    160         eps = _eigvalsh_to_eps(s, cond, rcond)

~/.virtualenvs/sandbox2/lib/python3.7/site-packages/scipy/linalg/decomp.py in eigh(a, b, lower, eigvals_only, overwrite_a, overwrite_b, turbo, eigvals, type, check_finite, subset_by_index, subset_by_value, driver)
    443                          ''.format(driver, '", "'.join(drv_str[1:])))
    444 
--> 445     a1 = _asarray_validated(a, check_finite=check_finite)
    446     if len(a1.shape) != 2 or a1.shape[0] != a1.shape[1]:
    447         raise ValueError('expected square "a" matrix')

~/.virtualenvs/sandbox2/lib/python3.7/site-packages/scipy/_lib/_util.py in _asarray_validated(a, check_finite, sparse_ok, objects_ok, mask_ok, as_inexact)
    291             raise ValueError('masked arrays are not supported')
    292     toarray = np.asarray_chkfinite if check_finite else np.asarray
--> 293     a = toarray(a)
    294     if not objects_ok:
    295         if a.dtype is np.dtype('O'):

~/.virtualenvs/sandbox2/lib/python3.7/site-packages/numpy/lib/function_base.py in asarray_chkfinite(a, dtype, order)
    487     if a.dtype.char in typecodes['AllFloat'] and not np.isfinite(a).all():
    488         raise ValueError(
--> 489             "array must not contain infs or NaNs")
    490     return a
    491 

ValueError: array must not contain infs or NaNs
class GMM:
    """
    Full covariance Gaussian Mixture Model,
    trained using Expectation Maximization.

    Parameters
    ----------
    n_components : int
        Number of clusters/mixture components in which the data will be
        partitioned into.

    n_iters : int
        Maximum number of iterations to run the algorithm.

    tol : float
        Tolerance. If the log-likelihood between two iterations is smaller than
        the specified tolerance level, the algorithm will stop performing the
        EM optimization.

    seed : int
        Seed / random state used to initialize the parameters.
    """

    def __init__(self, n_components: int, n_iters: int, tol: float, seed: int):
        self.n_components = n_components
        self.n_iters = n_iters
        self.tol = tol
        self.seed = seed

    def fit(self, X):

        # data's dimensionality and responsibility vector
        n_row, n_col = X.shape     
        self.resp = np.zeros((n_row, self.n_components))

        # initialize parameters
        np.random.seed(self.seed)
        chosen = np.random.choice(n_row, self.n_components, replace = False)
        self.means = X[chosen]
        self.weights = np.full(self.n_components, 1 / self.n_components)
        
        # for np.cov, rowvar = False, 
        # indicates that the rows represents obervation
        shape = self.n_components, n_col, n_col
        self.covs = np.full(shape, np.cov(X, rowvar = False))

        log_likelihood = 0
        self.converged = False
        self.log_likelihood_trace = []      

        for i in range(self.n_iters):
            log_likelihood_new = self._do_estep(X)
            print(f"log_likelihood_new is {log_likelihood_new}")
            self._do_mstep(X)

            if abs(log_likelihood_new - log_likelihood) <= self.tol:
                self.converged = True
                print("converged!")
                break
  
            log_likelihood = log_likelihood_new
            self.log_likelihood_trace.append(log_likelihood)


        return self

    def _do_estep(self, X):
        """
        E-step: compute responsibilities,
        update resp matrix so that resp[j, k] is the responsibility of cluster k for data point j,
        to compute likelihood of seeing data point j given cluster k, use multivariate_normal.pdf
        """
        self._compute_log_likelihood(X)
        log_likelihood = np.sum(np.log(np.sum(self.resp, axis = 1)))

        # normalize over all possible cluster assignments
        self.resp = self.resp / self.resp.sum(axis = 1, keepdims = 1)
        return log_likelihood

    def _compute_log_likelihood(self, X):
        for k in range(self.n_components):
            prior = self.weights[k]
            likelihood = multivariate_normal(self.means[k], self.covs[k], allow_singular=True).pdf(X)
            self.resp[:, k] = prior * likelihood

        return self

    def _do_mstep(self, X):
        """M-step, update parameters"""

        # total responsibility assigned to each cluster, N^{soft}
        resp_weights = self.resp.sum(axis = 0)
        
        # weights
        self.weights = resp_weights / X.shape[0]

        # means
        weighted_sum = np.dot(self.resp.T, X)
        self.means = weighted_sum / resp_weights.reshape(-1, 1)
        # covariance
        for k in range(self.n_components):
            diff = (X - self.means[k]).T
            weighted_sum = np.dot(self.resp[:, k] * diff, diff.T)
            self.covs[k] = weighted_sum / resp_weights[k]
            
        return self