Baysian Classifier¶
Our training data show whether emails are spam or not spam
Training
Split data into spam / not spam
Fit some number of guassian mixtures to each class
Predicting
Given the training data, get the probability that X came from each of the classes. Normalize based on the probability of belonging to class 0 or 1 without considering X
Return the class with high likilihood
Load Data¶
import pandas as pd
import numpy as np
import pdb
from scipy.stats import norm, multivariate_normal
base_path = "/Users/trevorgordon/Library/Mobile Documents/com~apple~CloudDocs/Documents/root/Columbia/Fall2021/ELEN4720/Assignments/assignment3/"
X_test = pd.read_csv(base_path + "hw3-data/Prob3_Xtest.csv", header=None).to_numpy()
y_test = pd.read_csv(base_path + "hw3-data/Prob3_ytest.csv", header=None).to_numpy()
X_train = pd.read_csv(base_path + "hw3-data/Prob3_Xtrain.csv", header=None).to_numpy()
y_train = pd.read_csv(base_path + "hw3-data/Prob3_ytrain.csv", header=None).to_numpy()
What I need to do¶
I have training data that is spam or not spam.
I have 10 predictor columns
I need to split training data by spam/not spam, and get the variance and means.
class conditional density will be the Gaussian mixture model
Initialize:
Mixing weights to be uniform
Covariance to be the mean covariange
Initalize mean from a sample using the empiral mean and covariance
Iterating¶
class EMClusterIng():
def __init__(self, num_clusters, max_iter, random_state, conv_tol):
self.num_clusters = num_clusters
self.max_iter = max_iter
self.conv_tol = conv_tol
def initialize_cluster_means(self):
self.cluster_means = np.zeros((self.num_clusters, self.num_feature_dims))
for num_cluster in range(self.num_clusters):
m1 =np.random.multivariate_normal(
self.empirical_cluster_means,
self.empirical_cluster_covariance)
self.cluster_means[num_cluster] = m1
def initialize_empirical_cluster_mean_covariance(self, X):
self.cluster_covariance = np.zeros((self.num_clusters, self.num_feature_dims, self.num_feature_dims))
self.empirical_cluster_means = np.mean(X, axis=0)
self.empirical_cluster_covariance = (np.ma.cov(X.T).data).reshape((self.num_feature_dims, self.num_feature_dims))
for num_cluster in range(self.num_clusters):
self.cluster_covariance[num_cluster, :, :] = np.ma.cov(X.T).data
def initialize_weights_and_prob(self):
self.w = 1/self.num_clusters*np.ones(self.num_clusters)
self.p = 1/self.num_clusters*np.ones((self.num_samples, self.num_clusters))
self.resp = np.zeros(self.p.shape)
def assign_data_to_cluster(self, X):
"""Assign data to cluster with prob
- Find the total probability of datapoint being in all guassians
-
Args:
X ([type]): [description]
Returns:
[type]: [description]
"""
self.compute_log_likelihood(X)
log_likelihood = np.sum(np.log(np.sum(self.resp, axis = 1)))
# normalize over all possible cluster assignments
self.resp = self.resp / self.resp.sum(axis = 1, keepdims = 1)
self.p = self.resp
# print(f"log_likelihood is {log_likelihood}")
return log_likelihood
def refit_clusters(self, X):
# Calculate Means
r_weights = self.p.sum(axis=0).reshape((self.num_clusters, 1)) #/ self.p.sum(axis=0).sum()
self.w = r_weights / X.shape[0]
new_u = self.p.T @ X
self.cluster_means = new_u / r_weights
# Cacluate new covariances
for num_cluster in range(self.num_clusters):
this_p = self.p[:, num_cluster]
diff = (X - self.cluster_means[num_cluster]).T
diff_squared = (this_p * diff @ diff.T)
new_cov = diff_squared / self.w[num_cluster]
self.cluster_covariance[num_cluster] = new_cov / r_weights[num_cluster]
def compute_log_likelihood(self, X):
for num_cluster in range(self.num_clusters):
prior = self.w[num_cluster]
u = self.cluster_means[num_cluster]
cov = self.cluster_covariance[num_cluster]
likelihood = multivariate_normal(u, cov, allow_singular=True).pdf(X)
self.resp[:, num_cluster] = prior * likelihood
return
def fit(self, X):
"""Fit the data
"""
self.num_samples, self.num_feature_dims = X.shape
self.initialize_empirical_cluster_mean_covariance(X)
self.initialize_cluster_means()
self.initialize_weights_and_prob()
self.log_likelihood_trace = []
log_lik = 0
for iter_i in range(self.max_iter):
try:
# Calculate distance from each datapoint to the cluster means.
log_lik_new = self.assign_data_to_cluster(X)
except np.linalg.LinAlgError:
pass
self.log_likelihood_trace.append(log_lik_new)
if abs(log_lik_new - log_lik) <= self.conv_tol:
self.converged = True
print("converged!")
break
log_lik = log_lik_new
self.refit_clusters(X)
X_with_cluster = np.concatenate([X, self.p], axis=1)
X_with_cluster_df = pd.DataFrame(X_with_cluster)
return self.p
X_y_train = np.concatenate([X_train, y_train], axis=1)
X_0 = X_y_train[X_y_train[:, -1] == 0, :-1]
X_1 = X_y_train[X_y_train[:, -1] == 1, :-1]
all_data_x0 = []
all_data_x1 = []
first_admit_iter = 4
for i in range(10):
# em = GMM(3, 30, 1, 2234)
em = EMClusterIng(num_clusters=3, max_iter=30, random_state=2, conv_tol=5)
em.fit(X_0)
all_data_x0.extend(zip(range(len(em.log_likelihood_trace)), em.log_likelihood_trace, [i]*len(em.log_likelihood_trace)))
log_likelihood_trace_df_x0 = pd.DataFrame(all_data_x0, columns=["iter", "log_likelihood", "i"])
log_likelihood_trace_df_x0 = log_likelihood_trace_df_x0.loc[log_likelihood_trace_df_x0["iter"] > first_admit_iter, :]
all_data_x1 = []
for i in range(10):
# em2 = GMM(3, 30, 1, 2234)
em2 = EMClusterIng(num_clusters=3, max_iter=30, random_state=2, conv_tol=5)
em2.fit(X_1)
all_data_x1.extend(zip(range(len(em2.log_likelihood_trace)), em2.log_likelihood_trace, [i]*len(em2.log_likelihood_trace)))
log_likelihood_trace_df_x1 = pd.DataFrame(all_data_x1, columns=["iter", "log_likelihood", "i"])
log_likelihood_trace_df_x1 = log_likelihood_trace_df_x1.loc[log_likelihood_trace_df_x1["iter"] > first_admit_iter, :]
converged!
converged!
converged!
converged!
converged!
converged!
converged!
converged!
converged!
converged!
converged!
converged!
converged!
import plotly.express as px
fig = px.line(log_likelihood_trace_df_x0, x="iter", y="log_likelihood", color="i", title="Log likelihood for class X0 GMM mixture with 3 clusters", width=750, height=500)
# fig.write_image("/Users/loreliegordon/Library/Mobile Documents/com~apple~CloudDocs/Documents/root/Columbia/Fall2021/ELEN4720/Assignments/assignment3/submission/log_like_x0.png")
fig.show()
import plotly.express as px
fig = px.line(log_likelihood_trace_df_x1, x="iter", y="log_likelihood", color="i", title="Log likelihood for class X1 GMM mixture with 3 clusters", width=750, height=500)
# fig.write_image("/Users/loreliegordon/Library/Mobile Documents/com~apple~CloudDocs/Documents/root/Columbia/Fall2021/ELEN4720/Assignments/assignment3/submission/log_like_x1.png")
fig.show()
Putting it together with a bayes classifier¶
class BayesClassifier():
"""BayesClassifier
This is an implementation from scratch that has the following:
"""
def __init__(self, num_guassians=3) -> None:
self.num_guassians = num_guassians
def fit(self, X_train, y_train):
"""Fit the input data
After this function the model parameters will be fit. We need to have:
- pi hat which is the probability of encountering class 0 or class 1 without considering X
Args:
X_train (np.array): Training features
y_train (np.array): Single column for the binary predicted class either 0 or 1
"""
X_y_train = np.concatenate([X_train, y_train], axis=1)
X_0 = X_y_train[X_y_train[:, -1] == 0, :-1]
X_1 = X_y_train[X_y_train[:, -1] == 1, :-1]
self.em_x0 = EMClusterIng(num_clusters=self.num_guassians, max_iter=30, random_state=2, conv_tol=1e-3)
self.em_x0.fit(X_0)
self.em_x1 = EMClusterIng(num_clusters=self.num_guassians, max_iter=30, random_state=2, conv_tol=1e-3)
self.em_x1.fit(X_1)
self.d = len(X_train[0])
self.n = len(y_train)
self._n_y0 = len(y_train[y_train == 0])
self._n_y1 = len(y_train[y_train == 1])
self.p_y0 = self._n_y0 / self.n
self.p_y1 = self._n_y1 / self.n
def predict(self, X):
"""Predict new data
To predict we need to:
- Calculate the probability of being either class
- Choose the class with the higher probability
"""
y_pred_y0 = np.zeros(shape=(len(X)))
y_pred_y1 = np.zeros(shape=(len(X)))
for i in range(len(X)):
x_i = X[i]
y_pred_y0[i] = self.em_x0.assign_data_to_cluster(x_i)
y_pred_y1[i] = self.em_x1.assign_data_to_cluster(x_i)
y_pred_y0 = self.p_y0 * y_pred_y0
y_pred_y1 = self.p_y1 * y_pred_y1
predictions = (y_pred_y1 > y_pred_y0).astype(int)
return predictions
for num_g in range(4, 5):
bc = BayesClassifier(num_guassians=num_g)
bc.fit(X_train, y_train)
y_pred = bc.predict(X_test)
joined = pd.concat([pd.DataFrame(y_pred, columns=["y_pred"]), pd.DataFrame(y_test, columns=["y_test"])], axis=1)
joined["correct"] = joined["y_pred"] == joined["y_test"]
y_test_1_y_pred_1 = joined.loc[(joined["correct"] == True) & (joined["y_test"] == 1),"correct"].count()
y_test_0_y_pred_0 = joined.loc[(joined["correct"] == True) & (joined["y_test"] == 0),"correct"].count()
y_test_1_y_pred_0 = joined.loc[(joined["correct"] == False) & (joined["y_test"] == 1),"correct"].count()
y_test_0_y_pred_1 = joined.loc[(joined["correct"] == False) & (joined["y_test"] == 0),"correct"].count()
print(f"Results for predictor with {num_g} guassians")
print(pd.DataFrame([[y_test_0_y_pred_0, y_test_0_y_pred_1], [y_test_1_y_pred_0, y_test_1_y_pred_1]]))
/Users/loreliegordon/.virtualenvs/sandbox2/lib/python3.7/site-packages/ipykernel_launcher.py:102: RuntimeWarning:
invalid value encountered in true_divide
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/var/folders/w3/8cmbpqt17zgg1lcgbdw84r6c0000gn/T/ipykernel_43971/1690612699.py in <module>
1 for num_g in range(4, 5):
2 bc = BayesClassifier(num_guassians=num_g)
----> 3 bc.fit(X_train, y_train)
4 y_pred = bc.predict(X_test)
5
/var/folders/w3/8cmbpqt17zgg1lcgbdw84r6c0000gn/T/ipykernel_43971/2922175977.py in fit(self, X_train, y_train)
27
28 self.em_x1 = EMClusterIng(num_clusters=self.num_guassians, max_iter=30, random_state=2, conv_tol=1e-3)
---> 29 self.em_x1.fit(X_1)
30
31 self.d = len(X_train[0])
/var/folders/w3/8cmbpqt17zgg1lcgbdw84r6c0000gn/T/ipykernel_43971/633934957.py in fit(self, X)
145 try:
146 # Calculate distance from each datapoint to the cluster means.
--> 147 log_lik_new = self.assign_data_to_cluster(X)
148
149 except np.linalg.LinAlgError:
/var/folders/w3/8cmbpqt17zgg1lcgbdw84r6c0000gn/T/ipykernel_43971/633934957.py in assign_data_to_cluster(self, X)
48 """
49
---> 50 self.compute_log_likelihood(X)
51 log_likelihood = np.sum(np.log(np.sum(self.resp, axis = 1)))
52
/var/folders/w3/8cmbpqt17zgg1lcgbdw84r6c0000gn/T/ipykernel_43971/633934957.py in compute_log_likelihood(self, X)
117 u = self.cluster_means[num_cluster]
118 cov = self.cluster_covariance[num_cluster]
--> 119 likelihood = multivariate_normal(u, cov, allow_singular=True).pdf(X)
120 self.resp[:, num_cluster] = prior * likelihood
121
~/.virtualenvs/sandbox2/lib/python3.7/site-packages/scipy/stats/_multivariate.py in __call__(self, mean, cov, allow_singular, seed)
360 return multivariate_normal_frozen(mean, cov,
361 allow_singular=allow_singular,
--> 362 seed=seed)
363
364 def _process_parameters(self, dim, mean, cov):
~/.virtualenvs/sandbox2/lib/python3.7/site-packages/scipy/stats/_multivariate.py in __init__(self, mean, cov, allow_singular, seed, maxpts, abseps, releps)
728 self.dim, self.mean, self.cov = self._dist._process_parameters(
729 None, mean, cov)
--> 730 self.cov_info = _PSD(self.cov, allow_singular=allow_singular)
731 if not maxpts:
732 maxpts = 1000000 * self.dim
~/.virtualenvs/sandbox2/lib/python3.7/site-packages/scipy/stats/_multivariate.py in __init__(self, M, cond, rcond, lower, check_finite, allow_singular)
156 # Note that eigh takes care of array conversion, chkfinite,
157 # and assertion that the matrix is square.
--> 158 s, u = scipy.linalg.eigh(M, lower=lower, check_finite=check_finite)
159
160 eps = _eigvalsh_to_eps(s, cond, rcond)
~/.virtualenvs/sandbox2/lib/python3.7/site-packages/scipy/linalg/decomp.py in eigh(a, b, lower, eigvals_only, overwrite_a, overwrite_b, turbo, eigvals, type, check_finite, subset_by_index, subset_by_value, driver)
443 ''.format(driver, '", "'.join(drv_str[1:])))
444
--> 445 a1 = _asarray_validated(a, check_finite=check_finite)
446 if len(a1.shape) != 2 or a1.shape[0] != a1.shape[1]:
447 raise ValueError('expected square "a" matrix')
~/.virtualenvs/sandbox2/lib/python3.7/site-packages/scipy/_lib/_util.py in _asarray_validated(a, check_finite, sparse_ok, objects_ok, mask_ok, as_inexact)
291 raise ValueError('masked arrays are not supported')
292 toarray = np.asarray_chkfinite if check_finite else np.asarray
--> 293 a = toarray(a)
294 if not objects_ok:
295 if a.dtype is np.dtype('O'):
~/.virtualenvs/sandbox2/lib/python3.7/site-packages/numpy/lib/function_base.py in asarray_chkfinite(a, dtype, order)
487 if a.dtype.char in typecodes['AllFloat'] and not np.isfinite(a).all():
488 raise ValueError(
--> 489 "array must not contain infs or NaNs")
490 return a
491
ValueError: array must not contain infs or NaNs
class GMM:
"""
Full covariance Gaussian Mixture Model,
trained using Expectation Maximization.
Parameters
----------
n_components : int
Number of clusters/mixture components in which the data will be
partitioned into.
n_iters : int
Maximum number of iterations to run the algorithm.
tol : float
Tolerance. If the log-likelihood between two iterations is smaller than
the specified tolerance level, the algorithm will stop performing the
EM optimization.
seed : int
Seed / random state used to initialize the parameters.
"""
def __init__(self, n_components: int, n_iters: int, tol: float, seed: int):
self.n_components = n_components
self.n_iters = n_iters
self.tol = tol
self.seed = seed
def fit(self, X):
# data's dimensionality and responsibility vector
n_row, n_col = X.shape
self.resp = np.zeros((n_row, self.n_components))
# initialize parameters
np.random.seed(self.seed)
chosen = np.random.choice(n_row, self.n_components, replace = False)
self.means = X[chosen]
self.weights = np.full(self.n_components, 1 / self.n_components)
# for np.cov, rowvar = False,
# indicates that the rows represents obervation
shape = self.n_components, n_col, n_col
self.covs = np.full(shape, np.cov(X, rowvar = False))
log_likelihood = 0
self.converged = False
self.log_likelihood_trace = []
for i in range(self.n_iters):
log_likelihood_new = self._do_estep(X)
print(f"log_likelihood_new is {log_likelihood_new}")
self._do_mstep(X)
if abs(log_likelihood_new - log_likelihood) <= self.tol:
self.converged = True
print("converged!")
break
log_likelihood = log_likelihood_new
self.log_likelihood_trace.append(log_likelihood)
return self
def _do_estep(self, X):
"""
E-step: compute responsibilities,
update resp matrix so that resp[j, k] is the responsibility of cluster k for data point j,
to compute likelihood of seeing data point j given cluster k, use multivariate_normal.pdf
"""
self._compute_log_likelihood(X)
log_likelihood = np.sum(np.log(np.sum(self.resp, axis = 1)))
# normalize over all possible cluster assignments
self.resp = self.resp / self.resp.sum(axis = 1, keepdims = 1)
return log_likelihood
def _compute_log_likelihood(self, X):
for k in range(self.n_components):
prior = self.weights[k]
likelihood = multivariate_normal(self.means[k], self.covs[k], allow_singular=True).pdf(X)
self.resp[:, k] = prior * likelihood
return self
def _do_mstep(self, X):
"""M-step, update parameters"""
# total responsibility assigned to each cluster, N^{soft}
resp_weights = self.resp.sum(axis = 0)
# weights
self.weights = resp_weights / X.shape[0]
# means
weighted_sum = np.dot(self.resp.T, X)
self.means = weighted_sum / resp_weights.reshape(-1, 1)
# covariance
for k in range(self.n_components):
diff = (X - self.means[k]).T
weighted_sum = np.dot(self.resp[:, k] * diff, diff.T)
self.covs[k] = weighted_sum / resp_weights[k]
return self