K Mean Clustering¶
import pandas as pd
import numpy as np
from scipy.spatial import distance_matrix
import pdb
Generate synthetic training data from 3 gaussians¶
num_obvs = 500
np.random.seed(235)
m1 =np.random.multivariate_normal([0,0], [[1,0],[0,1]], size=(num_obvs))
m2 =np.random.multivariate_normal([3,0], [[1,0],[0,1]], size=(num_obvs))
m3 =np.random.multivariate_normal([0,3], [[1,0],[0,1]], size=(num_obvs))
choose_index = np.random.choice([0,1,2], replace=True, p=[0.2,0.5,0.3], size=num_obvs)
all_m = np.stack([m1,m2,m3])
X_train = all_m[choose_index, range(num_obvs), :]
import plotly.express as px
fig = px.scatter(x=X_train[:, 0], y=X_train[:, 1], width=400, height=400)
fig.show()
class KMeansCluster():
def __init__(self, num_clusters, max_iter, random_state):
self.num_clusters = num_clusters
self.max_iter = max_iter
pass
def initialize_cluster_means(self, num_feature_dims):
self.cluster_info = np.zeros((self.num_clusters, num_feature_dims))
for num_cluster in range(self.num_clusters):
for dim in range(num_feature_dims):
self.cluster_info[num_cluster, dim] = np.random.uniform(low=mins[0], high=maxs[0])
def assign_data_to_cluster(self, X):
dm = distance_matrix(X, self.cluster_info)
assigned_clusters = np.argmin(dm, axis=1)
# Objective function is the distance to the assigned cluster.
loss = np.min(dm, axis=1).mean()
return assigned_clusters, loss
def fit(self, X):
"""Fit the data
Args:
X (np.array): Training data with shape ()
This function will randomly choose cluster centroids
(number_of_features, num_of_clusters)
Returns:
kmeans: X with a new column for it's cluster
"""
num_feature_dims = X.shape[0]
self.initialize_cluster_means(num_feature_dims)
self.losses = []
for iter_i in range(self.max_iter):
# Calculate distance from each datapoint to the cluster means.
assigned_clusters, loss = self.assign_data_to_cluster(X)
assigned_clusters = assigned_clusters.reshape((len(assigned_clusters), 1))
self.losses.append(loss)
X_with_cluster = np.concatenate([X, assigned_clusters], axis=1)
X_with_cluster_df = pd.DataFrame(X_with_cluster)
self.cluster_info = X_with_cluster_df.groupby(X_with_cluster_df.columns[-1]).mean().values
return X_with_cluster
num_clusters = 5
num_iterations = 20
km = KMeansCluster(num_clusters=num_clusters, max_iter=10, random_state=2)
y = km.fit(X_train)
df = pd.DataFrame(y, columns=["x", "y", "c"])
import plotly.express as px
fig = px.scatter(df, x="x", y="y", color="c", width=700, height=700, title=f"K means clustering on guassian mixture after {num_iterations} iterations using {num_clusters} clusters")
fig.write_image(f"/Users/loreliegordon/Library/Mobile Documents/com~apple~CloudDocs/Documents/root/Columbia/Fall2021/ELEN4720/Assignments/assignment3/submission/kmeans_{num_clusters}.png")
fig.show()
all_data = []
for num_clusters in range(2, 6):
km = KMeansCluster(num_clusters=num_clusters, max_iter=20, random_state=2)
y = km.fit(X_train)
all_data.extend(zip(range(len(km.losses)), km.losses, [num_clusters]*len(km.losses)))
losses_df = pd.DataFrame(all_data, columns=["iter", "loss", "num_clusters"])
losses_df
import plotly.express as px
fig = px.line(losses_df, x="iter", y="loss", color="num_clusters", title="K Means training loss (average L2 squared distance) for 3 models with a different number of clusters", width=1000, height=600)
fig.write_image("/Users/loreliegordon/Library/Mobile Documents/com~apple~CloudDocs/Documents/root/Columbia/Fall2021/ELEN4720/Assignments/assignment3/submission/kmeans_loss.png")
fig.show()