K Mean Clustering

import pandas as pd
import numpy as np
from scipy.spatial import distance_matrix
import pdb

Generate synthetic training data from 3 gaussians

num_obvs = 500
np.random.seed(235)
m1 =np.random.multivariate_normal([0,0], [[1,0],[0,1]], size=(num_obvs))
m2 =np.random.multivariate_normal([3,0], [[1,0],[0,1]], size=(num_obvs))
m3 =np.random.multivariate_normal([0,3], [[1,0],[0,1]], size=(num_obvs))
choose_index = np.random.choice([0,1,2], replace=True, p=[0.2,0.5,0.3], size=num_obvs)
all_m = np.stack([m1,m2,m3])
X_train = all_m[choose_index, range(num_obvs), :]
import plotly.express as px
fig = px.scatter(x=X_train[:, 0], y=X_train[:, 1], width=400, height=400)
fig.show()
class KMeansCluster():

    def __init__(self, num_clusters, max_iter, random_state):
        self.num_clusters = num_clusters
        self.max_iter = max_iter
        pass

    def initialize_cluster_means(self, num_feature_dims):

        self.cluster_info = np.zeros((self.num_clusters, num_feature_dims))
        for num_cluster in range(self.num_clusters):
            for dim in range(num_feature_dims):
                self.cluster_info[num_cluster, dim] = np.random.uniform(low=mins[0], high=maxs[0])

    def assign_data_to_cluster(self, X):

        dm = distance_matrix(X, self.cluster_info)
        assigned_clusters = np.argmin(dm, axis=1)

        # Objective function is the distance to the assigned cluster.
        loss = np.min(dm, axis=1).mean()
        return assigned_clusters, loss

    def fit(self, X):
        """Fit the data

        Args:
            X (np.array): Training data with shape ()


        This function will randomly choose cluster centroids 
        (number_of_features, num_of_clusters)

        Returns:
            kmeans: X with a new column for it's cluster
        """
        num_feature_dims = X.shape[0]
        self.initialize_cluster_means(num_feature_dims)
        self.losses = []

        for iter_i in range(self.max_iter):
            # Calculate distance from each datapoint to the cluster means.
            assigned_clusters, loss  = self.assign_data_to_cluster(X)
            assigned_clusters = assigned_clusters.reshape((len(assigned_clusters), 1))
            self.losses.append(loss)
            X_with_cluster = np.concatenate([X, assigned_clusters], axis=1)
            X_with_cluster_df = pd.DataFrame(X_with_cluster)
            self.cluster_info = X_with_cluster_df.groupby(X_with_cluster_df.columns[-1]).mean().values

        return X_with_cluster
num_clusters = 5
num_iterations = 20
km = KMeansCluster(num_clusters=num_clusters, max_iter=10, random_state=2)
y = km.fit(X_train)
df = pd.DataFrame(y, columns=["x", "y", "c"])
import plotly.express as px
fig = px.scatter(df, x="x", y="y", color="c", width=700, height=700, title=f"K means clustering on guassian mixture after {num_iterations} iterations using {num_clusters} clusters")
fig.write_image(f"/Users/loreliegordon/Library/Mobile Documents/com~apple~CloudDocs/Documents/root/Columbia/Fall2021/ELEN4720/Assignments/assignment3/submission/kmeans_{num_clusters}.png")
fig.show()
all_data = []

for num_clusters in range(2, 6):

    km = KMeansCluster(num_clusters=num_clusters, max_iter=20, random_state=2)
    y = km.fit(X_train)
    all_data.extend(zip(range(len(km.losses)), km.losses, [num_clusters]*len(km.losses)))

losses_df = pd.DataFrame(all_data, columns=["iter", "loss", "num_clusters"])
losses_df

import plotly.express as px
fig = px.line(losses_df, x="iter", y="loss", color="num_clusters", title="K Means training loss (average L2 squared distance) for 3 models with a different number of clusters", width=1000, height=600)
fig.write_image("/Users/loreliegordon/Library/Mobile Documents/com~apple~CloudDocs/Documents/root/Columbia/Fall2021/ELEN4720/Assignments/assignment3/submission/kmeans_loss.png")
fig.show()