Gaussian Processes ML Model From Scratch¶

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
import math
import pdb

Load Data¶

base_path = "/Users/loreliegordon/Library/Mobile Documents/com~apple~CloudDocs/Documents/root/Columbia/Fall2021/ELEN4720/Assignments/assignment1/"

X_all = pd.read_csv(base_path + "hw1-data/X_train.csv", header=None)
y_all = pd.read_csv(base_path + "hw1-data/y_train.csv", header=None)
X_test = pd.read_csv(base_path + "hw1-data/X_test.csv", header=None)
y_test = pd.read_csv(base_path + "hw1-data/y_test.csv", header=None)

X_all = X_all.copy().to_numpy()
y_all = y_all.copy().to_numpy()
X_test = X_test.copy().to_numpy()
y_test = y_test.copy().to_numpy()

y_train = y_all
X_train = X_all

class GaussianProcesses():
    """Implementation of ridge regression from scratch.
    """

    def __init__(self, sigma2=1, b=1) -> None:
        self.X_maxs = None
        self.X_means = None
        self.kernel = self.gaussian_kernel
        self.b = b
        self.theta = 1/b
        self.sigma2 = sigma2

    def gaussian_kernel(self, A, B):

        K = np.zeros((A.shape[0],B.shape[0]))
        for i in range(A.shape[0]):
            K[i,:] = np.exp(-np.sum(self.theta*(A[i,:]-B)**2, axis=1))
        return K
        

    def no_kernel(self, A, B):
        return A @ B.T

    def fit(self, X, y):
        """Fit the model to training data

        Args:
            X (np.array): Training data. Columns for features. Rows for test instances.
            y (np.array): Training target column. Must have same number of rows as X

        Returns:
            (np.array, np.array): Ridge regression weights, degrees of freedom.
        """
        self.X = X
        self.y = y
        self.n, self.d = self.X.shape

    def calc_means_and_stds(self, X):
        """Calculate and store the means and std.

        These will be used to normalize new data

        Args:
            X (np.array): Training data
        """
        self.X_maxs = X.max(axis=0)
        self.X_means = X.mean(axis=0)
        self.X_stds = X.std(axis=0)
        self.X_means[self.X_stds == 0] = 0 # Don't want to subtract any means if variance is zero (for 1 column)
        self.X_stds[self.X_stds == 0] = 1 # If STD is zero force it to one.

    def prepare_X(self, X):
        """Prepares any X data for model usage

        Subtract the column mean and normalize by variance.

        Args:
            X (np.array): Input array
        """
        self.calc_means_and_stds(X)
        if self.X_means is None:
            raise ValueError("Need to calc_means_and_stds at least once before this function.")
        X = (X - self.X_means) / self.X_stds
        return X

    def predict(self, X_in):
        """Predict y values for new X input

        Args:
            X_in (np.array): New data. Must have same shape as data this was trained on.

        Returns:
            np.array: The predicted data
        """
        part = np.linalg.inv(self.sigma2 * np.identity(self.n) + self.kernel(self.X, self.X))
        mean_pred = self.kernel(self.X, X_in).T @ part @ self.y
        return mean_pred

data = []
for b in range(5, 17, 2):
    for i in range(10):
        sig = (i+1)/10
        rr = GaussianProcesses(sigma2=sig, b=b)

        X_train_prepared = rr.prepare_X(X_train)
        rr.fit(X_train_prepared, y_train)
        y_pred = rr.predict(X_test)
        rmse = np.sqrt(sum((y_test - y_pred)**2)/len(y_pred))[0]
        data.append({"sig": sig, "b": b, "rmse":rmse})

df = pd.DataFrame(data)
dfpiv = df.pivot(values="rmse", index="b", columns="sig")
dfpiv

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
/var/folders/w3/8cmbpqt17zgg1lcgbdw84r6c0000gn/T/ipykernel_19959/1324821165.py in <module>
     11         data.append({"sig": sig, "b": b, "rmse":rmse})
     12 
---> 13 df = pd.DatFrame(data)

~/.virtualenvs/sandbox2/lib/python3.7/site-packages/pandas/__init__.py in __getattr__(name)
    242         return _SparseArray
    243 
--> 244     raise AttributeError(f"module 'pandas' has no attribute '{name}'")
    245 
    246 

AttributeError: module 'pandas' has no attribute 'DatFrame'

dfpiv = df.pivot(values="rmse", index="b", columns="sig")
dfpiv

sig	0.1	0.2	0.3	0.4	0.5	0.6	0.7	0.8	0.9	1.0
b
5	1.975933	1.936345	1.923342	1.919783	1.920475	1.923312	1.927312	1.931976	1.937034	1.942334
7	1.927620	1.909913	1.911403	1.917795	1.925456	1.933253	1.940816	1.948040	1.954922	1.961495
9	1.907233	1.910483	1.924117	1.937733	1.949864	1.960490	1.969857	1.978224	1.985811	1.992796
11	1.902687	1.925183	1.947469	1.965349	1.979662	1.991398	2.001297	2.009883	2.017524	2.024480
13	1.909892	1.947134	1.974492	1.994221	2.009113	2.020905	2.030658	2.039039	2.046482	2.053277
15	1.924686	1.971769	2.001363	2.021328	2.035884	2.047219	2.056548	2.064587	2.071777	2.078402

import plotly.figure_factory as ff
rounded = np.around(dfpiv.values, decimals=3)
fig = ff.create_annotated_heatmap(rounded, y=dfpiv.index.tolist(), x=dfpiv.columns.tolist(), colorscale="YlGnBu")
fig.show()

dfpiv.index.tolist()

[5, 7, 9, 11, 13, 15]

Visualizing¶

After retraining the model only using a single dimension, we can easily visualize a single dimension to see what the model is doing.

rr = GaussianProcesses(sigma2=2, b=5)

X_train_prepared = rr.prepare_X(X_train)[:, 3]
X_train_prepared = X_train_prepared.reshape(len(X_train_prepared), 1)
rr.fit(X_train_prepared, y_train)

X_test_using = X_test[:, 3]
X_test_using = X_test_using.reshape(len(X_test_using), 1)
x_stepped = np.arange(min(X_train_prepared), max(X_train_prepared), 0.1)
x_stepped = x_stepped.reshape(len(x_stepped), 1)
y_stepped = rr.predict(x_stepped)

import plotly.express as px
import plotly.graph_objects as go
import numpy as np

fig = go.Figure()

# Add traces
fig.add_trace(go.Scatter(mode="markers", x=X_train_prepared[:,0], y=y_train[:,0], name="training data"))
fig.add_trace(go.Scatter(mode="lines", x=x_stepped[:,0], y=y_stepped[:,0], name="prediction"))
fig.update_layout(title="Car Gass Mileage vs Car Weight Modeled Using Gaussian Processes", xaxis_title="Car Weight [Mean Subtracted Kg]", yaxis_title="Mileage [Mean subtracted Miles/Gallon]")

fig.show()

previous

Logistic Regression Classifier From Scratch

next

Baysian Classifier