import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
import math
import pdb
Load Data¶
base_path = "/Users/loreliegordon/Library/Mobile Documents/com~apple~CloudDocs/Documents/root/Columbia/Fall2021/ELEN4720/Assignments/assignment1/"
X_all = pd.read_csv(base_path + "hw1-data/X_train.csv", header=None)
y_all = pd.read_csv(base_path + "hw1-data/y_train.csv", header=None)
X_test = pd.read_csv(base_path + "hw1-data/X_test.csv", header=None)
y_test = pd.read_csv(base_path + "hw1-data/y_test.csv", header=None)
X_all = X_all.copy().to_numpy()
y_all = y_all.copy().to_numpy()
X_test = X_test.copy().to_numpy()
y_test = y_test.copy().to_numpy()
y_train = y_all
X_train = X_all
class GaussianProcesses():
"""Implementation of ridge regression from scratch.
"""
def __init__(self, sigma2=1, b=1) -> None:
self.X_maxs = None
self.X_means = None
self.kernel = self.gaussian_kernel
self.b = b
self.theta = 1/b
self.sigma2 = sigma2
def gaussian_kernel(self, A, B):
K = np.zeros((A.shape[0],B.shape[0]))
for i in range(A.shape[0]):
K[i,:] = np.exp(-np.sum(self.theta*(A[i,:]-B)**2, axis=1))
return K
def no_kernel(self, A, B):
return A @ B.T
def fit(self, X, y):
"""Fit the model to training data
Args:
X (np.array): Training data. Columns for features. Rows for test instances.
y (np.array): Training target column. Must have same number of rows as X
Returns:
(np.array, np.array): Ridge regression weights, degrees of freedom.
"""
self.X = X
self.y = y
self.n, self.d = self.X.shape
def calc_means_and_stds(self, X):
"""Calculate and store the means and std.
These will be used to normalize new data
Args:
X (np.array): Training data
"""
self.X_maxs = X.max(axis=0)
self.X_means = X.mean(axis=0)
self.X_stds = X.std(axis=0)
self.X_means[self.X_stds == 0] = 0 # Don't want to subtract any means if variance is zero (for 1 column)
self.X_stds[self.X_stds == 0] = 1 # If STD is zero force it to one.
def prepare_X(self, X):
"""Prepares any X data for model usage
Subtract the column mean and normalize by variance.
Args:
X (np.array): Input array
"""
self.calc_means_and_stds(X)
if self.X_means is None:
raise ValueError("Need to calc_means_and_stds at least once before this function.")
X = (X - self.X_means) / self.X_stds
return X
def predict(self, X_in):
"""Predict y values for new X input
Args:
X_in (np.array): New data. Must have same shape as data this was trained on.
Returns:
np.array: The predicted data
"""
part = np.linalg.inv(self.sigma2 * np.identity(self.n) + self.kernel(self.X, self.X))
mean_pred = self.kernel(self.X, X_in).T @ part @ self.y
return mean_pred
data = []
for b in range(5, 17, 2):
for i in range(10):
sig = (i+1)/10
rr = GaussianProcesses(sigma2=sig, b=b)
X_train_prepared = rr.prepare_X(X_train)
rr.fit(X_train_prepared, y_train)
y_pred = rr.predict(X_test)
rmse = np.sqrt(sum((y_test - y_pred)**2)/len(y_pred))[0]
data.append({"sig": sig, "b": b, "rmse":rmse})
df = pd.DataFrame(data)
dfpiv = df.pivot(values="rmse", index="b", columns="sig")
dfpiv
sig | 0.1 | 0.2 | 0.3 | 0.4 | 0.5 | 0.6 | 0.7 | 0.8 | 0.9 | 1.0 |
---|---|---|---|---|---|---|---|---|---|---|
b | ||||||||||
5 | 1.975933 | 1.936345 | 1.923342 | 1.919783 | 1.920475 | 1.923312 | 1.927312 | 1.931976 | 1.937034 | 1.942334 |
7 | 1.927620 | 1.909913 | 1.911403 | 1.917795 | 1.925456 | 1.933253 | 1.940816 | 1.948040 | 1.954922 | 1.961495 |
9 | 1.907233 | 1.910483 | 1.924117 | 1.937733 | 1.949864 | 1.960490 | 1.969857 | 1.978224 | 1.985811 | 1.992796 |
11 | 1.902687 | 1.925183 | 1.947469 | 1.965349 | 1.979662 | 1.991398 | 2.001297 | 2.009883 | 2.017524 | 2.024480 |
13 | 1.909892 | 1.947134 | 1.974492 | 1.994221 | 2.009113 | 2.020905 | 2.030658 | 2.039039 | 2.046482 | 2.053277 |
15 | 1.924686 | 1.971769 | 2.001363 | 2.021328 | 2.035884 | 2.047219 | 2.056548 | 2.064587 | 2.071777 | 2.078402 |
dfpiv = df.pivot(values="rmse", index="b", columns="sig")
dfpiv
sig | 0.1 | 0.2 | 0.3 | 0.4 | 0.5 | 0.6 | 0.7 | 0.8 | 0.9 | 1.0 |
---|---|---|---|---|---|---|---|---|---|---|
b | ||||||||||
5 | 1.975933 | 1.936345 | 1.923342 | 1.919783 | 1.920475 | 1.923312 | 1.927312 | 1.931976 | 1.937034 | 1.942334 |
7 | 1.927620 | 1.909913 | 1.911403 | 1.917795 | 1.925456 | 1.933253 | 1.940816 | 1.948040 | 1.954922 | 1.961495 |
9 | 1.907233 | 1.910483 | 1.924117 | 1.937733 | 1.949864 | 1.960490 | 1.969857 | 1.978224 | 1.985811 | 1.992796 |
11 | 1.902687 | 1.925183 | 1.947469 | 1.965349 | 1.979662 | 1.991398 | 2.001297 | 2.009883 | 2.017524 | 2.024480 |
13 | 1.909892 | 1.947134 | 1.974492 | 1.994221 | 2.009113 | 2.020905 | 2.030658 | 2.039039 | 2.046482 | 2.053277 |
15 | 1.924686 | 1.971769 | 2.001363 | 2.021328 | 2.035884 | 2.047219 | 2.056548 | 2.064587 | 2.071777 | 2.078402 |
import plotly.figure_factory as ff
rounded = np.around(dfpiv.values, decimals=3)
fig = ff.create_annotated_heatmap(rounded, y=dfpiv.index.tolist(), x=dfpiv.columns.tolist(), colorscale="YlGnBu")
fig.show()
dfpiv.index.tolist()
[5, 7, 9, 11, 13, 15]
Visualizing¶
After retraining the model only using a single dimension, we can easily visualize a single dimension to see what the model is doing.
rr = GaussianProcesses(sigma2=2, b=5)
X_train_prepared = rr.prepare_X(X_train)[:, 3]
X_train_prepared = X_train_prepared.reshape(len(X_train_prepared), 1)
rr.fit(X_train_prepared, y_train)
X_test_using = X_test[:, 3]
X_test_using = X_test_using.reshape(len(X_test_using), 1)
x_stepped = np.arange(min(X_train_prepared), max(X_train_prepared), 0.1)
x_stepped = x_stepped.reshape(len(x_stepped), 1)
y_stepped = rr.predict(x_stepped)
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
fig = go.Figure()
# Add traces
fig.add_trace(go.Scatter(mode="markers", x=X_train_prepared[:,0], y=y_train[:,0], name="training data"))
fig.add_trace(go.Scatter(mode="lines", x=x_stepped[:,0], y=y_stepped[:,0], name="prediction"))
fig.update_layout(title="Car Gass Mileage vs Car Weight Modeled Using Gaussian Processes", xaxis_title="Car Weight [Mean Subtracted Kg]", yaxis_title="Mileage [Mean subtracted Miles/Gallon]")
fig.show()