from linreg import LinearRegression # your source code
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
Linear Regression
In this blog post I have implemented least-squares linear regression, which is linear regression using a least-squares cost function. Minimizing the least squares cost function actually has an analytical solution, which I have implemented in addition to gradient descent.
def draw_line(w, x_min, x_max, *, color="black", ax=None, alpha=1, **kwargs):
= np.linspace(x_min, x_max, 101)
x if len(w) == 3:
= -(w[0]*x + w[2])/w[1]
y elif len(w) == 2:
= w[0]*x + w[1]
y if ax is None:
= color, alpha=alpha, **kwargs)
plt.plot(x, y, color else:
= color, alpha=alpha, **kwargs) ax.plot(x, y, color
def pad(X):
return np.append(X, np.ones((X.shape[0], 1)), 1)
def LR_data(n_train = 100, n_val = 100, p_features = 1, noise = .1, w = None):
if w is None:
= np.random.rand(p_features + 1) + .2
w
= np.random.rand(n_train, p_features)
X_train = pad(X_train)@w + noise*np.random.randn(n_train)
y_train
= np.random.rand(n_val, p_features)
X_val = pad(X_val)@w + noise*np.random.randn(n_val)
y_val
return X_train, y_train, X_val, y_val
= 100
n_train = 100
n_val = 1
p_features = 0.2
noise
# create some data
= LR_data(n_train, n_val, p_features, noise)
X_train, y_train, X_val, y_val
# train
= LinearRegression()
LR_analytical
LR_analytical.fit_analytical(X_train, y_train) = LinearRegression()
LR_gradient =[.5, .5], max_steps=100, alpha=.005)
LR_gradient.fit_gradient(X_train, y_train, w
# plot best fit lines
= plt.subplots(1, 2, figsize=(8, 4))
fig, axarr 0].scatter(X_train, y_train, color="gray", alpha=.5, label="Train", s=15)
axarr[0].scatter(X_val, y_val, color="black", alpha=1, label="Validation", s=20)
axarr[= axarr[0].set(title = "Training", xlabel = "x", ylabel = "y")
labs = axarr[1].set(title = "Validation", xlabel = "x")
labs
0, 1, color="blue", ax=axarr[0], label="Analytical", lw="2")
draw_line(LR_analytical.w, 0, 1, color="red", ax=axarr[0], label="Gradient", linestyle="dotted", lw="4")
draw_line(LR_gradient.w, 0].legend()
axarr[
# plot score
1].plot(LR_gradient.score_history)
axarr[= axarr[1].set(xlabel = "Iteration", ylabel = "Score", title = "Score Through Training")
labels 1].set_ylim([0, 1])
axarr[
plt.tight_layout()
print("\nAnalytical method:")
print(f"Training score = {LR_gradient.score(X_train, y_train).round(4)}")
print(f"Validation score = {LR_gradient.score(X_val, y_val).round(4)}")
print("\nGradient method:")
print(f"Training score = {LR_gradient.score(X_train, y_train).round(4)}")
print(f"Validation score = {LR_gradient.score(X_val, y_val).round(4)}")
Analytical method:
Training score = 0.4919
Validation score = 0.4647
Gradient method:
Training score = 0.4919
Validation score = 0.4647
Experiments
Experiments 1 and 2: Many Features and LASSO Regularization
In this experiment we will increase the number of features up to n - 1
in order to study what happens to the training and validation scores.
We will also add use a LR model that adds a regularizing term to its loss function to fight against overfitting when the number of features is very high.
from sklearn.linear_model import Lasso
= 100
n_train = 100
n_val = 0.2
noise = []
scores = []
scores_lasso for i in range(1, n_train):
= i
p_features
= LR_data(n_train, n_val, p_features, noise)
X_train, y_train, X_val, y_val
= LinearRegression()
LR
LR.fit_analytical(X_train, y_train)
= Lasso(alpha = 0.001)
LR_lasso
LR_lasso.fit(X_train, y_train)
"train": LR.score(X_train, y_train), "validation": LR.score(X_val, y_val)})
scores.append({"train": LR_lasso.score(X_train, y_train), "validation": LR_lasso.score(X_val, y_val)})
scores_lasso.append({
# plot score
= plt.subplots(1, 2, figsize=(12, 6), sharex=True, sharey=True)
fig, (ax0, ax1)
= pd.DataFrame(scores)
scores_df = np.arange(1, len(scores_df) + 1)
scores_df.index =ax0, xlabel="Number of features", ylabel="Score")
scores_df.plot(ax0, 1.05])
ax0.set_ylim([
= pd.DataFrame(scores_lasso)
scores_lasso_df = np.arange(1, len(scores_lasso_df) + 1)
scores_lasso_df.index =ax1, xlabel="Number of features", ylabel="Score")
scores_lasso_df.plot(ax
print(f"Scores with {n_train} training samples and {n_train-1} features:")
print(f"Training score = {round(scores[-1]['train'], 4)}")
print(f"Validation score = {round(scores[-1]['validation'], 4)}")
print(f"\nScores while using modified loss function with regularization term:")
print(f"Training score = {round(scores_lasso[-1]['train'], 4)}")
print(f"Validation score = {round(scores_lasso[-1]['validation'], 4)}")
Scores with 100 training samples and 99 features:
Training score = 1.0
Validation score = 0.5331
Scores while using modified loss function with regularization term:
Training score = 0.9982
Validation score = 0.8288
As we can clearly see, our implementation becomes severely overfit as the number of features approaches the number of training examples. The training score approaches near perfection, whereas the validation score gets worse.
The scikit-learn implementation with the regularization term also exhibits some pretty serious overfitting, but not to the same degree as our implementation. When the number of features is nearly equal to the number of training examples, the regularization term is able to keep the validation score from tanking.