import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn import metrics
from sklearn import tree
from sklearn import neighbors
from sklearn import svm
from sklearn import ensemble
from sklearn import cluster
import seaborn as sns
os.chdir(r'D:\projects\wordpress\ex47')
os.getcwd()
datasets.load_boston
datasets.fetch_california_housing
datasets.make_regression

#Linear Regression
np.random.seed(123)
#we have 50 known features, but only 10 of those features contribute to
#the predictive power of the regression model.
X_all, y_all = datasets.make_regression(n_samples=50, n_features=50, n_informative=10) #, noise=2.5)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, train_size=0.5)
X_train.shape, y_train.shape
X_test.shape, y_test.shape
model = linear_model.LinearRegression()
model.fit(X_train, y_train)

#Sum of squared errors (SSE) between models and the data.
def sse(resid):
    return sum(resid**2)

resid_train = y_train - model.predict(X_train)
sse_train = sse(resid_train)
print(sse_train)
resid_test = y_test - model.predict(X_test)
sse_test = sse(resid_train)
print(sse_test)
model.score(X_train, y_train)
model.score(X_test, y_test)

def plot_residuals_and_coeff(resid_train, resid_test, coeff):
    fig, axes = plt.subplots(1, 3, figsize=(12, 3))
    axes[0].bar(np.arange(len(resid_train)), resid_train)
    axes[0].set_xlabel("sample number")
    axes[0].set_ylabel("residual")
    axes[0].set_title("training data")
    axes[1].bar(np.arange(len(resid_test)), resid_test)
    axes[1].set_xlabel("sample number")
    axes[1].set_ylabel("residual")
    axes[1].set_title("testing data")
    axes[2].bar(np.arange(len(coeff)), coeff)
    axes[2].set_xlabel("coefficient number")
    axes[2].set_ylabel("coefficient")
    fig.tight_layout()
    return fig, axes

fig, ax = plot_residuals_and_coeff(resid_train, resid_test, model.coef_)
plt.savefig("example47_1.png", dpi=100)

#Ridge regression
model = linear_model.Ridge() #alpha=2.5)
model.fit(X_train, y_train)
resid_train = y_train - model.predict(X_train)
sse_train = sum(resid_train<strong>2)
print(sse_train)
resid_test = y_test - model.predict(X_test)
sse_test = sum(resid_test</strong>2)
print(sse_test)
model.score(X_train, y_train), model.score(X_test, y_test)
fig, ax = plot_residuals_and_coeff(resid_train, resid_test, model.coef_)
plt.savefig(&quot;example47_2.png&quot;, dpi=100)

#Lasso regression
model = linear_model.Lasso(alpha=1.0)
model.fit(X_train, y_train)
resid_train = y_train - model.predict(X_train)
sse_train = sse(resid_train)
print(sse_train)
resid_test = y_test - model.predict(X_test)
sse_test = sse(resid_test)
print(sse_test)
fig, ax = plot_residuals_and_coeff(resid_train, resid_test, model.coef_)
plt.savefig(&quot;example47_3.png&quot;, dpi=100)

<h1>Regularization strength parameter alpha</h1>

alphas = np.logspace(-4, 2, 100)
coeffs = np.zeros((len(alphas), X_train.shape[1]))
sse_train = np.zeros_like(alphas)
sse_test = np.zeros_like(alphas)
for n, alpha in enumerate(alphas):
    model = linear_model.Lasso(alpha=alpha)
    model.fit(X_train, y_train)
    coeffs[n, :] = model.coef_
    resid = y_train - model.predict(X_train)
    sse_train[n] = sum(resid<strong>2)
    resid = y_test - model.predict(X_test)
    sse_test[n] = sum(resid</strong>2)

fig, axes = plt.subplots(1, 2, figsize=(12, 4), sharex=True)

for n in range(coeffs.shape[1]):
    axes[0].plot(np.log10(alphas), coeffs[:, n], color=&#039;k&#039;, lw=0.5)

axes[1].semilogy(np.log10(alphas), sse_train, label=&quot;train&quot;)
axes[1].semilogy(np.log10(alphas), sse_test, label=&quot;test&quot;)
axes[1].legend(loc=0)

axes[0].set_xlabel(r&quot;${\log_{10}}\alpha$&quot;, fontsize=18)
axes[0].set_ylabel(r&quot;coefficients&quot;, fontsize=18)
axes[1].set_xlabel(r&quot;${\log_{10}}\alpha$&quot;, fontsize=18)
axes[1].set_ylabel(r&quot;sse&quot;, fontsize=18)
fig.tight_layout()
plt.savefig(&quot;example47_4.png&quot;, dpi=100)

#LassoCV with testing a regularized regression with several values of alpha
model = linear_model.LassoCV()
model.fit(X_all, y_all)
print(model.alpha_)
resid_train = y_train - model.predict(X_train)
sse_train = sse(resid_train)
print(sse_train)
model.score(X_train, y_train), model.score(X_test, y_test)
fig, ax = plot_residuals_and_coeff(resid_train, resid_test, model.coef_)
plt.savefig(&quot;example47_5.png&quot;, dpi=100)

#ElasticNetCV with testing a regularized regression with several values of alpha
model = linear_model.ElasticNetCV()
model.fit(X_all, y_all)
print(model.alpha_)
print(model.l1_ratio)
resid_train = y_train - model.predict(X_train)
sse_train = sum(resid_train<strong>2)
print(sse_train)
resid_test = y_test - model.predict(X_test)
sse_test = sum(resid_test</strong>2)
print(sse_test)
model.score(X_train, y_train), model.score(X_test, y_test)
fig, ax = plot_residuals_and_coeff(resid_train, resid_test, model.coef_)
plt.savefig(&quot;example47_6.png&quot;, dpi=100)
plt.show()
plt.close()

Discover more from Tips and Hints for Aerospace Engineers

Subscribe now to keep reading and get access to the full archive.

Continue reading