import os import numpy as np import matplotlib.pyplot as plt from sklearn import datasets from sklearn.model_selection import train_test_split from sklearn import linear_model from sklearn import metrics from sklearn import tree from sklearn import neighbors from sklearn import svm from sklearn import ensemble from sklearn import cluster import seaborn as sns os.chdir(r'D:\projects\wordpress\ex47') os.getcwd() datasets.load_boston datasets.fetch_california_housing datasets.make_regression #Linear Regression np.random.seed(123) #we have 50 known features, but only 10 of those features contribute to #the predictive power of the regression model. X_all, y_all = datasets.make_regression(n_samples=50, n_features=50, n_informative=10) #, noise=2.5) X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, train_size=0.5) X_train.shape, y_train.shape X_test.shape, y_test.shape model = linear_model.LinearRegression() model.fit(X_train, y_train) #Sum of squared errors (SSE) between models and the data. def sse(resid): return sum(resid**2) resid_train = y_train - model.predict(X_train) sse_train = sse(resid_train) print(sse_train) resid_test = y_test - model.predict(X_test) sse_test = sse(resid_train) print(sse_test) model.score(X_train, y_train) model.score(X_test, y_test) def plot_residuals_and_coeff(resid_train, resid_test, coeff): fig, axes = plt.subplots(1, 3, figsize=(12, 3)) axes[0].bar(np.arange(len(resid_train)), resid_train) axes[0].set_xlabel("sample number") axes[0].set_ylabel("residual") axes[0].set_title("training data") axes[1].bar(np.arange(len(resid_test)), resid_test) axes[1].set_xlabel("sample number") axes[1].set_ylabel("residual") axes[1].set_title("testing data") axes[2].bar(np.arange(len(coeff)), coeff) axes[2].set_xlabel("coefficient number") axes[2].set_ylabel("coefficient") fig.tight_layout() return fig, axes fig, ax = plot_residuals_and_coeff(resid_train, resid_test, model.coef_) plt.savefig("example47_1.png", dpi=100) #Ridge regression model = linear_model.Ridge() #alpha=2.5) model.fit(X_train, y_train) resid_train = y_train - model.predict(X_train) sse_train = sum(resid_train<strong>2) print(sse_train) resid_test = y_test - model.predict(X_test) sse_test = sum(resid_test</strong>2) print(sse_test) model.score(X_train, y_train), model.score(X_test, y_test) fig, ax = plot_residuals_and_coeff(resid_train, resid_test, model.coef_) plt.savefig("example47_2.png", dpi=100) #Lasso regression model = linear_model.Lasso(alpha=1.0) model.fit(X_train, y_train) resid_train = y_train - model.predict(X_train) sse_train = sse(resid_train) print(sse_train) resid_test = y_test - model.predict(X_test) sse_test = sse(resid_test) print(sse_test) fig, ax = plot_residuals_and_coeff(resid_train, resid_test, model.coef_) plt.savefig("example47_3.png", dpi=100) <h1>Regularization strength parameter alpha</h1> alphas = np.logspace(-4, 2, 100) coeffs = np.zeros((len(alphas), X_train.shape[1])) sse_train = np.zeros_like(alphas) sse_test = np.zeros_like(alphas) for n, alpha in enumerate(alphas): model = linear_model.Lasso(alpha=alpha) model.fit(X_train, y_train) coeffs[n, :] = model.coef_ resid = y_train - model.predict(X_train) sse_train[n] = sum(resid<strong>2) resid = y_test - model.predict(X_test) sse_test[n] = sum(resid</strong>2) fig, axes = plt.subplots(1, 2, figsize=(12, 4), sharex=True) for n in range(coeffs.shape[1]): axes[0].plot(np.log10(alphas), coeffs[:, n], color='k', lw=0.5) axes[1].semilogy(np.log10(alphas), sse_train, label="train") axes[1].semilogy(np.log10(alphas), sse_test, label="test") axes[1].legend(loc=0) axes[0].set_xlabel(r"${\log_{10}}\alpha$", fontsize=18) axes[0].set_ylabel(r"coefficients", fontsize=18) axes[1].set_xlabel(r"${\log_{10}}\alpha$", fontsize=18) axes[1].set_ylabel(r"sse", fontsize=18) fig.tight_layout() plt.savefig("example47_4.png", dpi=100) #LassoCV with testing a regularized regression with several values of alpha model = linear_model.LassoCV() model.fit(X_all, y_all) print(model.alpha_) resid_train = y_train - model.predict(X_train) sse_train = sse(resid_train) print(sse_train) model.score(X_train, y_train), model.score(X_test, y_test) fig, ax = plot_residuals_and_coeff(resid_train, resid_test, model.coef_) plt.savefig("example47_5.png", dpi=100) #ElasticNetCV with testing a regularized regression with several values of alpha model = linear_model.ElasticNetCV() model.fit(X_all, y_all) print(model.alpha_) print(model.l1_ratio) resid_train = y_train - model.predict(X_train) sse_train = sum(resid_train<strong>2) print(sse_train) resid_test = y_test - model.predict(X_test) sse_test = sum(resid_test</strong>2) print(sse_test) model.score(X_train, y_train), model.score(X_test, y_test) fig, ax = plot_residuals_and_coeff(resid_train, resid_test, model.coef_) plt.savefig("example47_6.png", dpi=100) plt.show() plt.close()
1 |
Recent Comments