{"id":232,"date":"2019-01-13T04:08:02","date_gmt":"2019-01-13T12:08:02","guid":{"rendered":"http:\/\/gantovnik.com\/bio-tips\/?p=232"},"modified":"2024-07-21T05:24:32","modified_gmt":"2024-07-21T12:24:32","slug":"regression","status":"publish","type":"post","link":"https:\/\/gantovnik.com\/bio-tips\/2019\/01\/regression\/","title":{"rendered":"#47 Regression"},"content":{"rendered":"<pre class=\"brush: python; title: ; notranslate\" title=\"\">\nimport os\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn import datasets\nfrom sklearn.model_selection import train_test_split\nfrom sklearn import linear_model\nfrom sklearn import metrics\nfrom sklearn import tree\nfrom sklearn import neighbors\nfrom sklearn import svm\nfrom sklearn import ensemble\nfrom sklearn import cluster\nimport seaborn as sns\nos.chdir(r&amp;#039;D:\\projects\\wordpress\\ex47&amp;#039;)\nos.getcwd()\ndatasets.load_boston\ndatasets.fetch_california_housing\ndatasets.make_regression\n\n#Linear Regression\nnp.random.seed(123)\n#we have 50 known features, but only 10 of those features contribute to\n#the predictive power of the regression model.\nX_all, y_all = datasets.make_regression(n_samples=50, n_features=50, n_informative=10) #, noise=2.5)\nX_train, X_test, y_train, y_test = train_test_split(X_all, y_all, train_size=0.5)\nX_train.shape, y_train.shape\nX_test.shape, y_test.shape\nmodel = linear_model.LinearRegression()\nmodel.fit(X_train, y_train)\n\n#Sum of squared errors (SSE) between models and the data.\ndef sse(resid):\n    return sum(resid**2)\n\nresid_train = y_train - model.predict(X_train)\nsse_train = sse(resid_train)\nprint(sse_train)\nresid_test = y_test - model.predict(X_test)\nsse_test = sse(resid_train)\nprint(sse_test)\nmodel.score(X_train, y_train)\nmodel.score(X_test, y_test)\n\ndef plot_residuals_and_coeff(resid_train, resid_test, coeff):\n    fig, axes = plt.subplots(1, 3, figsize=(12, 3))\n    axes&#x5B;0].bar(np.arange(len(resid_train)), resid_train)\n    axes&#x5B;0].set_xlabel(&amp;quot;sample number&amp;quot;)\n    axes&#x5B;0].set_ylabel(&amp;quot;residual&amp;quot;)\n    axes&#x5B;0].set_title(&amp;quot;training data&amp;quot;)\n    axes&#x5B;1].bar(np.arange(len(resid_test)), resid_test)\n    axes&#x5B;1].set_xlabel(&amp;quot;sample number&amp;quot;)\n    axes&#x5B;1].set_ylabel(&amp;quot;residual&amp;quot;)\n    axes&#x5B;1].set_title(&amp;quot;testing data&amp;quot;)\n    axes&#x5B;2].bar(np.arange(len(coeff)), coeff)\n    axes&#x5B;2].set_xlabel(&amp;quot;coefficient number&amp;quot;)\n    axes&#x5B;2].set_ylabel(&amp;quot;coefficient&amp;quot;)\n    fig.tight_layout()\n    return fig, axes\n\nfig, ax = plot_residuals_and_coeff(resid_train, resid_test, model.coef_)\nplt.savefig(&amp;quot;example47_1.png&amp;quot;, dpi=100)\n\n#Ridge regression\nmodel = linear_model.Ridge() #alpha=2.5)\nmodel.fit(X_train, y_train)\nresid_train = y_train - model.predict(X_train)\nsse_train = sum(resid_train&lt;strong&gt;2)\nprint(sse_train)\nresid_test = y_test - model.predict(X_test)\nsse_test = sum(resid_test&lt;\/strong&gt;2)\nprint(sse_test)\nmodel.score(X_train, y_train), model.score(X_test, y_test)\nfig, ax = plot_residuals_and_coeff(resid_train, resid_test, model.coef_)\nplt.savefig(&amp;quot;example47_2.png&amp;quot;, dpi=100)\n\n#Lasso regression\nmodel = linear_model.Lasso(alpha=1.0)\nmodel.fit(X_train, y_train)\nresid_train = y_train - model.predict(X_train)\nsse_train = sse(resid_train)\nprint(sse_train)\nresid_test = y_test - model.predict(X_test)\nsse_test = sse(resid_test)\nprint(sse_test)\nfig, ax = plot_residuals_and_coeff(resid_train, resid_test, model.coef_)\nplt.savefig(&amp;quot;example47_3.png&amp;quot;, dpi=100)\n\n&lt;h1&gt;Regularization strength parameter alpha&lt;\/h1&gt;\n\nalphas = np.logspace(-4, 2, 100)\ncoeffs = np.zeros((len(alphas), X_train.shape&#x5B;1]))\nsse_train = np.zeros_like(alphas)\nsse_test = np.zeros_like(alphas)\nfor n, alpha in enumerate(alphas):\n    model = linear_model.Lasso(alpha=alpha)\n    model.fit(X_train, y_train)\n    coeffs&#x5B;n, :] = model.coef_\n    resid = y_train - model.predict(X_train)\n    sse_train&#x5B;n] = sum(resid&lt;strong&gt;2)\n    resid = y_test - model.predict(X_test)\n    sse_test&#x5B;n] = sum(resid&lt;\/strong&gt;2)\n\nfig, axes = plt.subplots(1, 2, figsize=(12, 4), sharex=True)\n\nfor n in range(coeffs.shape&#x5B;1]):\n    axes&#x5B;0].plot(np.log10(alphas), coeffs&#x5B;:, n], color=&amp;#039;k&amp;#039;, lw=0.5)\n\naxes&#x5B;1].semilogy(np.log10(alphas), sse_train, label=&amp;quot;train&amp;quot;)\naxes&#x5B;1].semilogy(np.log10(alphas), sse_test, label=&amp;quot;test&amp;quot;)\naxes&#x5B;1].legend(loc=0)\n\naxes&#x5B;0].set_xlabel(r&amp;quot;${\\log_{10}}\\alpha$&amp;quot;, fontsize=18)\naxes&#x5B;0].set_ylabel(r&amp;quot;coefficients&amp;quot;, fontsize=18)\naxes&#x5B;1].set_xlabel(r&amp;quot;${\\log_{10}}\\alpha$&amp;quot;, fontsize=18)\naxes&#x5B;1].set_ylabel(r&amp;quot;sse&amp;quot;, fontsize=18)\nfig.tight_layout()\nplt.savefig(&amp;quot;example47_4.png&amp;quot;, dpi=100)\n\n#LassoCV with testing a regularized regression with several values of alpha\nmodel = linear_model.LassoCV()\nmodel.fit(X_all, y_all)\nprint(model.alpha_)\nresid_train = y_train - model.predict(X_train)\nsse_train = sse(resid_train)\nprint(sse_train)\nmodel.score(X_train, y_train), model.score(X_test, y_test)\nfig, ax = plot_residuals_and_coeff(resid_train, resid_test, model.coef_)\nplt.savefig(&amp;quot;example47_5.png&amp;quot;, dpi=100)\n\n#ElasticNetCV with testing a regularized regression with several values of alpha\nmodel = linear_model.ElasticNetCV()\nmodel.fit(X_all, y_all)\nprint(model.alpha_)\nprint(model.l1_ratio)\nresid_train = y_train - model.predict(X_train)\nsse_train = sum(resid_train&lt;strong&gt;2)\nprint(sse_train)\nresid_test = y_test - model.predict(X_test)\nsse_test = sum(resid_test&lt;\/strong&gt;2)\nprint(sse_test)\nmodel.score(X_train, y_train), model.score(X_test, y_test)\nfig, ax = plot_residuals_and_coeff(resid_train, resid_test, model.coef_)\nplt.savefig(&amp;quot;example47_6.png&amp;quot;, dpi=100)\nplt.show()\nplt.close()\n<\/pre>\n<div class=\"et_pb_slider et_pb_slider_fullwidth_off et_pb_gallery_post_type\">\n\t\t\t\t<div class=\"et_pb_slides\">\n\t\t\t\t\t<div class=\"et_pb_slide\" style=\"background: url(https:\/\/gantovnik.com\/bio-tips\/wp-content\/uploads\/2019\/01\/example47_2.png);\"><\/div><div class=\"et_pb_slide\" style=\"background: url(https:\/\/gantovnik.com\/bio-tips\/wp-content\/uploads\/2019\/01\/example47_3.png);\"><\/div><div class=\"et_pb_slide\" style=\"background: url(https:\/\/gantovnik.com\/bio-tips\/wp-content\/uploads\/2019\/01\/example47_4.png);\"><\/div><div class=\"et_pb_slide\" style=\"background: url(https:\/\/gantovnik.com\/bio-tips\/wp-content\/uploads\/2019\/01\/example47_5.png);\"><\/div><div class=\"et_pb_slide\" style=\"background: url(https:\/\/gantovnik.com\/bio-tips\/wp-content\/uploads\/2019\/01\/example47_6.png);\"><\/div><div class=\"et_pb_slide\" style=\"background: url(https:\/\/gantovnik.com\/bio-tips\/wp-content\/uploads\/2019\/01\/example47_1.png);\"><\/div>\n\t\t\t\t<\/div>\n\t\t\t<\/div>\n<pre>\n<\/pre>\n","protected":false},"excerpt":{"rendered":"<p>import os import numpy as np import matplotlib.pyplot as plt from sklearn import datasets from sklearn.model_selection import train_test_split from sklearn import linear_model from sklearn import metrics from sklearn import tree from sklearn import neighbors from sklearn import svm from sklearn import ensemble from sklearn import cluster import seaborn as sns os.chdir(r&amp;#039;D:\\projects\\wordpress\\ex47&amp;#039;) os.getcwd() datasets.load_boston datasets.fetch_california_housing datasets.make_regression [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"nf_dc_page":"","_et_pb_use_builder":"","_et_pb_old_content":"","_et_gb_content_width":"","_lmt_disableupdate":"yes","_lmt_disable":"","_jetpack_newsletter_access":"","_jetpack_dont_email_post_to_subs":false,"_jetpack_newsletter_tier_id":0,"_jetpack_memberships_contains_paywalled_content":false,"_jetpack_memberships_contains_paid_content":false,"footnotes":"","jetpack_post_was_ever_published":false},"categories":[2],"tags":[],"class_list":["post-232","post","type-post","status-publish","format-standard","hentry","category-python"],"modified_by":"gantovnik","jetpack_featured_media_url":"","jetpack_sharing_enabled":true,"jetpack_shortlink":"https:\/\/wp.me\/p8bH0k-3K","jetpack_likes_enabled":true,"jetpack-related-posts":[{"id":243,"url":"https:\/\/gantovnik.com\/bio-tips\/2019\/01\/classification\/","url_meta":{"origin":232,"position":0},"title":"#48 Classification","author":"gantovnik","date":"2019-01-13","format":false,"excerpt":"[code language=\"python\"] import os import numpy as np import matplotlib.pyplot as plt from sklearn import datasets from sklearn.model_selection import train_test_split from sklearn import linear_model from sklearn import metrics from sklearn import tree from sklearn import neighbors from sklearn import svm from sklearn import ensemble from sklearn import cluster import seaborn\u2026","rel":"","context":"In &quot;python&quot;","block_context":{"text":"python","link":"https:\/\/gantovnik.com\/bio-tips\/category\/python\/"},"img":{"alt_text":"example48","src":"https:\/\/i0.wp.com\/gantovnik.com\/bio-tips\/wp-content\/uploads\/2019\/01\/example48.png?resize=350%2C200","width":350,"height":200,"srcset":"https:\/\/i0.wp.com\/gantovnik.com\/bio-tips\/wp-content\/uploads\/2019\/01\/example48.png?resize=350%2C200 1x, https:\/\/i0.wp.com\/gantovnik.com\/bio-tips\/wp-content\/uploads\/2019\/01\/example48.png?resize=525%2C300 1.5x"},"classes":[]},{"id":248,"url":"https:\/\/gantovnik.com\/bio-tips\/2019\/01\/clustering\/","url_meta":{"origin":232,"position":1},"title":"#49 Clustering","author":"gantovnik","date":"2019-01-13","format":false,"excerpt":"[code language=\"python\"] import os import numpy as np import matplotlib.pyplot as plt from sklearn import datasets from sklearn import metrics from sklearn import cluster os.chdir(r'D:\\projects\\wordpress\\ex49') os.getcwd() iris = datasets.load_iris() X, y = iris.data, iris.target np.random.seed(123) n_clusters = 3 c = cluster.KMeans(n_clusters=n_clusters) c.fit(X) y_pred = c.predict(X) print(y_pred[::8]) print(y[::8]) idx_0, idx_1, idx_2\u2026","rel":"","context":"In &quot;python&quot;","block_context":{"text":"python","link":"https:\/\/gantovnik.com\/bio-tips\/category\/python\/"},"img":{"alt_text":"example49","src":"https:\/\/i0.wp.com\/gantovnik.com\/bio-tips\/wp-content\/uploads\/2019\/01\/example49.png?resize=350%2C200","width":350,"height":200,"srcset":"https:\/\/i0.wp.com\/gantovnik.com\/bio-tips\/wp-content\/uploads\/2019\/01\/example49.png?resize=350%2C200 1x, https:\/\/i0.wp.com\/gantovnik.com\/bio-tips\/wp-content\/uploads\/2019\/01\/example49.png?resize=525%2C300 1.5x"},"classes":[]},{"id":2104,"url":"https:\/\/gantovnik.com\/bio-tips\/2024\/01\/411-clustering-using-dbscan-algorithm-in-sklearn-cluster-in-python\/","url_meta":{"origin":232,"position":2},"title":"#411 Clustering using DBSCAN algorithm in sklearn.cluster in python","author":"gantovnik","date":"2024-01-18","format":false,"excerpt":"DBSCAN works by finding core points that have many data points within a given radius. Once the core is defined, the process is iteratively computed until there are no more core points definable within the maximum radius. This algorithm does exceptionally well compared to kmeans where there is noise present\u2026","rel":"","context":"In &quot;cluster&quot;","block_context":{"text":"cluster","link":"https:\/\/gantovnik.com\/bio-tips\/category\/cluster\/"},"img":{"alt_text":"","src":"https:\/\/i0.wp.com\/gantovnik.com\/bio-tips\/wp-content\/uploads\/2024\/01\/ex411.png?resize=350%2C200&ssl=1","width":350,"height":200},"classes":[]},{"id":304,"url":"https:\/\/gantovnik.com\/bio-tips\/2019\/01\/training-a-perceptron-via-scikit-learn\/","url_meta":{"origin":232,"position":3},"title":"#53 Training a perceptron via scikit-learn","author":"gantovnik","date":"2019-01-22","format":false,"excerpt":"[code language=\"python\"] import os import matplotlib.pyplot as plt import numpy as np from sklearn import datasets os.chdir(r'D:\\projects\\wordpress\\ex53') os.getcwd() iris = datasets.load_iris() X = iris.data[:, [2, 3]] y = iris.target from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) from sklearn.preprocessing import StandardScaler sc = StandardScaler() sc.fit(X_train)\u2026","rel":"","context":"In &quot;python&quot;","block_context":{"text":"python","link":"https:\/\/gantovnik.com\/bio-tips\/category\/python\/"},"img":{"alt_text":"","src":"https:\/\/i0.wp.com\/gantovnik.com\/bio-tips\/wp-content\/uploads\/2019\/01\/example53.png?resize=350%2C200&ssl=1","width":350,"height":200,"srcset":"https:\/\/i0.wp.com\/gantovnik.com\/bio-tips\/wp-content\/uploads\/2019\/01\/example53.png?resize=350%2C200&ssl=1 1x, https:\/\/i0.wp.com\/gantovnik.com\/bio-tips\/wp-content\/uploads\/2019\/01\/example53.png?resize=525%2C300&ssl=1 1.5x"},"classes":[]},{"id":1041,"url":"https:\/\/gantovnik.com\/bio-tips\/2021\/11\/186-generate-the-feature-importance\/","url_meta":{"origin":232,"position":4},"title":"#186 Generate the feature importance","author":"gantovnik","date":"2021-11-07","format":false,"excerpt":"[code language=\"python\"] import pandas as pd import seaborn as sns import os import matplotlib.pyplot as plt os.chdir(r'D:\\projects\\wordpress\\ex186') sns.set(style=\"ticks\") # read the downloaded input data (marketing data) df = pd.read_csv('https:\/\/raw.githubusercontent.com\/TrainingByPackt\/Big-Data-Analysis-with-Python\/master\/Lesson07\/Dataset\/bank.csv', sep=';') df['y'].replace(['yes','no'],[1,0],inplace=True) df['default'].replace(['yes','no'],[1,0],inplace=True) df['housing'].replace(['yes','no'],[1,0],inplace=True) df['loan'].replace(['yes','no'],[1,0],inplace=True) corr_df = df.corr() sns.heatmap(corr_df, xticklabels=corr_df.columns.values, yticklabels=corr_df.columns.values, annot = True, annot_kws={'size':12}) heat_map=plt.gcf(); heat_map.set_size_inches(10,5) plt.xticks(fontsize=10); plt.yticks(fontsize=10); plt.savefig('ex186a.jpg', dpi=300)\u2026","rel":"","context":"In &quot;python&quot;","block_context":{"text":"python","link":"https:\/\/gantovnik.com\/bio-tips\/category\/python\/"},"img":{"alt_text":"","src":"https:\/\/i0.wp.com\/gantovnik.com\/bio-tips\/wp-content\/uploads\/2021\/11\/ex186b.jpg?resize=350%2C200&ssl=1","width":350,"height":200,"srcset":"https:\/\/i0.wp.com\/gantovnik.com\/bio-tips\/wp-content\/uploads\/2021\/11\/ex186b.jpg?resize=350%2C200&ssl=1 1x, https:\/\/i0.wp.com\/gantovnik.com\/bio-tips\/wp-content\/uploads\/2021\/11\/ex186b.jpg?resize=525%2C300&ssl=1 1.5x, https:\/\/i0.wp.com\/gantovnik.com\/bio-tips\/wp-content\/uploads\/2021\/11\/ex186b.jpg?resize=700%2C400&ssl=1 2x, https:\/\/i0.wp.com\/gantovnik.com\/bio-tips\/wp-content\/uploads\/2021\/11\/ex186b.jpg?resize=1050%2C600&ssl=1 3x"},"classes":[]},{"id":1002,"url":"https:\/\/gantovnik.com\/bio-tips\/2021\/10\/180-linear-regression-using-python\/","url_meta":{"origin":232,"position":5},"title":"#180 Linear regression using python","author":"gantovnik","date":"2021-10-26","format":false,"excerpt":"[code language=\"python\"] import os import matplotlib.pyplot as plt import numpy as np import seaborn as sns from sklearn.linear_model import LinearRegression sns.set() os.chdir(r'D:\\projects\\wordpress\\ex66') os.getcwd() rng = np.random.RandomState(1) x = 10 * rng.rand(50) y = 2 * x - 5 + rng.randn(50) plt.scatter(x, y) model = LinearRegression(fit_intercept=True) model.fit(x[:, np.newaxis], y) xfit =\u2026","rel":"","context":"In &quot;python&quot;","block_context":{"text":"python","link":"https:\/\/gantovnik.com\/bio-tips\/category\/python\/"},"img":{"alt_text":"","src":"https:\/\/i0.wp.com\/gantovnik.com\/bio-tips\/wp-content\/uploads\/2021\/10\/ex180.png?resize=350%2C200&ssl=1","width":350,"height":200,"srcset":"https:\/\/i0.wp.com\/gantovnik.com\/bio-tips\/wp-content\/uploads\/2021\/10\/ex180.png?resize=350%2C200&ssl=1 1x, https:\/\/i0.wp.com\/gantovnik.com\/bio-tips\/wp-content\/uploads\/2021\/10\/ex180.png?resize=525%2C300&ssl=1 1.5x"},"classes":[]}],"_links":{"self":[{"href":"https:\/\/gantovnik.com\/bio-tips\/wp-json\/wp\/v2\/posts\/232","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/gantovnik.com\/bio-tips\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/gantovnik.com\/bio-tips\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/gantovnik.com\/bio-tips\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/gantovnik.com\/bio-tips\/wp-json\/wp\/v2\/comments?post=232"}],"version-history":[{"count":1,"href":"https:\/\/gantovnik.com\/bio-tips\/wp-json\/wp\/v2\/posts\/232\/revisions"}],"predecessor-version":[{"id":2906,"href":"https:\/\/gantovnik.com\/bio-tips\/wp-json\/wp\/v2\/posts\/232\/revisions\/2906"}],"wp:attachment":[{"href":"https:\/\/gantovnik.com\/bio-tips\/wp-json\/wp\/v2\/media?parent=232"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/gantovnik.com\/bio-tips\/wp-json\/wp\/v2\/categories?post=232"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/gantovnik.com\/bio-tips\/wp-json\/wp\/v2\/tags?post=232"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}