import pandas as pd
import seaborn as sns
import os
import matplotlib.pyplot as plt
os.chdir(r'D:\projects\wordpress\ex186')
sns.set(style="ticks")
# read the downloaded input data (marketing data)
df = pd.read_csv('https://raw.githubusercontent.com/TrainingByPackt/Big-Data-Analysis-with-Python/master/Lesson07/Dataset/bank.csv', sep=';')
df['y'].replace(['yes','no'],[1,0],inplace=True)
df['default'].replace(['yes','no'],[1,0],inplace=True)
df['housing'].replace(['yes','no'],[1,0],inplace=True)
df['loan'].replace(['yes','no'],[1,0],inplace=True)
corr_df = df.corr()
sns.heatmap(corr_df, xticklabels=corr_df.columns.values, yticklabels=corr_df.columns.values, annot = True, annot_kws={'size':12})
heat_map=plt.gcf(); heat_map.set_size_inches(10,5)
plt.xticks(fontsize=10); plt.yticks(fontsize=10);
plt.savefig('ex186a.jpg', dpi=300)
plt.show()
#pip install boruta --upgrade
# import DecisionTreeClassifier from sklearn and BorutaPy from boruta
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy
# transform all categorical data types to integers (hot-encoding)
for col_name in df.columns:
if(df[col_name].dtype == 'object'):
df[col_name]= df[col_name].astype('category')
df[col_name] = df[col_name].cat.codes
# generate separate dataframes for IVs and DV (target variable)
X = df.drop(['y'], axis=1).values
Y = df['y'].values
# build RandomForestClassifier, Boruta models and
# related parameter
rfc = RandomForestClassifier(n_estimators=200, n_jobs=4, class_weight='balanced', max_depth=6)
boruta_selector = BorutaPy(rfc, n_estimators='auto', verbose=2)
n_train = len(X)
# fit Boruta algorithm
boruta_selector.fit(X, Y)
# check ranking of features
feature_df = pd.DataFrame(df.drop(['y'], axis=1).columns.tolist(), columns=['features'])
feature_df['rank']=boruta_selector.ranking_
feature_df = feature_df.sort_values('rank', ascending=True).reset_index(drop=True)
sns.barplot(x='rank',y='features',data=feature_df)
plt.savefig('ex186b.jpg', dpi=300)
Like this:
Like Loading...
Related
Recent Comments