import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
1 对目标值的分布画图,观察连续变量的分布情况,是否接近正态分布,是否可以log来校正:
#descriptive statistics summary
df_train['SalePrice'].describe()
sns.distplot(df_train['SalePrice'])
2 分析离散型变量与目标值的关系,散点图:
var = 'TotalBsmtSF'
data = pd.concat([df_train['SalePrice'], df_train[var]], axis=1)
data.plot.scatter(x=var, y='SalePrice', ylim=(0,800000));
3 类别型变量与目标值的关系,柱形图:
#box plot overallqual/saleprice
var = 'OverallQual'
data = pd.concat([df_train['SalePrice'], df_train[var]], axis=1)
f, ax = plt.subplots(figsize=(8, 6))
fig = sns.boxplot(x=var, y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000);
4 所有变量间的相关度热图:
#correlation matrix
corrmat = df_train.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True)
5 固定筛选和目标值最相关的变量
#saleprice correlation matrix
k = 10 #number of variables for heatmap
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(df_train[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()
6 全景交叉散点分析,不用区分连续型或类别型
#scatterplot
sns.set()
cols = ['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']
sns.pairplot(df_train[cols], size = 2.5)
plt.show();
7 每列的缺失值信息汇总
#missing data
total = df_train.isnull().sum().sort_values(ascending=False)
percent = (df_train.isnull().sum()/df_train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)
缺失值占比超过15%的考虑放弃。
8 列标准化处理
#standardizing data
saleprice_scaled = StandardScaler().fit_transform(df_train['SalePrice'][:,np.newaxis]);
9 one-hot 处理
#convert categorical variable into dummy
df_train = pd.get_dummies(df_train)
10 选择特定类型的字段
11 取 log:
df_train.loc[df_train['HasBsmt']==1,'TotalBsmtSF'] = np.log(df_train['TotalBsmtSF'])
12 根据某列创造新列:
#create column for new variable (one is enough because it's a binary categorical feature)
#if area>0 it gets 1, for area==0 it gets 0
df_train['HasBsmt'] = pd.Series(len(df_train['TotalBsmtSF']), index=df_train.index)
df_train['HasBsmt'] = 0
df_train.loc[df_train['TotalBsmtSF']>0,'HasBsmt'] = 1
df_train.loc[df_train['HasBsmt']==1,'TotalBsmtSF'] = np.log(df_train['TotalBsmtSF'])
13 类别变量对类别目标值的影响,饼图显示:
plt.figure(figsize=(11,7))
ax = sns.countplot(x='Category',hue='Survived',data= p_data)
ax.set_xticklabels(['Passenger','Crew'])
ax.text(-0.22,700,p_data['Survived'][(p_data['Category']=='P') & (p_data['Survived']==0) ].count(),fontsize=12, fontweight='bold')
ax.text(0.19,100,p_data['Survived'][(p_data['Category']=='P') & (p_data['Survived']==1) ].count(),fontsize=12, fontweight='bold')
ax.text(0.79,157,p_data['Survived'][(p_data['Category']=='C') & (p_data['Survived']==0) ].count(),fontsize=12, fontweight='bold')
ax.text(1.19,45,p_data['Survived'][(p_data['Category']=='C') & (p_data['Survived']==1) ].count(),fontsize=12, fontweight='bold')
plt.show()
male=ds[ds["Sex"]==0]
female=ds[ds["Sex"]==1]
male_survi=male[ds["Survived"]==1]
male_not=male[ds["Survived"]==0]
female_survi=female[ds["Survived"]==1]
female_not=female[ds["Survived"]==0]
labels = ['Male - Survived','Male - Not Survived', "Female - Survived", "Female - Not Survived"]
values = [len(male[ds["Survived"]==1]),len(male[ds["Survived"]==0]),
len(female[ds["Survived"]==1]),len(female[ds["Survived"]==0])]
fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.4)])
fig.update_layout(title_text="Analysis on Survival - Gender")
fig.show()
14 连续型特征 对正负样本的影响
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
plt.figure(figsize=(11,5))
sns.boxplot(x='Survived',y='Age',data=p_data)
plt.show()
surv=ds[ds["Survived"]==1]["Age"]
not_surv=ds[ds["Survived"]==0]["Age"]
hist_data = [surv,not_surv]
group_labels = ['Survived', 'Not Survived']
fig = ff.create_distplot(hist_data, group_labels, bin_size=.2)
fig.update_layout(
title_text="Analysis in Age on Survival Status")
fig.show()
15 混淆矩阵可视化
def plot_confusion_matrix(cm, names, title="Random Forest Model- Confusion matrix", cmap=plt.cm.Blues):
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(names))
plt.xticks(tick_marks, names, rotation=45)
plt.yticks(tick_marks, names)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
cm = confusion_matrix(y_test, pred)
np.set_printoptions(precision=2)
print('Confusion matrix, without normalization')
print(cm)
plt.figure()
plot_confusion_matrix(cm, ["Not Survived","Survived"])
16
遍历改造字段的每个值:
data['Age_Category'] = 0
for i in data.Age.index:
if data['Age'][i] >=0 and data['Age'][i]<8: #[0-7]
data['Age_Category'][i]='0-7'
elif data['Age'][i] >=8 and data['Age'][i]<16: #[8-15]
data['Age_Category'][i]='8-15'
elif data['Age'][i] >=16 and data['Age'][i]<24: #[16-23]
data['Age_Category'][i]='16-23'
elif data['Age'][i] >=24 and data['Age'][i]<32: # [24-31]
data['Age_Category'][i]='24-31'
elif data['Age'][i] >=32 and data['Age'][i]<40: #[32-39]
data['Age_Category'][i]='32-39'
elif data['Age'][i] >=40 and data['Age'][i]<48: #[40-47]
data['Age_Category'][i]='40-47'
elif data['Age'][i] >=48 and data['Age'][i]<56: #[48-55]
data['Age_Category'][i]='48-55'
elif data['Age'][i] >=56 and data['Age'][i]<64: #[56-63]
data['Age_Category'][i]='56-63'
elif data['Age'][i] >=64 and data['Age'][i]<72: #[64-71]
data['Age_Category'][i]='64-71'
elif data['Age'][i] >=72 and data['Age'][i]<80: #[72-79]
data['Age_Category'][i]='72-79'
else: #[80-87]
data['Age_Category'][i]='60+'
17 轮盘
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import squarify
from wordcloud import WordCloud
import plotly.graph_objs as go
df = px.data.tips()
fig = px.sunburst(data, path=['Age_Category', 'Survived', 'Country'], values='Age',title='Dont Forget to Click Chart to Examine Deeply' )
fig.show()
18
bar_plot('Survived',data)
19
plt.figure(figsize = (20, 12))
squarify.plot(sizes = data.Country.value_counts().values, alpha = 0.8,
label = data.Country.unique())
plt.title('Most Common Regions', fontsize = 20)
plt.axis('off')
plt.show()
20