kaggle比赛笔记

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

1 对目标值的分布画图，观察连续变量的分布情况，是否接近正态分布，是否可以log来校正：

#descriptive statistics summary
df_train['SalePrice'].describe()

sns.distplot(df_train['SalePrice'])

2 分析离散型变量与目标值的关系，散点图：

var = 'TotalBsmtSF'
data = pd.concat([df_train['SalePrice'], df_train[var]], axis=1)
data.plot.scatter(x=var, y='SalePrice', ylim=(0,800000));

3 类别型变量与目标值的关系，柱形图：

#box plot overallqual/saleprice
var = 'OverallQual'
data = pd.concat([df_train['SalePrice'], df_train[var]], axis=1)
f, ax = plt.subplots(figsize=(8, 6))
fig = sns.boxplot(x=var, y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000);

4 所有变量间的相关度热图：

#correlation matrix
corrmat = df_train.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True)

5 固定筛选和目标值最相关的变量

#saleprice correlation matrix
k = 10 #number of variables for heatmap
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(df_train[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()

6 全景交叉散点分析，不用区分连续型或类别型

#scatterplot
sns.set()
cols = ['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']
sns.pairplot(df_train[cols], size = 2.5)
plt.show();

7 每列的缺失值信息汇总

#missing data
total = df_train.isnull().sum().sort_values(ascending=False)
percent = (df_train.isnull().sum()/df_train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

缺失值占比超过15%的考虑放弃。

8 列标准化处理

#standardizing data
saleprice_scaled = StandardScaler().fit_transform(df_train['SalePrice'][:,np.newaxis]);

9 one-hot 处理

#convert categorical variable into dummy
df_train = pd.get_dummies(df_train)

10 选择特定类型的字段

11 取 log：

df_train.loc[df_train['HasBsmt']==1,'TotalBsmtSF'] = np.log(df_train['TotalBsmtSF'])

12 根据某列创造新列：

#create column for new variable (one is enough because it's a binary categorical feature)
#if area>0 it gets 1, for area==0 it gets 0
df_train['HasBsmt'] = pd.Series(len(df_train['TotalBsmtSF']), index=df_train.index)
df_train['HasBsmt'] = 0 
df_train.loc[df_train['TotalBsmtSF']>0,'HasBsmt'] = 1

df_train.loc[df_train['HasBsmt']==1,'TotalBsmtSF'] = np.log(df_train['TotalBsmtSF'])

13 类别变量对类别目标值的影响，饼图显示：

plt.figure(figsize=(11,7))
ax = sns.countplot(x='Category',hue='Survived',data= p_data)
ax.set_xticklabels(['Passenger','Crew'])
ax.text(-0.22,700,p_data['Survived'][(p_data['Category']=='P') & (p_data['Survived']==0) ].count(),fontsize=12, fontweight='bold')
ax.text(0.19,100,p_data['Survived'][(p_data['Category']=='P') & (p_data['Survived']==1) ].count(),fontsize=12, fontweight='bold')
ax.text(0.79,157,p_data['Survived'][(p_data['Category']=='C') & (p_data['Survived']==0) ].count(),fontsize=12, fontweight='bold')
ax.text(1.19,45,p_data['Survived'][(p_data['Category']=='C') & (p_data['Survived']==1) ].count(),fontsize=12, fontweight='bold')

plt.show()

male=ds[ds["Sex"]==0]

female=ds[ds["Sex"]==1]

male_survi=male[ds["Survived"]==1]

male_not=male[ds["Survived"]==0]

female_survi=female[ds["Survived"]==1]

female_not=female[ds["Survived"]==0]

labels = ['Male - Survived','Male - Not Survived', "Female - Survived", "Female - Not Survived"]

values = [len(male[ds["Survived"]==1]),len(male[ds["Survived"]==0]),

len(female[ds["Survived"]==1]),len(female[ds["Survived"]==0])]

fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.4)])

fig.update_layout(title_text="Analysis on Survival - Gender")

fig.show()

14 连续型特征对正负样本的影响

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
plt.figure(figsize=(11,5))
sns.boxplot(x='Survived',y='Age',data=p_data)
plt.show()

surv=ds[ds["Survived"]==1]["Age"]

not_surv=ds[ds["Survived"]==0]["Age"]

hist_data = [surv,not_surv]

group_labels = ['Survived', 'Not Survived']

fig = ff.create_distplot(hist_data, group_labels, bin_size=.2)

fig.update_layout(

title_text="Analysis in Age on Survival Status")

fig.show()

15 混淆矩阵可视化

def plot_confusion_matrix(cm, names, title="Random Forest Model- Confusion matrix", cmap=plt.cm.Blues):

plt.imshow(cm, interpolation='nearest', cmap=cmap)

plt.title(title)

plt.colorbar()

tick_marks = np.arange(len(names))

plt.xticks(tick_marks, names, rotation=45)

plt.yticks(tick_marks, names)

plt.tight_layout()

plt.ylabel('True label')

plt.xlabel('Predicted label')

cm = confusion_matrix(y_test, pred)

np.set_printoptions(precision=2)

print('Confusion matrix, without normalization')

print(cm)

plt.figure()

plot_confusion_matrix(cm, ["Not Survived","Survived"])

遍历改造字段的每个值：

data['Age_Category'] = 0

for i in data.Age.index:
    if data['Age'][i] >=0 and data['Age'][i]<8:        #[0-7]
        data['Age_Category'][i]='0-7'
    elif data['Age'][i] >=8 and data['Age'][i]<16:     #[8-15]
        data['Age_Category'][i]='8-15'
    elif data['Age'][i] >=16 and data['Age'][i]<24:    #[16-23]
        data['Age_Category'][i]='16-23'
    elif data['Age'][i] >=24 and data['Age'][i]<32:    # [24-31]
        data['Age_Category'][i]='24-31'
    elif data['Age'][i] >=32 and data['Age'][i]<40:    #[32-39]
        data['Age_Category'][i]='32-39'
    elif data['Age'][i] >=40 and data['Age'][i]<48:    #[40-47]
        data['Age_Category'][i]='40-47'
    elif data['Age'][i] >=48 and data['Age'][i]<56:    #[48-55]
        data['Age_Category'][i]='48-55'
    elif data['Age'][i] >=56 and data['Age'][i]<64:    #[56-63]
        data['Age_Category'][i]='56-63'
    elif data['Age'][i] >=64 and data['Age'][i]<72:    #[64-71]
        data['Age_Category'][i]='64-71'
    elif data['Age'][i] >=72 and data['Age'][i]<80:    #[72-79]
        data['Age_Category'][i]='72-79'
    else:                                              #[80-87]
        data['Age_Category'][i]='60+'

17 轮盘

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import squarify
from wordcloud import WordCloud
import plotly.graph_objs as go

df = px.data.tips()
fig = px.sunburst(data, path=['Age_Category', 'Survived', 'Country'], values='Age',title='Dont Forget to Click Chart to Examine Deeply' )
fig.show()

bar_plot('Survived',data)

plt.figure(figsize = (20, 12))
squarify.plot(sizes = data.Country.value_counts().values, alpha = 0.8,
              label = data.Country.unique())
plt.title('Most Common Regions', fontsize = 20)
plt.axis('off')
plt.show()

相关推荐

取消回复欢迎你发表评论:

Google 黑客常用搜索语句一览原力计划

npx简介（npxvip是哪国的）

在 Android 模拟器上运行 ARM 应用（android模拟器原理）

GB28181,B接口协议之SIPRTSPRTPRTMP协议从入门到精通

手机实时提取SIM卡打电话的信令和声音-辅助外设与商用通话方案

安装使用Hoppscotch构建API请求访问与测试

轻松转换!AppleNumbers到Excel的快捷教程

Python自动化办公——后台截图（python 自动截图）

电脑端腾讯文档如何导出excel

网络流媒体经典开源软件宝典webRTC, FFMpeg, SIP_流媒体开发教程