以下为泰坦尼克Kaggle入门竞赛进入2%特征给大家分享,如有任何疑问欢迎交流!
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection
%matplotlib inline
warnings.filterwarnings('ignore')
train=pd.read_csv(path+'train.csv')
test=pd.read_csv(path+'test.csv')
PassengerId=test['PassengerId']
#(1)性别
train['Sex'] = train['Sex'].apply(lambda x: 1 if x == 'male' else 0)
test['Sex'] = test['Sex'].apply(lambda x: 1 if x == 'male' else 0)
#(2)姓名
def Name_Title_Code(x):
if x == 'Mr.':
return 1
if (x == 'Mrs.') or (x=='Ms.') or (x=='Lady.') or (x == 'Mlle.') or (x =='Mme.'):
return 2
if x == 'Miss.':
return 3
if x == 'Rev.':
return 4
return 5
train['Name_Title'] = train['Name'].apply(lambda x: x.split(',')[1]).apply(lambda x: x.split()[0])
test['Name_Title'] = test['Name'].apply(lambda x: x.split(',')[1]).apply(lambda x: x.split()[0])
#(3)年龄
def Age_feature(train, test):
for i in [train, test]:
i['Age_Null_Flag'] = i['Age'].apply(lambda x: 1 if pd.isnull(x) else 0)
data = train.groupby(['Name_Title', 'Pclass'])['Age']
i['Age'] = data.transform(lambda x: x.fillna(x.mean()))
return train, test
#(4)家庭人口
def Family_feature(train, test):
for i in [train, test]:
i['Fam_Size'] = np.where((i['SibSp']+i['Parch']) == 0 , 'Solo',
np.where((i['SibSp']+i['Parch']) <= 3,'Nuclear', 'Big'))
del i['SibSp']
del i['Parch']
return train, test
#(5)票
def ticket_grouped(train, test):
for i in [train, test]:
i['Ticket_Lett'] = i['Ticket'].apply(lambda x: str(x)[0])
i['Ticket_Lett'] = i['Ticket_Lett'].apply(lambda x: str(x))
i['Ticket_Lett'] = np.where((i['Ticket_Lett']).isin(['1', '2', '3', 'S', 'P', 'C', 'A']), i['Ticket_Lett'],
np.where((i['Ticket_Lett']).isin(['W', '4', '7', '6', 'L', '5', '8']),
'Low_ticket', 'Other_ticket'))
i['Ticket_Len'] = i['Ticket'].apply(lambda x: len(x))
del i['Ticket']
return train, test
#(6)仓号
def Cabin_feature(train, test):
for i in [train, test]:
i['Cabin_Letter'] = i['Cabin'].apply(lambda x: str(x)[0])
del i['Cabin']
return train, test
def cabin_num(train, test):
for i in [train, test]:
i['Cabin_num1'] = i['Cabin'].apply(lambda x: str(x).split(' ')[-1][1:])
i['Cabin_num1'].replace('an', np.NaN, inplace = True)
i['Cabin_num1'] = i['Cabin_num1'].apply(lambda x: int(x) if not pd.isnull(x) and x != '' else np.NaN)
i['Cabin_num'] = pd.qcut(train['Cabin_num1'],3)
train = pd.concat((train, pd.get_dummies(train['Cabin_num'], prefix = 'Cabin_num')), axis = 1)
test = pd.concat((test, pd.get_dummies(test['Cabin_num'], prefix = 'Cabin_num')), axis = 1)
del train['Cabin_num']
del test['Cabin_num']
del train['Cabin_num1']
del test['Cabin_num1']
return train, test
#(7)港口
def embarked_impute(train, test):
for i in [train, test]:
i['Embarked'] = i['Embarked'].fillna('C')
return train, test
#(8)票价
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
#特征处理
def dummies(train, test, columns = ['Pclass','Sex','Embarked', 'Ticket_Lett', 'Cabin_Letter', 'Name_Title', 'Fam_Size']):
for column in columns:
train[column] = train[column].apply(lambda x: str(x))
test[column] = test[column].apply(lambda x: str(x))
good_cols = [column+'_'+i for i in train[column].unique() if i in test[column].unique()]
train = pd.concat((train, pd.get_dummies(train[column], prefix = column)[good_cols]), axis = 1)
test = pd.concat((test, pd.get_dummies(test[column], prefix = column)[good_cols]), axis = 1)
del train[column]
del test[column]
return train, test
def drop(train, test, bye = ['PassengerId']):
for i in [train, test]:
for z in bye:
del i[z]
return train, test
train, test = Age_feature(train, test)
train['Name_Title'] = train['Name_Title'].apply(Name_Title_Code)
test['Name_Title'] = test['Name_Title'].apply(Name_Title_Code)
train = pd.get_dummies(columns = ['Name_Title'], data = train)
test = pd.get_dummies(columns = ['Name_Title'], data = test)
train, test = cabin_num(train, test)
train, test = Cabin_feature(train, test)
train, test = embarked_impute(train, test)
train, test = Family_feature(train, test)
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
train, test = ticket_grouped(train, test)
train, test = dummies(train, test, columns = ['Pclass','Sex','Embarked', 'Ticket_Lett', 'Fam_Size','Cabin_Letter'])
train, test = drop(train, test)
train.drop('Name',axis=1,inplace=True)
test.drop('Name',axis=1,inplace=True)