1. Read Data from Titanic Dataset
Original dataset downloaded from: https://www.kaggle.com/c/titanic
import pandas as pd
import numpy as np
train_path = 'raw_titanic/train.csv'
test_path = 'raw_titanic/test.csv'
test_surv_path = 'raw_titanic/gender_submission.csv'
train = pd.read_csv(train_path,
engine='python')
cols = train.columns
test = pd.read_csv(test_path,
engine='python')
test_surv = pd.read_csv(test_surv_path,
engine='python')
test = pd.merge(test, test_surv,
on='PassengerId',
how='outer')[cols]
t = train.append(test)
t = t.reset_index(drop=True)
print(t.shape)
t.head()
(1309, 12)
|
PassengerId |
Survived |
Pclass |
Name |
Sex |
Age |
SibSp |
Parch |
Ticket |
Fare |
Cabin |
Embarked |
0 |
1 |
0 |
3 |
Braund, Mr. Owen Harris |
male |
22.0 |
1 |
0 |
A/5 21171 |
7.2500 |
NaN |
S |
1 |
2 |
1 |
1 |
Cumings, Mrs. John Bradley (Florence Briggs Th... |
female |
38.0 |
1 |
0 |
PC 17599 |
71.2833 |
C85 |
C |
2 |
3 |
1 |
3 |
Heikkinen, Miss. Laina |
female |
26.0 |
0 |
0 |
STON/O2. 3101282 |
7.9250 |
NaN |
S |
3 |
4 |
1 |
1 |
Futrelle, Mrs. Jacques Heath (Lily May Peel) |
female |
35.0 |
1 |
0 |
113803 |
53.1000 |
C123 |
S |
4 |
5 |
0 |
3 |
Allen, Mr. William Henry |
male |
35.0 |
0 |
0 |
373450 |
8.0500 |
NaN |
S |
2. Clean Data
Cabin is only useful to obtain the Deck.
t['Deck'] = t['Cabin'].str.extract(r'([A-Z])?(\d)')[0]
Ticket, Name, PassengerID are all not useful.
t.reset_index(drop=True)
t['idx'] = t.index
t = t[['idx','Pclass','Sex','Age','SibSp','Parch','Fare','Deck','Embarked','Survived']].copy()
t.columns = ['idx','class','sex','age','sibs','par/ch','fare','deck','embarked','survived']
Deal with nan values
Fill nan values except for deck and embarked with the mean
t['age'] = t['age'].fillna(t['age'].mean())
t['fare'] = t['fare'].fillna(t['fare'].mean())
Fill nan values of embarked with the most common value, S
t['embarked'] = t['embarked'].fillna('S')
Fill Deck with 0
t['deck'] = t['deck'].fillna(0)
t.head()
|
idx |
class |
sex |
age |
sibs |
par/ch |
fare |
deck |
embarked |
survived |
0 |
0 |
3 |
male |
22.0 |
1 |
0 |
7.2500 |
0 |
S |
0 |
1 |
1 |
1 |
female |
38.0 |
1 |
0 |
71.2833 |
C |
C |
1 |
2 |
2 |
3 |
female |
26.0 |
0 |
0 |
7.9250 |
0 |
S |
1 |
3 |
3 |
1 |
female |
35.0 |
1 |
0 |
53.1000 |
C |
S |
1 |
4 |
4 |
3 |
male |
35.0 |
0 |
0 |
8.0500 |
0 |
S |
0 |
3. Convert Categorical (nominal, not ordinal) data to Numeric data
t['class raw'] = t['class']
t['sex raw'] = t['sex']
t['par/ch raw'] = t['par/ch']
t['fare raw'] = t['fare']
t['deck raw'] = t['deck']
t['embarked raw'] = t['embarked']
t['survived raw'] = t['survived']
t['class'] = t['class'].astype('category')
t['class'] = t['class'].cat.codes
t[['class','class raw','idx']].groupby(['class',
'class raw']).count()
|
|
idx |
class |
class raw |
|
0 |
1 |
323 |
1 |
2 |
277 |
2 |
3 |
709 |
t['sex'] = t['sex'].astype('category')
t['sex'] = t['sex'].cat.codes
t[['sex','sex raw','idx']].groupby(['sex',
'sex raw']).count()
|
|
idx |
sex |
sex raw |
|
0 |
female |
466 |
1 |
male |
843 |
t['deck'] = t['deck'].astype('category')
t['deck'] = t['deck'].cat.codes
t[['deck','deck raw','idx']].groupby(['deck',
'deck raw']).count()
|
|
idx |
deck |
deck raw |
|
0 |
0 |
1020 |
1 |
A |
22 |
2 |
B |
65 |
3 |
C |
94 |
4 |
D |
42 |
5 |
E |
44 |
6 |
F |
13 |
7 |
G |
9 |
t['embarked'] = t['embarked'].astype('category')
t['embarked'] = t['embarked'].cat.codes
t[['embarked','embarked raw','idx']].groupby(['embarked',
'embarked raw']).count()
|
|
idx |
embarked |
embarked raw |
|
0 |
C |
270 |
1 |
Q |
123 |
2 |
S |
916 |
t['survived'] = t['survived'].astype('category')
t['survived'] = t['survived'].cat.codes
t[['survived','survived raw','idx']].groupby(['survived',
'survived raw']).count()
|
|
idx |
survived |
survived raw |
|
0 |
0 |
815 |
1 |
1 |
494 |
t = t[['idx','class','sex','age','sibs','par/ch','fare','deck','embarked','survived']].copy()
print(t.shape)
t.head()
(1309, 10)
|
idx |
class |
sex |
age |
sibs |
par/ch |
fare |
deck |
embarked |
survived |
0 |
0 |
2 |
1 |
22.0 |
1 |
0 |
7.2500 |
0 |
2 |
0 |
1 |
1 |
0 |
0 |
38.0 |
1 |
0 |
71.2833 |
3 |
0 |
1 |
2 |
2 |
2 |
0 |
26.0 |
0 |
0 |
7.9250 |
0 |
2 |
1 |
3 |
3 |
0 |
0 |
35.0 |
1 |
0 |
53.1000 |
3 |
2 |
1 |
4 |
4 |
2 |
1 |
35.0 |
0 |
0 |
8.0500 |
0 |
2 |
0 |
4. Normalize Continuous Data
age
and fare
print(' Mean Age = ' + str(t['age'].mean()))
print('Std Dev Age = ' + str(t['age'].std()))
t['age'] = (t['age'] - t['age'].mean()) / t['age'].std()
Mean Age = 29.881137667303985
Std Dev Age = 12.883193243702001
print(' Mean fare = ' + str(t['fare'].mean()))
print('Std Dev fare = ' + str(t['fare'].std()))
t['fare'] = (t['fare'] - t['fare'].mean()) / t['fare'].std()
t.head()
Mean fare = 33.2954792813456
Std Dev fare = 51.73887903247138
|
idx |
class |
sex |
age |
sibs |
par/ch |
fare |
deck |
embarked |
survived |
0 |
0 |
2 |
1 |
-0.611738 |
1 |
0 |
-0.503402 |
0 |
2 |
0 |
1 |
1 |
0 |
0 |
0.630190 |
1 |
0 |
0.734222 |
3 |
0 |
1 |
2 |
2 |
2 |
0 |
-0.301256 |
0 |
0 |
-0.490356 |
0 |
2 |
1 |
3 |
3 |
0 |
0 |
0.397329 |
1 |
0 |
0.382778 |
3 |
2 |
1 |
4 |
4 |
2 |
1 |
0.397329 |
0 |
0 |
-0.487940 |
0 |
2 |
0 |
5. One-Hot (Dummy) Encode Categoricals (nominal, not ordinal)
print(t.shape)
t = pd.get_dummies(t,
columns=['deck',
'embarked'],
drop_first=True)
t = t[['idx', 'class', 'sex', 'age',
'sibs', 'par/ch', 'fare',
'deck_1', 'deck_2', 'deck_3', 'deck_4', 'deck_5', 'deck_6', 'deck_7',
'embarked_1', 'embarked_2',
'survived']].copy()
print(t.shape)
t.head()
(1309, 10)
(1309, 17)
|
idx |
class |
sex |
age |
sibs |
par/ch |
fare |
deck_1 |
deck_2 |
deck_3 |
deck_4 |
deck_5 |
deck_6 |
deck_7 |
embarked_1 |
embarked_2 |
survived |
0 |
0 |
2 |
1 |
-0.611738 |
1 |
0 |
-0.503402 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
1 |
1 |
0 |
0 |
0.630190 |
1 |
0 |
0.734222 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
2 |
2 |
2 |
0 |
-0.301256 |
0 |
0 |
-0.490356 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
1 |
3 |
3 |
0 |
0 |
0.397329 |
1 |
0 |
0.382778 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
1 |
1 |
4 |
4 |
2 |
1 |
0.397329 |
0 |
0 |
-0.487940 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
t.to_csv('titanic_clean.csv',
index=False)