1. Read Data from Adult Dataset
Original dataset downloaded from: https://archive.ics.uci.edu/ml/datasets/Adult
import pandas as pd
import numpy as np
adult_data_path = 'raw_adult/adult.data'
adult_test_path = 'raw_adult/adult.test'
cols = ['age','workclass','fnlwgt','education','education-num','marital-status',
'occupation','relationship','race','sex','capital-gain', 'capital-loss',
'hours-per-week', 'native-country','compensation']
a = (pd.read_csv(adult_data_path,
names=cols,
sep=', ',
engine='python')
.append(pd.read_csv(adult_test_path,
skiprows=1,
names=cols,
sep=', ',
engine='python')))
print(a.shape)
a.head()
(48842, 15)
|
age |
workclass |
fnlwgt |
education |
education-num |
marital-status |
occupation |
relationship |
race |
sex |
capital-gain |
capital-loss |
hours-per-week |
native-country |
compensation |
0 |
39 |
State-gov |
77516 |
Bachelors |
13 |
Never-married |
Adm-clerical |
Not-in-family |
White |
Male |
2174 |
0 |
40 |
United-States |
<=50K |
1 |
50 |
Self-emp-not-inc |
83311 |
Bachelors |
13 |
Married-civ-spouse |
Exec-managerial |
Husband |
White |
Male |
0 |
0 |
13 |
United-States |
<=50K |
2 |
38 |
Private |
215646 |
HS-grad |
9 |
Divorced |
Handlers-cleaners |
Not-in-family |
White |
Male |
0 |
0 |
40 |
United-States |
<=50K |
3 |
53 |
Private |
234721 |
11th |
7 |
Married-civ-spouse |
Handlers-cleaners |
Husband |
Black |
Male |
0 |
0 |
40 |
United-States |
<=50K |
4 |
28 |
Private |
338409 |
Bachelors |
13 |
Married-civ-spouse |
Prof-specialty |
Wife |
Black |
Female |
0 |
0 |
40 |
Cuba |
<=50K |
2. Clean Data
a = a.replace({'<=50K.' : '<=50K',
'>50K.' : '>50K'})
Drop entries where workclass
and occupation
are unknown, and where workclass
is Without-pay
.
a = (a[(a['workclass']!='?')&
(a['occupation']!='?')&
(a['workclass']!='Without-pay')]
.reset_index(drop=True))
a['idx'] = a.index
Map the very small Armed-Forces
category of occupation
to Protective-serv
.
a.loc[a['occupation']=='Armed-Forces','occupation'] = 'Protective-serv'
Map Ages, Education, Workclass, and Weekly-Hours to smaller category set.
a.loc[a['workclass'].isin(['State-gov', 'Federal-gov', 'Local-gov']),
'employment-type'] = 'Government'
a.loc[a['workclass'].isin(['Self-emp-not-inc', 'Self-emp-inc']),
'employment-type'] = 'Self-Employed'
a.loc[a['workclass'].isin(['Private']),
'employment-type'] = 'Privately-Employed'
a['employment-type'].unique()
array(['Government', 'Self-Employed', 'Privately-Employed'], dtype=object)
a.loc[a['education-num'] <= 8, 'education'] = 'Less than High School'
a.loc[a['education-num'].isin([ 9,10]), 'education'] = 'High School'
a.loc[a['education-num'].isin([11,12]), 'education'] = 'Associates'
a.loc[a['education-num'].isin([13]), 'education'] = 'Bachelors'
a.loc[a['education-num'].isin([14]), 'education'] = 'Masters'
a.loc[a['education-num'].isin([15,16]), 'education'] = 'PhD/Professional'
a['education'].unique()
array(['Bachelors', 'High School', 'Less than High School', 'Masters',
'Associates', 'PhD/Professional'], dtype=object)
a = a.rename(columns={'hours-per-week':'weekly-hours'})
a.head()
|
age |
workclass |
fnlwgt |
education |
education-num |
marital-status |
occupation |
relationship |
race |
sex |
capital-gain |
capital-loss |
weekly-hours |
native-country |
compensation |
idx |
employment-type |
0 |
39 |
State-gov |
77516 |
Bachelors |
13 |
Never-married |
Adm-clerical |
Not-in-family |
White |
Male |
2174 |
0 |
40 |
United-States |
<=50K |
0 |
Government |
1 |
50 |
Self-emp-not-inc |
83311 |
Bachelors |
13 |
Married-civ-spouse |
Exec-managerial |
Husband |
White |
Male |
0 |
0 |
13 |
United-States |
<=50K |
1 |
Self-Employed |
2 |
38 |
Private |
215646 |
High School |
9 |
Divorced |
Handlers-cleaners |
Not-in-family |
White |
Male |
0 |
0 |
40 |
United-States |
<=50K |
2 |
Privately-Employed |
3 |
53 |
Private |
234721 |
Less than High School |
7 |
Married-civ-spouse |
Handlers-cleaners |
Husband |
Black |
Male |
0 |
0 |
40 |
United-States |
<=50K |
3 |
Privately-Employed |
4 |
28 |
Private |
338409 |
Bachelors |
13 |
Married-civ-spouse |
Prof-specialty |
Wife |
Black |
Female |
0 |
0 |
40 |
Cuba |
<=50K |
4 |
Privately-Employed |
3. Convert all Categorical data to Numeric data
a['age raw'] = a['age']
a['employment-type raw'] = a['employment-type']
a['education raw'] = a['education']
a['occupation raw'] = a['occupation']
a['race raw'] = a['race']
a['sex raw'] = a['sex']
a['weekly-hours raw'] = a['weekly-hours']
a['compensation raw'] = a['compensation']
a['employment-type'] = a['employment-type'].astype('category')
a['employment-type'] = a['employment-type'].cat.codes
a[['employment-type','employment-type raw','idx']].groupby(['employment-type',
'employment-type raw']).count()
|
|
idx |
employment-type |
employment-type raw |
|
0 |
Government |
6549 |
1 |
Privately-Employed |
33906 |
2 |
Self-Employed |
5557 |
a['education'] = a['education'].astype('category')
a['education'] = a['education'].cat.codes
a[['education','education raw','idx']].groupby(['education',
'education raw']).count()
|
|
idx |
education |
education raw |
|
0 |
Associates |
3505 |
1 |
Bachelors |
7772 |
2 |
High School |
24991 |
3 |
Less than High School |
5768 |
4 |
Masters |
2590 |
5 |
PhD/Professional |
1386 |
a['occupation'] = a['occupation'].astype('category')
a['occupation'] = a['occupation'].cat.codes
a[['occupation','occupation raw','idx']].groupby(['occupation',
'occupation raw']).count()
|
|
idx |
occupation |
occupation raw |
|
0 |
Adm-clerical |
5608 |
1 |
Craft-repair |
6111 |
2 |
Exec-managerial |
6085 |
3 |
Farming-fishing |
1482 |
4 |
Handlers-cleaners |
2070 |
5 |
Machine-op-inspct |
3020 |
6 |
Other-service |
4921 |
7 |
Priv-house-serv |
242 |
8 |
Prof-specialty |
6172 |
9 |
Protective-serv |
998 |
10 |
Sales |
5503 |
11 |
Tech-support |
1446 |
12 |
Transport-moving |
2354 |
a['race'] = a['race'].astype('category')
a['race'] = a['race'].cat.codes
a[['race','race raw','idx']].groupby(['race',
'race raw']).count()
|
|
idx |
race |
race raw |
|
0 |
Amer-Indian-Eskimo |
435 |
1 |
Asian-Pac-Islander |
1422 |
2 |
Black |
4355 |
3 |
Other |
375 |
4 |
White |
39425 |
a['sex'] = a['sex'].astype('category')
a['sex'] = a['sex'].cat.codes
a[['sex','sex raw','idx']].groupby(['sex',
'sex raw']).count()
|
|
idx |
sex |
sex raw |
|
0 |
Female |
14912 |
1 |
Male |
31100 |
a['compensation'] = a['compensation'].astype('category')
a['compensation'] = a['compensation'].cat.codes
a[['compensation','compensation raw','idx']].groupby(['compensation',
'compensation raw']).count()
|
|
idx |
compensation |
compensation raw |
|
0 |
<=50K |
34592 |
1 |
>50K |
11420 |
a = a[['idx',
'age',
'employment-type',
'education',
'occupation',
'race',
'sex',
'weekly-hours',
'compensation']].copy()
print(a.shape)
a.head()
(46012, 9)
|
idx |
age |
employment-type |
education |
occupation |
race |
sex |
weekly-hours |
compensation |
0 |
0 |
39 |
0 |
1 |
0 |
4 |
1 |
40 |
0 |
1 |
1 |
50 |
2 |
1 |
2 |
4 |
1 |
13 |
0 |
2 |
2 |
38 |
1 |
2 |
4 |
4 |
1 |
40 |
0 |
3 |
3 |
53 |
1 |
3 |
4 |
2 |
1 |
40 |
0 |
4 |
4 |
28 |
1 |
1 |
8 |
2 |
0 |
40 |
0 |
4. Normalize Continuous Data
Age
and Weekly-Hours
print(' Mean Age = ' + str(a['age'].mean()))
print('Std Dev Age = ' + str(a['age'].std()))
a['age'] = (a['age'] - a['age'].mean()) / a['age'].std()
Mean Age = 38.55261670868469
Std Dev Age = 13.199319129893926
print(' Mean Weekly-Hours = ' + str(a['weekly-hours'].mean()))
print('Std Dev Weekly-Hours = ' + str(a['weekly-hours'].std()))
a['weekly-hours'] = (a['weekly-hours'] - a['weekly-hours'].mean()) / a['weekly-hours'].std()
a.head()
Mean Weekly-Hours = 40.95159958271755
Std Dev Weekly-Hours = 12.007797424174404
|
idx |
age |
employment-type |
education |
occupation |
race |
sex |
weekly-hours |
compensation |
0 |
0 |
0.033894 |
0 |
1 |
0 |
4 |
1 |
-0.079248 |
0 |
1 |
1 |
0.867271 |
2 |
1 |
2 |
4 |
1 |
-2.327787 |
0 |
2 |
2 |
-0.041867 |
1 |
2 |
4 |
4 |
1 |
-0.079248 |
0 |
3 |
3 |
1.094555 |
1 |
3 |
4 |
2 |
1 |
-0.079248 |
0 |
4 |
4 |
-0.799482 |
1 |
1 |
8 |
2 |
0 |
-0.079248 |
0 |
5. One-Hot (Dummy) Encode Categoricals (nominal, not ordinal)
print(a.shape)
a = pd.get_dummies(a,
columns=['employment-type',
'occupation',
'race'],
drop_first=True)
a = a[['idx', 'age', 'education', 'sex', 'weekly-hours',
'employment-type_1', 'employment-type_2',
'occupation_1', 'occupation_2', 'occupation_3', 'occupation_4', 'occupation_5', 'occupation_6',
'occupation_7', 'occupation_8', 'occupation_9', 'occupation_10', 'occupation_11', 'occupation_12',
'race_1', 'race_2', 'race_3', 'race_4',
'compensation']].copy()
print(a.shape)
a.head()
(46012, 9)
(46012, 24)
|
idx |
age |
education |
sex |
weekly-hours |
employment-type_1 |
employment-type_2 |
occupation_1 |
occupation_2 |
occupation_3 |
... |
occupation_8 |
occupation_9 |
occupation_10 |
occupation_11 |
occupation_12 |
race_1 |
race_2 |
race_3 |
race_4 |
compensation |
0 |
0 |
0.033894 |
1 |
1 |
-0.079248 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
1 |
1 |
0.867271 |
1 |
1 |
-2.327787 |
0 |
1 |
0 |
1 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
2 |
2 |
-0.041867 |
2 |
1 |
-0.079248 |
1 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
3 |
3 |
1.094555 |
3 |
1 |
-0.079248 |
1 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
4 |
4 |
-0.799482 |
1 |
0 |
-0.079248 |
1 |
0 |
0 |
0 |
0 |
... |
1 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
5 rows × 24 columns
a.to_csv('adult_clean.csv',
index=False)