Cleaning the Adult Dataset

1. Read Data from Adult Dataset

Original dataset downloaded from: https://archive.ics.uci.edu/ml/datasets/Adult

import pandas as pd
import numpy as np
adult_data_path = 'raw_adult/adult.data'
adult_test_path = 'raw_adult/adult.test'
cols = ['age','workclass','fnlwgt','education','education-num','marital-status',
        'occupation','relationship','race','sex','capital-gain', 'capital-loss',
        'hours-per-week', 'native-country','compensation']
a = (pd.read_csv(adult_data_path,
                 names=cols,
                 sep=', ',
                 engine='python')
     .append(pd.read_csv(adult_test_path,
                         skiprows=1,
                         names=cols,
                         sep=', ',
                         engine='python')))
print(a.shape)
a.head()
(48842, 15)

age workclass fnlwgt education education-num marital-status occupation relationship race sex capital-gain capital-loss hours-per-week native-country compensation
0 39 State-gov 77516 Bachelors 13 Never-married Adm-clerical Not-in-family White Male 2174 0 40 United-States <=50K
1 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse Exec-managerial Husband White Male 0 0 13 United-States <=50K
2 38 Private 215646 HS-grad 9 Divorced Handlers-cleaners Not-in-family White Male 0 0 40 United-States <=50K
3 53 Private 234721 11th 7 Married-civ-spouse Handlers-cleaners Husband Black Male 0 0 40 United-States <=50K
4 28 Private 338409 Bachelors 13 Married-civ-spouse Prof-specialty Wife Black Female 0 0 40 Cuba <=50K

2. Clean Data

a = a.replace({'<=50K.' : '<=50K',
               '>50K.'  : '>50K'})

Drop entries where workclass and occupation are unknown, and where workclass is Without-pay.

a = (a[(a['workclass']!='?')&
       (a['occupation']!='?')&
       (a['workclass']!='Without-pay')]
     .reset_index(drop=True))
a['idx'] = a.index

Map the very small Armed-Forces category of occupation to Protective-serv.

a.loc[a['occupation']=='Armed-Forces','occupation'] = 'Protective-serv'

Map Ages, Education, Workclass, and Weekly-Hours to smaller category set.

a.loc[a['workclass'].isin(['State-gov', 'Federal-gov', 'Local-gov']), 
      'employment-type'] = 'Government'
a.loc[a['workclass'].isin(['Self-emp-not-inc', 'Self-emp-inc']),      
      'employment-type'] = 'Self-Employed'
a.loc[a['workclass'].isin(['Private']),                               
      'employment-type'] = 'Privately-Employed'
a['employment-type'].unique()
array(['Government', 'Self-Employed', 'Privately-Employed'], dtype=object)
a.loc[a['education-num'] <= 8,          'education'] = 'Less than High School'
a.loc[a['education-num'].isin([ 9,10]), 'education'] = 'High School'
a.loc[a['education-num'].isin([11,12]), 'education'] = 'Associates'
a.loc[a['education-num'].isin([13]),    'education'] = 'Bachelors'
a.loc[a['education-num'].isin([14]),    'education'] = 'Masters'
a.loc[a['education-num'].isin([15,16]), 'education'] = 'PhD/Professional'
a['education'].unique()
array(['Bachelors', 'High School', 'Less than High School', 'Masters',
       'Associates', 'PhD/Professional'], dtype=object)
a = a.rename(columns={'hours-per-week':'weekly-hours'})
a.head()

age workclass fnlwgt education education-num marital-status occupation relationship race sex capital-gain capital-loss weekly-hours native-country compensation idx employment-type
0 39 State-gov 77516 Bachelors 13 Never-married Adm-clerical Not-in-family White Male 2174 0 40 United-States <=50K 0 Government
1 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse Exec-managerial Husband White Male 0 0 13 United-States <=50K 1 Self-Employed
2 38 Private 215646 High School 9 Divorced Handlers-cleaners Not-in-family White Male 0 0 40 United-States <=50K 2 Privately-Employed
3 53 Private 234721 Less than High School 7 Married-civ-spouse Handlers-cleaners Husband Black Male 0 0 40 United-States <=50K 3 Privately-Employed
4 28 Private 338409 Bachelors 13 Married-civ-spouse Prof-specialty Wife Black Female 0 0 40 Cuba <=50K 4 Privately-Employed

3. Convert all Categorical data to Numeric data

a['age raw'] = a['age']
a['employment-type raw'] = a['employment-type']
a['education raw'] = a['education']
a['occupation raw'] = a['occupation']
a['race raw'] = a['race']
a['sex raw'] = a['sex']
a['weekly-hours raw'] = a['weekly-hours']
a['compensation raw'] = a['compensation']
a['employment-type'] = a['employment-type'].astype('category')
a['employment-type'] = a['employment-type'].cat.codes
a[['employment-type','employment-type raw','idx']].groupby(['employment-type',
                                                            'employment-type raw']).count()

idx
employment-type employment-type raw
0 Government 6549
1 Privately-Employed 33906
2 Self-Employed 5557
a['education'] = a['education'].astype('category')
a['education'] = a['education'].cat.codes
a[['education','education raw','idx']].groupby(['education',
                                                'education raw']).count()

idx
education education raw
0 Associates 3505
1 Bachelors 7772
2 High School 24991
3 Less than High School 5768
4 Masters 2590
5 PhD/Professional 1386
a['occupation'] = a['occupation'].astype('category')
a['occupation'] = a['occupation'].cat.codes
a[['occupation','occupation raw','idx']].groupby(['occupation',
                                                  'occupation raw']).count()

idx
occupation occupation raw
0 Adm-clerical 5608
1 Craft-repair 6111
2 Exec-managerial 6085
3 Farming-fishing 1482
4 Handlers-cleaners 2070
5 Machine-op-inspct 3020
6 Other-service 4921
7 Priv-house-serv 242
8 Prof-specialty 6172
9 Protective-serv 998
10 Sales 5503
11 Tech-support 1446
12 Transport-moving 2354
a['race'] = a['race'].astype('category')
a['race'] = a['race'].cat.codes
a[['race','race raw','idx']].groupby(['race',
                                      'race raw']).count()

idx
race race raw
0 Amer-Indian-Eskimo 435
1 Asian-Pac-Islander 1422
2 Black 4355
3 Other 375
4 White 39425
a['sex'] = a['sex'].astype('category')
a['sex'] = a['sex'].cat.codes
a[['sex','sex raw','idx']].groupby(['sex',
                                    'sex raw']).count()

idx
sex sex raw
0 Female 14912
1 Male 31100
a['compensation'] = a['compensation'].astype('category')
a['compensation'] = a['compensation'].cat.codes
a[['compensation','compensation raw','idx']].groupby(['compensation',
                                                      'compensation raw']).count()

idx
compensation compensation raw
0 <=50K 34592
1 >50K 11420
a = a[['idx',
       'age',
       'employment-type',
       'education',
       'occupation',
       'race',
       'sex',
       'weekly-hours',
       'compensation']].copy()
print(a.shape)
a.head()
(46012, 9)

idx age employment-type education occupation race sex weekly-hours compensation
0 0 39 0 1 0 4 1 40 0
1 1 50 2 1 2 4 1 13 0
2 2 38 1 2 4 4 1 40 0
3 3 53 1 3 4 2 1 40 0
4 4 28 1 1 8 2 0 40 0

4. Normalize Continuous Data

Age and Weekly-Hours

print('   Mean Age = ' + str(a['age'].mean()))
print('Std Dev Age = ' + str(a['age'].std()))
a['age'] = (a['age'] - a['age'].mean()) / a['age'].std()
   Mean Age = 38.55261670868469
Std Dev Age = 13.199319129893926
print('   Mean Weekly-Hours = ' + str(a['weekly-hours'].mean()))
print('Std Dev Weekly-Hours = ' + str(a['weekly-hours'].std()))
a['weekly-hours'] = (a['weekly-hours'] - a['weekly-hours'].mean()) / a['weekly-hours'].std()
a.head()
   Mean Weekly-Hours = 40.95159958271755
Std Dev Weekly-Hours = 12.007797424174404

idx age employment-type education occupation race sex weekly-hours compensation
0 0 0.033894 0 1 0 4 1 -0.079248 0
1 1 0.867271 2 1 2 4 1 -2.327787 0
2 2 -0.041867 1 2 4 4 1 -0.079248 0
3 3 1.094555 1 3 4 2 1 -0.079248 0
4 4 -0.799482 1 1 8 2 0 -0.079248 0

5. One-Hot (Dummy) Encode Categoricals (nominal, not ordinal)

print(a.shape)
a = pd.get_dummies(a,
                   columns=['employment-type',
                            'occupation',
                            'race'],
                   drop_first=True)
a = a[['idx', 'age', 'education', 'sex', 'weekly-hours',
       'employment-type_1', 'employment-type_2', 
       'occupation_1', 'occupation_2', 'occupation_3', 'occupation_4', 'occupation_5', 'occupation_6', 
       'occupation_7', 'occupation_8', 'occupation_9', 'occupation_10', 'occupation_11', 'occupation_12', 
       'race_1', 'race_2', 'race_3', 'race_4',
       'compensation']].copy()
print(a.shape)
a.head()
(46012, 9)
(46012, 24)

idx age education sex weekly-hours employment-type_1 employment-type_2 occupation_1 occupation_2 occupation_3 ... occupation_8 occupation_9 occupation_10 occupation_11 occupation_12 race_1 race_2 race_3 race_4 compensation
0 0 0.033894 1 1 -0.079248 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
1 1 0.867271 1 1 -2.327787 0 1 0 1 0 ... 0 0 0 0 0 0 0 0 1 0
2 2 -0.041867 2 1 -0.079248 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
3 3 1.094555 3 1 -0.079248 1 0 0 0 0 ... 0 0 0 0 0 0 1 0 0 0
4 4 -0.799482 1 0 -0.079248 1 0 0 0 0 ... 1 0 0 0 0 0 1 0 0 0

5 rows × 24 columns

a.to_csv('adult_clean.csv',
         index=False)