Cleaning the Adult Dataset

05 Oct 2019

1. Read Data from Adult Dataset

Original dataset downloaded from: https://archive.ics.uci.edu/ml/datasets/Adult

import pandas as pd
import numpy as np

adult_data_path = 'raw_adult/adult.data'
adult_test_path = 'raw_adult/adult.test'
cols = ['age','workclass','fnlwgt','education','education-num','marital-status',
        'occupation','relationship','race','sex','capital-gain', 'capital-loss',
        'hours-per-week', 'native-country','compensation']
a = (pd.read_csv(adult_data_path,
                 names=cols,
                 sep=', ',
                 engine='python')
     .append(pd.read_csv(adult_test_path,
                         skiprows=1,
                         names=cols,
                         sep=', ',
                         engine='python')))
print(a.shape)
a.head()

(48842, 15)

	age	workclass	fnlwgt	education	education-num	marital-status	occupation	relationship	race	sex	capital-gain	hours-per-week	native-country	compensation
0	39	State-gov	77516	Bachelors	13	Never-married	Adm-clerical	Not-in-family	White	Male	2174	40	United-States	<=50K
1	50	Self-emp-not-inc	83311	Bachelors	13	Married-civ-spouse	Exec-managerial	Husband	White	Male	0	13	United-States	<=50K
2	38	Private	215646	HS-grad	9	Divorced	Handlers-cleaners	Not-in-family	White	Male	0	40	United-States	<=50K
3	53	Private	234721	11th	7	Married-civ-spouse	Handlers-cleaners	Husband	Black	Male	0	40	United-States	<=50K
4	28	Private	338409	Bachelors	13	Married-civ-spouse	Prof-specialty	Wife	Black	Female	0	40	Cuba	<=50K

2. Clean Data

a = a.replace({'<=50K.' : '<=50K',
               '>50K.'  : '>50K'})

Drop entries where workclass and occupation are unknown, and where workclass is Without-pay.

a = (a[(a['workclass']!='?')&
       (a['occupation']!='?')&
       (a['workclass']!='Without-pay')]
     .reset_index(drop=True))
a['idx'] = a.index

Map the very small Armed-Forces category of occupation to Protective-serv.

a.loc[a['occupation']=='Armed-Forces','occupation'] = 'Protective-serv'

Map Ages, Education, Workclass, and Weekly-Hours to smaller category set.

a.loc[a['workclass'].isin(['State-gov', 'Federal-gov', 'Local-gov']), 
      'employment-type'] = 'Government'
a.loc[a['workclass'].isin(['Self-emp-not-inc', 'Self-emp-inc']),      
      'employment-type'] = 'Self-Employed'
a.loc[a['workclass'].isin(['Private']),                               
      'employment-type'] = 'Privately-Employed'
a['employment-type'].unique()

array(['Government', 'Self-Employed', 'Privately-Employed'], dtype=object)

a.loc[a['education-num'] <= 8,          'education'] = 'Less than High School'
a.loc[a['education-num'].isin([ 9,10]), 'education'] = 'High School'
a.loc[a['education-num'].isin([11,12]), 'education'] = 'Associates'
a.loc[a['education-num'].isin([13]),    'education'] = 'Bachelors'
a.loc[a['education-num'].isin([14]),    'education'] = 'Masters'
a.loc[a['education-num'].isin([15,16]), 'education'] = 'PhD/Professional'
a['education'].unique()

array(['Bachelors', 'High School', 'Less than High School', 'Masters',
       'Associates', 'PhD/Professional'], dtype=object)

a = a.rename(columns={'hours-per-week':'weekly-hours'})
a.head()

	age	workclass	fnlwgt	education	education-num	marital-status	occupation	relationship	race	sex	capital-gain	weekly-hours	native-country	compensation	idx	employment-type
0	39	State-gov	77516	Bachelors	13	Never-married	Adm-clerical	Not-in-family	White	Male	2174	40	United-States	<=50K	0	Government
1	50	Self-emp-not-inc	83311	Bachelors	13	Married-civ-spouse	Exec-managerial	Husband	White	Male	0	13	United-States	<=50K	1	Self-Employed
2	38	Private	215646	High School	9	Divorced	Handlers-cleaners	Not-in-family	White	Male	0	40	United-States	<=50K	2	Privately-Employed
3	53	Private	234721	Less than High School	7	Married-civ-spouse	Handlers-cleaners	Husband	Black	Male	0	40	United-States	<=50K	3	Privately-Employed
4	28	Private	338409	Bachelors	13	Married-civ-spouse	Prof-specialty	Wife	Black	Female	0	40	Cuba	<=50K	4	Privately-Employed

3. Convert all Categorical data to Numeric data

a['age raw'] = a['age']
a['employment-type raw'] = a['employment-type']
a['education raw'] = a['education']
a['occupation raw'] = a['occupation']
a['race raw'] = a['race']
a['sex raw'] = a['sex']
a['weekly-hours raw'] = a['weekly-hours']
a['compensation raw'] = a['compensation']

a['employment-type'] = a['employment-type'].astype('category')
a['employment-type'] = a['employment-type'].cat.codes
a[['employment-type','employment-type raw','idx']].groupby(['employment-type',
                                                            'employment-type raw']).count()

		idx
employment-type	employment-type raw
0	Government	6549
1	Privately-Employed	33906
2	Self-Employed	5557

a['education'] = a['education'].astype('category')
a['education'] = a['education'].cat.codes
a[['education','education raw','idx']].groupby(['education',
                                                'education raw']).count()

		idx
education	education raw
0	Associates	3505
1	Bachelors	7772
2	High School	24991
3	Less than High School	5768
4	Masters	2590
5	PhD/Professional	1386

a['occupation'] = a['occupation'].astype('category')
a['occupation'] = a['occupation'].cat.codes
a[['occupation','occupation raw','idx']].groupby(['occupation',
                                                  'occupation raw']).count()

		idx
occupation	occupation raw
0	Adm-clerical	5608
1	Craft-repair	6111
2	Exec-managerial	6085
3	Farming-fishing	1482
4	Handlers-cleaners	2070
5	Machine-op-inspct	3020
6	Other-service	4921
7	Priv-house-serv	242
8	Prof-specialty	6172
9	Protective-serv	998
10	Sales	5503
11	Tech-support	1446
12	Transport-moving	2354

a['race'] = a['race'].astype('category')
a['race'] = a['race'].cat.codes
a[['race','race raw','idx']].groupby(['race',
                                      'race raw']).count()

		idx
race	race raw
0	Amer-Indian-Eskimo	435
1	Asian-Pac-Islander	1422
2	Black	4355
3	Other	375
4	White	39425

a['sex'] = a['sex'].astype('category')
a['sex'] = a['sex'].cat.codes
a[['sex','sex raw','idx']].groupby(['sex',
                                    'sex raw']).count()

		idx
sex	sex raw
0	Female	14912
1	Male	31100

a['compensation'] = a['compensation'].astype('category')
a['compensation'] = a['compensation'].cat.codes
a[['compensation','compensation raw','idx']].groupby(['compensation',
                                                      'compensation raw']).count()

		idx
compensation	compensation raw
0	<=50K	34592
1	>50K	11420

a = a[['idx',
       'age',
       'employment-type',
       'education',
       'occupation',
       'race',
       'sex',
       'weekly-hours',
       'compensation']].copy()
print(a.shape)
a.head()

(46012, 9)

	idx	age	employment-type	education	occupation	race	sex	weekly-hours
0	0	39	0	1	0	4	1	40
1	1	50	2	1	2	4	1	13
2	2	38	1	2	4	4	1	40
3	3	53	1	3	4	2	1	40
4	4	28	1	1	8	2	0	40

4. Normalize Continuous Data

Age and Weekly-Hours

print('   Mean Age = ' + str(a['age'].mean()))
print('Std Dev Age = ' + str(a['age'].std()))
a['age'] = (a['age'] - a['age'].mean()) / a['age'].std()

   Mean Age = 38.55261670868469
Std Dev Age = 13.199319129893926

print('   Mean Weekly-Hours = ' + str(a['weekly-hours'].mean()))
print('Std Dev Weekly-Hours = ' + str(a['weekly-hours'].std()))
a['weekly-hours'] = (a['weekly-hours'] - a['weekly-hours'].mean()) / a['weekly-hours'].std()
a.head()

   Mean Weekly-Hours = 40.95159958271755
Std Dev Weekly-Hours = 12.007797424174404

	idx	age	employment-type	education	occupation	race	sex	weekly-hours
0	0	0.033894	0	1	0	4	1	-0.079248
1	1	0.867271	2	1	2	4	1	-2.327787
2	2	-0.041867	1	2	4	4	1	-0.079248
3	3	1.094555	1	3	4	2	1	-0.079248
4	4	-0.799482	1	1	8	2	0	-0.079248

5. One-Hot (Dummy) Encode Categoricals (nominal, not ordinal)

print(a.shape)
a = pd.get_dummies(a,
                   columns=['employment-type',
                            'occupation',
                            'race'],
                   drop_first=True)
a = a[['idx', 'age', 'education', 'sex', 'weekly-hours',
       'employment-type_1', 'employment-type_2', 
       'occupation_1', 'occupation_2', 'occupation_3', 'occupation_4', 'occupation_5', 'occupation_6', 
       'occupation_7', 'occupation_8', 'occupation_9', 'occupation_10', 'occupation_11', 'occupation_12', 
       'race_1', 'race_2', 'race_3', 'race_4',
       'compensation']].copy()
print(a.shape)
a.head()

(46012, 9)
(46012, 24)

	idx	age	education	sex	weekly-hours	employment-type_1	employment-type_2	occupation_2	...	occupation_8	race_2	race_4
0	0	0.033894	1	1	-0.079248	0	0	0	...	0	0	1
1	1	0.867271	1	1	-2.327787	0	1	1	...	0	0	1
2	2	-0.041867	2	1	-0.079248	1	0	0	...	0	0	1
3	3	1.094555	3	1	-0.079248	1	0	0	...	0	1	0
4	4	-0.799482	1	0	-0.079248	1	0	0	...	1	1	0

5 rows × 24 columns

a.to_csv('adult_clean.csv',
         index=False)

	idx	age	employment-type	education	occupation	race	sex	weekly-hours
0	0	39	0	1	0	4	1	40
1	1	50	2	1	2	4	1	13
2	2	38	1	2	4	4	1	40
3	3	53	1	3	4	2	1	40
4	4	28	1	1	8	2	0	40

	idx	age	employment-type	education	occupation	race	sex	weekly-hours
0	0	39	0	1	0	4	1	40
1	1	50	2	1	2	4	1	13
2	2	38	1	2	4	4	1	40
3	3	53	1	3	4	2	1	40
4	4	28	1	1	8	2	0	40

	idx	age	employment-type	education	occupation	race	sex	weekly-hours
0	0	39	0	1	0	4	1	40
1	1	50	2	1	2	4	1	13
2	2	38	1	2	4	4	1	40
3	3	53	1	3	4	2	1	40
4	4	28	1	1	8	2	0	40