Parse and Combine the Kaggle Machine Learning and Data Science Survey Data

06 Jan 2020

Compile Questions
Combine Questions
Compile Responses
Clean
Combine Responses
Define Datatypes
Write to File

import pandas as pd
import numpy as np

Display full text of columns with long strings.

pd.set_option('display.max_colwidth', -1)

Compile Question Text

2017

q7 = (pd.read_csv('kaggle-survey-2017/schema.csv')
      .reset_index()
      .rename(columns={'Column'   : 'q7',
                       'Question' : 't'})
      [['q7','t']])

q7 = (q7[q7['q7'].isin(['Age',
                        'Country',
                        'FormalEducation',
                        'CurrentJobTitleSelect',
                        'CompensationAmount',
                        'LanguageRecommendationSelect'])]
      .reset_index(drop=True))
q7['d'] = pd.Series(['country',
                     'age',
                     'title',
                     'language',
                     'education',
                     'compensation'])
q7['y'] = 2017

2018

q8 = (pd.read_csv('kaggle-survey-2018/SurveySchema.csv')
      .T
      .reset_index()
      .rename(columns={'index' : 'q8',
                       0       : 't'})
      [['q8','t']])

q8 = (q8[q8['q8'].isin(['Q2',
                        'Q3',
                        'Q4',
                        'Q6',
                        'Q9',
                        'Q18'])]
      .reset_index(drop=True))
q8['d'] = pd.Series(['language',
                     'age',
                     'country',
                     'education',
                     'title',
                     'compensation'])
q8['y'] = 2018

2019

q9 = (pd.read_csv('kaggle-survey-2019/questions_only.csv')
      .T
      .reset_index()
      .rename(columns={'index' : 'q9',
                       0       : 't'})
      [['q9','t']])

q9 = (q9[q9['q9'].isin(['Q1',
                        'Q3',
                        'Q4',
                        'Q5',
                        'Q10',
                        'Q19'])]
      .reset_index(drop=True))
q9['d'] = pd.Series(['age',
                     'country',
                     'education',
                     'title',
                     'compensation',
                     'language'])
q9['y'] = 2019

Combine Question Text

q = (q7[['d','t','y']]
     .append(q8[['d','t','y']])
     .append(q9[['d','t','y']])
     .sort_values(['d','y'])
     .reset_index(drop=True)
    [['d','y','t']])

response_cols = list(q['d'].unique())

Compile Responses

2017

r7 = (pd.read_csv('kaggle-survey-2017/multipleChoiceResponses.csv',
                  encoding = "ISO-8859-1",
                  skiprows=[1]))
r7.loc[(r7['CompensationCurrency'].fillna('USD')!='USD')&
       (r7['Country']!='United States'), 'CompensationAmount'] = -1
q7_subset = list(q7['q7'])
q7_col_dict = dict(zip(q7['q7'],q7['d']))
r7 = (r7[q7_subset]
      .rename(columns=q7_col_dict))
r7 = r7[response_cols]
r7.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16715 entries, 0 to 16714
Data columns (total 6 columns):
age             16385 non-null float64
compensation    5340 non-null object
country         16595 non-null object
education       15014 non-null object
language        10997 non-null object
title           11829 non-null object
dtypes: float64(1), object(5)
memory usage: 783.6+ KB


/Users/ryanwingate/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3057: DtypeWarning: Columns (31,83,86,87,98,99,109,116,123,124,127,129,130,164) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

2018

r8 = (pd.read_csv('kaggle-survey-2018/multipleChoiceResponses.csv',
                  skiprows=[1]))
q8_subset = list(q8['q8'])
q8_col_dict = dict(zip(q8['q8'],q8['d']))
r8 = (r8[q8_subset]
      .rename(columns=q8_col_dict))
r8 = r8[response_cols]
r8.info()

/Users/ryanwingate/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3057: DtypeWarning: Columns (157,172,174,210,218,219,246,368,371,384,389,390,391,393) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23859 entries, 0 to 23858
Data columns (total 6 columns):
age             23859 non-null object
compensation    20185 non-null object
country         23859 non-null object
education       23438 non-null object
language        18788 non-null object
title           22900 non-null object
dtypes: object(6)
memory usage: 1.1+ MB

2019

r9 = pd.read_csv('kaggle-survey-2019/multiple_choice_responses.csv',
                 skiprows=[1])
q9_subset = list(q9['q9'])
q9_col_dict = dict(zip(q9['q9'],q9['d']))
r9 = (r9[q9_subset]
      .rename(columns=q9_col_dict))
r9 = r9[response_cols]
r9.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19717 entries, 0 to 19716
Data columns (total 6 columns):
age             19717 non-null object
compensation    12497 non-null object
country         19717 non-null object
education       19323 non-null object
language        14377 non-null object
title           19107 non-null object
dtypes: object(6)
memory usage: 924.3+ KB

Clean

r7 = r7.fillna('None')
r8 = r8.fillna('None')
r9 = r9.fillna('None')

Categorize Continuous Age

r7['continuous_age'] = r7['age'].replace('None',np.nan).astype(float)
r7['age'] = 'None'
r7.loc[r7['continuous_age'] >= 18, 'age'] = '18-21'
r7.loc[r7['continuous_age'] >= 22, 'age'] = '22-24'
r7.loc[r7['continuous_age'] >= 25, 'age'] = '25-29'
r7.loc[r7['continuous_age'] >= 30, 'age'] = '30-34'
r7.loc[r7['continuous_age'] >= 35, 'age'] = '35-39'
r7.loc[r7['continuous_age'] >= 40, 'age'] = '40-44'
r7.loc[r7['continuous_age'] >= 45, 'age'] = '45-49'
r7.loc[r7['continuous_age'] >= 50, 'age'] = '50-54'
r7.loc[r7['continuous_age'] >= 55, 'age'] = '55-59'
r7.loc[r7['continuous_age'] >= 60, 'age'] = '60-69'
r7.loc[r7['continuous_age'] >= 70, 'age'] = '70+'
r7 = r7.drop(columns=['continuous_age'])
print(r7.shape)

(16715, 6)

Categorize Continuous Compensation

r7['continuous_compensation'] = (r7['compensation'].str.replace('\D','',regex=True)
                                 .fillna(-1)
                                 .replace({'':-1})
                                 .astype(int))

r7.loc[r7['continuous_compensation'] <  10000,  'compensation'] = '<$10K',
r7.loc[r7['continuous_compensation'] >= 10000,  'compensation'] = '$10K+',
r7.loc[r7['continuous_compensation'] >= 20000,  'compensation'] = '$20K+',
r7.loc[r7['continuous_compensation'] >= 30000,  'compensation'] = '$30K+',
r7.loc[r7['continuous_compensation'] >= 40000,  'compensation'] = '$40K+',
r7.loc[r7['continuous_compensation'] >= 50000,  'compensation'] = '$50K+',
r7.loc[r7['continuous_compensation'] >= 60000,  'compensation'] = '$60K+',
r7.loc[r7['continuous_compensation'] >= 70000,  'compensation'] = '$70K+',
r7.loc[r7['continuous_compensation'] >= 80000,  'compensation'] = '$80K+',
r7.loc[r7['continuous_compensation'] >= 90000,  'compensation'] = '$90K+',
r7.loc[r7['continuous_compensation'] >= 100000, 'compensation'] = '$100K+',
r7.loc[r7['continuous_compensation'] >= 125000, 'compensation'] = '$125K+',
r7.loc[r7['continuous_compensation'] >= 150000, 'compensation'] = '$150K+',
r7.loc[r7['continuous_compensation'] >= 200000, 'compensation'] = '$200K+',
r7.loc[r7['continuous_compensation'] == -1,     'compensation'] = 'None'

Map to Common Categories

country_mapping = {
    'United States of America'                             : 'United States',
    'United Kingdom of Great Britain and Northern Ireland' : 'United Kingdom',
    "People 's Republic of China"                          : 'China',
    'Republic of China'                                    : 'China',
    'Viet Nam'                                             : 'Vietnam',
    'Iran, Islamic Republic of...'                         : 'Iran',
    'Hong Kong (S.A.R.)'                                   : 'Hong Kong',
    'Republic of Korea'                                    : 'South Korea',
    'I do not wish to disclose my location'                : 'None'
}

age_mapping = {
    '70-79' : '70+',
    '80+'   : '70+'
}

title_mapping = {
    'Software Developer/Software Engineer' : 'Software Engineer',
    'Scientist/Researcher'                 : 'Researcher',
    'Research Scientist'                   : 'Researcher',
    'Research Assistant'                   : 'Researcher'
}

language_mapping = {
    'C'      : 'C/C++/C#',
    'C++'    : 'C/C++/C#',
    'Matlab' : 'MATLAB'
}

education_mapping = {
    'Master’s degree'                                                   : "Master's degree",
    'Bachelor’s degree'                                                 : "Bachelor's degree",
    "Some college/university study without earning a bachelor's degree" : 'Some College',
    'Some college/university study without earning a bachelor’s degree' : 'Some College',
    'No formal education past high school'                              : 'High School',
    'I did not complete any formal education past high school'          : 'High School',
    'I prefer not to answer'                                            : 'None'
}

compensation_mapping = {
    'I do not wish to disclose my approximate yearly compensation' : 'None',
    '$0-999'          : '<$10K',
    '0-10,000'        : '<$10K',
    '2,000-2,999'     : '<$10K',
    '3,000-3,999'     : '<$10K',
    '4,000-4,999'     : '<$10K',
    '5,000-7,499'     : '<$10K',
    '7,500-9,999'     : '<$10K',
    '1,000-1,999'     : '<$10K',
    '10-20,000'       : '$10K+',
    '10,000-14,999'   : '$10K+',
    '15,000-19,999'   : '$10K+',
    '25,000-29,999'   : '$20K+',
    '20-30,000'       : '$20K+',
    '20,000-24,999'   : '$20K+',
    '30,000-39,999'   : '$30K+',
    '30-40,000'       : '$30K+',
    '40,000-49,999'   : '$40K+',
    '40-50,000'       : '$40K+',
    '50,000-59,999'   : '$50K+',
    '50-60,000'       : '$50K+',
    '60,000-69,999'   : '$60K+',
    '60-70,000'       : '$60K+',
    '70,000-79,999'   : '$70K+',
    '70-80,000'       : '$70K+',
    '80,000-89,999'   : '$80K+',
    '80-90,000'       : '$80K+',
    '90,000-99,999'   : '$90K+',
    '90-100,000'      : '$90K+',
    '100-125,000'     : '$100K+',
    '100,000-124,999' : '$100K+',
    '125,000-149,999' : '$125K+',
    '125-150,000'     : '$125K+',
    '150-200,000'     : '$150K+',
    '150,000-199,999' : '$150K+',
    '200-250,000'     : '$200K+',
    '200,000-249,999' : '$200K+',
    '250-300,000'     : '$200K+',
    '250,000-299,999' : '$200K+',
    '300-400,000'     : '$200K+',
    '400-500,000'     : '$200K+',
    '500,000+'        : '$200K+',
    '> $500,000'      : '$200K+',
    '300,000-500,000' : '$200K+'
}

r7['year'] = 2017
r8['year'] = 2018
r9['year'] = 2019

for df in [r7,r8,r9]:
    df['country'] = df['country'].replace(country_mapping)
    df['age'] = df['age'].replace(age_mapping)
    df['title'] = df['title'].replace(title_mapping)
    df['language'] = df['language'].replace(language_mapping)
    df['education'] = df['education'].replace(education_mapping)
    df['compensation'] = df['compensation'].replace(compensation_mapping)

cols = ['country','age','title','language','education','compensation','year']
r7 = r7[cols].copy()
r8 = r8[cols].copy()
r9 = r9[cols].copy()

Combine Responses

r = (r7
     .append(r8)
     .append(r9)
     .reset_index(drop=True))
r.shape

(60291, 7)

Only keep common values in each category.

for col in ['country',
            'age',
            'title',
            'language',
            'education',
            'compensation']:
    r7_set = set(r7[col])
    r8_set = set(r8[col])
    r9_set = set(r9[col])
    intersection_set = r7_set.intersection(r8_set, r9_set)
    intersection_set.add('None')
    r.loc[~r[col].isin(intersection_set),col] = 'Other'

Define Categorical Datatypes for Age, Education, Compensation

age_ordered_list = ['18-21','22-24','25-29','30-34','35-39','40-44',
                    '45-49','50-54','55-59','60-69','70+','None']
age_dtype = pd.Categorical(
    age_ordered_list,
    categories=age_ordered_list,
    ordered=True)

education_ordered_list = ['High School','Some College',"Bachelor's degree","Master's degree",
                          'Doctoral degree','Professional degree','None']
education_dtype = pd.Categorical(
    education_ordered_list,
    categories=education_ordered_list,
    ordered=True)

compensation_ordered_list = ['<$10K','$10K+','$20K+','$30K+','$40K+','$50K+','$60K+','$70K+',
                             '$80K+','$90K+','$100K+','$125K+','$150K+','$200K+','None']
compensation_dtype = pd.Categorical(
    compensation_ordered_list,
    categories=compensation_ordered_list,
    ordered=True)

Create Simplified Versions to Reduce Category Counts

simplified_age_ordered_list = ['<30','30\'s','40\'s','50\'s+','None']
simplified_age_dtype = pd.Categorical(
    simplified_age_ordered_list,
    categories=simplified_age_ordered_list,
    ordered=True)

simplified_education_ordered_list = ['No Degree','Bachelor\'s','Master\'s',
                                     'PhD','Professional','None']
simplified_education_dtype = pd.Categorical(
    simplified_education_ordered_list,
    categories=simplified_education_ordered_list,
    ordered=True)

simplified_compensation_ordered_list = ['$30K+','$60K+','$90K+','$125K+','$150K+','None']
simplified_compensation_dtype = pd.Categorical(
    simplified_compensation_ordered_list,
    categories=simplified_compensation_ordered_list,
    ordered=True)

Map the Categories to the Simpler Versions

r['s_age'] = r['age'].replace({
    '18-21' : '<30',
    '22-24' : '<30',
    '25-29' : '<30',
    '30-34' : '30\'s',
    '35-39' : '30\'s',
    '40-44' : '40\'s',
    '45-49' : '40\'s',
    '50-54' : '50\'s+',
    '55-59' : '50\'s+',
    '60-69' : '50\'s+',
    '70+'   : '50\'s+',
    'None'  : 'None'
})

r['s_education'] = r['education'].replace({
    'High School'         : 'No Degree',
    'Some College'        : 'No Degree',
    "Bachelor's degree"   : 'Bachelor\'s',
    "Master's degree"     : 'Master\'s',
    'Doctoral degree'     : 'PhD',
    'Professional degree' : 'Professional',
    'None'                : 'None'
})

r['s_compensation'] = r['compensation'].replace({
    '<$10K' : 'None',
    '$10K+' : 'None',
    '$20K+' : 'None',
    '$30K+' : '$30K+',
    '$40K+' : '$30K+',
    '$50K+' : '$30K+',
    '$60K+' : '$60K+',
    '$70K+' : '$60K+',
    '$80K+' : '$60K+',
    '$90K+' : '$90K+',
    '$100K+': '$90K+',
    '$125K+': '$125K+',
    '$150K+': '$150K+',
    '$200K+': '$150K+'})

Assign Datatypes

for col, dtype in [('age',            age_dtype),
                   ('education',      education_dtype),
                   ('compensation',   compensation_dtype),
                   ('s_age',          simplified_age_dtype),
                   ('s_education',    simplified_education_dtype),
                   ('s_compensation', simplified_compensation_dtype)]:
    r[col] = r[col].astype(dtype)
r = r[['country', 'age', 's_age', 'title', 'language', 
       'education', 's_education', 'compensation', 's_compensation',
       'year']].copy()

Write to File

q.to_excel('processed/questions.xlsx')

r.to_excel('processed/responses.xlsx')