Parse and Combine the Kaggle Machine Learning and Data Science Survey Data

import pandas as pd
import numpy as np

Display full text of columns with long strings.

pd.set_option('display.max_colwidth', -1)

Compile Question Text

2017

q7 = (pd.read_csv('kaggle-survey-2017/schema.csv')
      .reset_index()
      .rename(columns={'Column'   : 'q7',
                       'Question' : 't'})
      [['q7','t']])
q7 = (q7[q7['q7'].isin(['Age',
                        'Country',
                        'FormalEducation',
                        'CurrentJobTitleSelect',
                        'CompensationAmount',
                        'LanguageRecommendationSelect'])]
      .reset_index(drop=True))
q7['d'] = pd.Series(['country',
                     'age',
                     'title',
                     'language',
                     'education',
                     'compensation'])
q7['y'] = 2017

2018

q8 = (pd.read_csv('kaggle-survey-2018/SurveySchema.csv')
      .T
      .reset_index()
      .rename(columns={'index' : 'q8',
                       0       : 't'})
      [['q8','t']])
q8 = (q8[q8['q8'].isin(['Q2',
                        'Q3',
                        'Q4',
                        'Q6',
                        'Q9',
                        'Q18'])]
      .reset_index(drop=True))
q8['d'] = pd.Series(['language',
                     'age',
                     'country',
                     'education',
                     'title',
                     'compensation'])
q8['y'] = 2018

2019

q9 = (pd.read_csv('kaggle-survey-2019/questions_only.csv')
      .T
      .reset_index()
      .rename(columns={'index' : 'q9',
                       0       : 't'})
      [['q9','t']])
q9 = (q9[q9['q9'].isin(['Q1',
                        'Q3',
                        'Q4',
                        'Q5',
                        'Q10',
                        'Q19'])]
      .reset_index(drop=True))
q9['d'] = pd.Series(['age',
                     'country',
                     'education',
                     'title',
                     'compensation',
                     'language'])
q9['y'] = 2019

Combine Question Text

q = (q7[['d','t','y']]
     .append(q8[['d','t','y']])
     .append(q9[['d','t','y']])
     .sort_values(['d','y'])
     .reset_index(drop=True)
    [['d','y','t']])
response_cols = list(q['d'].unique())

Compile Responses

2017

r7 = (pd.read_csv('kaggle-survey-2017/multipleChoiceResponses.csv',
                  encoding = "ISO-8859-1",
                  skiprows=[1]))
r7.loc[(r7['CompensationCurrency'].fillna('USD')!='USD')&
       (r7['Country']!='United States'), 'CompensationAmount'] = -1
q7_subset = list(q7['q7'])
q7_col_dict = dict(zip(q7['q7'],q7['d']))
r7 = (r7[q7_subset]
      .rename(columns=q7_col_dict))
r7 = r7[response_cols]
r7.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16715 entries, 0 to 16714
Data columns (total 6 columns):
age             16385 non-null float64
compensation    5340 non-null object
country         16595 non-null object
education       15014 non-null object
language        10997 non-null object
title           11829 non-null object
dtypes: float64(1), object(5)
memory usage: 783.6+ KB


/Users/ryanwingate/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3057: DtypeWarning: Columns (31,83,86,87,98,99,109,116,123,124,127,129,130,164) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

2018

r8 = (pd.read_csv('kaggle-survey-2018/multipleChoiceResponses.csv',
                  skiprows=[1]))
q8_subset = list(q8['q8'])
q8_col_dict = dict(zip(q8['q8'],q8['d']))
r8 = (r8[q8_subset]
      .rename(columns=q8_col_dict))
r8 = r8[response_cols]
r8.info()
/Users/ryanwingate/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3057: DtypeWarning: Columns (157,172,174,210,218,219,246,368,371,384,389,390,391,393) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23859 entries, 0 to 23858
Data columns (total 6 columns):
age             23859 non-null object
compensation    20185 non-null object
country         23859 non-null object
education       23438 non-null object
language        18788 non-null object
title           22900 non-null object
dtypes: object(6)
memory usage: 1.1+ MB

2019

r9 = pd.read_csv('kaggle-survey-2019/multiple_choice_responses.csv',
                 skiprows=[1])
q9_subset = list(q9['q9'])
q9_col_dict = dict(zip(q9['q9'],q9['d']))
r9 = (r9[q9_subset]
      .rename(columns=q9_col_dict))
r9 = r9[response_cols]
r9.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19717 entries, 0 to 19716
Data columns (total 6 columns):
age             19717 non-null object
compensation    12497 non-null object
country         19717 non-null object
education       19323 non-null object
language        14377 non-null object
title           19107 non-null object
dtypes: object(6)
memory usage: 924.3+ KB

Clean

r7 = r7.fillna('None')
r8 = r8.fillna('None')
r9 = r9.fillna('None')

Categorize Continuous Age

r7['continuous_age'] = r7['age'].replace('None',np.nan).astype(float)
r7['age'] = 'None'
r7.loc[r7['continuous_age'] >= 18, 'age'] = '18-21'
r7.loc[r7['continuous_age'] >= 22, 'age'] = '22-24'
r7.loc[r7['continuous_age'] >= 25, 'age'] = '25-29'
r7.loc[r7['continuous_age'] >= 30, 'age'] = '30-34'
r7.loc[r7['continuous_age'] >= 35, 'age'] = '35-39'
r7.loc[r7['continuous_age'] >= 40, 'age'] = '40-44'
r7.loc[r7['continuous_age'] >= 45, 'age'] = '45-49'
r7.loc[r7['continuous_age'] >= 50, 'age'] = '50-54'
r7.loc[r7['continuous_age'] >= 55, 'age'] = '55-59'
r7.loc[r7['continuous_age'] >= 60, 'age'] = '60-69'
r7.loc[r7['continuous_age'] >= 70, 'age'] = '70+'
r7 = r7.drop(columns=['continuous_age'])
print(r7.shape)
(16715, 6)

Categorize Continuous Compensation

r7['continuous_compensation'] = (r7['compensation'].str.replace('\D','',regex=True)
                                 .fillna(-1)
                                 .replace({'':-1})
                                 .astype(int))
r7.loc[r7['continuous_compensation'] <  10000,  'compensation'] = '<$10K',
r7.loc[r7['continuous_compensation'] >= 10000,  'compensation'] = '$10K+',
r7.loc[r7['continuous_compensation'] >= 20000,  'compensation'] = '$20K+',
r7.loc[r7['continuous_compensation'] >= 30000,  'compensation'] = '$30K+',
r7.loc[r7['continuous_compensation'] >= 40000,  'compensation'] = '$40K+',
r7.loc[r7['continuous_compensation'] >= 50000,  'compensation'] = '$50K+',
r7.loc[r7['continuous_compensation'] >= 60000,  'compensation'] = '$60K+',
r7.loc[r7['continuous_compensation'] >= 70000,  'compensation'] = '$70K+',
r7.loc[r7['continuous_compensation'] >= 80000,  'compensation'] = '$80K+',
r7.loc[r7['continuous_compensation'] >= 90000,  'compensation'] = '$90K+',
r7.loc[r7['continuous_compensation'] >= 100000, 'compensation'] = '$100K+',
r7.loc[r7['continuous_compensation'] >= 125000, 'compensation'] = '$125K+',
r7.loc[r7['continuous_compensation'] >= 150000, 'compensation'] = '$150K+',
r7.loc[r7['continuous_compensation'] >= 200000, 'compensation'] = '$200K+',
r7.loc[r7['continuous_compensation'] == -1,     'compensation'] = 'None'

Map to Common Categories

country_mapping = {
    'United States of America'                             : 'United States',
    'United Kingdom of Great Britain and Northern Ireland' : 'United Kingdom',
    "People 's Republic of China"                          : 'China',
    'Republic of China'                                    : 'China',
    'Viet Nam'                                             : 'Vietnam',
    'Iran, Islamic Republic of...'                         : 'Iran',
    'Hong Kong (S.A.R.)'                                   : 'Hong Kong',
    'Republic of Korea'                                    : 'South Korea',
    'I do not wish to disclose my location'                : 'None'
}
age_mapping = {
    '70-79' : '70+',
    '80+'   : '70+'
}
title_mapping = {
    'Software Developer/Software Engineer' : 'Software Engineer',
    'Scientist/Researcher'                 : 'Researcher',
    'Research Scientist'                   : 'Researcher',
    'Research Assistant'                   : 'Researcher'
}
language_mapping = {
    'C'      : 'C/C++/C#',
    'C++'    : 'C/C++/C#',
    'Matlab' : 'MATLAB'
}
education_mapping = {
    'Master’s degree'                                                   : "Master's degree",
    'Bachelor’s degree'                                                 : "Bachelor's degree",
    "Some college/university study without earning a bachelor's degree" : 'Some College',
    'Some college/university study without earning a bachelor’s degree' : 'Some College',
    'No formal education past high school'                              : 'High School',
    'I did not complete any formal education past high school'          : 'High School',
    'I prefer not to answer'                                            : 'None'
}
compensation_mapping = {
    'I do not wish to disclose my approximate yearly compensation' : 'None',
    '$0-999'          : '<$10K',
    '0-10,000'        : '<$10K',
    '2,000-2,999'     : '<$10K',
    '3,000-3,999'     : '<$10K',
    '4,000-4,999'     : '<$10K',
    '5,000-7,499'     : '<$10K',
    '7,500-9,999'     : '<$10K',
    '1,000-1,999'     : '<$10K',
    '10-20,000'       : '$10K+',
    '10,000-14,999'   : '$10K+',
    '15,000-19,999'   : '$10K+',
    '25,000-29,999'   : '$20K+',
    '20-30,000'       : '$20K+',
    '20,000-24,999'   : '$20K+',
    '30,000-39,999'   : '$30K+',
    '30-40,000'       : '$30K+',
    '40,000-49,999'   : '$40K+',
    '40-50,000'       : '$40K+',
    '50,000-59,999'   : '$50K+',
    '50-60,000'       : '$50K+',
    '60,000-69,999'   : '$60K+',
    '60-70,000'       : '$60K+',
    '70,000-79,999'   : '$70K+',
    '70-80,000'       : '$70K+',
    '80,000-89,999'   : '$80K+',
    '80-90,000'       : '$80K+',
    '90,000-99,999'   : '$90K+',
    '90-100,000'      : '$90K+',
    '100-125,000'     : '$100K+',
    '100,000-124,999' : '$100K+',
    '125,000-149,999' : '$125K+',
    '125-150,000'     : '$125K+',
    '150-200,000'     : '$150K+',
    '150,000-199,999' : '$150K+',
    '200-250,000'     : '$200K+',
    '200,000-249,999' : '$200K+',
    '250-300,000'     : '$200K+',
    '250,000-299,999' : '$200K+',
    '300-400,000'     : '$200K+',
    '400-500,000'     : '$200K+',
    '500,000+'        : '$200K+',
    '> $500,000'      : '$200K+',
    '300,000-500,000' : '$200K+'
}
r7['year'] = 2017
r8['year'] = 2018
r9['year'] = 2019
for df in [r7,r8,r9]:
    df['country'] = df['country'].replace(country_mapping)
    df['age'] = df['age'].replace(age_mapping)
    df['title'] = df['title'].replace(title_mapping)
    df['language'] = df['language'].replace(language_mapping)
    df['education'] = df['education'].replace(education_mapping)
    df['compensation'] = df['compensation'].replace(compensation_mapping)
cols = ['country','age','title','language','education','compensation','year']
r7 = r7[cols].copy()
r8 = r8[cols].copy()
r9 = r9[cols].copy()

Combine Responses

r = (r7
     .append(r8)
     .append(r9)
     .reset_index(drop=True))
r.shape
(60291, 7)

Only keep common values in each category.

for col in ['country',
            'age',
            'title',
            'language',
            'education',
            'compensation']:
    r7_set = set(r7[col])
    r8_set = set(r8[col])
    r9_set = set(r9[col])
    intersection_set = r7_set.intersection(r8_set, r9_set)
    intersection_set.add('None')
    r.loc[~r[col].isin(intersection_set),col] = 'Other'

Define Categorical Datatypes for Age, Education, Compensation

age_ordered_list = ['18-21','22-24','25-29','30-34','35-39','40-44',
                    '45-49','50-54','55-59','60-69','70+','None']
age_dtype = pd.Categorical(
    age_ordered_list,
    categories=age_ordered_list,
    ordered=True)
education_ordered_list = ['High School','Some College',"Bachelor's degree","Master's degree",
                          'Doctoral degree','Professional degree','None']
education_dtype = pd.Categorical(
    education_ordered_list,
    categories=education_ordered_list,
    ordered=True)
compensation_ordered_list = ['<$10K','$10K+','$20K+','$30K+','$40K+','$50K+','$60K+','$70K+',
                             '$80K+','$90K+','$100K+','$125K+','$150K+','$200K+','None']
compensation_dtype = pd.Categorical(
    compensation_ordered_list,
    categories=compensation_ordered_list,
    ordered=True)

Create Simplified Versions to Reduce Category Counts

simplified_age_ordered_list = ['<30','30\'s','40\'s','50\'s+','None']
simplified_age_dtype = pd.Categorical(
    simplified_age_ordered_list,
    categories=simplified_age_ordered_list,
    ordered=True)
simplified_education_ordered_list = ['No Degree','Bachelor\'s','Master\'s',
                                     'PhD','Professional','None']
simplified_education_dtype = pd.Categorical(
    simplified_education_ordered_list,
    categories=simplified_education_ordered_list,
    ordered=True)
simplified_compensation_ordered_list = ['$30K+','$60K+','$90K+','$125K+','$150K+','None']
simplified_compensation_dtype = pd.Categorical(
    simplified_compensation_ordered_list,
    categories=simplified_compensation_ordered_list,
    ordered=True)

Map the Categories to the Simpler Versions

r['s_age'] = r['age'].replace({
    '18-21' : '<30',
    '22-24' : '<30',
    '25-29' : '<30',
    '30-34' : '30\'s',
    '35-39' : '30\'s',
    '40-44' : '40\'s',
    '45-49' : '40\'s',
    '50-54' : '50\'s+',
    '55-59' : '50\'s+',
    '60-69' : '50\'s+',
    '70+'   : '50\'s+',
    'None'  : 'None'
})
r['s_education'] = r['education'].replace({
    'High School'         : 'No Degree',
    'Some College'        : 'No Degree',
    "Bachelor's degree"   : 'Bachelor\'s',
    "Master's degree"     : 'Master\'s',
    'Doctoral degree'     : 'PhD',
    'Professional degree' : 'Professional',
    'None'                : 'None'
})
r['s_compensation'] = r['compensation'].replace({
    '<$10K' : 'None',
    '$10K+' : 'None',
    '$20K+' : 'None',
    '$30K+' : '$30K+',
    '$40K+' : '$30K+',
    '$50K+' : '$30K+',
    '$60K+' : '$60K+',
    '$70K+' : '$60K+',
    '$80K+' : '$60K+',
    '$90K+' : '$90K+',
    '$100K+': '$90K+',
    '$125K+': '$125K+',
    '$150K+': '$150K+',
    '$200K+': '$150K+'})

Assign Datatypes

for col, dtype in [('age',            age_dtype),
                   ('education',      education_dtype),
                   ('compensation',   compensation_dtype),
                   ('s_age',          simplified_age_dtype),
                   ('s_education',    simplified_education_dtype),
                   ('s_compensation', simplified_compensation_dtype)]:
    r[col] = r[col].astype(dtype)
r = r[['country', 'age', 's_age', 'title', 'language', 
       'education', 's_education', 'compensation', 's_compensation',
       'year']].copy()

Write to File

q.to_excel('processed/questions.xlsx')
r.to_excel('processed/responses.xlsx')