Parse and Combine the Kaggle Machine Learning and Data Science Survey Data
- Compile Questions
- Combine Questions
- Compile Responses
- Clean
- Combine Responses
- Define Datatypes
- Write to File
import pandas as pd
import numpy as np
Display full text of columns with long strings.
pd.set_option('display.max_colwidth', -1)
Compile Question Text
2017
q7 = (pd.read_csv('kaggle-survey-2017/schema.csv')
.reset_index()
.rename(columns={'Column' : 'q7',
'Question' : 't'})
[['q7','t']])
q7 = (q7[q7['q7'].isin(['Age',
'Country',
'FormalEducation',
'CurrentJobTitleSelect',
'CompensationAmount',
'LanguageRecommendationSelect'])]
.reset_index(drop=True))
q7['d'] = pd.Series(['country',
'age',
'title',
'language',
'education',
'compensation'])
q7['y'] = 2017
2018
q8 = (pd.read_csv('kaggle-survey-2018/SurveySchema.csv')
.T
.reset_index()
.rename(columns={'index' : 'q8',
0 : 't'})
[['q8','t']])
q8 = (q8[q8['q8'].isin(['Q2',
'Q3',
'Q4',
'Q6',
'Q9',
'Q18'])]
.reset_index(drop=True))
q8['d'] = pd.Series(['language',
'age',
'country',
'education',
'title',
'compensation'])
q8['y'] = 2018
2019
q9 = (pd.read_csv('kaggle-survey-2019/questions_only.csv')
.T
.reset_index()
.rename(columns={'index' : 'q9',
0 : 't'})
[['q9','t']])
q9 = (q9[q9['q9'].isin(['Q1',
'Q3',
'Q4',
'Q5',
'Q10',
'Q19'])]
.reset_index(drop=True))
q9['d'] = pd.Series(['age',
'country',
'education',
'title',
'compensation',
'language'])
q9['y'] = 2019
Combine Question Text
q = (q7[['d','t','y']]
.append(q8[['d','t','y']])
.append(q9[['d','t','y']])
.sort_values(['d','y'])
.reset_index(drop=True)
[['d','y','t']])
response_cols = list(q['d'].unique())
Compile Responses
2017
r7 = (pd.read_csv('kaggle-survey-2017/multipleChoiceResponses.csv',
encoding = "ISO-8859-1",
skiprows=[1]))
r7.loc[(r7['CompensationCurrency'].fillna('USD')!='USD')&
(r7['Country']!='United States'), 'CompensationAmount'] = -1
q7_subset = list(q7['q7'])
q7_col_dict = dict(zip(q7['q7'],q7['d']))
r7 = (r7[q7_subset]
.rename(columns=q7_col_dict))
r7 = r7[response_cols]
r7.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16715 entries, 0 to 16714
Data columns (total 6 columns):
age 16385 non-null float64
compensation 5340 non-null object
country 16595 non-null object
education 15014 non-null object
language 10997 non-null object
title 11829 non-null object
dtypes: float64(1), object(5)
memory usage: 783.6+ KB
/Users/ryanwingate/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3057: DtypeWarning: Columns (31,83,86,87,98,99,109,116,123,124,127,129,130,164) have mixed types. Specify dtype option on import or set low_memory=False.
interactivity=interactivity, compiler=compiler, result=result)
2018
r8 = (pd.read_csv('kaggle-survey-2018/multipleChoiceResponses.csv',
skiprows=[1]))
q8_subset = list(q8['q8'])
q8_col_dict = dict(zip(q8['q8'],q8['d']))
r8 = (r8[q8_subset]
.rename(columns=q8_col_dict))
r8 = r8[response_cols]
r8.info()
/Users/ryanwingate/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3057: DtypeWarning: Columns (157,172,174,210,218,219,246,368,371,384,389,390,391,393) have mixed types. Specify dtype option on import or set low_memory=False.
interactivity=interactivity, compiler=compiler, result=result)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23859 entries, 0 to 23858
Data columns (total 6 columns):
age 23859 non-null object
compensation 20185 non-null object
country 23859 non-null object
education 23438 non-null object
language 18788 non-null object
title 22900 non-null object
dtypes: object(6)
memory usage: 1.1+ MB
2019
r9 = pd.read_csv('kaggle-survey-2019/multiple_choice_responses.csv',
skiprows=[1])
q9_subset = list(q9['q9'])
q9_col_dict = dict(zip(q9['q9'],q9['d']))
r9 = (r9[q9_subset]
.rename(columns=q9_col_dict))
r9 = r9[response_cols]
r9.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19717 entries, 0 to 19716
Data columns (total 6 columns):
age 19717 non-null object
compensation 12497 non-null object
country 19717 non-null object
education 19323 non-null object
language 14377 non-null object
title 19107 non-null object
dtypes: object(6)
memory usage: 924.3+ KB
Clean
r7 = r7.fillna('None')
r8 = r8.fillna('None')
r9 = r9.fillna('None')
Categorize Continuous Age
r7['continuous_age'] = r7['age'].replace('None',np.nan).astype(float)
r7['age'] = 'None'
r7.loc[r7['continuous_age'] >= 18, 'age'] = '18-21'
r7.loc[r7['continuous_age'] >= 22, 'age'] = '22-24'
r7.loc[r7['continuous_age'] >= 25, 'age'] = '25-29'
r7.loc[r7['continuous_age'] >= 30, 'age'] = '30-34'
r7.loc[r7['continuous_age'] >= 35, 'age'] = '35-39'
r7.loc[r7['continuous_age'] >= 40, 'age'] = '40-44'
r7.loc[r7['continuous_age'] >= 45, 'age'] = '45-49'
r7.loc[r7['continuous_age'] >= 50, 'age'] = '50-54'
r7.loc[r7['continuous_age'] >= 55, 'age'] = '55-59'
r7.loc[r7['continuous_age'] >= 60, 'age'] = '60-69'
r7.loc[r7['continuous_age'] >= 70, 'age'] = '70+'
r7 = r7.drop(columns=['continuous_age'])
print(r7.shape)
(16715, 6)
Categorize Continuous Compensation
r7['continuous_compensation'] = (r7['compensation'].str.replace('\D','',regex=True)
.fillna(-1)
.replace({'':-1})
.astype(int))
r7.loc[r7['continuous_compensation'] < 10000, 'compensation'] = '<$10K',
r7.loc[r7['continuous_compensation'] >= 10000, 'compensation'] = '$10K+',
r7.loc[r7['continuous_compensation'] >= 20000, 'compensation'] = '$20K+',
r7.loc[r7['continuous_compensation'] >= 30000, 'compensation'] = '$30K+',
r7.loc[r7['continuous_compensation'] >= 40000, 'compensation'] = '$40K+',
r7.loc[r7['continuous_compensation'] >= 50000, 'compensation'] = '$50K+',
r7.loc[r7['continuous_compensation'] >= 60000, 'compensation'] = '$60K+',
r7.loc[r7['continuous_compensation'] >= 70000, 'compensation'] = '$70K+',
r7.loc[r7['continuous_compensation'] >= 80000, 'compensation'] = '$80K+',
r7.loc[r7['continuous_compensation'] >= 90000, 'compensation'] = '$90K+',
r7.loc[r7['continuous_compensation'] >= 100000, 'compensation'] = '$100K+',
r7.loc[r7['continuous_compensation'] >= 125000, 'compensation'] = '$125K+',
r7.loc[r7['continuous_compensation'] >= 150000, 'compensation'] = '$150K+',
r7.loc[r7['continuous_compensation'] >= 200000, 'compensation'] = '$200K+',
r7.loc[r7['continuous_compensation'] == -1, 'compensation'] = 'None'
Map to Common Categories
country_mapping = {
'United States of America' : 'United States',
'United Kingdom of Great Britain and Northern Ireland' : 'United Kingdom',
"People 's Republic of China" : 'China',
'Republic of China' : 'China',
'Viet Nam' : 'Vietnam',
'Iran, Islamic Republic of...' : 'Iran',
'Hong Kong (S.A.R.)' : 'Hong Kong',
'Republic of Korea' : 'South Korea',
'I do not wish to disclose my location' : 'None'
}
age_mapping = {
'70-79' : '70+',
'80+' : '70+'
}
title_mapping = {
'Software Developer/Software Engineer' : 'Software Engineer',
'Scientist/Researcher' : 'Researcher',
'Research Scientist' : 'Researcher',
'Research Assistant' : 'Researcher'
}
language_mapping = {
'C' : 'C/C++/C#',
'C++' : 'C/C++/C#',
'Matlab' : 'MATLAB'
}
education_mapping = {
'Master’s degree' : "Master's degree",
'Bachelor’s degree' : "Bachelor's degree",
"Some college/university study without earning a bachelor's degree" : 'Some College',
'Some college/university study without earning a bachelor’s degree' : 'Some College',
'No formal education past high school' : 'High School',
'I did not complete any formal education past high school' : 'High School',
'I prefer not to answer' : 'None'
}
compensation_mapping = {
'I do not wish to disclose my approximate yearly compensation' : 'None',
'$0-999' : '<$10K',
'0-10,000' : '<$10K',
'2,000-2,999' : '<$10K',
'3,000-3,999' : '<$10K',
'4,000-4,999' : '<$10K',
'5,000-7,499' : '<$10K',
'7,500-9,999' : '<$10K',
'1,000-1,999' : '<$10K',
'10-20,000' : '$10K+',
'10,000-14,999' : '$10K+',
'15,000-19,999' : '$10K+',
'25,000-29,999' : '$20K+',
'20-30,000' : '$20K+',
'20,000-24,999' : '$20K+',
'30,000-39,999' : '$30K+',
'30-40,000' : '$30K+',
'40,000-49,999' : '$40K+',
'40-50,000' : '$40K+',
'50,000-59,999' : '$50K+',
'50-60,000' : '$50K+',
'60,000-69,999' : '$60K+',
'60-70,000' : '$60K+',
'70,000-79,999' : '$70K+',
'70-80,000' : '$70K+',
'80,000-89,999' : '$80K+',
'80-90,000' : '$80K+',
'90,000-99,999' : '$90K+',
'90-100,000' : '$90K+',
'100-125,000' : '$100K+',
'100,000-124,999' : '$100K+',
'125,000-149,999' : '$125K+',
'125-150,000' : '$125K+',
'150-200,000' : '$150K+',
'150,000-199,999' : '$150K+',
'200-250,000' : '$200K+',
'200,000-249,999' : '$200K+',
'250-300,000' : '$200K+',
'250,000-299,999' : '$200K+',
'300-400,000' : '$200K+',
'400-500,000' : '$200K+',
'500,000+' : '$200K+',
'> $500,000' : '$200K+',
'300,000-500,000' : '$200K+'
}
r7['year'] = 2017
r8['year'] = 2018
r9['year'] = 2019
for df in [r7,r8,r9]:
df['country'] = df['country'].replace(country_mapping)
df['age'] = df['age'].replace(age_mapping)
df['title'] = df['title'].replace(title_mapping)
df['language'] = df['language'].replace(language_mapping)
df['education'] = df['education'].replace(education_mapping)
df['compensation'] = df['compensation'].replace(compensation_mapping)
cols = ['country','age','title','language','education','compensation','year']
r7 = r7[cols].copy()
r8 = r8[cols].copy()
r9 = r9[cols].copy()
Combine Responses
r = (r7
.append(r8)
.append(r9)
.reset_index(drop=True))
r.shape
(60291, 7)
Only keep common values in each category.
for col in ['country',
'age',
'title',
'language',
'education',
'compensation']:
r7_set = set(r7[col])
r8_set = set(r8[col])
r9_set = set(r9[col])
intersection_set = r7_set.intersection(r8_set, r9_set)
intersection_set.add('None')
r.loc[~r[col].isin(intersection_set),col] = 'Other'
Define Categorical Datatypes for Age, Education, Compensation
age_ordered_list = ['18-21','22-24','25-29','30-34','35-39','40-44',
'45-49','50-54','55-59','60-69','70+','None']
age_dtype = pd.Categorical(
age_ordered_list,
categories=age_ordered_list,
ordered=True)
education_ordered_list = ['High School','Some College',"Bachelor's degree","Master's degree",
'Doctoral degree','Professional degree','None']
education_dtype = pd.Categorical(
education_ordered_list,
categories=education_ordered_list,
ordered=True)
compensation_ordered_list = ['<$10K','$10K+','$20K+','$30K+','$40K+','$50K+','$60K+','$70K+',
'$80K+','$90K+','$100K+','$125K+','$150K+','$200K+','None']
compensation_dtype = pd.Categorical(
compensation_ordered_list,
categories=compensation_ordered_list,
ordered=True)
Create Simplified Versions to Reduce Category Counts
simplified_age_ordered_list = ['<30','30\'s','40\'s','50\'s+','None']
simplified_age_dtype = pd.Categorical(
simplified_age_ordered_list,
categories=simplified_age_ordered_list,
ordered=True)
simplified_education_ordered_list = ['No Degree','Bachelor\'s','Master\'s',
'PhD','Professional','None']
simplified_education_dtype = pd.Categorical(
simplified_education_ordered_list,
categories=simplified_education_ordered_list,
ordered=True)
simplified_compensation_ordered_list = ['$30K+','$60K+','$90K+','$125K+','$150K+','None']
simplified_compensation_dtype = pd.Categorical(
simplified_compensation_ordered_list,
categories=simplified_compensation_ordered_list,
ordered=True)
Map the Categories to the Simpler Versions
r['s_age'] = r['age'].replace({
'18-21' : '<30',
'22-24' : '<30',
'25-29' : '<30',
'30-34' : '30\'s',
'35-39' : '30\'s',
'40-44' : '40\'s',
'45-49' : '40\'s',
'50-54' : '50\'s+',
'55-59' : '50\'s+',
'60-69' : '50\'s+',
'70+' : '50\'s+',
'None' : 'None'
})
r['s_education'] = r['education'].replace({
'High School' : 'No Degree',
'Some College' : 'No Degree',
"Bachelor's degree" : 'Bachelor\'s',
"Master's degree" : 'Master\'s',
'Doctoral degree' : 'PhD',
'Professional degree' : 'Professional',
'None' : 'None'
})
r['s_compensation'] = r['compensation'].replace({
'<$10K' : 'None',
'$10K+' : 'None',
'$20K+' : 'None',
'$30K+' : '$30K+',
'$40K+' : '$30K+',
'$50K+' : '$30K+',
'$60K+' : '$60K+',
'$70K+' : '$60K+',
'$80K+' : '$60K+',
'$90K+' : '$90K+',
'$100K+': '$90K+',
'$125K+': '$125K+',
'$150K+': '$150K+',
'$200K+': '$150K+'})
Assign Datatypes
for col, dtype in [('age', age_dtype),
('education', education_dtype),
('compensation', compensation_dtype),
('s_age', simplified_age_dtype),
('s_education', simplified_education_dtype),
('s_compensation', simplified_compensation_dtype)]:
r[col] = r[col].astype(dtype)
r = r[['country', 'age', 's_age', 'title', 'language',
'education', 's_education', 'compensation', 's_compensation',
'year']].copy()
Write to File
q.to_excel('processed/questions.xlsx')
r.to_excel('processed/responses.xlsx')