Summarize and Visualize the Kaggle Machine Learning and Data Science Survey Data
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import seaborn as sns
%matplotlib notebook
plt.style.use('tableau-colorblind10')
Display full text of columns with long strings.
pd.set_option('display.max_colwidth', -1)
Read Questions
pd.read_excel('processed/questions.xlsx',
index_col=0)
d | y | t | |
---|---|---|---|
0 | age | 2017 | What's your age? |
1 | age | 2018 | What is your age (# years)? |
2 | age | 2019 | What is your age (# years)? |
3 | compensation | 2017 | What is your current total yearly compensation (salary + bonus)? - Total Amount (e.g. 75,000) - Your Compensation: - |
4 | compensation | 2018 | What is your current yearly compensation (approximate $USD)? |
5 | compensation | 2019 | What is your current yearly compensation (approximate $USD)? |
6 | country | 2017 | Select the country you currently live in. |
7 | country | 2018 | In which country do you currently reside? |
8 | country | 2019 | In which country do you currently reside? |
9 | education | 2017 | Which level of formal education have you attained? |
10 | education | 2018 | What is the highest level of formal education that you have attained or plan to attain within the next 2 years? |
11 | education | 2019 | What is the highest level of formal education that you have attained or plan to attain within the next 2 years? |
12 | language | 2017 | What programming language would you recommend a new data scientist learn first? (Select one option) - Selected Choice |
13 | language | 2018 | What programming language would you recommend an aspiring data scientist to learn first? - Selected Choice |
14 | language | 2019 | What programming language would you recommend an aspiring data scientist to learn first? - Selected Choice |
15 | title | 2017 | Select the option that's most similar to your current job/professional title (or most recent title if retired). - Selected Choice |
16 | title | 2018 | Select the title most similar to your current role (or most recent title if retired): - Selected Choice |
17 | title | 2019 | Select the title most similar to your current role (or most recent title if retired): - Selected Choice |
Read Responses
d = (pd.read_excel('processed/responses.xlsx',
index_col=0)
.replace('None',np.nan))
print(d.info())
d = d.replace(np.nan,'None');
<class 'pandas.core.frame.DataFrame'>
Int64Index: 60291 entries, 0 to 60290
Data columns (total 10 columns):
country 59777 non-null object
age 59847 non-null object
s_age 59847 non-null object
title 53836 non-null object
language 43961 non-null object
education 57007 non-null object
s_education 57007 non-null object
compensation 30319 non-null object
s_compensation 15880 non-null object
year 60291 non-null int64
dtypes: int64(1), object(9)
memory usage: 5.1+ MB
None
Define Datatypes
age_ordered_list = ['18-21','22-24','25-29','30-34','35-39','40-44',
'45-49','50-54','55-59','60-69','70+','None']
age_dtype = pd.Categorical(
age_ordered_list,
categories=age_ordered_list,
ordered=True)
education_ordered_list = ['High School','Some College',"Bachelor's degree","Master's degree",
'Doctoral degree','Professional degree','None']
education_dtype = pd.Categorical(
education_ordered_list,
categories=education_ordered_list,
ordered=True)
compensation_ordered_list = ['<$10K','$10K+','$20K+','$30K+','$40K+','$50K+','$60K+','$70K+',
'$80K+','$90K+','$100K+','$125K+','$150K+','$200K+','None']
compensation_dtype = pd.Categorical(
compensation_ordered_list,
categories=compensation_ordered_list,
ordered=True)
simplified_age_ordered_list = ['<30','30\'s','40\'s','50\'s+','None']
simplified_age_dtype = pd.Categorical(
simplified_age_ordered_list,
categories=simplified_age_ordered_list,
ordered=True)
simplified_education_ordered_list = ['No Degree','Bachelor\'s','Master\'s',
'PhD','Professional','None']
simplified_education_dtype = pd.Categorical(
simplified_education_ordered_list,
categories=simplified_education_ordered_list,
ordered=True)
simplified_compensation_ordered_list = ['$30K+','$60K+','$90K+','$125K+','$150K+','None']
simplified_compensation_dtype = pd.Categorical(
simplified_compensation_ordered_list,
categories=simplified_compensation_ordered_list,
ordered=True)
Assign Datatypes
for col, dtype in [('age', age_dtype),
('education', education_dtype),
('compensation', compensation_dtype),
('s_age', simplified_age_dtype),
('s_education', simplified_education_dtype),
('s_compensation', simplified_compensation_dtype)]:
d[col] = d[col].astype(dtype)
d = d[['country', 'age', 's_age', 'title', 'language',
'education', 's_education', 'compensation', 's_compensation',
'year']].copy()
Define Function to Create Summary Table
simplify_dataset
returns a simplified version of the dataset with one of the dimensions mapped down to the number of categories passed in as the simplified_count
parameter.
def simplify_dataset(dataset,
simplified_count,
dimension_to_simplify,
other_dimension):
top_counts = ((dataset[(dataset[dimension_to_simplify]!='None')]
[[dimension_to_simplify,other_dimension]])
.groupby(dimension_to_simplify)
.count()
.reset_index()
.rename(columns={other_dimension:'total'})
.sort_values('total',
ascending=False)
.reset_index(drop=True))
top_list = list(top_counts.head(simplified_count)[dimension_to_simplify])
simplified_dataset = dataset[(dataset[dimension_to_simplify]!='None')].copy()
simplified_dataset.loc[~(simplified_dataset[dimension_to_simplify].isin(top_list)),
dimension_to_simplify] = 'Other'
return simplified_dataset
create_summary
returns a pivot table run on the dataset on the dependent and independent parameters passed in as parameters.
- The rows become the columns in the stacked barplot.
- The columns become the stacks in the stacked barplot.
def create_summary(subset,
dependent,
independent):
s = (subset[(subset[dependent]!='None')]
[[dependent,independent]])
p = pd.pivot_table(s,
index=dependent,
columns=independent,
aggfunc=len)
p.columns = p.columns.astype(str)
return p
convert_to_percentage
converts the summarized data created by create_summary
and converts it to percentages, returning the totals for each column as a list as well.
def convert_to_percentage(summary):
p_summary = summary.copy()
p_summary.columns = p_summary.columns.astype(str)
p_summary['Total'] = (p_summary
.sum(axis='columns')
.astype(int))
heights = list(p_summary['Total'])
for col in p_summary.columns:
p_summary[col] = round(p_summary[col]/p_summary['Total']*100,3)
p_summary = p_summary.drop(columns=['Total'])
return heights, p_summary
plot_barchart
creates a barchart of counts based on the summarized data passed in as a parameter. The stacks are labeled by percent of the total and the count of each stack is shown at the top of each respective column.
#fignum = 1
def plot_barchart(summary,
plot_type, # count or percentage
cat_or_ord,
title,
labels_to_skip):
global fignum
fig = plt.figure();
n_columns, n_stacks = summary.shape
heights = []
if plot_type == 'percentage':
heights, summary = convert_to_percentage(summary)
b = 0
if cat_or_ord == 'categorical':
for n, col in enumerate(summary.columns):
plt.bar(summary.index,
summary[col],
bottom = b,
label = col,
width=0.65)
b += summary[col]
elif cat_or_ord == 'ordinal':
colors = plt.cm.cividis(np.linspace(0.2,0.8,n_stacks))
for n, col in enumerate(summary.columns):
plt.bar(summary.index,
summary[col],
bottom = b,
label = col,
width=0.65,
color=colors[n])
b += summary[col]
rects = plt.gca().patches
# Iterate and find the tallest column to determine vertical_offset of column labels
max_height = 0
for r in rects:
if int(r.get_y() + r.get_height()) > max_height:
max_height = int(r.get_y() + r.get_height())
# Add column totals
for n, r in enumerate(rects[(-1*n_columns):]):
if plot_type == 'count':
height = int(r.get_y() + r.get_height())
heights.append(height)
plt.gca().text(r.get_x() + r.get_width() / 2,
height + (.01 * max_height),
str(height),
ha='center',
va='bottom',
color='black',
fontsize=8)
if plot_type == 'percentage':
plt.gca().text(r.get_x() + r.get_width() / 2,
100 + (.01 * max_height),
str(heights[n]),
ha='center',
va='bottom',
color='black',
fontsize=8)
# Add Percentage labels
for n, r in enumerate(rects):
if n not in labels_to_skip:
height = r.get_height()
if plot_type == 'count':
label = (height/heights[n % n_columns])*100
else:
label = height
plt.gca().text(r.get_x() + r.get_width() / 2,
r.get_y() + height / 2,
'{0:.0f}%'.format(label),
ha='center',
va='center',
color='white',
fontsize=8)
plt.suptitle(title)
if plot_type == 'count':
plt.ylabel('Respondent Count')
if plot_type == 'percentage':
plt.ylabel('% of Responses')
ax = plt.gca()
plt.tick_params(
axis='y',
left=False,
labelleft=False)
for spine in ax.spines.values():
spine.set_visible(False)
ax.legend(loc='lower center',
bbox_to_anchor=(0.5, 1),
ncol=n_stacks,
frameon=False,
prop={'size':8})
# plt.savefig('output_figures/'+str(fignum)+'.pdf')
# fignum += 1
Create Visuals
Compensation By Title
d_sub = simplify_dataset(d,
5,
'title',
's_compensation')
c_t = (create_summary(d_sub,
's_compensation',
'title')
[['Data Scientist','Other','Software Engineer','Researcher','Data Analyst']])
c_t.columns = c_t.columns.str.replace(' ','\n')
c_t = (c_t
.T
[['$150K+','$125K+','$90K+','$60K+','$30K+']])
plot_barchart(c_t,
'percentage',
'ordinal',
'Kaggle Survey Respondents\' Compensation Category by Job Title',
[])
<IPython.core.display.Javascript object>
Compensation by Age
c_a = (create_summary(d,
's_compensation',
's_age')
.drop(columns=['None'])
.T
[['$150K+','$125K+','$90K+','$60K+','$30K+']])
plot_barchart(c_a,
'percentage',
'ordinal',
'Kaggle Survey Respondents\' Compensation Category by Age Group',
[])
<IPython.core.display.Javascript object>
Language by Age
d_sub = simplify_dataset(d,
5,
'language',
's_age')
l_a = (create_summary(d_sub,
'language',
's_age')
.drop(columns=['None'])
.T
[['Python','R','SQL','C/C++/C#', 'Other']])
plot_barchart(l_a,
'percentage',
'categorical',
'Kaggle Survey Respondents\' Recommended 1st Language by Age Group',
[])
<IPython.core.display.Javascript object>
Language by Year
d_sub = simplify_dataset(d,
5,
'language',
'year')
l = create_summary(d_sub,
'language',
'year')
l = l.T
l = l[['Python','R','SQL','C/C++/C#','Other']]
plot_barchart(l,
'percentage',
'categorical',
'Kaggle Survey Respondents\' Recommended 1st Language by Survey Year',
[])
<IPython.core.display.Javascript object>
Language By Title
d_sub = simplify_dataset(d,
5,
'language',
'title')
d_sub = simplify_dataset(d_sub,
5,
'title',
'language')
l_t = (create_summary(d_sub,
'language',
'title')
[['Software Engineer','Other','Data Scientist','Researcher','Data Analyst']])
l_t.columns = l_t.columns.str.replace(' ','\n')
l_t = (l_t.T
[['Python','R','SQL','C/C++/C#','Other']])
plot_barchart(l_t,
'percentage',
'categorical',
'Kaggle Survey Respondents\' Recommended 1st Language by Job Title',
[13,17,19])
<IPython.core.display.Javascript object>
Education By Compensation
t_c = (create_summary(d_sub[d_sub['s_compensation']!='None'],
's_education',
's_compensation')
[['$150K+','$125K+','$90K+','$60K+','$30K+']]
.T
[['PhD','Professional','Master\'s','Bachelor\'s','No Degree']]
)
plot_barchart(t_c,
'percentage',
'ordinal',
'Kaggle Survey Respondents\' Educational Attainment by Compensation',
[])
<IPython.core.display.Javascript object>
Education By Title
d_sub = simplify_dataset(d,
5,
'title',
's_education')
e_t = (create_summary(d_sub,
's_education',
'title')
[['Researcher','Data Scientist','Other','Data Analyst','Software Engineer']])
e_t.columns = e_t.columns.str.replace(' ','\n')
e_t = (e_t.T
[['PhD', 'Professional', 'Master\'s', 'Bachelor\'s', 'No Degree']])
plot_barchart(e_t,
'percentage',
'ordinal',
'Kaggle Survey Respondents\' Educational Attainment by Job Title',
[20])
<IPython.core.display.Javascript object>
Education by Age
e_a = (create_summary(d,
's_education',
's_age')
.drop(columns=['None'])
.T
[['PhD', 'Professional', 'Master\'s', 'Bachelor\'s', 'No Degree']])
plot_barchart(e_a,
'percentage',
'ordinal',
'Kaggle Survey Respondents\' Education Attainment by Age Group',
[])
<IPython.core.display.Javascript object>
Country by Year
d_sub = simplify_dataset(d,
4,
'country',
'year')
c = create_summary(d_sub,
'country',
'year')
c = c.T
c = c[['United States','India','China','Other']]
plot_barchart(c,
'percentage',
'categorical',
'Kaggle Survey Respondents\' Country by Survey Year',
[]);
<IPython.core.display.Javascript object>