Summarize and Visualize the Kaggle Machine Learning and Data Science Survey Data

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import seaborn as sns
%matplotlib notebook
plt.style.use('tableau-colorblind10')

Display full text of columns with long strings.

pd.set_option('display.max_colwidth', -1)

Read Questions

pd.read_excel('processed/questions.xlsx',
              index_col=0)

d y t
0 age 2017 What's your age?
1 age 2018 What is your age (# years)?
2 age 2019 What is your age (# years)?
3 compensation 2017 What is your current total yearly compensation (salary + bonus)? - Total Amount (e.g. 75,000) - Your Compensation: -
4 compensation 2018 What is your current yearly compensation (approximate $USD)?
5 compensation 2019 What is your current yearly compensation (approximate $USD)?
6 country 2017 Select the country you currently live in.
7 country 2018 In which country do you currently reside?
8 country 2019 In which country do you currently reside?
9 education 2017 Which level of formal education have you attained?
10 education 2018 What is the highest level of formal education that you have attained or plan to attain within the next 2 years?
11 education 2019 What is the highest level of formal education that you have attained or plan to attain within the next 2 years?
12 language 2017 What programming language would you recommend a new data scientist learn first? (Select one option) - Selected Choice
13 language 2018 What programming language would you recommend an aspiring data scientist to learn first? - Selected Choice
14 language 2019 What programming language would you recommend an aspiring data scientist to learn first? - Selected Choice
15 title 2017 Select the option that's most similar to your current job/professional title (or most recent title if retired). - Selected Choice
16 title 2018 Select the title most similar to your current role (or most recent title if retired): - Selected Choice
17 title 2019 Select the title most similar to your current role (or most recent title if retired): - Selected Choice

Read Responses

d = (pd.read_excel('processed/responses.xlsx',
                   index_col=0)
     .replace('None',np.nan))
print(d.info())
d = d.replace(np.nan,'None');
<class 'pandas.core.frame.DataFrame'>
Int64Index: 60291 entries, 0 to 60290
Data columns (total 10 columns):
country           59777 non-null object
age               59847 non-null object
s_age             59847 non-null object
title             53836 non-null object
language          43961 non-null object
education         57007 non-null object
s_education       57007 non-null object
compensation      30319 non-null object
s_compensation    15880 non-null object
year              60291 non-null int64
dtypes: int64(1), object(9)
memory usage: 5.1+ MB
None

Define Datatypes

age_ordered_list = ['18-21','22-24','25-29','30-34','35-39','40-44',
                    '45-49','50-54','55-59','60-69','70+','None']
age_dtype = pd.Categorical(
    age_ordered_list,
    categories=age_ordered_list,
    ordered=True)
education_ordered_list = ['High School','Some College',"Bachelor's degree","Master's degree",
                          'Doctoral degree','Professional degree','None']
education_dtype = pd.Categorical(
    education_ordered_list,
    categories=education_ordered_list,
    ordered=True)
compensation_ordered_list = ['<$10K','$10K+','$20K+','$30K+','$40K+','$50K+','$60K+','$70K+',
                             '$80K+','$90K+','$100K+','$125K+','$150K+','$200K+','None']
compensation_dtype = pd.Categorical(
    compensation_ordered_list,
    categories=compensation_ordered_list,
    ordered=True)
simplified_age_ordered_list = ['<30','30\'s','40\'s','50\'s+','None']
simplified_age_dtype = pd.Categorical(
    simplified_age_ordered_list,
    categories=simplified_age_ordered_list,
    ordered=True)
simplified_education_ordered_list = ['No Degree','Bachelor\'s','Master\'s',
                                     'PhD','Professional','None']
simplified_education_dtype = pd.Categorical(
    simplified_education_ordered_list,
    categories=simplified_education_ordered_list,
    ordered=True)
simplified_compensation_ordered_list = ['$30K+','$60K+','$90K+','$125K+','$150K+','None']
simplified_compensation_dtype = pd.Categorical(
    simplified_compensation_ordered_list,
    categories=simplified_compensation_ordered_list,
    ordered=True)

Assign Datatypes

for col, dtype in [('age',            age_dtype),
                   ('education',      education_dtype),
                   ('compensation',   compensation_dtype),
                   ('s_age',          simplified_age_dtype),
                   ('s_education',    simplified_education_dtype),
                   ('s_compensation', simplified_compensation_dtype)]:
    d[col] = d[col].astype(dtype)
d = d[['country', 'age', 's_age', 'title', 'language', 
       'education', 's_education', 'compensation', 's_compensation',
       'year']].copy()

Define Function to Create Summary Table

simplify_dataset returns a simplified version of the dataset with one of the dimensions mapped down to the number of categories passed in as the simplified_count parameter.

def simplify_dataset(dataset,
                     simplified_count,
                     dimension_to_simplify,
                     other_dimension):
    top_counts = ((dataset[(dataset[dimension_to_simplify]!='None')]
                   [[dimension_to_simplify,other_dimension]])
                  .groupby(dimension_to_simplify)
                  .count()
                  .reset_index()
                  .rename(columns={other_dimension:'total'})
                  .sort_values('total',
                               ascending=False)
                  .reset_index(drop=True))
    top_list = list(top_counts.head(simplified_count)[dimension_to_simplify])
    simplified_dataset = dataset[(dataset[dimension_to_simplify]!='None')].copy()
    simplified_dataset.loc[~(simplified_dataset[dimension_to_simplify].isin(top_list)), 
                             dimension_to_simplify] = 'Other'
    return simplified_dataset

create_summary returns a pivot table run on the dataset on the dependent and independent parameters passed in as parameters.

  • The rows become the columns in the stacked barplot.
  • The columns become the stacks in the stacked barplot.
def create_summary(subset, 
                   dependent, 
                   independent):
    s = (subset[(subset[dependent]!='None')]
         [[dependent,independent]])
    p = pd.pivot_table(s,
                       index=dependent,
                       columns=independent,
                       aggfunc=len)
    p.columns = p.columns.astype(str)

    return p

convert_to_percentage converts the summarized data created by create_summary and converts it to percentages, returning the totals for each column as a list as well.

def convert_to_percentage(summary):
    p_summary = summary.copy()
    p_summary.columns = p_summary.columns.astype(str)
    
    p_summary['Total'] = (p_summary
                          .sum(axis='columns')
                          .astype(int))
    heights = list(p_summary['Total'])
    
    for col in p_summary.columns:
        p_summary[col] = round(p_summary[col]/p_summary['Total']*100,3)
    p_summary = p_summary.drop(columns=['Total'])
    
    return heights, p_summary

plot_barchart creates a barchart of counts based on the summarized data passed in as a parameter. The stacks are labeled by percent of the total and the count of each stack is shown at the top of each respective column.

#fignum = 1
def plot_barchart(summary,
                  plot_type, # count or percentage
                  cat_or_ord,
                  title,
                  labels_to_skip):
    global fignum
    fig = plt.figure();
    n_columns, n_stacks = summary.shape
    heights = []
    
    if plot_type == 'percentage':
        heights, summary = convert_to_percentage(summary)
        
    b = 0
    if cat_or_ord == 'categorical':
        for n, col in enumerate(summary.columns):
            plt.bar(summary.index, 
                    summary[col],
                    bottom = b,
                    label = col,
                    width=0.65)
            b += summary[col]
    elif cat_or_ord == 'ordinal':
        colors = plt.cm.cividis(np.linspace(0.2,0.8,n_stacks))
        for n, col in enumerate(summary.columns):
            plt.bar(summary.index, 
                    summary[col],
                    bottom = b,
                    label = col,
                    width=0.65,
                    color=colors[n])
            b += summary[col]
    
    rects = plt.gca().patches
    
    # Iterate and find the tallest column to determine vertical_offset of column labels
    max_height = 0
    for r in rects:
        if int(r.get_y() + r.get_height()) > max_height:
            max_height = int(r.get_y() + r.get_height())
    
    # Add column totals
    for n, r in enumerate(rects[(-1*n_columns):]):
        if plot_type == 'count':
            height = int(r.get_y() + r.get_height())
            heights.append(height)
            plt.gca().text(r.get_x() + r.get_width() / 2,
                           height + (.01 * max_height),
                           str(height),
                           ha='center',
                           va='bottom',
                           color='black',
                           fontsize=8)
        if plot_type == 'percentage':
            plt.gca().text(r.get_x() + r.get_width() / 2,
                           100 + (.01 * max_height),
                           str(heights[n]),
                           ha='center',
                           va='bottom',
                           color='black',
                           fontsize=8)

    # Add Percentage labels
    for n, r in enumerate(rects):
        if n not in labels_to_skip:
            height = r.get_height()
            if plot_type == 'count':
                label = (height/heights[n % n_columns])*100
            else:
                label = height
            plt.gca().text(r.get_x() + r.get_width() / 2,
                           r.get_y() + height / 2,
                           '{0:.0f}%'.format(label),
                           ha='center',
                           va='center',
                           color='white',
                           fontsize=8)

    plt.suptitle(title)
    
    if plot_type == 'count':
        plt.ylabel('Respondent Count')
    if plot_type == 'percentage':
        plt.ylabel('% of Responses')

    ax = plt.gca()
    
    plt.tick_params(
        axis='y',
        left=False,
        labelleft=False)
    for spine in ax.spines.values():
        spine.set_visible(False)

    ax.legend(loc='lower center',
              bbox_to_anchor=(0.5, 1),
              ncol=n_stacks,
              frameon=False,
              prop={'size':8})
    
#    plt.savefig('output_figures/'+str(fignum)+'.pdf')
#    fignum += 1

Create Visuals

Compensation By Title

d_sub = simplify_dataset(d,
                         5,
                         'title',
                         's_compensation')
c_t = (create_summary(d_sub,
                      's_compensation',
                      'title')
      [['Data Scientist','Other','Software Engineer','Researcher','Data Analyst']])
c_t.columns = c_t.columns.str.replace(' ','\n')
c_t = (c_t
       .T
       [['$150K+','$125K+','$90K+','$60K+','$30K+']])
plot_barchart(c_t,
              'percentage',
              'ordinal',
              'Kaggle Survey Respondents\' Compensation Category by Job Title',
              [])
<IPython.core.display.Javascript object>

Compensation by Age

c_a = (create_summary(d,
                      's_compensation',
                      's_age')
      .drop(columns=['None'])
      .T
      [['$150K+','$125K+','$90K+','$60K+','$30K+']])
plot_barchart(c_a,
              'percentage',
              'ordinal',
              'Kaggle Survey Respondents\' Compensation Category by Age Group',
              [])
<IPython.core.display.Javascript object>

Language by Age

d_sub = simplify_dataset(d,
                         5,
                         'language',
                         's_age')
l_a = (create_summary(d_sub,
                      'language',
                      's_age')
       .drop(columns=['None'])
       .T
       [['Python','R','SQL','C/C++/C#', 'Other']])
plot_barchart(l_a,
              'percentage',
              'categorical',
              'Kaggle Survey Respondents\' Recommended 1st Language by Age Group',
              [])
<IPython.core.display.Javascript object>

Language by Year

d_sub = simplify_dataset(d,
                         5,
                         'language',
                         'year')
l = create_summary(d_sub,
                  'language',
                  'year')
l = l.T
l = l[['Python','R','SQL','C/C++/C#','Other']]
plot_barchart(l,
              'percentage',
              'categorical',
              'Kaggle Survey Respondents\' Recommended 1st Language by Survey Year',
              [])
<IPython.core.display.Javascript object>

Language By Title

d_sub = simplify_dataset(d,
                         5,
                         'language',
                         'title')
d_sub = simplify_dataset(d_sub,
                         5,
                         'title',
                         'language')
l_t = (create_summary(d_sub,
                      'language',
                      'title')
      [['Software Engineer','Other','Data Scientist','Researcher','Data Analyst']])
l_t.columns = l_t.columns.str.replace(' ','\n')
l_t = (l_t.T
       [['Python','R','SQL','C/C++/C#','Other']])
plot_barchart(l_t,
              'percentage',
              'categorical',
              'Kaggle Survey Respondents\' Recommended 1st Language by Job Title',
              [13,17,19])
<IPython.core.display.Javascript object>

Education By Compensation

t_c = (create_summary(d_sub[d_sub['s_compensation']!='None'],
                      's_education',
                      's_compensation')
       [['$150K+','$125K+','$90K+','$60K+','$30K+']]
       .T
       [['PhD','Professional','Master\'s','Bachelor\'s','No Degree']]
       )
plot_barchart(t_c,
              'percentage',
              'ordinal',
              'Kaggle Survey Respondents\' Educational Attainment by Compensation',
              [])
<IPython.core.display.Javascript object>

Education By Title

d_sub = simplify_dataset(d,
                         5,
                         'title',
                         's_education')
e_t = (create_summary(d_sub,
                      's_education',
                      'title')
      [['Researcher','Data Scientist','Other','Data Analyst','Software Engineer']])
e_t.columns = e_t.columns.str.replace(' ','\n')
e_t = (e_t.T
       [['PhD', 'Professional', 'Master\'s', 'Bachelor\'s', 'No Degree']])
plot_barchart(e_t,
              'percentage',
              'ordinal',
              'Kaggle Survey Respondents\' Educational Attainment by Job Title',
              [20])
<IPython.core.display.Javascript object>

Education by Age

e_a = (create_summary(d,
                      's_education',
                      's_age')
      .drop(columns=['None'])
      .T
      [['PhD', 'Professional', 'Master\'s', 'Bachelor\'s', 'No Degree']])
plot_barchart(e_a,
              'percentage',
              'ordinal',
              'Kaggle Survey Respondents\' Education Attainment by Age Group',
              [])
<IPython.core.display.Javascript object>

Country by Year

d_sub = simplify_dataset(d,
                         4,
                         'country',
                         'year')
c = create_summary(d_sub,
                   'country',
                   'year')
c = c.T
c = c[['United States','India','China','Other']]
plot_barchart(c,
              'percentage',
              'categorical',
              'Kaggle Survey Respondents\' Country by Survey Year',
              []);
<IPython.core.display.Javascript object>