Summarize and Visualize the Kaggle Machine Learning and Data Science Survey Data

06 Jan 2020

Read Questions
Read Responses
- Define Datatypes
- Assign Datatypes
Define Functions
Create Visuals

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import seaborn as sns

%matplotlib notebook

plt.style.use('tableau-colorblind10')

Display full text of columns with long strings.

pd.set_option('display.max_colwidth', -1)

Read Questions

pd.read_excel('processed/questions.xlsx',
              index_col=0)

	d	y	t
0	age	2017	What's your age?
1	age	2018	What is your age (# years)?
2	age	2019	What is your age (# years)?
3	compensation	2017	What is your current total yearly compensation (salary + bonus)? - Total Amount (e.g. 75,000) - Your Compensation: -
4	compensation	2018	What is your current yearly compensation (approximate $USD)?
5	compensation	2019	What is your current yearly compensation (approximate $USD)?
6	country	2017	Select the country you currently live in.
7	country	2018	In which country do you currently reside?
8	country	2019	In which country do you currently reside?
9	education	2017	Which level of formal education have you attained?
10	education	2018	What is the highest level of formal education that you have attained or plan to attain within the next 2 years?
11	education	2019	What is the highest level of formal education that you have attained or plan to attain within the next 2 years?
12	language	2017	What programming language would you recommend a new data scientist learn first? (Select one option) - Selected Choice
13	language	2018	What programming language would you recommend an aspiring data scientist to learn first? - Selected Choice
14	language	2019	What programming language would you recommend an aspiring data scientist to learn first? - Selected Choice
15	title	2017	Select the option that's most similar to your current job/professional title (or most recent title if retired). - Selected Choice
16	title	2018	Select the title most similar to your current role (or most recent title if retired): - Selected Choice
17	title	2019	Select the title most similar to your current role (or most recent title if retired): - Selected Choice

Read Responses

d = (pd.read_excel('processed/responses.xlsx',
                   index_col=0)
     .replace('None',np.nan))
print(d.info())
d = d.replace(np.nan,'None');

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60291 entries, 0 to 60290
Data columns (total 10 columns):
country           59777 non-null object
age               59847 non-null object
s_age             59847 non-null object
title             53836 non-null object
language          43961 non-null object
education         57007 non-null object
s_education       57007 non-null object
compensation      30319 non-null object
s_compensation    15880 non-null object
year              60291 non-null int64
dtypes: int64(1), object(9)
memory usage: 5.1+ MB
None

Define Datatypes

age_ordered_list = ['18-21','22-24','25-29','30-34','35-39','40-44',
                    '45-49','50-54','55-59','60-69','70+','None']
age_dtype = pd.Categorical(
    age_ordered_list,
    categories=age_ordered_list,
    ordered=True)

education_ordered_list = ['High School','Some College',"Bachelor's degree","Master's degree",
                          'Doctoral degree','Professional degree','None']
education_dtype = pd.Categorical(
    education_ordered_list,
    categories=education_ordered_list,
    ordered=True)

compensation_ordered_list = ['<$10K','$10K+','$20K+','$30K+','$40K+','$50K+','$60K+','$70K+',
                             '$80K+','$90K+','$100K+','$125K+','$150K+','$200K+','None']
compensation_dtype = pd.Categorical(
    compensation_ordered_list,
    categories=compensation_ordered_list,
    ordered=True)

simplified_age_ordered_list = ['<30','30\'s','40\'s','50\'s+','None']
simplified_age_dtype = pd.Categorical(
    simplified_age_ordered_list,
    categories=simplified_age_ordered_list,
    ordered=True)

simplified_education_ordered_list = ['No Degree','Bachelor\'s','Master\'s',
                                     'PhD','Professional','None']
simplified_education_dtype = pd.Categorical(
    simplified_education_ordered_list,
    categories=simplified_education_ordered_list,
    ordered=True)

simplified_compensation_ordered_list = ['$30K+','$60K+','$90K+','$125K+','$150K+','None']
simplified_compensation_dtype = pd.Categorical(
    simplified_compensation_ordered_list,
    categories=simplified_compensation_ordered_list,
    ordered=True)

Assign Datatypes

for col, dtype in [('age',            age_dtype),
                   ('education',      education_dtype),
                   ('compensation',   compensation_dtype),
                   ('s_age',          simplified_age_dtype),
                   ('s_education',    simplified_education_dtype),
                   ('s_compensation', simplified_compensation_dtype)]:
    d[col] = d[col].astype(dtype)
d = d[['country', 'age', 's_age', 'title', 'language', 
       'education', 's_education', 'compensation', 's_compensation',
       'year']].copy()

Define Function to Create Summary Table

simplify_dataset returns a simplified version of the dataset with one of the dimensions mapped down to the number of categories passed in as the simplified_count parameter.

def simplify_dataset(dataset,
                     simplified_count,
                     dimension_to_simplify,
                     other_dimension):
    top_counts = ((dataset[(dataset[dimension_to_simplify]!='None')]
                   [[dimension_to_simplify,other_dimension]])
                  .groupby(dimension_to_simplify)
                  .count()
                  .reset_index()
                  .rename(columns={other_dimension:'total'})
                  .sort_values('total',
                               ascending=False)
                  .reset_index(drop=True))
    top_list = list(top_counts.head(simplified_count)[dimension_to_simplify])
    simplified_dataset = dataset[(dataset[dimension_to_simplify]!='None')].copy()
    simplified_dataset.loc[~(simplified_dataset[dimension_to_simplify].isin(top_list)), 
                             dimension_to_simplify] = 'Other'
    return simplified_dataset

create_summary returns a pivot table run on the dataset on the dependent and independent parameters passed in as parameters.

The rows become the columns in the stacked barplot.
The columns become the stacks in the stacked barplot.

def create_summary(subset, 
                   dependent, 
                   independent):
    s = (subset[(subset[dependent]!='None')]
         [[dependent,independent]])
    p = pd.pivot_table(s,
                       index=dependent,
                       columns=independent,
                       aggfunc=len)
    p.columns = p.columns.astype(str)

    return p

convert_to_percentage converts the summarized data created by create_summary and converts it to percentages, returning the totals for each column as a list as well.

def convert_to_percentage(summary):
    p_summary = summary.copy()
    p_summary.columns = p_summary.columns.astype(str)
    
    p_summary['Total'] = (p_summary
                          .sum(axis='columns')
                          .astype(int))
    heights = list(p_summary['Total'])
    
    for col in p_summary.columns:
        p_summary[col] = round(p_summary[col]/p_summary['Total']*100,3)
    p_summary = p_summary.drop(columns=['Total'])
    
    return heights, p_summary

plot_barchart creates a barchart of counts based on the summarized data passed in as a parameter. The stacks are labeled by percent of the total and the count of each stack is shown at the top of each respective column.

#fignum = 1
def plot_barchart(summary,
                  plot_type, # count or percentage
                  cat_or_ord,
                  title,
                  labels_to_skip):
    global fignum
    fig = plt.figure();
    n_columns, n_stacks = summary.shape
    heights = []
    
    if plot_type == 'percentage':
        heights, summary = convert_to_percentage(summary)
        
    b = 0
    if cat_or_ord == 'categorical':
        for n, col in enumerate(summary.columns):
            plt.bar(summary.index, 
                    summary[col],
                    bottom = b,
                    label = col,
                    width=0.65)
            b += summary[col]
    elif cat_or_ord == 'ordinal':
        colors = plt.cm.cividis(np.linspace(0.2,0.8,n_stacks))
        for n, col in enumerate(summary.columns):
            plt.bar(summary.index, 
                    summary[col],
                    bottom = b,
                    label = col,
                    width=0.65,
                    color=colors[n])
            b += summary[col]
    
    rects = plt.gca().patches
    
    # Iterate and find the tallest column to determine vertical_offset of column labels
    max_height = 0
    for r in rects:
        if int(r.get_y() + r.get_height()) > max_height:
            max_height = int(r.get_y() + r.get_height())
    
    # Add column totals
    for n, r in enumerate(rects[(-1*n_columns):]):
        if plot_type == 'count':
            height = int(r.get_y() + r.get_height())
            heights.append(height)
            plt.gca().text(r.get_x() + r.get_width() / 2,
                           height + (.01 * max_height),
                           str(height),
                           ha='center',
                           va='bottom',
                           color='black',
                           fontsize=8)
        if plot_type == 'percentage':
            plt.gca().text(r.get_x() + r.get_width() / 2,
                           100 + (.01 * max_height),
                           str(heights[n]),
                           ha='center',
                           va='bottom',
                           color='black',
                           fontsize=8)

    # Add Percentage labels
    for n, r in enumerate(rects):
        if n not in labels_to_skip:
            height = r.get_height()
            if plot_type == 'count':
                label = (height/heights[n % n_columns])*100
            else:
                label = height
            plt.gca().text(r.get_x() + r.get_width() / 2,
                           r.get_y() + height / 2,
                           '{0:.0f}%'.format(label),
                           ha='center',
                           va='center',
                           color='white',
                           fontsize=8)

    plt.suptitle(title)
    
    if plot_type == 'count':
        plt.ylabel('Respondent Count')
    if plot_type == 'percentage':
        plt.ylabel('% of Responses')

    ax = plt.gca()
    
    plt.tick_params(
        axis='y',
        left=False,
        labelleft=False)
    for spine in ax.spines.values():
        spine.set_visible(False)

    ax.legend(loc='lower center',
              bbox_to_anchor=(0.5, 1),
              ncol=n_stacks,
              frameon=False,
              prop={'size':8})
    
#    plt.savefig('output_figures/'+str(fignum)+'.pdf')
#    fignum += 1

Create Visuals

Compensation By Title

d_sub = simplify_dataset(d,
                         5,
                         'title',
                         's_compensation')
c_t = (create_summary(d_sub,
                      's_compensation',
                      'title')
      [['Data Scientist','Other','Software Engineer','Researcher','Data Analyst']])
c_t.columns = c_t.columns.str.replace(' ','\n')
c_t = (c_t
       .T
       [['$150K+','$125K+','$90K+','$60K+','$30K+']])
plot_barchart(c_t,
              'percentage',
              'ordinal',
              'Kaggle Survey Respondents\' Compensation Category by Job Title',
              [])

<IPython.core.display.Javascript object>

Compensation by Age

c_a = (create_summary(d,
                      's_compensation',
                      's_age')
      .drop(columns=['None'])
      .T
      [['$150K+','$125K+','$90K+','$60K+','$30K+']])
plot_barchart(c_a,
              'percentage',
              'ordinal',
              'Kaggle Survey Respondents\' Compensation Category by Age Group',
              [])

<IPython.core.display.Javascript object>

Language by Age

d_sub = simplify_dataset(d,
                         5,
                         'language',
                         's_age')
l_a = (create_summary(d_sub,
                      'language',
                      's_age')
       .drop(columns=['None'])
       .T
       [['Python','R','SQL','C/C++/C#', 'Other']])
plot_barchart(l_a,
              'percentage',
              'categorical',
              'Kaggle Survey Respondents\' Recommended 1st Language by Age Group',
              [])

<IPython.core.display.Javascript object>

Language by Year

d_sub = simplify_dataset(d,
                         5,
                         'language',
                         'year')
l = create_summary(d_sub,
                  'language',
                  'year')
l = l.T
l = l[['Python','R','SQL','C/C++/C#','Other']]
plot_barchart(l,
              'percentage',
              'categorical',
              'Kaggle Survey Respondents\' Recommended 1st Language by Survey Year',
              [])

<IPython.core.display.Javascript object>

Language By Title

d_sub = simplify_dataset(d,
                         5,
                         'language',
                         'title')
d_sub = simplify_dataset(d_sub,
                         5,
                         'title',
                         'language')
l_t = (create_summary(d_sub,
                      'language',
                      'title')
      [['Software Engineer','Other','Data Scientist','Researcher','Data Analyst']])
l_t.columns = l_t.columns.str.replace(' ','\n')
l_t = (l_t.T
       [['Python','R','SQL','C/C++/C#','Other']])
plot_barchart(l_t,
              'percentage',
              'categorical',
              'Kaggle Survey Respondents\' Recommended 1st Language by Job Title',
              [13,17,19])

<IPython.core.display.Javascript object>

Education By Compensation

t_c = (create_summary(d_sub[d_sub['s_compensation']!='None'],
                      's_education',
                      's_compensation')
       [['$150K+','$125K+','$90K+','$60K+','$30K+']]
       .T
       [['PhD','Professional','Master\'s','Bachelor\'s','No Degree']]
       )
plot_barchart(t_c,
              'percentage',
              'ordinal',
              'Kaggle Survey Respondents\' Educational Attainment by Compensation',
              [])

<IPython.core.display.Javascript object>

Education By Title

d_sub = simplify_dataset(d,
                         5,
                         'title',
                         's_education')
e_t = (create_summary(d_sub,
                      's_education',
                      'title')
      [['Researcher','Data Scientist','Other','Data Analyst','Software Engineer']])
e_t.columns = e_t.columns.str.replace(' ','\n')
e_t = (e_t.T
       [['PhD', 'Professional', 'Master\'s', 'Bachelor\'s', 'No Degree']])
plot_barchart(e_t,
              'percentage',
              'ordinal',
              'Kaggle Survey Respondents\' Educational Attainment by Job Title',
              [20])

<IPython.core.display.Javascript object>

Education by Age

e_a = (create_summary(d,
                      's_education',
                      's_age')
      .drop(columns=['None'])
      .T
      [['PhD', 'Professional', 'Master\'s', 'Bachelor\'s', 'No Degree']])
plot_barchart(e_a,
              'percentage',
              'ordinal',
              'Kaggle Survey Respondents\' Education Attainment by Age Group',
              [])

<IPython.core.display.Javascript object>

Country by Year

d_sub = simplify_dataset(d,
                         4,
                         'country',
                         'year')
c = create_summary(d_sub,
                   'country',
                   'year')
c = c.T
c = c[['United States','India','China','Other']]
plot_barchart(c,
              'percentage',
              'categorical',
              'Kaggle Survey Respondents\' Country by Survey Year',
              []);

<IPython.core.display.Javascript object>