Stacked Bar Charts

This note demonstrates a function that can be used to quickly build a stacked bar chart using Pandas and Matplotlib. It also demonstrates a quick way to categorize continuous data using Pandas.

import numpy as np
import pandas as pd

Discretize a Continuous Variable

2 Pandas functions can be used to categorize rows based on a continuous feature. In this case, classifying fruits by mass.

  • Panda’s cut function divides the fruits by mass into buckets with equal size bins.
  • Panda’s qcut function divides the fruits by mass into buckets with equal counts of instances in each bin.
f = (pd.read_csv('stacked-bar-charts/fruit_data_with_colors.txt',
                 sep='\t')
     .rename(columns={'fruit_name':'fruit'}))
f['fruit'] = f['fruit'].str.title()
f['mass'].describe().round(1)
count     59.0
mean     163.1
std       55.0
min       76.0
25%      140.0
50%      158.0
75%      177.0
max      362.0
Name: mass, dtype: float64
f['cut'] = pd.cut(f['mass'],
                  3)
f['cut labels'] = pd.cut(f['mass'],
                         3,
                         labels=['Small','Medium','Large'])
f['qcut'] = pd.qcut(f['mass'],
                    4)
f['qcut labels'] = pd.qcut(f['mass'],
                           4,
                           labels=['Small','Medium','Large','Extra Large'])
f = f[['fruit', 'mass',
       'cut',   'cut labels',
       'qcut',  'qcut labels']]
f.head()
fruit mass cut cut labels qcut qcut labels
0 Apple 192 (171.333, 266.667] Medium (177.0, 362.0] Extra Large
1 Apple 180 (171.333, 266.667] Medium (177.0, 362.0] Extra Large
2 Apple 176 (171.333, 266.667] Medium (158.0, 177.0] Large
3 Mandarin 86 (75.714, 171.333] Small (75.999, 140.0] Small
4 Mandarin 84 (75.714, 171.333] Small (75.999, 140.0] Small

Cut

cut_count = (f[['cut labels','fruit','cut',]]
             .groupby(['cut labels','cut'])
             .count()
             .reset_index()
             .rename(columns={'cut labels' : 'labels',
                              'fruit'      : 'fruit count'})
             .dropna()
             .reset_index(drop=True))
cut_count['fruit count'] = cut_count['fruit count'].astype(int)
for limit, threshold in [('upper',1),
                         ('lower',0)]:
    cut_count[limit] = (cut_count['cut'].astype(str)
                        .str.split(expand=True)[threshold]
                        .str.strip('(,] ')
                        .astype(float))
cut_count['bin size'] = (cut_count['upper'] - cut_count['lower']).round(1)
cut_count[['labels','cut','bin size','fruit count']]
labels cut bin size fruit count
0 Small (75.714, 171.333] 95.6 40
1 Medium (171.333, 266.667] 95.3 16
2 Large (266.667, 362.0] 95.3 3

QCut

qcut_count = (f[['qcut labels','fruit','qcut',]]
              .groupby(['qcut labels','qcut'])
              .count()
              .reset_index()
              .rename(columns={'qcut labels':'labels',
                               'fruit':'fruit count'})
              .dropna()
              .reset_index(drop=True))
qcut_count['fruit count'] = qcut_count['fruit count'].astype(int)
for limit, threshold in [('upper',1),
                         ('lower',0)]:
    qcut_count[limit] = (qcut_count['qcut'].astype(str)
                         .str.split(expand=True)[threshold]
                         .str.strip('(,] ')
                         .astype(float))
qcut_count['bin size'] = (qcut_count['upper'] - qcut_count['lower']).round(1)
qcut_count[['labels','qcut','bin size','fruit count']]
labels qcut bin size fruit count
0 Small (75.999, 140.0] 64.0 16
1 Medium (140.0, 158.0] 18.0 14
2 Large (158.0, 177.0] 19.0 14
3 Extra Large (177.0, 362.0] 185.0 15

Visualize

Given data that has two overlapping categorical classifications, a stacked bar chart may provide some quick insight.

Define Function to Create Plot

%matplotlib notebook

import matplotlib.pyplot as plt
plt.style.use('tableau-colorblind10')

def create_stacked_bar_plot(p,
                            y_label,
                            title):
    plt.figure(figsize=(8,6))

    rows = len(p.index)

    # List of zeros with length equal to the number of rows in the pivot table
    bottoms = [0] * rows

    # Create the stacks
    for col in p.columns[1:]:
        plt.bar(p[p.columns[0]],
                p[col],
                label=col,
                width=0.5,
                bottom=bottoms)
        bottoms += p[col].values

    # Add denominators at the top of each bar
    heights = list(bottoms) # bottoms now contains the height of each column overall
    xs = range(0,len(p.index),1)
    
    # Make plot slightly taller than the max height
    plt.gca().set_ylim(0, max(heights)*1.1)

    for x, height in zip(xs, heights):
        plt.gca().text(x,
                       height + max(heights)*.01,
                       str(height),
                       ha='center',
                       va='bottom',
                       color='black',
                       fontsize=8)

    # Add percentages to each stack
    rects = plt.gca().patches

    for n, r in enumerate(rects):
        height = r.get_height()

        if height >= max(heights)*0.1: # If the height of the rectangle is large enough to be labeled
            perc = '{0:.0f}%'.format(height/heights[n % rows]*100)
            perc_w_denom = '{0:.0f}%'.format((height/heights[n % rows])*100) + '\n(' + str(height) + ')'

            plt.gca().text(r.get_x() + r.get_width() / 2,
                           r.get_y() + height / 2,
                           perc_w_denom,
                           ha='center',
                           va='center',
                           color='white',
                           fontsize=8)

    # Hide labels and ticks on left to improve data-ink ratio
    plt.tick_params(
    axis='y',
    left=False,
    labelleft=False)

    # Hide frame to improve data-ink ratio
    for spine in plt.gca().spines.values():
        spine.set_visible(False)

    # Add legend at the Top
    plt.gca().legend(loc='lower center',
                     bbox_to_anchor=(0.5, 0.95),
                     ncol=4,
                     frameon=False,
                     prop={'size':8})

    plt.ylabel(y_label)
    plt.title(title);

Cut

f.head()
fruit mass cut cut labels qcut qcut labels
0 Apple 192 (171.333, 266.667] Medium (177.0, 362.0] Extra Large
1 Apple 180 (171.333, 266.667] Medium (177.0, 362.0] Extra Large
2 Apple 176 (171.333, 266.667] Medium (158.0, 177.0] Large
3 Mandarin 86 (75.714, 171.333] Small (75.999, 140.0] Small
4 Mandarin 84 (75.714, 171.333] Small (75.999, 140.0] Small
p = (pd.pivot_table(f[['fruit','cut labels']],
                    index='fruit',
                    columns='cut labels',
                    aggfunc=len)
     .fillna(0)
     .astype(int))
p.columns = p.columns.astype(str)
p = p.reset_index()
p.columns.name = ''
p.columns = p.columns.str.title()
p
Fruit Small Medium Large
0 Apple 13 6 0
1 Lemon 10 6 0
2 Mandarin 5 0 0
3 Orange 12 4 3
create_stacked_bar_plot(p,
                        'Fruit Count',
                        "Fruit Size Distribution - Cut (Equal Bin Size)")
<IPython.core.display.Javascript object>

QCut

f.head()
fruit mass cut cut labels qcut qcut labels
0 Apple 192 (171.333, 266.667] Medium (177.0, 362.0] Extra Large
1 Apple 180 (171.333, 266.667] Medium (177.0, 362.0] Extra Large
2 Apple 176 (171.333, 266.667] Medium (158.0, 177.0] Large
3 Mandarin 86 (75.714, 171.333] Small (75.999, 140.0] Small
4 Mandarin 84 (75.714, 171.333] Small (75.999, 140.0] Small
p = (pd.pivot_table(f[['fruit','qcut labels']],
                    index='fruit',
                    columns='qcut labels',
                    aggfunc=len)
     .fillna(0)
     .astype(int))
p.columns = p.columns.astype(str)
p = p.reset_index()
p.columns.name = ''
p.columns = p.columns.str.title()
p
Fruit Small Medium Large Extra Large
0 Apple 1 5 10 3
1 Lemon 9 1 1 5
2 Mandarin 5 0 0 0
3 Orange 1 8 3 7
create_stacked_bar_plot(p,
                        'Fruit Count',
                        "Fruit Size Distribution - QCut (Equal Bin Counts)")
<IPython.core.display.Javascript object>