Stacked Bar Charts

29 Feb 2020

This note demonstrates a function that can be used to quickly build a stacked bar chart using Pandas and Matplotlib. It also demonstrates a quick way to categorize continuous data using Pandas.

import numpy as np
import pandas as pd

Discretize a Continuous Variable

2 Pandas functions can be used to categorize rows based on a continuous feature. In this case, classifying fruits by mass.

Panda’s cut function divides the fruits by mass into buckets with equal size bins.
Panda’s qcut function divides the fruits by mass into buckets with equal counts of instances in each bin.

f = (pd.read_csv('stacked-bar-charts/fruit_data_with_colors.txt',
                 sep='\t')
     .rename(columns={'fruit_name':'fruit'}))
f['fruit'] = f['fruit'].str.title()
f['mass'].describe().round(1)

count     59.0
mean     163.1
std       55.0
min       76.0
25%      140.0
50%      158.0
75%      177.0
max      362.0
Name: mass, dtype: float64

f['cut'] = pd.cut(f['mass'],
                  3)
f['cut labels'] = pd.cut(f['mass'],
                         3,
                         labels=['Small','Medium','Large'])
f['qcut'] = pd.qcut(f['mass'],
                    4)
f['qcut labels'] = pd.qcut(f['mass'],
                           4,
                           labels=['Small','Medium','Large','Extra Large'])
f = f[['fruit', 'mass',
       'cut',   'cut labels',
       'qcut',  'qcut labels']]
f.head()

	fruit	mass	cut	cut labels	qcut	qcut labels
0	Apple	192	(171.333, 266.667]	Medium	(177.0, 362.0]	Extra Large
1	Apple	180	(171.333, 266.667]	Medium	(177.0, 362.0]	Extra Large
2	Apple	176	(171.333, 266.667]	Medium	(158.0, 177.0]	Large
3	Mandarin	86	(75.714, 171.333]	Small	(75.999, 140.0]	Small
4	Mandarin	84	(75.714, 171.333]	Small	(75.999, 140.0]	Small

Cut

cut_count = (f[['cut labels','fruit','cut',]]
             .groupby(['cut labels','cut'])
             .count()
             .reset_index()
             .rename(columns={'cut labels' : 'labels',
                              'fruit'      : 'fruit count'})
             .dropna()
             .reset_index(drop=True))
cut_count['fruit count'] = cut_count['fruit count'].astype(int)
for limit, threshold in [('upper',1),
                         ('lower',0)]:
    cut_count[limit] = (cut_count['cut'].astype(str)
                        .str.split(expand=True)[threshold]
                        .str.strip('(,] ')
                        .astype(float))
cut_count['bin size'] = (cut_count['upper'] - cut_count['lower']).round(1)
cut_count[['labels','cut','bin size','fruit count']]

	labels	cut	bin size	fruit count
0	Small	(75.714, 171.333]	95.6	40
1	Medium	(171.333, 266.667]	95.3	16
2	Large	(266.667, 362.0]	95.3	3

QCut

qcut_count = (f[['qcut labels','fruit','qcut',]]
              .groupby(['qcut labels','qcut'])
              .count()
              .reset_index()
              .rename(columns={'qcut labels':'labels',
                               'fruit':'fruit count'})
              .dropna()
              .reset_index(drop=True))
qcut_count['fruit count'] = qcut_count['fruit count'].astype(int)
for limit, threshold in [('upper',1),
                         ('lower',0)]:
    qcut_count[limit] = (qcut_count['qcut'].astype(str)
                         .str.split(expand=True)[threshold]
                         .str.strip('(,] ')
                         .astype(float))
qcut_count['bin size'] = (qcut_count['upper'] - qcut_count['lower']).round(1)
qcut_count[['labels','qcut','bin size','fruit count']]

	labels	qcut	bin size	fruit count
0	Small	(75.999, 140.0]	64.0	16
1	Medium	(140.0, 158.0]	18.0	14
2	Large	(158.0, 177.0]	19.0	14
3	Extra Large	(177.0, 362.0]	185.0	15

Visualize

Given data that has two overlapping categorical classifications, a stacked bar chart may provide some quick insight.

Define Function to Create Plot

%matplotlib notebook

import matplotlib.pyplot as plt
plt.style.use('tableau-colorblind10')

def create_stacked_bar_plot(p,
                            y_label,
                            title):
    plt.figure(figsize=(8,6))

    rows = len(p.index)

    # List of zeros with length equal to the number of rows in the pivot table
    bottoms = [0] * rows

    # Create the stacks
    for col in p.columns[1:]:
        plt.bar(p[p.columns[0]],
                p[col],
                label=col,
                width=0.5,
                bottom=bottoms)
        bottoms += p[col].values

    # Add denominators at the top of each bar
    heights = list(bottoms) # bottoms now contains the height of each column overall
    xs = range(0,len(p.index),1)
    
    # Make plot slightly taller than the max height
    plt.gca().set_ylim(0, max(heights)*1.1)

    for x, height in zip(xs, heights):
        plt.gca().text(x,
                       height + max(heights)*.01,
                       str(height),
                       ha='center',
                       va='bottom',
                       color='black',
                       fontsize=8)

    # Add percentages to each stack
    rects = plt.gca().patches

    for n, r in enumerate(rects):
        height = r.get_height()

        if height >= max(heights)*0.1: # If the height of the rectangle is large enough to be labeled
            perc = '{0:.0f}%'.format(height/heights[n % rows]*100)
            perc_w_denom = '{0:.0f}%'.format((height/heights[n % rows])*100) + '\n(' + str(height) + ')'

            plt.gca().text(r.get_x() + r.get_width() / 2,
                           r.get_y() + height / 2,
                           perc_w_denom,
                           ha='center',
                           va='center',
                           color='white',
                           fontsize=8)

    # Hide labels and ticks on left to improve data-ink ratio
    plt.tick_params(
    axis='y',
    left=False,
    labelleft=False)

    # Hide frame to improve data-ink ratio
    for spine in plt.gca().spines.values():
        spine.set_visible(False)

    # Add legend at the Top
    plt.gca().legend(loc='lower center',
                     bbox_to_anchor=(0.5, 0.95),
                     ncol=4,
                     frameon=False,
                     prop={'size':8})

    plt.ylabel(y_label)
    plt.title(title);

Cut

f.head()

	fruit	mass	cut	cut labels	qcut	qcut labels
0	Apple	192	(171.333, 266.667]	Medium	(177.0, 362.0]	Extra Large
1	Apple	180	(171.333, 266.667]	Medium	(177.0, 362.0]	Extra Large
2	Apple	176	(171.333, 266.667]	Medium	(158.0, 177.0]	Large
3	Mandarin	86	(75.714, 171.333]	Small	(75.999, 140.0]	Small
4	Mandarin	84	(75.714, 171.333]	Small	(75.999, 140.0]	Small

p = (pd.pivot_table(f[['fruit','cut labels']],
                    index='fruit',
                    columns='cut labels',
                    aggfunc=len)
     .fillna(0)
     .astype(int))
p.columns = p.columns.astype(str)
p = p.reset_index()
p.columns.name = ''
p.columns = p.columns.str.title()
p

	Fruit	Small	Medium	Large
0	Apple	13	6	0
1	Lemon	10	6	0
2	Mandarin	5	0	0
3	Orange	12	4	3

create_stacked_bar_plot(p,
                        'Fruit Count',
                        "Fruit Size Distribution - Cut (Equal Bin Size)")

<IPython.core.display.Javascript object>

QCut

f.head()

	fruit	mass	cut	cut labels	qcut	qcut labels
0	Apple	192	(171.333, 266.667]	Medium	(177.0, 362.0]	Extra Large
1	Apple	180	(171.333, 266.667]	Medium	(177.0, 362.0]	Extra Large
2	Apple	176	(171.333, 266.667]	Medium	(158.0, 177.0]	Large
3	Mandarin	86	(75.714, 171.333]	Small	(75.999, 140.0]	Small
4	Mandarin	84	(75.714, 171.333]	Small	(75.999, 140.0]	Small

p = (pd.pivot_table(f[['fruit','qcut labels']],
                    index='fruit',
                    columns='qcut labels',
                    aggfunc=len)
     .fillna(0)
     .astype(int))
p.columns = p.columns.astype(str)
p = p.reset_index()
p.columns.name = ''
p.columns = p.columns.str.title()
p

	Fruit	Small	Medium	Large	Extra Large
0	Apple	1	5	10	3
1	Lemon	9	1	1	5
2	Mandarin	5	0	0	0
3	Orange	1	8	3	7

create_stacked_bar_plot(p,
                        'Fruit Count',
                        "Fruit Size Distribution - QCut (Equal Bin Counts)")

<IPython.core.display.Javascript object>