This note demonstrates a function that can be used to quickly build a stacked bar chart using Pandas and Matplotlib. It also demonstrates a quick way to categorize continuous data using Pandas.
import numpy as np
import pandas as pd
Discretize a Continuous Variable
2 Pandas functions can be used to categorize rows based on a continuous feature. In this case, classifying fruits by mass.
- Panda’s
cut
function divides the fruits by mass into buckets with equal size bins.
- Panda’s
qcut
function divides the fruits by mass into buckets with equal counts of instances in each bin.
f = (pd.read_csv('stacked-bar-charts/fruit_data_with_colors.txt',
sep='\t')
.rename(columns={'fruit_name':'fruit'}))
f['fruit'] = f['fruit'].str.title()
f['mass'].describe().round(1)
count 59.0
mean 163.1
std 55.0
min 76.0
25% 140.0
50% 158.0
75% 177.0
max 362.0
Name: mass, dtype: float64
f['cut'] = pd.cut(f['mass'],
3)
f['cut labels'] = pd.cut(f['mass'],
3,
labels=['Small','Medium','Large'])
f['qcut'] = pd.qcut(f['mass'],
4)
f['qcut labels'] = pd.qcut(f['mass'],
4,
labels=['Small','Medium','Large','Extra Large'])
f = f[['fruit', 'mass',
'cut', 'cut labels',
'qcut', 'qcut labels']]
f.head()
|
fruit |
mass |
cut |
cut labels |
qcut |
qcut labels |
0 |
Apple |
192 |
(171.333, 266.667] |
Medium |
(177.0, 362.0] |
Extra Large |
1 |
Apple |
180 |
(171.333, 266.667] |
Medium |
(177.0, 362.0] |
Extra Large |
2 |
Apple |
176 |
(171.333, 266.667] |
Medium |
(158.0, 177.0] |
Large |
3 |
Mandarin |
86 |
(75.714, 171.333] |
Small |
(75.999, 140.0] |
Small |
4 |
Mandarin |
84 |
(75.714, 171.333] |
Small |
(75.999, 140.0] |
Small |
Cut
cut_count = (f[['cut labels','fruit','cut',]]
.groupby(['cut labels','cut'])
.count()
.reset_index()
.rename(columns={'cut labels' : 'labels',
'fruit' : 'fruit count'})
.dropna()
.reset_index(drop=True))
cut_count['fruit count'] = cut_count['fruit count'].astype(int)
for limit, threshold in [('upper',1),
('lower',0)]:
cut_count[limit] = (cut_count['cut'].astype(str)
.str.split(expand=True)[threshold]
.str.strip('(,] ')
.astype(float))
cut_count['bin size'] = (cut_count['upper'] - cut_count['lower']).round(1)
cut_count[['labels','cut','bin size','fruit count']]
|
labels |
cut |
bin size |
fruit count |
0 |
Small |
(75.714, 171.333] |
95.6 |
40 |
1 |
Medium |
(171.333, 266.667] |
95.3 |
16 |
2 |
Large |
(266.667, 362.0] |
95.3 |
3 |
QCut
qcut_count = (f[['qcut labels','fruit','qcut',]]
.groupby(['qcut labels','qcut'])
.count()
.reset_index()
.rename(columns={'qcut labels':'labels',
'fruit':'fruit count'})
.dropna()
.reset_index(drop=True))
qcut_count['fruit count'] = qcut_count['fruit count'].astype(int)
for limit, threshold in [('upper',1),
('lower',0)]:
qcut_count[limit] = (qcut_count['qcut'].astype(str)
.str.split(expand=True)[threshold]
.str.strip('(,] ')
.astype(float))
qcut_count['bin size'] = (qcut_count['upper'] - qcut_count['lower']).round(1)
qcut_count[['labels','qcut','bin size','fruit count']]
|
labels |
qcut |
bin size |
fruit count |
0 |
Small |
(75.999, 140.0] |
64.0 |
16 |
1 |
Medium |
(140.0, 158.0] |
18.0 |
14 |
2 |
Large |
(158.0, 177.0] |
19.0 |
14 |
3 |
Extra Large |
(177.0, 362.0] |
185.0 |
15 |
Visualize
Given data that has two overlapping categorical classifications, a stacked bar chart may provide some quick insight.
Define Function to Create Plot
%matplotlib notebook
import matplotlib.pyplot as plt
plt.style.use('tableau-colorblind10')
def create_stacked_bar_plot(p,
y_label,
title):
plt.figure(figsize=(8,6))
rows = len(p.index)
# List of zeros with length equal to the number of rows in the pivot table
bottoms = [0] * rows
# Create the stacks
for col in p.columns[1:]:
plt.bar(p[p.columns[0]],
p[col],
label=col,
width=0.5,
bottom=bottoms)
bottoms += p[col].values
# Add denominators at the top of each bar
heights = list(bottoms) # bottoms now contains the height of each column overall
xs = range(0,len(p.index),1)
# Make plot slightly taller than the max height
plt.gca().set_ylim(0, max(heights)*1.1)
for x, height in zip(xs, heights):
plt.gca().text(x,
height + max(heights)*.01,
str(height),
ha='center',
va='bottom',
color='black',
fontsize=8)
# Add percentages to each stack
rects = plt.gca().patches
for n, r in enumerate(rects):
height = r.get_height()
if height >= max(heights)*0.1: # If the height of the rectangle is large enough to be labeled
perc = '{0:.0f}%'.format(height/heights[n % rows]*100)
perc_w_denom = '{0:.0f}%'.format((height/heights[n % rows])*100) + '\n(' + str(height) + ')'
plt.gca().text(r.get_x() + r.get_width() / 2,
r.get_y() + height / 2,
perc_w_denom,
ha='center',
va='center',
color='white',
fontsize=8)
# Hide labels and ticks on left to improve data-ink ratio
plt.tick_params(
axis='y',
left=False,
labelleft=False)
# Hide frame to improve data-ink ratio
for spine in plt.gca().spines.values():
spine.set_visible(False)
# Add legend at the Top
plt.gca().legend(loc='lower center',
bbox_to_anchor=(0.5, 0.95),
ncol=4,
frameon=False,
prop={'size':8})
plt.ylabel(y_label)
plt.title(title);
Cut
|
fruit |
mass |
cut |
cut labels |
qcut |
qcut labels |
0 |
Apple |
192 |
(171.333, 266.667] |
Medium |
(177.0, 362.0] |
Extra Large |
1 |
Apple |
180 |
(171.333, 266.667] |
Medium |
(177.0, 362.0] |
Extra Large |
2 |
Apple |
176 |
(171.333, 266.667] |
Medium |
(158.0, 177.0] |
Large |
3 |
Mandarin |
86 |
(75.714, 171.333] |
Small |
(75.999, 140.0] |
Small |
4 |
Mandarin |
84 |
(75.714, 171.333] |
Small |
(75.999, 140.0] |
Small |
p = (pd.pivot_table(f[['fruit','cut labels']],
index='fruit',
columns='cut labels',
aggfunc=len)
.fillna(0)
.astype(int))
p.columns = p.columns.astype(str)
p = p.reset_index()
p.columns.name = ''
p.columns = p.columns.str.title()
p
|
Fruit |
Small |
Medium |
Large |
0 |
Apple |
13 |
6 |
0 |
1 |
Lemon |
10 |
6 |
0 |
2 |
Mandarin |
5 |
0 |
0 |
3 |
Orange |
12 |
4 |
3 |
create_stacked_bar_plot(p,
'Fruit Count',
"Fruit Size Distribution - Cut (Equal Bin Size)")
<IPython.core.display.Javascript object>
QCut
|
fruit |
mass |
cut |
cut labels |
qcut |
qcut labels |
0 |
Apple |
192 |
(171.333, 266.667] |
Medium |
(177.0, 362.0] |
Extra Large |
1 |
Apple |
180 |
(171.333, 266.667] |
Medium |
(177.0, 362.0] |
Extra Large |
2 |
Apple |
176 |
(171.333, 266.667] |
Medium |
(158.0, 177.0] |
Large |
3 |
Mandarin |
86 |
(75.714, 171.333] |
Small |
(75.999, 140.0] |
Small |
4 |
Mandarin |
84 |
(75.714, 171.333] |
Small |
(75.999, 140.0] |
Small |
p = (pd.pivot_table(f[['fruit','qcut labels']],
index='fruit',
columns='qcut labels',
aggfunc=len)
.fillna(0)
.astype(int))
p.columns = p.columns.astype(str)
p = p.reset_index()
p.columns.name = ''
p.columns = p.columns.str.title()
p
|
Fruit |
Small |
Medium |
Large |
Extra Large |
0 |
Apple |
1 |
5 |
10 |
3 |
1 |
Lemon |
9 |
1 |
1 |
5 |
2 |
Mandarin |
5 |
0 |
0 |
0 |
3 |
Orange |
1 |
8 |
3 |
7 |
create_stacked_bar_plot(p,
'Fruit Count',
"Fruit Size Distribution - QCut (Equal Bin Counts)")
<IPython.core.display.Javascript object>