Real-World Histograms and Bar Charts
The purpose of this data visualization effort was to help demonstrate the efficacy of using a special teaching technique in the classroom. The data is a set of weekly test scores for 23 students in each of 2 subjects over an 8-week time span. For weeks 1-4, standard techniques were used. For weeks 5-8, a special technique was used.
The data is very convincing, so the efficacy of the “treatment” was demonstrated by a basic comparison of average test scores for the class week-over-week. This was faciliated by a simple bar chart. Additionally, the distribution of individual scores were examined with and without the special teach technique. This was faciliated by superimposed histograms with colors indicating whether the special teaching technique was applied.
In order to establish efficacy of the technique for lower-performing students, second versions of the same visuals were generated that excluded students who were deemed to have already mastered the material. This “mastery” was determined based on their performance on the tests in weeks 1-4, when standard techniques were used.
The remainder of this page describes how the data were manipulated and then visualized using standard Python data visualization libraries.
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
Raw Data
The format of the data is an Excel spreadsheet split onto four worksheets. These worksheets contain data for two, four-week periods for each of two subjects.
raw = pd.read_excel('data/data.xlsx', sheet_name='VD (no TPR)')
raw.head()
|
Student Number |
Week 1 (%) |
Week 2 (%) |
Week 3 (%) |
Week 4 (%) |
0 |
Student 1 |
100.0 |
100.0 |
100.0 |
100 |
1 |
Student 2 |
100.0 |
100.0 |
100.0 |
83.3 |
2 |
Student 3 |
100.0 |
66.7 |
100.0 |
100 |
3 |
Student 4 |
33.3 |
50.0 |
66.7 |
33.3 |
4 |
Student 5 |
16.7 |
50 |
66.7 |
50.0 |
Data Manipulation
Stack into Tall Data
d = pd.DataFrame()
for sheet, subject, treatment in [('VD (no TPR)', 'Vocab Development', 'No TPR'),
('VD (TPR)', 'Vocab Development', 'TPR'),
('TS (no TPR)', 'Text Sequencing', 'No TPR'),
('TS (TPR)', 'Text Sequencing', 'TPR')]:
new_sheet = pd.read_excel('data/data.xlsx',
sheet_name=sheet)
new_sheet['Subject'] = subject
new_sheet['Treatment'] = treatment
new_sheet = (new_sheet
.set_index(['Student Number','Subject','Treatment'])
.stack()
.reset_index()
.rename(columns={'Student Number':'Student',
'level_3':'Week',
0:'Score'}))
new_sheet['Student'] = new_sheet['Student'].str.extract('(\d+)')
new_sheet['Week'] = new_sheet['Week'].str.extract('(\d+)')
d = (d
.append(new_sheet))
d = d.reset_index(drop=True)
d['Student'] = d['Student'].astype(int)
d['Score'] = d['Score'].astype(float)
d['Week'] = d['Week'].astype(int)
d = (d
.sort_values(['Subject','Student','Week'])
.reset_index(drop=True))
d.info()
d.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 368 entries, 0 to 367
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Student 368 non-null int64
1 Subject 368 non-null object
2 Treatment 368 non-null object
3 Week 368 non-null int64
4 Score 368 non-null float64
dtypes: float64(1), int64(2), object(2)
memory usage: 14.5+ KB
|
Student |
Subject |
Treatment |
Week |
Score |
0 |
1 |
Text Sequencing |
No TPR |
1 |
100.0 |
1 |
1 |
Text Sequencing |
No TPR |
2 |
100.0 |
2 |
1 |
Text Sequencing |
No TPR |
3 |
100.0 |
3 |
1 |
Text Sequencing |
No TPR |
4 |
100.0 |
4 |
1 |
Text Sequencing |
TPR |
5 |
100.0 |
Create Summary Statistics
def create_summaries(sub):
s = (sub[['Student','Score']]
.groupby(['Student'])
.mean()
.rename(columns={'Score':'Week 1-8 Mean'}))
s_2 = (sub[sub['Week'].isin([1,2,3,4])][['Student','Score']]
.groupby(['Student'])
.mean()
.rename(columns={'Score':'Week 1-4 Mean'}))
s_3 = (sub[sub['Week'].isin([5,6,7,8])][['Student','Score']]
.groupby(['Student'])
.mean()
.rename(columns={'Score':'Week 5-8 Mean'}))
for new_col in [s_2, s_3]:
s = pd.merge(s, new_col,
left_index=True,
right_index=True)
return s
s = pd.DataFrame()
for subject in ['Text Sequencing', 'Vocab Development']:
new_summaries = create_summaries(d[d['Subject']==subject])
new_summaries['Subject'] = subject
s = s.append(new_summaries)
s = s.reset_index()
s = (s[['Student', 'Subject',
'Week 1-4 Mean', 'Week 5-8 Mean']]#,
.sort_values(['Student','Subject'])
.reset_index(drop=True))
s.info()
s.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46 entries, 0 to 45
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Student 46 non-null int64
1 Subject 46 non-null object
2 Week 1-4 Mean 46 non-null float64
3 Week 5-8 Mean 46 non-null float64
dtypes: float64(2), int64(1), object(1)
memory usage: 1.6+ KB
|
Student |
Subject |
Week 1-4 Mean |
Week 5-8 Mean |
0 |
1 |
Text Sequencing |
100.000 |
100.0 |
1 |
1 |
Vocab Development |
100.000 |
100.0 |
2 |
2 |
Text Sequencing |
100.000 |
100.0 |
3 |
2 |
Vocab Development |
95.825 |
100.0 |
4 |
3 |
Text Sequencing |
75.000 |
100.0 |
Segment the Students
s.loc[s['Week 1-4 Mean']>=90.,'Week 1-4 90%+ Mean'] = 'Mastery'
s.loc[s['Week 5-8 Mean'] > s['Week 1-4 Mean'], 'Improvement'] = 'Improved'
s.loc[s['Week 5-8 Mean'] < s['Week 1-4 Mean'], 'Improvement'] = 'Worsened'
s['Improvement'] = s['Improvement'].fillna('No Change')
|
Student |
Subject |
Week 1-4 Mean |
Week 5-8 Mean |
Week 1-4 90%+ Mean |
Improvement |
0 |
1 |
Text Sequencing |
100.000 |
100.0 |
Mastery |
No Change |
1 |
1 |
Vocab Development |
100.000 |
100.0 |
Mastery |
No Change |
2 |
2 |
Text Sequencing |
100.000 |
100.0 |
Mastery |
No Change |
3 |
2 |
Vocab Development |
95.825 |
100.0 |
Mastery |
Improved |
4 |
3 |
Text Sequencing |
75.000 |
100.0 |
NaN |
Improved |
Merge the Segments back into Grades Data
if 'Week 1-4 90%+ Mean' not in d.columns:
d = pd.merge(d, s.drop(columns=['Week 1-4 Mean','Week 5-8 Mean']),
on=['Student','Subject'])
|
Student |
Subject |
Treatment |
Week |
Score |
Week 1-4 90%+ Mean |
Improvement |
0 |
1 |
Text Sequencing |
No TPR |
1 |
100.0 |
Mastery |
No Change |
1 |
1 |
Text Sequencing |
No TPR |
2 |
100.0 |
Mastery |
No Change |
2 |
1 |
Text Sequencing |
No TPR |
3 |
100.0 |
Mastery |
No Change |
3 |
1 |
Text Sequencing |
No TPR |
4 |
100.0 |
Mastery |
No Change |
4 |
1 |
Text Sequencing |
TPR |
5 |
100.0 |
Mastery |
No Change |
|
Student |
Subject |
Week 1-4 Mean |
Week 5-8 Mean |
Week 1-4 90%+ Mean |
Improvement |
0 |
1 |
Text Sequencing |
100.000 |
100.0 |
Mastery |
No Change |
1 |
1 |
Vocab Development |
100.000 |
100.0 |
Mastery |
No Change |
2 |
2 |
Text Sequencing |
100.000 |
100.0 |
Mastery |
No Change |
3 |
2 |
Vocab Development |
95.825 |
100.0 |
Mastery |
Improved |
4 |
3 |
Text Sequencing |
75.000 |
100.0 |
NaN |
Improved |
Create Visuals
Define Function to Create Histograms
def create_histograms(subject, exclude, labels_to_skip):
plt.figure(figsize=(11,8.5))
ax = plt.gca()
ax.hist(d[(d['Subject']==subject)&
(d['Treatment']=='No TPR')&
(d['Week 1-4 90%+ Mean']!=exclude)]['Score'],
bins=np.arange(-5,115,10),
alpha=0.625,
align='mid',
label = 'No TPR')
ax.hist(d[(d['Subject']==subject)&
(d['Treatment']=='TPR')&
(d['Week 1-4 90%+ Mean']!=exclude)]['Score'],
bins=np.arange(-5,115,10),
alpha=0.625,
align='mid',
label = 'TPR')
ax.set_xticks(np.arange(0,110,10));
ax.set_xlim([-5,105])
ymax = ax.get_ylim()[1]
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.tick_params(
axis='x',
bottom=False)
ax.tick_params(
axis='y',
left=False,
right=False,
labelleft=False)
rects = ax.patches
for n, r in enumerate(rects):
if n not in labels_to_skip:
height = r.get_height()
plt.gca().text(r.get_x() + r.get_width() / 2,
height + ymax/67,
str(int(height)),
ha='center',
va='center')
ax.legend(
fancybox=True,
loc=9,
ncol=4,
facecolor='white',
edgecolor='white',
framealpha=1,
fontsize=12)
title_str = subject + ' Weekly Scores Distribution'
if exclude == 'None':
title_str += '\nTotal Class'
else:
title_str += '\nExcludes Students who had Mastered the Material'
ax.set_title(title_str,
fontsize=14)
if save_fig:
plt.savefig('figures/' + subject.replace(' ','_').lower() \
+ '_excludes_' + exclude.lower() + '.pdf',
dpi=450)
Define Function to Create Bar Charts
def create_bar_charts(subject, exclude):
plt.figure(figsize=(11,8.5))
ax = plt.gca()
no_tpr_avg = (d[(d['Subject']==subject)&
(d['Treatment']=='No TPR')&
(d['Week 1-4 90%+ Mean']!=exclude)][['Week','Score']]
.groupby('Week')
.mean()
.round(0)
.astype(int)
.reset_index())
tpr_avg = (d[(d['Subject']==subject)&
(d['Treatment']=='TPR')&
(d['Week 1-4 90%+ Mean']!=exclude)][['Week','Score']]
.groupby('Week')
.mean()
.round(0)
.astype(int)
.reset_index())
tpr_avg.index += no_tpr_avg.shape[0]
for df, label in [(no_tpr_avg, 'No TPR'),
(tpr_avg, 'TPR')]:
position = df.index
scores = df['Score']
ax.bar(position,
scores,
align='center',
label=label,
alpha=0.625)
all_avgs = no_tpr_avg.append(tpr_avg)
position = all_avgs.index
labels = all_avgs['Week']
plt.xticks(position, labels)
plt.xlabel('Week',
fontsize=12)
plt.ylim([0,105])
plt.xlim([-0.4,7.4])
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.tick_params(
axis='x',
bottom=False)
ax.tick_params(
axis='y',
left=False,
right=False,
labelleft=False)
rects = ax.patches
for n, r in enumerate(rects):
height = r.get_height()
plt.gca().text(r.get_x() + r.get_width() / 2,
height + 1.5,
str(int(height)),
ha='center',
va='center')
ax.legend(
fancybox=True,
loc=9,
ncol=4,
facecolor='white',
edgecolor='white',
framealpha=1,
fontsize=12)
title_str = subject + ' Weekly Mean Scores'
if exclude == 'None':
title_str += '\nTotal Class'
else:
title_str += '\nExcludes Students who had Mastered the Material'
ax.set_title(title_str,
fontsize=14)
if save_fig:
plt.savefig('figures/' + subject.replace(' ','_').lower() \
+ '_excludes_' + exclude.lower() + '.pdf',
dpi=450)
Plot
save_fig = False
hist_params = [('Text Sequencing', 'None', [11,12,13,14,16,18,20]),
('Text Sequencing', 'Mastery', [11,12,13,14,20]),
('Vocab Development', 'None', [0,1,4,6,9,11,12,13,15,17,20]),
('Vocab Development', 'Mastery', [0,1,4,6,9,11,12,15,17,20])]
bar_params = [('Text Sequencing', 'None'),
('Text Sequencing', 'Mastery'),
('Vocab Development', 'None'),
('Vocab Development', 'Mastery')]
for subject, exclude, labels_to_skip in hist_params:
create_histograms(subject, exclude, labels_to_skip)
for subject, exclude in bar_params:
create_bar_charts(subject, exclude)







