Creating a Matplotlib Visual with Real-World Data

Import libraries and configure for high-definition display with large inline visuals.

%matplotlib inline

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
mpl.rcParams['figure.dpi']= 900

%config InlineBackend.figure_format = 'retina'

Import Data and Clean

The data used for this visual is taken from The National Centers for Environmental Information (NCEI) Daily Global Historical Climatology Network (GHCN-Daily). The GHCN-Daily consists of daily climate records from thousands of weather stations from around the world.

  • id : station identification code
  • date : date in YYYY-MM-DD format (e.g. 2012-01-24 = January 24, 2012)
  • element : indicator of element type
    • TMAX : Maximum temperature (tenths of degrees C)
    • TMIN : Minimum temperature (tenths of degrees C)
  • value : data value for element (tenths of degrees C)
filename = 'fb441e62df2d58994928907a91895ec62c2c42e6cd075c2700843b89.csv'
w = (pd.read_csv('data/' + filename)
     .rename(columns={'Data_Value':'Temp'}))
print(w.columns)
print(w.shape)
print(w['Date'].min())
print(w['Date'].max())
Index(['ID', 'Date', 'Element', 'Temp'], dtype='object')
(165085, 4)
2005-01-01
2015-12-31

Drop Leap Days and convert temperature to degrees Celsius, then to degrees Fahrenheight.

print(w.shape)
w = w[~(w['Date'].str.contains('02-29'))].copy()
print(w.shape)
w['Temp'] = (w['Temp'] * 0.1 * (9./5.)) + 32.
w.head()
(165085, 4)
(165002, 4)

ID Date Element Temp
0 USW00094889 2014-11-12 TMAX 35.96
1 USC00208972 2009-04-29 TMIN 42.08
2 USC00200032 2008-05-26 TMAX 82.04
3 USC00205563 2005-11-11 TMAX 57.02
4 USC00200230 2014-02-27 TMAX 12.92
w_15 = w[w['Date'].str.contains('2015-')].copy()
print(w_15['Date'].min())
print(w_15['Date'].max())
2015-01-01
2015-12-31
w_05_14 = w[~(w['Date'].str.contains('2015-'))].copy()
print(w_05_14['Date'].min())
print(w_05_14['Date'].max())
2005-01-01
2014-12-31
w_05_14['Date'] = w_05_14['Date'].str[5:]
w_15['Date'] = w_15['Date'].str[5:]
w_05_14.head()

ID Date Element Temp
0 USW00094889 11-12 TMAX 35.96
1 USC00208972 04-29 TMIN 42.08
2 USC00200032 05-26 TMAX 82.04
3 USC00205563 11-11 TMAX 57.02
4 USC00200230 02-27 TMAX 12.92
mi = (pd.DataFrame(w_05_14[w_05_14['Element']=='TMIN'][['Date','Temp']]
                   .groupby('Date')
                   .min())
      .reset_index()
      .sort_values('Date')
      .rename(columns={'Temp':"05-14 Min"}))
print(mi.shape)
ma = (pd.DataFrame(w_05_14[w_05_14['Element']=='TMAX'][['Date','Temp']]
                   .groupby('Date')
                   .max())
      .reset_index()
      .sort_values('Date')
      .rename(columns={'Temp':"05-14 Max"}))
print(ma.shape)
t_05_14 = pd.merge(mi, ma,
                   on='Date')
t_05_14.head()
(365, 2)
(365, 2)

Date 05-14 Min 05-14 Max
0 01-01 3.20 60.08
1 01-02 -16.06 57.02
2 01-03 -16.06 55.94
3 01-04 -14.98 51.08
4 01-05 5.00 55.04
mi_15 = (pd.DataFrame(w_15[w_15['Element']=='TMIN'][['Date','Temp']]
                   .groupby('Date')
                   .min())
      .reset_index()
      .sort_values('Date')
      .rename(columns={'Temp':"15 Min"}))
print(mi_15.shape)
ma_15 = (pd.DataFrame(w_15[w_15['Element']=='TMAX'][['Date','Temp']]
                   .groupby('Date')
                   .max())
      .reset_index()
      .sort_values('Date')
      .rename(columns={'Temp':"15 Max"}))
print(ma_15.shape)
t_15 = pd.merge(mi_15, ma_15,
                on='Date')
t_15.head()
(365, 2)
(365, 2)

Date 15 Min 15 Max
0 01-01 8.06 33.98
1 01-02 10.04 39.02
2 01-03 19.94 39.02
3 01-04 16.16 39.92
4 01-05 4.10 37.04
t = pd.merge(t_05_14, t_15,
             on='Date')
print(t.shape)
t.head()
(365, 5)

Date 05-14 Min 05-14 Max 15 Min 15 Max
0 01-01 3.20 60.08 8.06 33.98
1 01-02 -16.06 57.02 10.04 39.02
2 01-03 -16.06 55.94 19.94 39.02
3 01-04 -14.98 51.08 16.16 39.92
4 01-05 5.00 55.04 4.10 37.04
t.loc[t['15 Min'] >= t['05-14 Min'], '15 Min'] = np.nan
t.loc[t['15 Max'] <= t['05-14 Max'], '15 Max'] = np.nan
t[(t['15 Min'].isna()==False)|
  (t['15 Max'].isna()==False)].fillna('').head()

Date 05-14 Min 05-14 Max 15 Min 15 Max
4 01-05 5.00 55.04 4.1
10 01-11 -0.94 60.08 -4
33 02-03 -9.76 48.02 -10.84
39 02-09 -5.80 46.04 46.94
44 02-14 -7.06 51.08 -11.02
x_label_dates = ['01-01',
                 '04-01',
                 '07-01',
                 '10-01',
                 '12-31']
x_label_labels = ['1/1',
                  '4/1',
                  '7/1',
                  '10/1',
                  '12/31']
x_label_indexes = list(t[t['Date'].isin(x_label_dates)].index)
t[t['Date'].isin(x_label_dates)]

Date 05-14 Min 05-14 Max 15 Min 15 Max
0 01-01 3.20 60.08 NaN NaN
90 04-01 17.06 80.96 NaN NaN
181 07-01 42.98 98.06 NaN NaN
273 10-01 26.24 80.96 NaN NaN
364 12-31 5.00 57.02 NaN NaN

Define Figure Creation

def create_default_plot():
    plt.figure()

    plt.plot(t['Date'],
             t['05-14 Min'],
             label="Low '05-'14")
    plt.plot(t['Date'],
             t['05-14 Max'],
             label="High '05-'14")

    plt.gca().fill_between(t['Date'],
                           t['05-14 Min'],
                           t['05-14 Max'])

    plt.scatter(t['Date'],
                t['15 Min'],
                label="Low '15")
    plt.scatter(t['Date'],
                t['15 Max'],
                label="High '15")
    
    plt.legend()

    plt.title('Days in 2015 that Set a Record High or Low for 2005-2015')
    plt.xlabel('Date')
    plt.ylabel('Temperature ($^\circ$F)');
def create_best_practices_plot():
    plt.figure()

    plt.plot(t.index,
             t['05-14 Min'], 
             color='blue',
             linewidth=0.1,
             zorder=1,
             label="Low '05-'14")
    plt.plot(t.index,
             t['05-14 Max'], 
             color='red',
             linewidth=0.1,
             zorder=1,
             label="High '05-'14")

    plt.gca().fill_between(t.index,
                           t['05-14 Min'],
                           t['05-14 Max'],
                           color='#DFDFDF',
                           zorder=1)

    plt.scatter(t.index,
                t['15 Min'],
                color='blue',
                s=3,
                zorder=2,
                label="Low '15")
    plt.scatter(t.index,
                t['15 Max'],
                color='red',
                s=3,
                zorder=2,
                label="High '15")

    plt.title('Days in 2015 that Set a Record High or Low for 2005-2015',
              fontsize=8,
              alpha=0.8)
    plt.xlabel('Date',
               fontsize=8,
               alpha=0.8)
    plt.ylabel('Temperature ($^\circ$F)',
               fontsize=8,
               alpha=0.8)

    plt.xticks(x_label_indexes,
               x_label_labels, 
               fontsize=6,
               alpha=0.8)
    plt.yticks(fontsize=6,
               alpha=0.8)

    plt.gca().tick_params(direction='out', 
                          length=3, 
                          width=0.25)

    for spine in plt.gca().spines.values():
        spine.set_visible(False);

    handles, labels = plt.gca().get_legend_handles_labels()
    order = [3,1,0,2]
    plt.legend([handles[idx] for idx in order],
               [labels[idx] for idx in order],
               fontsize=6,
               frameon=False)

    plt.savefig('weather.png',
                bbox_inches='tight')

Matplotlib Default Plot

create_default_plot()

png

Plot Made per Best Practices

create_best_practices_plot()

png

Best Practices List

The second graph demonstrates several important improvements to bring it inline with visualization best practices

  • The following all improve the Data-Ink Ratio:

    • Outer border is removed
    • Font sizes are decreased
    • Line weights and scatterplot marker size are decreased
    • Increased legibility of the x-axis label due to decreased number of tick marks/labels
    • More rational, less saturated, color choices