MPG Correlogram#

A correlogram provides a quick overview of the entire dataset and allows analysing the relationship between each pair of numerical variables.

import numpy as np
import pandas as pd

from lets_plot import *

LetsPlot.setup_html()

df = pd.read_csv("https://raw.githubusercontent.com/JetBrains/lets-plot-docs/master/data/mpg.csv")
df.drop(columns=["Unnamed: 0"], inplace=True)
print(df.shape)
df.head()

(234, 11)

	manufacturer	model	displ	year	cyl	trans	drv	cty	hwy	fl	class
0	audi	a4	1.8	1999	4	auto(l5)	f	18	29	p	compact
1	audi	a4	1.8	1999	4	manual(m5)	f	21	29	p	compact
2	audi	a4	2.0	2008	4	manual(m6)	f	20	31	p	compact
3	audi	a4	2.0	2008	4	auto(av)	f	21	30	p	compact
4	audi	a4	2.8	1999	6	auto(l5)	f	16	26	p	compact

corr_df = df.corr(numeric_only=True).stack().to_frame().reset_index()
corr_df.columns = ["x", "y", "corr"]
corr_df.head()

	x	y	corr
0	displ	displ	1.000000
1	displ	year	0.147843
2	displ	cyl	0.930227
3	displ	cty	-0.798524
4	displ	hwy	-0.766020

corr_df0 = corr_df[corr_df["x"] == corr_df["y"]]

corr_df1 = corr_df[corr_df["x"] < corr_df["y"]]
corr_df1 = pd.concat([
    corr_df1.assign(half="corr"),
    corr_df1.assign(corr=np.where(corr_df1["corr"] > 0, 1 - corr_df1["corr"], -1 - corr_df1["corr"]), half="remainder")
]).reset_index(drop=True)

corr_df2 = corr_df[corr_df["x"] > corr_df["y"]]
corr_df2 = corr_df2.assign(angle=np.where(corr_df2["corr"] > 0, np.pi / 4, 3 * np.pi / 4))

vars = sorted(corr_df["x"].unique())

The Correlogram#

Let’s plot a correlogram of the mpg dataset variables. Here’s how it should be interpreted:

The filled portion of the pie shows the correlation magnitude.
The diagonal of the squares shows the sign of the correlation.
The depth of the figures shading shows the correlation magnitude.
The names of the variables are on the diagonal.

ggplot() + \
    geom_pie(aes("x", "y", slice="corr", paint_a="corr"), \
             data=corr_df1, stat='identity', \
             size=1, size_unit='x', spacer_width=1, \
             fill_by='paint_a', tooltips='none') + \
    geom_pie(aes("x", "y", slice="corr", paint_b="half"), \
             data=corr_df1, stat='identity', \
             size=1, size_unit='x', show_legend=False, \
             fill_by='paint_b', tooltips='none') + \
    geom_point(aes("x", "y", paint_a="corr"), \
               data=corr_df2, shape=22, size=1, stroke=1, size_unit='x', \
               color="white", fill_by='paint_a', tooltips='none') + \
    geom_spoke(aes("x", "y", angle="angle"), \
               radius=np.sqrt(2), data=corr_df2, \
               pivot='middle', size=1, color="white") + \
    geom_text(aes("x", "y", label="x"), \
              data=corr_df0, size=1, size_unit='x') + \
    scale_x_discrete(breaks=vars, expand=[.1, 0]) + scale_y_discrete(breaks=vars, expand=[.1, 0]) + \
    scale_gradient2('paint_a', low="#a50026", mid="white", high="#313695") + \
    scale_manual('paint_b', values=["rgba(0, 0, 0, 0)", "lightgrey"]) + \
    coord_fixed() + \
    ggsize(660, 600) + \
    theme_void()