Distributions#

from lets_plot import *

LetsPlot.setup_html()

# This example was found at: www.cookbook-r.com/Graphs/Plotting_distributions_(ggplot2)
def get_data():
    import numpy as np
    import pandas as pd

    np.random.seed(123)

    return pd.DataFrame(dict(
        cond=np.repeat(["A", "B"], 200),
        rating=np.concatenate((np.random.normal(0, 1, 200), np.random.normal(.8, 1, 200)))
    ))

df = get_data()
print(df.shape)
df.head()

(400, 2)

	cond	rating
0	A	-1.085631
1	A	0.997345
2	A	0.282978
3	A	-1.506295
4	A	-0.578600

# Basic histogram of "rating"
p = ggplot(df, aes(x="rating")) + ggsize(500, 250)
p + geom_histogram(binwidth=.5)

# Histogram overlaid with kernel density curve
#  - histogram with density instead of count on y-axis
#  - overlay with transparent density plot
p + \
    geom_histogram(aes(y='..density..'), binwidth=.5, colour="black", fill="white") + \
    geom_density(alpha=.2, color="#de2d26", fill="#ff6666")

p + \
    geom_histogram(binwidth=.5, colour="black", fill="white") + \
    geom_vline(xintercept=df["rating"].mean(), \
               color="red", linetype='dashed', size=1)

Histogram and density plots with multiple groups#

p1 = ggplot(df, aes(x="rating", fill="cond")) + ggsize(500, 250)

# Default histogram (stacked)
p1 + geom_histogram(binwidth=.5)

# Overlaid histograms
p1 + geom_histogram(binwidth=.5, alpha=.7, position="identity")

# Interleaved histograms
p1 + geom_histogram(binwidth=.5, position="dodge")

# Density plot
p2 = ggplot(df, aes(x="rating", color="cond")) + ggsize(500, 250)
p2 + geom_density()

# Density plot with semi-transparent fill
p2 + geom_density(aes(fill="cond"), alpha=.7)

# Find the mean of each group
cdf = df.groupby(["cond"], as_index=False).mean()
cdf.head()

	cond	rating
0	A	0.003787
1	B	0.685638

# Overlaid histograms with means
p2 + \
    geom_histogram(aes(fill="cond"), alpha=.5, position="identity", size=0) + \
    geom_vline(data=cdf, \
               mapping=aes(xintercept="rating",  color="cond"), \
               linetype='dashed', size=1)

# Use frqpoly instead of histogram
p2 + \
    geom_freqpoly(aes(fill="cond")) + \
    geom_vline(data=cdf, \
               mapping=aes(xintercept="rating",  color="cond"), \
               linetype='dashed', size=1)

# Density plots with means
p2 + \
    geom_density() + \
    geom_vline(data=cdf, \
               mapping=aes(xintercept="rating", color="cond"), \
               linetype='dashed', size=1)

Box plots#

# A basic box plot
p3 = ggplot(df, aes(x="cond", y="rating")) + ggsize(400, 300)
p3 + geom_boxplot()

# A basic box with the conditions colored
p3 + geom_boxplot(aes(fill="cond"))

# Style outliers
p3 + geom_boxplot(outlier_color="red", outlier_shape=8, outlier_size=1.5)

Distributions#

Histogram and density plots with multiple groups#

Using facets#

Box plots#