Download notebook (.ipynb)

Distributions#

from lets_plot import *
LetsPlot.setup_html()
# This example was found at: www.cookbook-r.com/Graphs/Plotting_distributions_(ggplot2)
def get_data():
    import numpy as np
    import pandas as pd

    np.random.seed(123)

    return pd.DataFrame(dict(
        cond=np.repeat(["A", "B"], 200),
        rating=np.concatenate((np.random.normal(0, 1, 200), np.random.normal(.8, 1, 200)))
    ))

df = get_data()
print(df.shape)
df.head()
(400, 2)
cond rating
0 A -1.085631
1 A 0.997345
2 A 0.282978
3 A -1.506295
4 A -0.578600
# Basic histogram of "rating"
p = ggplot(df, aes(x="rating")) + ggsize(500, 250)
p + geom_histogram(binwidth=.5)
# Histogram overlaid with kernel density curve
#  - histogram with density instead of count on y-axis
#  - overlay with transparent density plot
p + \
    geom_histogram(aes(y='..density..'), binwidth=.5, colour="black", fill="white") + \
    geom_density(alpha=.2, color="#de2d26", fill="#ff6666")
p + \
    geom_histogram(binwidth=.5, colour="black", fill="white") + \
    geom_vline(xintercept=df["rating"].mean(), \
               color="red", linetype='dashed', size=1)

Histogram and density plots with multiple groups#

p1 = ggplot(df, aes(x="rating", fill="cond")) + ggsize(500, 250)

# Default histogram (stacked)
p1 + geom_histogram(binwidth=.5)
# Overlaid histograms
p1 + geom_histogram(binwidth=.5, alpha=.7, position="identity")
# Interleaved histograms
p1 + geom_histogram(binwidth=.5, position="dodge")
# Density plot
p2 = ggplot(df, aes(x="rating", color="cond")) + ggsize(500, 250)
p2 + geom_density()
# Density plot with semi-transparent fill
p2 + geom_density(aes(fill="cond"), alpha=.7)
# Find the mean of each group
cdf = df.groupby(["cond"], as_index=False).mean()
cdf.head()
cond rating
0 A 0.003787
1 B 0.685638
# Overlaid histograms with means
p2 + \
    geom_histogram(aes(fill="cond"), alpha=.5, position="identity", size=0) + \
    geom_vline(data=cdf, \
               mapping=aes(xintercept="rating",  color="cond"), \
               linetype='dashed', size=1)
# Use frqpoly instead of histogram
p2 + \
    geom_freqpoly(aes(fill="cond")) + \
    geom_vline(data=cdf, \
               mapping=aes(xintercept="rating",  color="cond"), \
               linetype='dashed', size=1)
# Density plots with means
p2 + \
    geom_density() + \
    geom_vline(data=cdf, \
               mapping=aes(xintercept="rating", color="cond"), \
               linetype='dashed', size=1)

Using facets#

ggplot(df, aes(x="rating")) + \
    geom_histogram(binwidth=.5, colour="black", fill="white") + \
    facet_grid("cond")
# With mean lines, using 'cdat' computed earlier.
ggplot(df, aes(x="rating")) + \
    geom_histogram(binwidth=.5, colour="black", fill="white") + \
    geom_vline(data=cdf, \
               mapping=aes(xintercept="rating"), \
               linetype='dashed', size=1, colour="red") + \
    facet_grid(None, "cond") + \
    ggsize(500, 250)

Box plots#

# A basic box plot
p3 = ggplot(df, aes(x="cond", y="rating")) + ggsize(400, 300)
p3 + geom_boxplot()
# A basic box with the conditions colored
p3 + geom_boxplot(aes(fill="cond"))
# Style outliers
p3 + geom_boxplot(outlier_color="red", outlier_shape=8, outlier_size=1.5)