Download notebook (.ipynb)

Netflix Movies#

import pandas as pd

from lets_plot import *
LetsPlot.setup_html()
df = pd.read_csv("https://raw.githubusercontent.com/JetBrains/lets-plot-docs/master/data/NetFlix.csv")
df = df[df["release_year"] >= 2000]
print(df.shape)
df.head()
(7338, 12)
show_id type title director cast country date_added release_year rating duration genres description
0 s1 TV Show 3% NaN João Miguel, Bianca Comparato, Michel Gomes, R... Brazil 14-Aug-20 2020 TV-MA 4 International TV Shows, TV Dramas, TV Sci-Fi &... In a future where the elite inhabit an island ...
1 s10 Movie 1920 Vikram Bhatt Rajneesh Duggal, Adah Sharma, Indraneil Sengup... India 15-Dec-17 2008 TV-MA 143 Horror Movies, International Movies, Thrillers An architect and his wife move into a castle t...
2 s100 Movie 3 Heroines Iman Brotoseno Reza Rahadian, Bunga Citra Lestari, Tara Basro... Indonesia 05-Jan-19 2016 TV-PG 124 Dramas, International Movies, Sports Movies Three Indonesian women break records by becomi...
3 s1000 Movie Blue Mountain State: The Rise of Thadland Lev L. Spiro Alan Ritchson, Darin Brooks, James Cade, Rob R... United States 01-Mar-16 2016 R 90 Comedies New NFL star Thad buys his old teammates' belo...
4 s1001 TV Show Blue Planet II NaN David Attenborough United Kingdom 03-Dec-18 2017 TV-G 1 British TV Shows, Docuseries, Science & Nature TV This sequel to the award-winning nature series...
movies_df = df[(df["type"] == "Movie")&(df["genres"] != "Movies")]
by_genre_df = pd.melt(
    movies_df["genres"].str.split(", ", expand=True).assign(duration=movies_df["duration"]),
    id_vars=["duration"], value_vars=[0, 1, 2], value_name="genre"
)[["genre", "duration"]].dropna(subset=["genre"])
by_genre_df = by_genre_df.assign(
    duration_mean=by_genre_df.merge(by_genre_df.groupby("genre")["duration"].mean(), on="genre", suffixes=["", "_mean"])["duration_mean"]
).sort_values(by="duration_mean", ascending=False)

ggplot(by_genre_df, aes("duration", "genre")) + \
    geom_area_ridges(aes(group="genre", fill="duration_mean"), \
                     scale=4, sampling=sampling_pick(by_genre_df.shape[0]), \
                     tooltips=layer_tooltips().title("@genre")\
                                              .line("@|@duration")) + \
    scale_x_log10() + \
    scale_fill_viridis(name="mean duration", option='plasma') + \
    ggsize(800, 600) + \
    ggtitle("Mean Netflix movie duration") + \
    theme(axis_line_x='blank')