Reddit Nationalpark Data Exploration

Alexander Dunkel, Institute of Cartography, TU Dresden

•••
Out[1]:

Last updated: Aug-19-2023, Carto-Lab Docker Version 0.14.0

Visualization of temporal patterns for submissions and comments from Nationalpark-Subreddits.

Preparations

In [2]:
import os, sys
from pathlib import Path
import psycopg2
import geopandas as gp
import pandas as pd
import seaborn as sns
import calendar
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as mticker
import matplotlib.patheffects as pe
from typing import List, Tuple, Dict, Optional
from IPython.display import clear_output, display, HTML
from datetime import datetime
In [3]:
module_path = str(Path.cwd().parents[0] / "py")
if module_path not in sys.path:
    sys.path.append(module_path)
from modules.base import tools, hll
from modules.base.hll import union_hll, cardinality_hll
In [4]:
%load_ext autoreload
%autoreload 2
In [5]:
OUTPUT = Path.cwd().parents[0] / "out"       # output directory for figures (etc.)
WORK_DIR = Path.cwd().parents[0] / "tmp"     # Working directory
In [6]:
(OUTPUT / "figures").mkdir(exist_ok=True)
(OUTPUT / "svg").mkdir(exist_ok=True)
WORK_DIR.mkdir(exist_ok=True)

Plot styling

In [7]:
plt.style.use('default')

Set global font

In [8]:
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = ['Times New Roman'] + plt.rcParams['font.serif']

Load HLL aggregate data

Load the data from CSV, generated in the previous notebook. Data is stored as aggregate HLL data (postcount, usercount) for each month.

In [9]:
REDDIT_ALL_SUBMISSIONS = OUTPUT / "reddit_all_months.csv"
REDDIT_ALL_COMMENTS = OUTPUT / "reddit_comments_all_months.csv"
In [10]:
%%time
data_files = {
    "REDDIT_ALL_SUBMISSIONS":REDDIT_ALL_SUBMISSIONS,
    "REDDIT_ALL_COMMENTS":REDDIT_ALL_COMMENTS,
    }
tools.display_file_stats(data_files)
name REDDIT_ALL_SUBMISSIONS REDDIT_ALL_COMMENTS
size 428.49 KB 1.16 MB
records 2,448 2,221
CPU times: user 55 ms, sys: 5.33 ms, total: 60.3 ms
Wall time: 59.5 ms
In [11]:
df = pd.read_csv(REDDIT_ALL_SUBMISSIONS)
In [12]:
df.head(10)
Out[12]:
year month post_hll user_hll topic_group
0 2010 10 \x138b40c8a2 \x138b40c303 everglades
1 2011 2 \x138b40fd82 \x138b40c303 everglades
2 2011 2 \x138b40014238a147c16a8185618a648c83d3c1efc2 \x138b4018214861750197a1c303dee1e021 yosemite
3 2011 3 \x138b409481 \x138b405e44 yosemite
4 2011 4 \x138b4020c177c27be294a3 \x138b4076a197a1c303 yosemite
5 2011 5 \x138b4090e5c1e4 \x138b404961c303 grandcanyon
6 2011 5 \x138b4011815c826621a341bf81d505e1a2fc43 \x138b4036c3c303 hotsprings
7 2011 5 \x138b400d4212c19cc49e41ae64af41 \x138b4003e21ac25e44c303dee1 yosemite
8 2011 6 \x138b4004c316e42782286131013722488281e4b084c0... \x138b4008a10f835e448062c303dca1e1a2ffa2 yosemite
9 2011 7 \x138b403081 \x138b405543 hotsprings

Get distinct subreddits:

In [13]:
df['topic_group'].unique()
Out[13]:
array(['everglades', 'yosemite', 'grandcanyon', 'hotsprings',
       'deathvalley', 'joshuatree', 'glaciernationalpark',
       'virginislands', 'bigbendtx', 'glacier', 'shenandoah',
       'deathvalleynp', 'americansamoa', 'bigbend', 'cuyahogafalls',
       'arches', 'craterlake', 'grandtetonnatlpark', 'northcascades',
       'isleroyale', 'gsmnp', 'pinnaclesnp', 'mount_rainier',
       'acadianationalpark', 'sequoia', 'kenaipeninsula', 'canyonlands',
       'brycecanyon', 'redwoodnationalpark', 'archesnationalpark',
       'bigbendnationalpark', 'zionnationalpark', 'capitolreefnp',
       'olympicnationalpark', 'lassenvolcanic', 'zionnp',
       'carlsbadcavernsnp', 'grandteton', 'rockymountain',
       'greatbasinstories', 'newrivergorgenp', 'shenandoahpark',
       'mammothcave', 'grandcanyonhiking'], dtype=object)

Connect hll worker db

In [14]:
DB_USER = "hlluser"
DB_PASS = os.getenv('READONLY_USER_PASSWORD')
# set connection variables
DB_HOST = "hllworkerdb"
DB_PORT = "5432"
DB_NAME = "hllworkerdb"

Connect to empty Postgres database running HLL Extension:

In [15]:
DB_CONN = psycopg2.connect(
        host=DB_HOST,
        port=DB_PORT ,
        dbname=DB_NAME,
        user=DB_USER,
        password=DB_PASS
)
DB_CONN.set_session(
    readonly=True)
DB_CALC = tools.DbConn(
    DB_CONN)
CUR_HLL = DB_CONN.cursor()
In [16]:
db_conn = tools.DbConn(DB_CONN)

test

In [17]:
results = union_hll([df["post_hll"][0], df["post_hll"][1]], CUR_HLL)
results
Out[17]:
'\\x138b40c8a2fd82'
In [18]:
cardinality_hll(results, CUR_HLL)
Out[18]:
2

Calculate HLL Cardinality per month

Define additional functions for reading and formatting CSV as pd.DataFrame

In [19]:
TIMESTAMP_FORMAT = '%Y %m'

def read_csv_datetime(csv: Path, timestamp_format: str = TIMESTAMP_FORMAT) -> pd.DataFrame:
    """Read CSV with parsing datetime index (months)
    
        First CSV column: Year
        Second CSV column: Month
    """
    date_cols = ["year", "month"]
    df = pd.read_csv(
        csv, index_col='datetime', 
        parse_dates={'datetime':date_cols},
        date_format=timestamp_format,
        keep_date_col='False')
    df.drop(columns=date_cols, inplace=True)
    return df
    
def append_cardinality_df(df: pd.DataFrame, hll_col: str = "post_hll", cardinality_col: str = 'postcount_est'):
    """Calculate cardinality from HLL and append to extra column in df"""
    df[cardinality_col] = df.apply(
        lambda x: cardinality_hll(
           x[hll_col], CUR_HLL),
        axis=1)
    df.drop(columns=[hll_col], inplace=True)
    return df

def filter_fill_time(
        df: pd.DataFrame, min_year: int, 
        max_year: int, val_col: str = "postcount_est",
        min_month: int = 1, max_month: int = 1):
    """Filter time values between min - max year and fill missing values"""
    min_date = pd.Timestamp(f'{min_year}-{min_month}-01')
    max_date = pd.Timestamp(f'{max_year}-{max_month}-01')
    # clip by start and end date
    if not min_date in df.index:
        df.loc[min_date, val_col] = 0
    if not max_date in df.index:
        df.loc[max_date, val_col] = 0
    df.sort_index(inplace=True)
    # mask min and max time
    time_mask = ((df.index >= min_date) & (df.index <= max_date))
    # fill missing months with 0
    # this will also set the day to max of month
    series = df.loc[time_mask][val_col].resample('M').sum().fillna(0)
    return series.to_frame()

Apply functions to all data sets.

  • Read from CSV
  • calculate cardinality
  • merge year and month to single column
  • filter 2007 - 2018 range, fill missing values
In [20]:
%%time
df_post = read_csv_datetime(REDDIT_ALL_SUBMISSIONS)
df_post = append_cardinality_df(df_post, 'post_hll', 'postcount_est')
df_post = filter_fill_time(df_post, 2010, 2023, 'postcount_est')
CPU times: user 64.2 ms, sys: 52.5 ms, total: 117 ms
Wall time: 252 ms
In [21]:
df_post.head(5)
Out[21]:
postcount_est
datetime
2010-01-31 0.0
2010-02-28 0.0
2010-03-31 0.0
2010-04-30 0.0
2010-05-31 0.0
In [22]:
df_post.plot()
Out[22]:
<Axes: xlabel='datetime'>

Repeat for comments:

In [23]:
df = pd.read_csv(REDDIT_ALL_COMMENTS)
df.tail()
Out[23]:
year month post_hll user_hll referenced_post_hll topic_group
2215 2023 4 \x138b4013231921210225e129c13f2340a14be174c47e... \x138b4001a269019c61d041 \x138b4013c116218a01a721b8c3 shenandoah
2216 2023 4 \x138b400c8310e113e31be21ea224e12dc12e8137a23c... \x138b40348143e145a1d841e802e841f5c2 \x138b40002325c1270138835361eb67f181 virginislands
2217 2023 4 \x148b4000802104200040238000080020042101020280... \x138b40016101c108210ac40fc2112115c1174119611b... \x138b40000100430141018204810561064108a209e10c... yosemite
2218 2023 4 \x138b40004101030142028203010362038105c6060106... \x138b4016c120e42162226122e2276228e5290329212d... \x138b40016101c5060107440d8113e117822461260326... zionnationalpark
2219 2023 4 \x138b401924 \x138b40d221 \x138b4014e2 zionnp
In [24]:
%%time
df_comments = read_csv_datetime(REDDIT_ALL_COMMENTS)
CPU times: user 16.8 ms, sys: 2.28 ms, total: 19.1 ms
Wall time: 18.1 ms
In [25]:
df_comments.head()
Out[25]:
post_hll user_hll referenced_post_hll topic_group
datetime
2011-02-01 \x138b4005222b8151c15a226ce379a187038f8297c1b1... \x138b40182148617501c303dee1 \x138b40014247c185618c83efc2 yosemite
2011-03-01 \x138b40cce2 \x138b405e44 \x138b409481 yosemite
2011-04-01 \x138b4006218b618ee1b641f061f201f241 \x138b4076a197a1c303 \x138b4020c17be294a3 yosemite
2011-05-01 \x138b401c812b822d429fc5aa82ca61 \x138b401ac25e44dee1 \x138b4012c19cc49e41 yosemite
2011-06-01 \x138b402ee1 \x138b4036c3 \x138b401181 hotsprings
In [26]:
df_comments = append_cardinality_df(df_comments, 'post_hll', 'commentscount_est')
df_comments = filter_fill_time(df_comments, 2010, 2023, 'commentscount_est')
In [27]:
def plot_lines(
        df_list: List[pd.DataFrame], ylegend: str = "Post count", 
        xlegend: str = "Year", title: Optional[str] = None):
    """Plot lines from a list of DataFrames"""

    fig, ax = plt.subplots()
    fig.set_size_inches(15.7, 4.27)
    ylabel = f'{ylegend} (estimate)'
    for df in df_list:
        ax = df.plot(ax=ax)
    tick_loc = mticker.MultipleLocator(12)
    # x axis ticker formatting
    ax.xaxis.set_major_locator(tick_loc)
    ax.yaxis.set_major_formatter(mticker.StrMethodFormatter('{x:,.0f}'))
    ax.tick_params(axis='x', rotation=45, color='grey')
    ax.set(xlabel=xlegend, ylabel=ylegend)
    ax.spines["left"].set_linewidth(0.25)
    ax.spines["bottom"].set_linewidth(0.25)
    ax.spines["top"].set_linewidth(0)
    ax.spines["right"].set_linewidth(0)
    ax.yaxis.set_tick_params(width=0.5)
    # add legend
    h, l = ax.get_legend_handles_labels()
    ax.legend(h, l, frameon=False, loc='best')
    if title:
        ax.set_title(title)
In [28]:
plot_lines([df_post, df_comments])
In [29]:
df_comments
Out[29]:
commentscount_est
datetime
2010-01-31 0.0
2010-02-28 0.0
2010-03-31 0.0
2010-04-30 0.0
2010-05-31 0.0
... ...
2022-09-30 4761.0
2022-10-31 4025.0
2022-11-30 3996.0
2022-12-31 3655.0
2023-01-31 4286.0

157 rows × 1 columns

Group temporal patterns by month

The growth in data contribution on Reddit distorts the curve and limits our ability to view temporal characteristic and repeating patterns. Below, we group data by month, to visualize average and repeating seasonal trends.

In [30]:
def plot_bars(
        df1: pd.DataFrame, df2: pd.DataFrame, ax: matplotlib.axes = None, title: str = None):
    """Plot bars from two DataFrames"""
    colors = sns.color_palette("vlag", as_cmap=True, n_colors=2)
    bar_param = {
        "width":1.0,
        "label":"Reddit total submission count aggregated for months",
        "edgecolor":"white",
        "linewidth":0.5,
        "alpha":0.7
    }
    # create figure
    if not ax:
        fig, ax = plt.subplots(1, 1, figsize=(3, 1.5))
    # plot
    df1.groupby(df1.index.month)["commentscount_est"] \
        .mean().plot.bar(ax=ax, color=colors([1.0]), y="commentscount_est", **bar_param)
    df2.groupby(df2.index.month)["postcount_est"] \
        .mean().plot.bar(ax=ax, color=colors([0.0]), y="postcount_est", **bar_param)
    # format
    ax.set_xlim(-0.5,11.5)
    month_names = ['Jan','Feb','Mar','Apr','May','Jun',
                   'Jul','Aug','Sep','Oct','Nov','Dec'] 
    ax.set_xticklabels(month_names)
    ax.tick_params(axis='x', rotation=45, length=0) # length: of ticks
    ax.spines["left"].set_linewidth(0.25)
    ax.spines["bottom"].set_linewidth(0.25)
    ax.spines["top"].set_linewidth(0)
    ax.spines["right"].set_linewidth(0)
    ax.yaxis.set_tick_params(width=0.5)
    ax.set(xlabel="", ylabel="")
    if not title:
        title = "Post Count per Month (mean)"
    ax.set_title(title, y=-0.2, pad=-14)
    for item in (
        [ax.xaxis.label, ax.title, ax.yaxis.label] +
         ax.get_xticklabels() + ax.get_yticklabels()):
            item.set_fontsize(8)
In [31]:
plot_bars(df1=df_comments, df2=df_post)

Visualize for all Nationalparks separately.

In [32]:
national_parks = df['topic_group'].unique()
print(len(national_parks))
46

Get summary of total posts per Nationalpark, for sorting:

In [33]:
def replace_datetime_with_month(df: pd.DataFrame):
    """Extract month from datetime index, set as new composite index
    together with topic_group"""
    df.set_index([df.index.month, "topic_group"], inplace=True)
    df.index.rename(("month", "topic_group"), inplace=True)
In [34]:
df_post = read_csv_datetime(REDDIT_ALL_SUBMISSIONS)
df_comments = read_csv_datetime(REDDIT_ALL_COMMENTS)
df = pd.concat([df_post, df_comments])
replace_datetime_with_month(df)
cardinality_series = tools.union_hll_series(
    hll_series=df["user_hll"],
    db_conn=db_conn, multiindex=True)
df = cardinality_series.to_frame()
df.rename(columns={"hll_cardinality":"user_count"}, inplace=True)
national_parks_sorted = df.unstack(level=1).fillna(0).droplevel(0, axis=1).sum().sort_values(ascending=False)

Preview ranking order

In [35]:
national_parks_sorted.pow(1./2).plot.bar(figsize=(10, 3))
Out[35]:
<Axes: xlabel='topic_group'>

We can limit our analysis to the top 50%, as below there is not enough data available.

In [36]:
top_50_cnt = int(len(national_parks_sorted)/2)
top_50 = list(national_parks_sorted.keys())[:top_50_cnt]

Create bar plots individually for the top 50% Nationalparks subreddits in our dataset

In [37]:
df_comments = read_csv_datetime(REDDIT_ALL_COMMENTS)
df_post = read_csv_datetime(REDDIT_ALL_SUBMISSIONS)
# create figure object with multiple subplots
fig, axes = plt.subplots(nrows=int(top_50_cnt/4), ncols=4, figsize=(12, 11))
fig.subplots_adjust(hspace=.5) # adjust vertical space, to allow title below plot
# iterate nationalparks
for ix, ax in enumerate(axes.reshape(-1)):
    np_str = top_50[ix]
    # filter np_str and calculate cardinality
    df_comments_filter = df_comments[df_comments["topic_group"]==np_str].copy()
    df_comments_filter = append_cardinality_df(df_comments_filter, 'post_hll', 'commentscount_est')
    df_comments_filter = filter_fill_time(df_comments_filter, 2010, 2023, 'commentscount_est')
    df_post_filter = df_post[df_post["topic_group"]==np_str].copy()
    df_post_filter = append_cardinality_df(df_post_filter, 'post_hll', 'postcount_est')
    df_post_filter = filter_fill_time(df_post_filter, 2010, 2023, 'postcount_est')
    # plot bars individually
    plot_bars(
        df1=df_comments_filter,
        df2=df_post_filter, title=np_str, ax=ax)
In [38]:
tools.save_fig(fig, output=OUTPUT, name="barplot_nationalparks")

Visualize using joyplot

Import Joypy. Install to worker_env, if not available.

In [39]:
avail = True
try:
    import joypy
except:
    avail = False
In [40]:
!if [ "$avail" = False ] ; then /opt/conda/envs/worker_env/bin/python -m pip install joypy >&- 2>&-; fi
import joypy

First, merge comments+post count into one column, per Nationalpark/month.

In [41]:
df_comments = read_csv_datetime(REDDIT_ALL_COMMENTS)
df_post = read_csv_datetime(REDDIT_ALL_SUBMISSIONS)
In [42]:
df_comments.head()
Out[42]:
post_hll user_hll referenced_post_hll topic_group
datetime
2011-02-01 \x138b4005222b8151c15a226ce379a187038f8297c1b1... \x138b40182148617501c303dee1 \x138b40014247c185618c83efc2 yosemite
2011-03-01 \x138b40cce2 \x138b405e44 \x138b409481 yosemite
2011-04-01 \x138b4006218b618ee1b641f061f201f241 \x138b4076a197a1c303 \x138b4020c17be294a3 yosemite
2011-05-01 \x138b401c812b822d429fc5aa82ca61 \x138b401ac25e44dee1 \x138b4012c19cc49e41 yosemite
2011-06-01 \x138b402ee1 \x138b4036c3 \x138b401181 hotsprings

Data cleanup: There were some topic_group references included that start with u_ (user-topic_groups). Drop these.

In [43]:
df_comments = df_comments[~df_comments.topic_group.str.startswith('u_')]

Update Index to include topic_group in composite index

In [44]:
for df in [df_post, df_comments]:
    replace_datetime_with_month(df)

Union HLL sets (user_hll)

Concat two pd.Series

In [45]:
df = pd.concat([df_post["user_hll"], df_comments["user_hll"]])

Now we have a series with duplicate indexes. These need to be unioned with HLL. This will also remove duplicate users appearing in both comments and posts for a given month/National Park.

In [46]:
%%time
cardinality_series = tools.union_hll_series(
    hll_series=df,
    db_conn=db_conn, multiindex=True)
CPU times: user 25.6 ms, sys: 6.41 ms, total: 32 ms
Wall time: 59.7 ms
In [47]:
df = cardinality_series.to_frame()
df.rename(columns={"hll_cardinality":"user_count"}, inplace=True)
In [48]:
df.head()
Out[48]:
user_count
month topic_group
1 acadianationalpark 19
americansamoa 23
arches 4
archesnationalpark 5
bigbend 3

We can also use unstack to turn the multiindex into a matrix

In [49]:
(df.unstack(level=0)  # unstack level 0 -> pivot index labels to columns
    .fillna(0)  # fill NA-values with 0
    .astype(int)  # float to integer
    .droplevel(0, axis=1)  # remove "user_count" multi-level x-column, after unstack
    .style.background_gradient(cmap='Blues', axis=1)) # colorize
Out[49]:
month 1 2 3 4 5 6 7 8 9 10 11 12
topic_group                        
acadianationalpark 19 40 36 49 52 77 93 86 93 97 23 15
americansamoa 23 27 23 23 15 9 12 14 17 14 22 20
arches 4 6 6 6 5 6 4 9 7 8 9 4
archesnationalpark 5 6 9 11 13 7 8 10 8 4 6 3
bigbend 3 2 5 3 3 3 1 0 4 7 2 4
bigbendnationalpark 15 8 6 10 5 2 3 1 1 5 4 6
bigbendtx 233 192 276 170 124 98 92 103 126 153 232 210
brycecanyon 14 13 13 15 14 14 13 13 10 16 8 8
canyonlands 5 2 9 8 7 4 4 5 4 5 5 6
capitolreefnp 6 3 7 8 5 2 7 4 7 6 2 6
carlsbadcavernsnp 1 1 5 8 5 3 1 2 0 0 3 2
craterlake 9 7 10 13 19 18 25 36 18 19 6 6
cuyahogafalls 9 10 9 9 8 9 12 9 10 12 11 8
deathvalley 22 15 28 18 10 9 15 9 11 13 10 14
deathvalleynp 88 90 125 58 36 23 37 46 43 56 65 87
everglades 40 44 48 41 22 20 16 18 23 24 25 33
glacier 46 56 69 82 103 186 139 149 74 32 29 34
glaciernationalpark 131 177 228 155 191 398 521 305 243 108 70 213
grandcanyon 271 272 346 305 264 271 209 224 247 269 258 261
grandcanyonhiking 0 1 1 0 0 0 1 0 0 0 0 2
grandteton 1 0 0 0 0 1 0 0 2 2 3 1
grandtetonnatlpark 68 69 63 92 88 78 128 144 88 56 34 33
greatbasinstories 0 0 0 5 1 3 1 0 0 0 1 1
gsmnp 43 55 85 75 92 72 72 82 58 64 55 30
hotsprings 54 59 77 83 75 72 77 65 57 55 57 56
isleroyale 38 32 52 43 46 62 67 74 39 26 17 24
joshuatree 390 417 479 397 244 203 197 230 242 272 349 346
kenaipeninsula 6 8 7 15 9 10 6 11 10 13 6 5
lassenvolcanic 0 3 2 1 0 1 0 1 2 0 3 1
mammothcave 1 2 6 5 6 2 10 1 2 4 3 0
mount_rainier 9 5 18 28 12 26 35 30 21 13 12 15
newrivergorgenp 5 1 2 2 2 2 3 6 2 6 0 1
northcascades 5 11 13 15 9 10 28 21 15 6 2 3
olympicnationalpark 37 44 69 80 63 72 92 81 70 50 39 43
pinnaclesnp 10 3 6 5 8 4 3 1 3 4 4 3
redwoodnationalpark 6 1 1 1 1 6 4 5 2 3 2 0
rockymountain 39 40 41 56 49 134 119 99 82 43 23 26
sequoia 17 18 22 18 9 20 28 32 16 17 15 8
shenandoah 12 11 8 11 5 5 9 15 19 35 9 6
shenandoahpark 2 4 7 8 6 5 13 8 7 10 8 2
virginislands 92 117 123 119 128 114 112 103 114 64 98 105
yosemite 778 961 1131 1164 1249 1550 1456 1267 971 849 750 645
zionnationalpark 88 82 180 188 147 131 67 108 118 107 38 57
zionnp 29 23 47 46 54 68 76 59 100 110 47 37

Flatten, clean-up and use the square-root, to compress y-range max values.

In [50]:
df_plot = df.unstack(level=1).fillna(0).droplevel(0, axis=1).pow(1./4)

Create a sorted dictionary for explicit plotting order in joyplot. Note that as of Python 3.7, dictionaries are guarnteed to be ordered.

In [51]:
d = {}
for np_ref in top_50:
    d[np_ref] = df_plot[np_ref]

visualize

In [52]:
month_num = range(1, 13)
tick_pos = range(0, 12)
fig, ax = joypy.joyplot(
    d, kind="values", x_range=[0, 11], figsize=(4, 7),
    colormap=matplotlib.cm.autumn_r, fade=True, overlap=2, grid="y", legend=True)
ax[-1].set_xticks(tick_pos, labels=month_num)
plt.show()

Before analyzing chi, we do a last visualization test by calculating how individual parks' monthly patterns diverge from the norm (the average monthly patterns for all parks).

Calculate total averages and "diverge from average" per park/month

In [53]:
df_plot = df.unstack(level=1).fillna(0).droplevel(0, axis=1)
In [54]:
averages = df_plot.mean()

Create plotting order/selection based on absolute user counts of peak month:

In [55]:
order_np: Dict[str, int] = {}
for ix, park in enumerate(top_50):
    peak_value = max(df_plot[park])
    peak_month = list(df_plot[park]).index(peak_value)
    order_np[park] = df.loc[peak_month, park].user_count
In [56]:
top_50_plot = list(pd.Series(order_np).sort_values(ascending=False).keys())

Apply a number of modifications, to optimize visual legibility:

  • visualize diverge from mean, not absolute values
  • stretch these values to 1-100 range
  • reduce 1-100 range height, beginning with less visited parks (will prevent overlap)
  • order of plots is based on sorting of user visitations average in peak months
In [57]:
a = 1 # min
b = 100 # max'
for ix, park in enumerate(top_50_plot):
    perc = ix/(len(top_50)/100)
    b_mod = abs(perc-b)
    x = df_plot[park]
    df_plot[park] = a + (x - x.min()) * (b_mod - a) / (x.max() - x.min())

Create dictionary for plotting order:

In [58]:
d = {}
for np_ref in top_50_plot:
    d[np_ref] = df_plot[np_ref]

For annotating peaks, to avoid collision, we manually define text offset ahead of time.

In [59]:
xy_txt = {
    'yosemite':(20, 10),
    'joshuatree':(-70, 2),
    'grandcanyon':(5, 2),
    'glaciernationalpark':(5, 2),
    'bigbendtx':(-70, 2),
    'zionnationalpark':(2, 5),
    'virginislands':(-70, -20),
    'glacier':(5, 2),
    'grandtetonnatlpark':(5, 5),
    'hotsprings':(-70, -20),
    'gsmnp':(15, 2),
    'deathvalleynp':(5, 2),
    'rockymountain':(-80, -20),
    'olympicnationalpark':(5, 5),
    'zionnp':(5, 2),
    'acadianationalpark':(-30, -30),
    'isleroyale':(10, -10),
    'everglades':(-50, 2),
    'mount_rainier':(-50, 10),
    'sequoia':(5, 10),
    'craterlake':(5, 10),
    'americansamoa':(5, 10),
    'deathvalley':(5, 2),
}

Create plot

In [60]:
from adjustText import adjust_text
month_num = range(1, 13)
tick_pos = range(0, 12)
fig, ax = joypy.joyplot(
    d, kind="values", x_range=[0, 11], figsize=(8, 6),
    colormap=matplotlib.cm.autumn_r, fade=True, overlap=2, grid="y", legend=True)
ax[-1].set_xticks(tick_pos, labels=month_num)
for ix, park in enumerate(top_50_plot):
    peak_value = max(df_plot[park])
    peak_month = list(df_plot[park]).index(peak_value)
    ax[ix].plot(peak_month, peak_value, marker='o', markerfacecolor='black', markeredgecolor="white", zorder=1000, markersize=5)
    xy_pos = xy_txt.get(park)
    ax[ix].annotate(
        f"r/{park}\n{calendar.month_name[peak_month][:3]}, {df.loc[peak_month, park].user_count} user",
        xy=(peak_month, peak_value), xytext=xy_pos, zorder=1001,
        fontsize='medium', textcoords='offset points',
        path_effects=[pe.withStroke(linewidth=2, foreground="white")],
        arrowprops=dict(
            arrowstyle="-",
            color='black', lw=0.5, alpha=0.8, mutation_scale=4, 
            connectionstyle=f'arc3,rad=-0.3')
    )
fig.show()

save as svg and png

In [61]:
tools.save_fig(fig, output=OUTPUT, name="joyplot_nationalparks")

Chi per month

We want to compare temporal patterns for different parks. For instance, some parks may be particularly interesting during Spring (e.g. Yosemite, for Waterfalls being most impressive; or Shanandoah National Park, for colorfull autumn foliage).

Below, use chi per month is used to compare a single National Park's visitation to the average visitation rate for that month for all Parks.

In [62]:
#TODO

Create notebook HTML

In [62]:
!jupyter nbconvert --to html_toc \
    --output-dir=../resources/html/ ./02_reddit_nationalparks.ipynb \
    --template=../nbconvert.tpl \
    --ExtractOutputPreprocessor.enabled=False >&- 2>&-
In [ ]: