Alexander Dunkel, Institute of Cartography, TU Dresden
Visualization of temporal patterns for submissions and comments from Nationalpark-Subreddits.
import os, sys
from pathlib import Path
import psycopg2
import geopandas as gp
import pandas as pd
import seaborn as sns
import calendar
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as mticker
import matplotlib.patheffects as pe
from typing import List, Tuple, Dict, Optional
from IPython.display import clear_output, display, HTML
from datetime import datetime
module_path = str(Path.cwd().parents[0] / "py")
if module_path not in sys.path:
sys.path.append(module_path)
from modules.base import tools, hll
from modules.base.hll import union_hll, cardinality_hll
%load_ext autoreload
%autoreload 2
OUTPUT = Path.cwd().parents[0] / "out" # output directory for figures (etc.)
WORK_DIR = Path.cwd().parents[0] / "tmp" # Working directory
(OUTPUT / "figures").mkdir(exist_ok=True)
(OUTPUT / "svg").mkdir(exist_ok=True)
WORK_DIR.mkdir(exist_ok=True)
Plot styling
plt.style.use('default')
Set global font
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = ['Times New Roman'] + plt.rcParams['font.serif']
Load the data from CSV, generated in the previous notebook. Data is stored as aggregate HLL data (postcount, usercount) for each month.
REDDIT_ALL_SUBMISSIONS = OUTPUT / "reddit_all_months.csv"
REDDIT_ALL_COMMENTS = OUTPUT / "reddit_comments_all_months.csv"
%%time
data_files = {
"REDDIT_ALL_SUBMISSIONS":REDDIT_ALL_SUBMISSIONS,
"REDDIT_ALL_COMMENTS":REDDIT_ALL_COMMENTS,
}
tools.display_file_stats(data_files)
df = pd.read_csv(REDDIT_ALL_SUBMISSIONS)
df.head(10)
Get distinct subreddits:
df['topic_group'].unique()
DB_USER = "hlluser"
DB_PASS = os.getenv('READONLY_USER_PASSWORD')
# set connection variables
DB_HOST = "hllworkerdb"
DB_PORT = "5432"
DB_NAME = "hllworkerdb"
Connect to empty Postgres database running HLL Extension:
DB_CONN = psycopg2.connect(
host=DB_HOST,
port=DB_PORT ,
dbname=DB_NAME,
user=DB_USER,
password=DB_PASS
)
DB_CONN.set_session(
readonly=True)
DB_CALC = tools.DbConn(
DB_CONN)
CUR_HLL = DB_CONN.cursor()
db_conn = tools.DbConn(DB_CONN)
test
results = union_hll([df["post_hll"][0], df["post_hll"][1]], CUR_HLL)
results
cardinality_hll(results, CUR_HLL)
Define additional functions for reading and formatting CSV as pd.DataFrame
TIMESTAMP_FORMAT = '%Y %m'
def read_csv_datetime(csv: Path, timestamp_format: str = TIMESTAMP_FORMAT) -> pd.DataFrame:
"""Read CSV with parsing datetime index (months)
First CSV column: Year
Second CSV column: Month
"""
date_cols = ["year", "month"]
df = pd.read_csv(
csv, index_col='datetime',
parse_dates={'datetime':date_cols},
date_format=timestamp_format,
keep_date_col='False')
df.drop(columns=date_cols, inplace=True)
return df
def append_cardinality_df(df: pd.DataFrame, hll_col: str = "post_hll", cardinality_col: str = 'postcount_est'):
"""Calculate cardinality from HLL and append to extra column in df"""
df[cardinality_col] = df.apply(
lambda x: cardinality_hll(
x[hll_col], CUR_HLL),
axis=1)
df.drop(columns=[hll_col], inplace=True)
return df
def filter_fill_time(
df: pd.DataFrame, min_year: int,
max_year: int, val_col: str = "postcount_est",
min_month: int = 1, max_month: int = 1):
"""Filter time values between min - max year and fill missing values"""
min_date = pd.Timestamp(f'{min_year}-{min_month}-01')
max_date = pd.Timestamp(f'{max_year}-{max_month}-01')
# clip by start and end date
if not min_date in df.index:
df.loc[min_date, val_col] = 0
if not max_date in df.index:
df.loc[max_date, val_col] = 0
df.sort_index(inplace=True)
# mask min and max time
time_mask = ((df.index >= min_date) & (df.index <= max_date))
# fill missing months with 0
# this will also set the day to max of month
series = df.loc[time_mask][val_col].resample('M').sum().fillna(0)
return series.to_frame()
Apply functions to all data sets.
%%time
df_post = read_csv_datetime(REDDIT_ALL_SUBMISSIONS)
df_post = append_cardinality_df(df_post, 'post_hll', 'postcount_est')
df_post = filter_fill_time(df_post, 2010, 2023, 'postcount_est')
df_post.head(5)
df_post.plot()
Repeat for comments:
df = pd.read_csv(REDDIT_ALL_COMMENTS)
df.tail()
%%time
df_comments = read_csv_datetime(REDDIT_ALL_COMMENTS)
df_comments.head()
df_comments = append_cardinality_df(df_comments, 'post_hll', 'commentscount_est')
df_comments = filter_fill_time(df_comments, 2010, 2023, 'commentscount_est')
def plot_lines(
df_list: List[pd.DataFrame], ylegend: str = "Post count",
xlegend: str = "Year", title: Optional[str] = None):
"""Plot lines from a list of DataFrames"""
fig, ax = plt.subplots()
fig.set_size_inches(15.7, 4.27)
ylabel = f'{ylegend} (estimate)'
for df in df_list:
ax = df.plot(ax=ax)
tick_loc = mticker.MultipleLocator(12)
# x axis ticker formatting
ax.xaxis.set_major_locator(tick_loc)
ax.yaxis.set_major_formatter(mticker.StrMethodFormatter('{x:,.0f}'))
ax.tick_params(axis='x', rotation=45, color='grey')
ax.set(xlabel=xlegend, ylabel=ylegend)
ax.spines["left"].set_linewidth(0.25)
ax.spines["bottom"].set_linewidth(0.25)
ax.spines["top"].set_linewidth(0)
ax.spines["right"].set_linewidth(0)
ax.yaxis.set_tick_params(width=0.5)
# add legend
h, l = ax.get_legend_handles_labels()
ax.legend(h, l, frameon=False, loc='best')
if title:
ax.set_title(title)
plot_lines([df_post, df_comments])
df_comments
The growth in data contribution on Reddit distorts the curve and limits our ability to view temporal characteristic and repeating patterns. Below, we group data by month, to visualize average and repeating seasonal trends.
def plot_bars(
df1: pd.DataFrame, df2: pd.DataFrame, ax: matplotlib.axes = None, title: str = None):
"""Plot bars from two DataFrames"""
colors = sns.color_palette("vlag", as_cmap=True, n_colors=2)
bar_param = {
"width":1.0,
"label":"Reddit total submission count aggregated for months",
"edgecolor":"white",
"linewidth":0.5,
"alpha":0.7
}
# create figure
if not ax:
fig, ax = plt.subplots(1, 1, figsize=(3, 1.5))
# plot
df1.groupby(df1.index.month)["commentscount_est"] \
.mean().plot.bar(ax=ax, color=colors([1.0]), y="commentscount_est", **bar_param)
df2.groupby(df2.index.month)["postcount_est"] \
.mean().plot.bar(ax=ax, color=colors([0.0]), y="postcount_est", **bar_param)
# format
ax.set_xlim(-0.5,11.5)
month_names = ['Jan','Feb','Mar','Apr','May','Jun',
'Jul','Aug','Sep','Oct','Nov','Dec']
ax.set_xticklabels(month_names)
ax.tick_params(axis='x', rotation=45, length=0) # length: of ticks
ax.spines["left"].set_linewidth(0.25)
ax.spines["bottom"].set_linewidth(0.25)
ax.spines["top"].set_linewidth(0)
ax.spines["right"].set_linewidth(0)
ax.yaxis.set_tick_params(width=0.5)
ax.set(xlabel="", ylabel="")
if not title:
title = "Post Count per Month (mean)"
ax.set_title(title, y=-0.2, pad=-14)
for item in (
[ax.xaxis.label, ax.title, ax.yaxis.label] +
ax.get_xticklabels() + ax.get_yticklabels()):
item.set_fontsize(8)
plot_bars(df1=df_comments, df2=df_post)
Visualize for all Nationalparks separately.
national_parks = df['topic_group'].unique()
print(len(national_parks))
Get summary of total posts per Nationalpark, for sorting:
def replace_datetime_with_month(df: pd.DataFrame):
"""Extract month from datetime index, set as new composite index
together with topic_group"""
df.set_index([df.index.month, "topic_group"], inplace=True)
df.index.rename(("month", "topic_group"), inplace=True)
df_post = read_csv_datetime(REDDIT_ALL_SUBMISSIONS)
df_comments = read_csv_datetime(REDDIT_ALL_COMMENTS)
df = pd.concat([df_post, df_comments])
replace_datetime_with_month(df)
cardinality_series = tools.union_hll_series(
hll_series=df["user_hll"],
db_conn=db_conn, multiindex=True)
df = cardinality_series.to_frame()
df.rename(columns={"hll_cardinality":"user_count"}, inplace=True)
national_parks_sorted = df.unstack(level=1).fillna(0).droplevel(0, axis=1).sum().sort_values(ascending=False)
Preview ranking order
national_parks_sorted.pow(1./2).plot.bar(figsize=(10, 3))
We can limit our analysis to the top 50%, as below there is not enough data available.
top_50_cnt = int(len(national_parks_sorted)/2)
top_50 = list(national_parks_sorted.keys())[:top_50_cnt]
Create bar plots individually for the top 50% Nationalparks subreddits in our dataset
df_comments = read_csv_datetime(REDDIT_ALL_COMMENTS)
df_post = read_csv_datetime(REDDIT_ALL_SUBMISSIONS)
# create figure object with multiple subplots
fig, axes = plt.subplots(nrows=int(top_50_cnt/4), ncols=4, figsize=(12, 11))
fig.subplots_adjust(hspace=.5) # adjust vertical space, to allow title below plot
# iterate nationalparks
for ix, ax in enumerate(axes.reshape(-1)):
np_str = top_50[ix]
# filter np_str and calculate cardinality
df_comments_filter = df_comments[df_comments["topic_group"]==np_str].copy()
df_comments_filter = append_cardinality_df(df_comments_filter, 'post_hll', 'commentscount_est')
df_comments_filter = filter_fill_time(df_comments_filter, 2010, 2023, 'commentscount_est')
df_post_filter = df_post[df_post["topic_group"]==np_str].copy()
df_post_filter = append_cardinality_df(df_post_filter, 'post_hll', 'postcount_est')
df_post_filter = filter_fill_time(df_post_filter, 2010, 2023, 'postcount_est')
# plot bars individually
plot_bars(
df1=df_comments_filter,
df2=df_post_filter, title=np_str, ax=ax)
tools.save_fig(fig, output=OUTPUT, name="barplot_nationalparks")
Import Joypy. Install to worker_env
, if not available.
avail = True
try:
import joypy
except:
avail = False
!if [ "$avail" = False ] ; then /opt/conda/envs/worker_env/bin/python -m pip install joypy >&- 2>&-; fi
import joypy
First, merge comments+post count into one column, per Nationalpark/month.
df_comments = read_csv_datetime(REDDIT_ALL_COMMENTS)
df_post = read_csv_datetime(REDDIT_ALL_SUBMISSIONS)
df_comments.head()
Data cleanup: There were some topic_group references included that start with u_
(user-topic_groups). Drop these.
df_comments = df_comments[~df_comments.topic_group.str.startswith('u_')]
Update Index to include topic_group
in composite index
for df in [df_post, df_comments]:
replace_datetime_with_month(df)
Union HLL sets (user_hll)
Concat two pd.Series
df = pd.concat([df_post["user_hll"], df_comments["user_hll"]])
Now we have a series with duplicate indexes. These need to be unioned with HLL. This will also remove duplicate users appearing in both comments and posts for a given month/National Park.
%%time
cardinality_series = tools.union_hll_series(
hll_series=df,
db_conn=db_conn, multiindex=True)
df = cardinality_series.to_frame()
df.rename(columns={"hll_cardinality":"user_count"}, inplace=True)
df.head()
We can also use unstack
to turn the multiindex into a matrix
(df.unstack(level=0) # unstack level 0 -> pivot index labels to columns
.fillna(0) # fill NA-values with 0
.astype(int) # float to integer
.droplevel(0, axis=1) # remove "user_count" multi-level x-column, after unstack
.style.background_gradient(cmap='Blues', axis=1)) # colorize
Flatten, clean-up and use the square-root, to compress y-range max values.
df_plot = df.unstack(level=1).fillna(0).droplevel(0, axis=1).pow(1./4)
Create a sorted dictionary for explicit plotting order in joyplot. Note that as of Python 3.7, dictionaries are guarnteed to be ordered.
d = {}
for np_ref in top_50:
d[np_ref] = df_plot[np_ref]
visualize
month_num = range(1, 13)
tick_pos = range(0, 12)
fig, ax = joypy.joyplot(
d, kind="values", x_range=[0, 11], figsize=(4, 7),
colormap=matplotlib.cm.autumn_r, fade=True, overlap=2, grid="y", legend=True)
ax[-1].set_xticks(tick_pos, labels=month_num)
plt.show()
Before analyzing chi
, we do a last visualization test by calculating how individual parks' monthly patterns diverge from the norm (the average monthly patterns for all parks).
Calculate total averages and "diverge from average" per park/month
df_plot = df.unstack(level=1).fillna(0).droplevel(0, axis=1)
averages = df_plot.mean()
Create plotting order/selection based on absolute user counts of peak month:
order_np: Dict[str, int] = {}
for ix, park in enumerate(top_50):
peak_value = max(df_plot[park])
peak_month = list(df_plot[park]).index(peak_value)
order_np[park] = df.loc[peak_month, park].user_count
top_50_plot = list(pd.Series(order_np).sort_values(ascending=False).keys())
Apply a number of modifications, to optimize visual legibility:
a = 1 # min
b = 100 # max'
for ix, park in enumerate(top_50_plot):
perc = ix/(len(top_50)/100)
b_mod = abs(perc-b)
x = df_plot[park]
df_plot[park] = a + (x - x.min()) * (b_mod - a) / (x.max() - x.min())
Create dictionary for plotting order:
d = {}
for np_ref in top_50_plot:
d[np_ref] = df_plot[np_ref]
For annotating peaks, to avoid collision, we manually define text offset ahead of time.
xy_txt = {
'yosemite':(20, 10),
'joshuatree':(-70, 2),
'grandcanyon':(5, 2),
'glaciernationalpark':(5, 2),
'bigbendtx':(-70, 2),
'zionnationalpark':(2, 5),
'virginislands':(-70, -20),
'glacier':(5, 2),
'grandtetonnatlpark':(5, 5),
'hotsprings':(-70, -20),
'gsmnp':(15, 2),
'deathvalleynp':(5, 2),
'rockymountain':(-80, -20),
'olympicnationalpark':(5, 5),
'zionnp':(5, 2),
'acadianationalpark':(-30, -30),
'isleroyale':(10, -10),
'everglades':(-50, 2),
'mount_rainier':(-50, 10),
'sequoia':(5, 10),
'craterlake':(5, 10),
'americansamoa':(5, 10),
'deathvalley':(5, 2),
}
Create plot
from adjustText import adjust_text
month_num = range(1, 13)
tick_pos = range(0, 12)
fig, ax = joypy.joyplot(
d, kind="values", x_range=[0, 11], figsize=(8, 6),
colormap=matplotlib.cm.autumn_r, fade=True, overlap=2, grid="y", legend=True)
ax[-1].set_xticks(tick_pos, labels=month_num)
for ix, park in enumerate(top_50_plot):
peak_value = max(df_plot[park])
peak_month = list(df_plot[park]).index(peak_value)
ax[ix].plot(peak_month, peak_value, marker='o', markerfacecolor='black', markeredgecolor="white", zorder=1000, markersize=5)
xy_pos = xy_txt.get(park)
ax[ix].annotate(
f"r/{park}\n{calendar.month_name[peak_month][:3]}, {df.loc[peak_month, park].user_count} user",
xy=(peak_month, peak_value), xytext=xy_pos, zorder=1001,
fontsize='medium', textcoords='offset points',
path_effects=[pe.withStroke(linewidth=2, foreground="white")],
arrowprops=dict(
arrowstyle="-",
color='black', lw=0.5, alpha=0.8, mutation_scale=4,
connectionstyle=f'arc3,rad=-0.3')
)
fig.show()
save as svg and png
tools.save_fig(fig, output=OUTPUT, name="joyplot_nationalparks")
We want to compare temporal patterns for different parks. For instance, some parks may be particularly interesting during Spring (e.g. Yosemite, for Waterfalls being most impressive; or Shanandoah National Park, for colorfull autumn foliage).
Below, use chi per month is used to compare a single National Park's visitation to the average visitation rate for that month for all Parks.
#TODO
!jupyter nbconvert --to html_toc \
--output-dir=../resources/html/ ./02_reddit_nationalparks.ipynb \
--template=../nbconvert.tpl \
--ExtractOutputPreprocessor.enabled=False >&- 2>&-