Reddit Nationalpark Data Exploration ¶

Alexander Dunkel, Institute of Cartography, TU Dresden

Visualization of temporal patterns for submissions and comments from Nationalpark-Subreddits.

Preparations¶

import os, sys
from pathlib import Path
import psycopg2
import geopandas as gp
import pandas as pd
import seaborn as sns
import calendar
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as mticker
import matplotlib.patheffects as pe
from typing import List, Tuple, Dict, Optional
from IPython.display import clear_output, display, HTML
from datetime import datetime

module_path = str(Path.cwd().parents[0] / "py")
if module_path not in sys.path:
    sys.path.append(module_path)
from modules.base import tools, hll
from modules.base.hll import union_hll, cardinality_hll

%load_ext autoreload
%autoreload 2

OUTPUT = Path.cwd().parents[0] / "out"       # output directory for figures (etc.)
WORK_DIR = Path.cwd().parents[0] / "tmp"     # Working directory

(OUTPUT / "figures").mkdir(exist_ok=True)
(OUTPUT / "svg").mkdir(exist_ok=True)
WORK_DIR.mkdir(exist_ok=True)

Plot styling

plt.style.use('default')

Set global font

plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = ['Times New Roman'] + plt.rcParams['font.serif']

Load HLL aggregate data¶

Load the data from CSV, generated in the previous notebook. Data is stored as aggregate HLL data (postcount, usercount) for each month.

REDDIT_ALL_SUBMISSIONS = OUTPUT / "reddit_all_months.csv"
REDDIT_ALL_COMMENTS = OUTPUT / "reddit_comments_all_months.csv"

%%time
data_files = {
    "REDDIT_ALL_SUBMISSIONS":REDDIT_ALL_SUBMISSIONS,
    "REDDIT_ALL_COMMENTS":REDDIT_ALL_COMMENTS,
    }
tools.display_file_stats(data_files)

CPU times: user 55 ms, sys: 5.33 ms, total: 60.3 ms
Wall time: 59.5 ms

df = pd.read_csv(REDDIT_ALL_SUBMISSIONS)

df.head(10)

Get distinct subreddits:

df['topic_group'].unique()

array(['everglades', 'yosemite', 'grandcanyon', 'hotsprings',
       'deathvalley', 'joshuatree', 'glaciernationalpark',
       'virginislands', 'bigbendtx', 'glacier', 'shenandoah',
       'deathvalleynp', 'americansamoa', 'bigbend', 'cuyahogafalls',
       'arches', 'craterlake', 'grandtetonnatlpark', 'northcascades',
       'isleroyale', 'gsmnp', 'pinnaclesnp', 'mount_rainier',
       'acadianationalpark', 'sequoia', 'kenaipeninsula', 'canyonlands',
       'brycecanyon', 'redwoodnationalpark', 'archesnationalpark',
       'bigbendnationalpark', 'zionnationalpark', 'capitolreefnp',
       'olympicnationalpark', 'lassenvolcanic', 'zionnp',
       'carlsbadcavernsnp', 'grandteton', 'rockymountain',
       'greatbasinstories', 'newrivergorgenp', 'shenandoahpark',
       'mammothcave', 'grandcanyonhiking'], dtype=object)

Connect hll worker db¶

DB_USER = "hlluser"
DB_PASS = os.getenv('READONLY_USER_PASSWORD')
# set connection variables
DB_HOST = "hllworkerdb"
DB_PORT = "5432"
DB_NAME = "hllworkerdb"

Connect to empty Postgres database running HLL Extension:

DB_CONN = psycopg2.connect(
        host=DB_HOST,
        port=DB_PORT ,
        dbname=DB_NAME,
        user=DB_USER,
        password=DB_PASS
)
DB_CONN.set_session(
    readonly=True)
DB_CALC = tools.DbConn(
    DB_CONN)
CUR_HLL = DB_CONN.cursor()

db_conn = tools.DbConn(DB_CONN)

test

results = union_hll([df["post_hll"][0], df["post_hll"][1]], CUR_HLL)
results

'\\x138b40c8a2fd82'

cardinality_hll(results, CUR_HLL)

2

Calculate HLL Cardinality per month¶

Define additional functions for reading and formatting CSV as pd.DataFrame

TIMESTAMP_FORMAT = '%Y %m'

def read_csv_datetime(csv: Path, timestamp_format: str = TIMESTAMP_FORMAT) -> pd.DataFrame:
    """Read CSV with parsing datetime index (months)
    
        First CSV column: Year
        Second CSV column: Month
    """
    date_cols = ["year", "month"]
    df = pd.read_csv(
        csv, index_col='datetime', 
        parse_dates={'datetime':date_cols},
        date_format=timestamp_format,
        keep_date_col='False')
    df.drop(columns=date_cols, inplace=True)
    return df
    
def append_cardinality_df(df: pd.DataFrame, hll_col: str = "post_hll", cardinality_col: str = 'postcount_est'):
    """Calculate cardinality from HLL and append to extra column in df"""
    df[cardinality_col] = df.apply(
        lambda x: cardinality_hll(
           x[hll_col], CUR_HLL),
        axis=1)
    df.drop(columns=[hll_col], inplace=True)
    return df

def filter_fill_time(
        df: pd.DataFrame, min_year: int, 
        max_year: int, val_col: str = "postcount_est",
        min_month: int = 1, max_month: int = 1):
    """Filter time values between min - max year and fill missing values"""
    min_date = pd.Timestamp(f'{min_year}-{min_month}-01')
    max_date = pd.Timestamp(f'{max_year}-{max_month}-01')
    # clip by start and end date
    if not min_date in df.index:
        df.loc[min_date, val_col] = 0
    if not max_date in df.index:
        df.loc[max_date, val_col] = 0
    df.sort_index(inplace=True)
    # mask min and max time
    time_mask = ((df.index >= min_date) & (df.index <= max_date))
    # fill missing months with 0
    # this will also set the day to max of month
    series = df.loc[time_mask][val_col].resample('M').sum().fillna(0)
    return series.to_frame()

Apply functions to all data sets.

Read from CSV
calculate cardinality
merge year and month to single column
filter 2007 - 2018 range, fill missing values

%%time
df_post = read_csv_datetime(REDDIT_ALL_SUBMISSIONS)
df_post = append_cardinality_df(df_post, 'post_hll', 'postcount_est')
df_post = filter_fill_time(df_post, 2010, 2023, 'postcount_est')

CPU times: user 64.2 ms, sys: 52.5 ms, total: 117 ms
Wall time: 252 ms

df_post.head(5)

df_post.plot()

<Axes: xlabel='datetime'>

Repeat for comments:

df = pd.read_csv(REDDIT_ALL_COMMENTS)
df.tail()

%%time
df_comments = read_csv_datetime(REDDIT_ALL_COMMENTS)

CPU times: user 16.8 ms, sys: 2.28 ms, total: 19.1 ms
Wall time: 18.1 ms

df_comments.head()

df_comments = append_cardinality_df(df_comments, 'post_hll', 'commentscount_est')
df_comments = filter_fill_time(df_comments, 2010, 2023, 'commentscount_est')

def plot_lines(
        df_list: List[pd.DataFrame], ylegend: str = "Post count", 
        xlegend: str = "Year", title: Optional[str] = None):
    """Plot lines from a list of DataFrames"""

    fig, ax = plt.subplots()
    fig.set_size_inches(15.7, 4.27)
    ylabel = f'{ylegend} (estimate)'
    for df in df_list:
        ax = df.plot(ax=ax)
    tick_loc = mticker.MultipleLocator(12)
    # x axis ticker formatting
    ax.xaxis.set_major_locator(tick_loc)
    ax.yaxis.set_major_formatter(mticker.StrMethodFormatter('{x:,.0f}'))
    ax.tick_params(axis='x', rotation=45, color='grey')
    ax.set(xlabel=xlegend, ylabel=ylegend)
    ax.spines["left"].set_linewidth(0.25)
    ax.spines["bottom"].set_linewidth(0.25)
    ax.spines["top"].set_linewidth(0)
    ax.spines["right"].set_linewidth(0)
    ax.yaxis.set_tick_params(width=0.5)
    # add legend
    h, l = ax.get_legend_handles_labels()
    ax.legend(h, l, frameon=False, loc='best')
    if title:
        ax.set_title(title)

plot_lines([df_post, df_comments])

df_comments

Group temporal patterns by month¶

The growth in data contribution on Reddit distorts the curve and limits our ability to view temporal characteristic and repeating patterns. Below, we group data by month, to visualize average and repeating seasonal trends.

def plot_bars(
        df1: pd.DataFrame, df2: pd.DataFrame, ax: matplotlib.axes = None, title: str = None):
    """Plot bars from two DataFrames"""
    colors = sns.color_palette("vlag", as_cmap=True, n_colors=2)
    bar_param = {
        "width":1.0,
        "label":"Reddit total submission count aggregated for months",
        "edgecolor":"white",
        "linewidth":0.5,
        "alpha":0.7
    }
    # create figure
    if not ax:
        fig, ax = plt.subplots(1, 1, figsize=(3, 1.5))
    # plot
    df1.groupby(df1.index.month)["commentscount_est"] \
        .mean().plot.bar(ax=ax, color=colors([1.0]), y="commentscount_est", **bar_param)
    df2.groupby(df2.index.month)["postcount_est"] \
        .mean().plot.bar(ax=ax, color=colors([0.0]), y="postcount_est", **bar_param)
    # format
    ax.set_xlim(-0.5,11.5)
    month_names = ['Jan','Feb','Mar','Apr','May','Jun',
                   'Jul','Aug','Sep','Oct','Nov','Dec'] 
    ax.set_xticklabels(month_names)
    ax.tick_params(axis='x', rotation=45, length=0) # length: of ticks
    ax.spines["left"].set_linewidth(0.25)
    ax.spines["bottom"].set_linewidth(0.25)
    ax.spines["top"].set_linewidth(0)
    ax.spines["right"].set_linewidth(0)
    ax.yaxis.set_tick_params(width=0.5)
    ax.set(xlabel="", ylabel="")
    if not title:
        title = "Post Count per Month (mean)"
    ax.set_title(title, y=-0.2, pad=-14)
    for item in (
        [ax.xaxis.label, ax.title, ax.yaxis.label] +
         ax.get_xticklabels() + ax.get_yticklabels()):
            item.set_fontsize(8)

plot_bars(df1=df_comments, df2=df_post)

Visualize for all Nationalparks separately.

national_parks = df['topic_group'].unique()
print(len(national_parks))

46

Get summary of total posts per Nationalpark, for sorting:

def replace_datetime_with_month(df: pd.DataFrame):
    """Extract month from datetime index, set as new composite index
    together with topic_group"""
    df.set_index([df.index.month, "topic_group"], inplace=True)
    df.index.rename(("month", "topic_group"), inplace=True)

df_post = read_csv_datetime(REDDIT_ALL_SUBMISSIONS)
df_comments = read_csv_datetime(REDDIT_ALL_COMMENTS)
df = pd.concat([df_post, df_comments])
replace_datetime_with_month(df)
cardinality_series = tools.union_hll_series(
    hll_series=df["user_hll"],
    db_conn=db_conn, multiindex=True)
df = cardinality_series.to_frame()
df.rename(columns={"hll_cardinality":"user_count"}, inplace=True)
national_parks_sorted = df.unstack(level=1).fillna(0).droplevel(0, axis=1).sum().sort_values(ascending=False)

Preview ranking order

national_parks_sorted.pow(1./2).plot.bar(figsize=(10, 3))

<Axes: xlabel='topic_group'>

We can limit our analysis to the top 50%, as below there is not enough data available.

top_50_cnt = int(len(national_parks_sorted)/2)
top_50 = list(national_parks_sorted.keys())[:top_50_cnt]

Create bar plots individually for the top 50% Nationalparks subreddits in our dataset

df_comments = read_csv_datetime(REDDIT_ALL_COMMENTS)
df_post = read_csv_datetime(REDDIT_ALL_SUBMISSIONS)
# create figure object with multiple subplots
fig, axes = plt.subplots(nrows=int(top_50_cnt/4), ncols=4, figsize=(12, 11))
fig.subplots_adjust(hspace=.5) # adjust vertical space, to allow title below plot
# iterate nationalparks
for ix, ax in enumerate(axes.reshape(-1)):
    np_str = top_50[ix]
    # filter np_str and calculate cardinality
    df_comments_filter = df_comments[df_comments["topic_group"]==np_str].copy()
    df_comments_filter = append_cardinality_df(df_comments_filter, 'post_hll', 'commentscount_est')
    df_comments_filter = filter_fill_time(df_comments_filter, 2010, 2023, 'commentscount_est')
    df_post_filter = df_post[df_post["topic_group"]==np_str].copy()
    df_post_filter = append_cardinality_df(df_post_filter, 'post_hll', 'postcount_est')
    df_post_filter = filter_fill_time(df_post_filter, 2010, 2023, 'postcount_est')
    # plot bars individually
    plot_bars(
        df1=df_comments_filter,
        df2=df_post_filter, title=np_str, ax=ax)

tools.save_fig(fig, output=OUTPUT, name="barplot_nationalparks")

Visualize using joyplot¶

Import Joypy. Install to worker_env, if not available.

avail = True
try:
    import joypy
except:
    avail = False

!if [ "$avail" = False ] ; then /opt/conda/envs/worker_env/bin/python -m pip install joypy >&- 2>&-; fi
import joypy

First, merge comments+post count into one column, per Nationalpark/month.

df_comments = read_csv_datetime(REDDIT_ALL_COMMENTS)
df_post = read_csv_datetime(REDDIT_ALL_SUBMISSIONS)

df_comments.head()

Data cleanup: There were some topic_group references included that start with u_ (user-topic_groups). Drop these.

df_comments = df_comments[~df_comments.topic_group.str.startswith('u_')]

Update Index to include topic_group in composite index

for df in [df_post, df_comments]:
    replace_datetime_with_month(df)

Union HLL sets (user_hll)

Concat two pd.Series

df = pd.concat([df_post["user_hll"], df_comments["user_hll"]])

Now we have a series with duplicate indexes. These need to be unioned with HLL. This will also remove duplicate users appearing in both comments and posts for a given month/National Park.

%%time
cardinality_series = tools.union_hll_series(
    hll_series=df,
    db_conn=db_conn, multiindex=True)

CPU times: user 25.6 ms, sys: 6.41 ms, total: 32 ms
Wall time: 59.7 ms

df = cardinality_series.to_frame()
df.rename(columns={"hll_cardinality":"user_count"}, inplace=True)

df.head()

We can also use unstack to turn the multiindex into a matrix

(df.unstack(level=0)  # unstack level 0 -> pivot index labels to columns
    .fillna(0)  # fill NA-values with 0
    .astype(int)  # float to integer
    .droplevel(0, axis=1)  # remove "user_count" multi-level x-column, after unstack
    .style.background_gradient(cmap='Blues', axis=1)) # colorize

Flatten, clean-up and use the square-root, to compress y-range max values.

df_plot = df.unstack(level=1).fillna(0).droplevel(0, axis=1).pow(1./4)

Create a sorted dictionary for explicit plotting order in joyplot. Note that as of Python 3.7, dictionaries are guarnteed to be ordered.

d = {}
for np_ref in top_50:
    d[np_ref] = df_plot[np_ref]

visualize

month_num = range(1, 13)
tick_pos = range(0, 12)
fig, ax = joypy.joyplot(
    d, kind="values", x_range=[0, 11], figsize=(4, 7),
    colormap=matplotlib.cm.autumn_r, fade=True, overlap=2, grid="y", legend=True)
ax[-1].set_xticks(tick_pos, labels=month_num)
plt.show()

Before analyzing chi, we do a last visualization test by calculating how individual parks' monthly patterns diverge from the norm (the average monthly patterns for all parks).

Calculate total averages and "diverge from average" per park/month

df_plot = df.unstack(level=1).fillna(0).droplevel(0, axis=1)

averages = df_plot.mean()

Create plotting order/selection based on absolute user counts of peak month:

order_np: Dict[str, int] = {}
for ix, park in enumerate(top_50):
    peak_value = max(df_plot[park])
    peak_month = list(df_plot[park]).index(peak_value)
    order_np[park] = df.loc[peak_month, park].user_count

top_50_plot = list(pd.Series(order_np).sort_values(ascending=False).keys())

Apply a number of modifications, to optimize visual legibility:

visualize diverge from mean, not absolute values
stretch these values to 1-100 range
reduce 1-100 range height, beginning with less visited parks (will prevent overlap)
order of plots is based on sorting of user visitations average in peak months

a = 1 # min
b = 100 # max'
for ix, park in enumerate(top_50_plot):
    perc = ix/(len(top_50)/100)
    b_mod = abs(perc-b)
    x = df_plot[park]
    df_plot[park] = a + (x - x.min()) * (b_mod - a) / (x.max() - x.min())

Create dictionary for plotting order:

d = {}
for np_ref in top_50_plot:
    d[np_ref] = df_plot[np_ref]

For annotating peaks, to avoid collision, we manually define text offset ahead of time.

xy_txt = {
    'yosemite':(20, 10),
    'joshuatree':(-70, 2),
    'grandcanyon':(5, 2),
    'glaciernationalpark':(5, 2),
    'bigbendtx':(-70, 2),
    'zionnationalpark':(2, 5),
    'virginislands':(-70, -20),
    'glacier':(5, 2),
    'grandtetonnatlpark':(5, 5),
    'hotsprings':(-70, -20),
    'gsmnp':(15, 2),
    'deathvalleynp':(5, 2),
    'rockymountain':(-80, -20),
    'olympicnationalpark':(5, 5),
    'zionnp':(5, 2),
    'acadianationalpark':(-30, -30),
    'isleroyale':(10, -10),
    'everglades':(-50, 2),
    'mount_rainier':(-50, 10),
    'sequoia':(5, 10),
    'craterlake':(5, 10),
    'americansamoa':(5, 10),
    'deathvalley':(5, 2),
}

Create plot

from adjustText import adjust_text
month_num = range(1, 13)
tick_pos = range(0, 12)
fig, ax = joypy.joyplot(
    d, kind="values", x_range=[0, 11], figsize=(8, 6),
    colormap=matplotlib.cm.autumn_r, fade=True, overlap=2, grid="y", legend=True)
ax[-1].set_xticks(tick_pos, labels=month_num)
for ix, park in enumerate(top_50_plot):
    peak_value = max(df_plot[park])
    peak_month = list(df_plot[park]).index(peak_value)
    ax[ix].plot(peak_month, peak_value, marker='o', markerfacecolor='black', markeredgecolor="white", zorder=1000, markersize=5)
    xy_pos = xy_txt.get(park)
    ax[ix].annotate(
        f"r/{park}\n{calendar.month_name[peak_month][:3]}, {df.loc[peak_month, park].user_count} user",
        xy=(peak_month, peak_value), xytext=xy_pos, zorder=1001,
        fontsize='medium', textcoords='offset points',
        path_effects=[pe.withStroke(linewidth=2, foreground="white")],
        arrowprops=dict(
            arrowstyle="-",
            color='black', lw=0.5, alpha=0.8, mutation_scale=4, 
            connectionstyle=f'arc3,rad=-0.3')
    )
fig.show()

save as svg and png

tools.save_fig(fig, output=OUTPUT, name="joyplot_nationalparks")

Chi per month¶

We want to compare temporal patterns for different parks. For instance, some parks may be particularly interesting during Spring (e.g. Yosemite, for Waterfalls being most impressive; or Shanandoah National Park, for colorfull autumn foliage).

Below, use chi per month is used to compare a single National Park's visitation to the average visitation rate for that month for all Parks.

#TODO

Create notebook HTML¶

!jupyter nbconvert --to html_toc \
    --output-dir=../resources/html/ ./02_reddit_nationalparks.ipynb \
    --template=../nbconvert.tpl \
    --ExtractOutputPreprocessor.enabled=False >&- 2>&-

name	REDDIT_ALL_SUBMISSIONS	REDDIT_ALL_COMMENTS
size	428.49 KB	1.16 MB
records	2,448	2,221

	year	month	post_hll	user_hll	topic_group
0	2010	10	\x138b40c8a2	\x138b40c303	everglades
1	2011	2	\x138b40fd82	\x138b40c303	everglades
2	2011	2	\x138b40014238a147c16a8185618a648c83d3c1efc2	\x138b4018214861750197a1c303dee1e021	yosemite
3	2011	3	\x138b409481	\x138b405e44	yosemite
4	2011	4	\x138b4020c177c27be294a3	\x138b4076a197a1c303	yosemite
5	2011	5	\x138b4090e5c1e4	\x138b404961c303	grandcanyon
6	2011	5	\x138b4011815c826621a341bf81d505e1a2fc43	\x138b4036c3c303	hotsprings
7	2011	5	\x138b400d4212c19cc49e41ae64af41	\x138b4003e21ac25e44c303dee1	yosemite
8	2011	6	\x138b4004c316e42782286131013722488281e4b084c0...	\x138b4008a10f835e448062c303dca1e1a2ffa2	yosemite
9	2011	7	\x138b403081	\x138b405543	hotsprings

	year	month	post_hll	user_hll	referenced_post_hll	topic_group
2215	2023	4	\x138b4013231921210225e129c13f2340a14be174c47e...	\x138b4001a269019c61d041	\x138b4013c116218a01a721b8c3	shenandoah
2216	2023	4	\x138b400c8310e113e31be21ea224e12dc12e8137a23c...	\x138b40348143e145a1d841e802e841f5c2	\x138b40002325c1270138835361eb67f181	virginislands
2217	2023	4	\x148b4000802104200040238000080020042101020280...	\x138b40016101c108210ac40fc2112115c1174119611b...	\x138b40000100430141018204810561064108a209e10c...	yosemite
2218	2023	4	\x138b40004101030142028203010362038105c6060106...	\x138b4016c120e42162226122e2276228e5290329212d...	\x138b40016101c5060107440d8113e117822461260326...	zionnationalpark
2219	2023	4	\x138b401924	\x138b40d221	\x138b4014e2	zionnp

	post_hll	user_hll	referenced_post_hll	topic_group
datetime
2011-02-01	\x138b4005222b8151c15a226ce379a187038f8297c1b1...	\x138b40182148617501c303dee1	\x138b40014247c185618c83efc2	yosemite
2011-03-01	\x138b40cce2	\x138b405e44	\x138b409481	yosemite
2011-04-01	\x138b4006218b618ee1b641f061f201f241	\x138b4076a197a1c303	\x138b4020c17be294a3	yosemite
2011-05-01	\x138b401c812b822d429fc5aa82ca61	\x138b401ac25e44dee1	\x138b4012c19cc49e41	yosemite
2011-06-01	\x138b402ee1	\x138b4036c3	\x138b401181	hotsprings

	commentscount_est
datetime
2010-01-31	0.0
2010-02-28	0.0
2010-03-31	0.0
2010-04-30	0.0
2010-05-31	0.0
...	...
2022-09-30	4761.0
2022-10-31	4025.0
2022-11-30	3996.0
2022-12-31	3655.0
2023-01-31	4286.0

		user_count
month	topic_group
1	acadianationalpark	19
	americansamoa	23
	arches	4
	archesnationalpark	5
	bigbend	3

month	1	2	3	4	5	6	7	8	9	10	11	12
topic_group
acadianationalpark	19	40	36	49	52	77	93	86	93	97	23	15
americansamoa	23	27	23	23	15	9	12	14	17	14	22	20
arches	4	6	6	6	5	6	4	9	7	8	9	4
archesnationalpark	5	6	9	11	13	7	8	10	8	4	6	3
bigbend	3	2	5	3	3	3	1	0	4	7	2	4
bigbendnationalpark	15	8	6	10	5	2	3	1	1	5	4	6
bigbendtx	233	192	276	170	124	98	92	103	126	153	232	210
brycecanyon	14	13	13	15	14	14	13	13	10	16	8	8
canyonlands	5	2	9	8	7	4	4	5	4	5	5	6
capitolreefnp	6	3	7	8	5	2	7	4	7	6	2	6
carlsbadcavernsnp	1	1	5	8	5	3	1	2	0	0	3	2
craterlake	9	7	10	13	19	18	25	36	18	19	6	6
cuyahogafalls	9	10	9	9	8	9	12	9	10	12	11	8
deathvalley	22	15	28	18	10	9	15	9	11	13	10	14
deathvalleynp	88	90	125	58	36	23	37	46	43	56	65	87
everglades	40	44	48	41	22	20	16	18	23	24	25	33
glacier	46	56	69	82	103	186	139	149	74	32	29	34
glaciernationalpark	131	177	228	155	191	398	521	305	243	108	70	213
grandcanyon	271	272	346	305	264	271	209	224	247	269	258	261
grandcanyonhiking	0	1	1	0	0	0	1	0	0	0	0	2
grandteton	1	0	0	0	0	1	0	0	2	2	3	1
grandtetonnatlpark	68	69	63	92	88	78	128	144	88	56	34	33
greatbasinstories	0	0	0	5	1	3	1	0	0	0	1	1
gsmnp	43	55	85	75	92	72	72	82	58	64	55	30
hotsprings	54	59	77	83	75	72	77	65	57	55	57	56
isleroyale	38	32	52	43	46	62	67	74	39	26	17	24
joshuatree	390	417	479	397	244	203	197	230	242	272	349	346
kenaipeninsula	6	8	7	15	9	10	6	11	10	13	6	5
lassenvolcanic	0	3	2	1	0	1	0	1	2	0	3	1
mammothcave	1	2	6	5	6	2	10	1	2	4	3	0
mount_rainier	9	5	18	28	12	26	35	30	21	13	12	15
newrivergorgenp	5	1	2	2	2	2	3	6	2	6	0	1
northcascades	5	11	13	15	9	10	28	21	15	6	2	3
olympicnationalpark	37	44	69	80	63	72	92	81	70	50	39	43
pinnaclesnp	10	3	6	5	8	4	3	1	3	4	4	3
redwoodnationalpark	6	1	1	1	1	6	4	5	2	3	2	0
rockymountain	39	40	41	56	49	134	119	99	82	43	23	26
sequoia	17	18	22	18	9	20	28	32	16	17	15	8
shenandoah	12	11	8	11	5	5	9	15	19	35	9	6