Temporal chi visualization ¶

Alexander Dunkel, Institute of Cartography, TU Dresden

Chi visualization of temporal patterns for ephemeral events. This notebook is a continuation from a previous publication.

Data sources used:

Flickr
iNaturalist
Reddit

Preparations¶

import sys, os
import math
import numpy as np
import pandas as pd
import psycopg2
import statsmodels.api as sm
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import seaborn as sns
from matplotlib.axes import Axes 
from matplotlib import cm
from typing import Tuple, Dict, Any
from pathlib import Path
from python_hll.hll import HLL
from python_hll.util import NumberUtil
from shapely.geometry import box
module_path = str(Path.cwd().parents[0] / "py")
if module_path not in sys.path:
    sys.path.append(module_path)
from modules.base import tools, hll

OUTPUT = Path.cwd().parents[0] / "out"       # output directory for figures (etc.)
WORK_DIR = Path.cwd().parents[0] / "tmp"     # Working directory

(OUTPUT / "figures").mkdir(exist_ok=True)
(OUTPUT / "svg").mkdir(exist_ok=True)
WORK_DIR.mkdir(exist_ok=True)

%load_ext autoreload
%autoreload 2

Select M for monthly aggregation, Y for yearly aggregation

AGG_BASE = "Y"

First, define whether to study usercount or postcount

# METRIC = 'user'
METRIC = 'post'

Set global font

plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = ['Times New Roman'] + plt.rcParams['font.serif']

Load HLL aggregate data¶

Load the data from CSV, generated in the previous notebook. Data is stored as aggregate HLL data (postcount, usercount) for each month.

FLICKR_ALL = OUTPUT / "flickr_all_months.csv"
INATURALIST_ALL = OUTPUT / "inaturalist_all_months.csv"
INATURALIST_ALL_MILVUSRANGE = OUTPUT / "milvus_range_inat_all_months.csv"
INATURALIST_FLICKR_ALL_FOCUS = OUTPUT / "milvus_focus_flickr_inat_all_months.csv"
INATURALIST_FLICKR_MILVUS_FOCUS = Path.cwd().parents[0] \
    / "00_data" / "milvus" / "2023-08-18_milvusmilvus_flickr_focus.csv"

%%time
data_files = {
    "FLICKR_ALL":FLICKR_ALL,
    "INATURALIST_ALL":INATURALIST_ALL,
    }
tools.display_file_stats(data_files)

CPU times: user 49.6 ms, sys: 8.48 ms, total: 58.1 ms
Wall time: 57.4 ms

pd.read_csv(FLICKR_ALL, nrows=10)

Connect hll worker db¶

DB_USER = "hlluser"
DB_PASS = os.getenv('READONLY_USER_PASSWORD')
# set connection variables
DB_HOST = "hllworkerdb"
DB_PORT = "5432"
DB_NAME = "hllworkerdb"

Connect to empty Postgres database running HLL Extension:

DB_CONN = psycopg2.connect(
        host=DB_HOST,
        port=DB_PORT ,
        dbname=DB_NAME,
        user=DB_USER,
        password=DB_PASS
)
DB_CONN.set_session(
    readonly=True)
DB_CALC = tools.DbConn(
    DB_CONN)
CUR_HLL = DB_CONN.cursor()

test

Calculate HLL Cardinality per month¶

Define additional functions for reading and formatting CSV as pd.DataFrame

from datetime import datetime

def read_csv_datetime(csv: Path) -> pd.DataFrame:
    """Read CSV with parsing datetime index (months)
    
        First CSV column: Year
        Second CSV column: Month
    """
    date_cols = ["year", "month"]
    df = pd.read_csv(
        csv, index_col='datetime', 
        parse_dates={'datetime':date_cols},
        date_format='%Y %m',
        keep_date_col='False')
    df.drop(columns=date_cols, inplace=True)
    return df
    
def append_cardinality_df(df: pd.DataFrame, hll_col: str = "post_hll", cardinality_col: str = 'postcount_est'):
    """Calculate cardinality from HLL and append to extra column in df"""
    df[cardinality_col] = df.apply(
        lambda x: hll.cardinality_hll(
           x[hll_col], CUR_HLL),
        axis=1)
    df.drop(columns=[hll_col], inplace=True)
    return df

def filter_fill_time(
        df: pd.DataFrame, min_year: int, 
        max_year: int, val_col: str = "postcount_est",
        min_month: str = "01", max_month: str = "01", agg_base: str = None,
        agg_method = None):
    """Filter time values between min - max year and fill missing values"""
    max_day = "01"
    if agg_base is None:
        agg_base = "M"
    elif agg_base == "Y":
        max_month = "12"
        max_day = "31"
    min_date = pd.Timestamp(f'{min_year}-{min_month}-01')
    max_date = pd.Timestamp(f'{max_year}-{max_month}-{max_day}')
    # clip by start and end date
    if not min_date in df.index:
        df.loc[min_date, val_col] = 0
    if not max_date in df.index:
        df.loc[max_date, val_col] = 0
    df.sort_index(inplace=True)
    # mask min and max time
    time_mask = ((df.index >= min_date) & (df.index <= max_date))
    resampled = df.loc[time_mask][val_col].resample(agg_base)
    if agg_method is None:
        series = resampled.sum()
    elif agg_method == "count":
        series = resampled.count()
    # fill missing months with 0
    # this will also set the day to max of month
    return series.fillna(0).to_frame()

Select dataset to process below

Apply functions to all data sets.

Read from CSV
calculate cardinality
merge year and month to single column
filter 2007 - 2018 range, fill missing values

def process_dataset(
        dataset: Path = None, metric: str = None, df_post: pd.DataFrame = None,
        min_year: int = None, max_year: int = None, agg_base: str = None) -> pd.DataFrame:
    """Apply temporal filter/pre-processing to all data sets."""
    if metric is None:
        metric = 'post_hll'
    if metric == 'post_hll':
        cardinality_col = 'postcount_est'
    else:
        cardinality_col = 'usercount_est'
    if min_year is None:
        min_year = 2007
    if max_year is None:
        max_year = 2022
    if df_post is None:
        df_post = read_csv_datetime(dataset)
    df_post = append_cardinality_df(df_post, metric, cardinality_col)
    return filter_fill_time(df_post, min_year, max_year, cardinality_col, agg_base=agg_base)

%%time
df_post = process_dataset(FLICKR_ALL, agg_base=AGG_BASE)

CPU times: user 24.3 ms, sys: 8.09 ms, total: 32.4 ms
Wall time: 46.4 ms

df_post.head(5)

%%time
df_user = process_dataset(FLICKR_ALL, metric='user_hll', agg_base=AGG_BASE)

CPU times: user 18.3 ms, sys: 14.8 ms, total: 33.1 ms
Wall time: 49.5 ms

df_user.head(5)

Visualize Cardinality¶

Define plot function.

def bar_plot_time(
        df: pd.DataFrame, ax: Axes, color: str,
        label: str, val_col: str = "postcount_est") -> Axes:
    """Matplotlib Barplot with time axis formatting"""
    ax = df.set_index(
        df.index.map(lambda s: s.strftime('%Y'))).plot.bar(
            ax=ax, y=val_col, color=color, width=1.0,
            label=label, edgecolor="white", linewidth=0.5, alpha=0.6)
    return ax

def plot_time(
        df: Tuple[pd.DataFrame, pd.DataFrame], title, color, filename = None, 
        output = OUTPUT, legend: str = "Postcount", val_col: str = None,
        trend: bool = None, seasonal: bool = None, residual: bool = None,
        agg_base: str = None):
    """Create dataframe(s) time plot"""
    x_ticks_every = 12
    fig_x = 15.7
    fig_y = 4.27
    font_mod = False
    x_label = "Month"
    linewidth = 3
    if agg_base and agg_base == "Y":
        x_ticks_every = 1
        fig_x = 3
        fig_y = 1.5
        font_mod = True
        x_label = "Year"
        linewidth = 1
    fig, ax = plt.subplots()
    fig.set_size_inches(fig_x, fig_y)
    ylabel = f'{legend}'
    if val_col is None:
        val_col = f'{legend.lower()}_est'
    ax = bar_plot_time(
        df=df, ax=ax, color=color, val_col=val_col, label=legend)
    # x axis ticker formatting
    tick_loc = mticker.MultipleLocator(x_ticks_every)
    ax.xaxis.set_major_locator(tick_loc)
    ax.tick_params(axis='x', rotation=45, length=0)
    ax.yaxis.set_major_formatter(mticker.StrMethodFormatter('{x:,.0f}'))
    ax.set(xlabel=x_label, ylabel=ylabel)
    ax.spines["left"].set_linewidth(0.25)
    ax.spines["bottom"].set_linewidth(0.25)
    ax.spines["top"].set_linewidth(0)
    ax.spines["right"].set_linewidth(0)
    ax.yaxis.set_tick_params(width=0.5)
    # remove legend
    ax.get_legend().remove()
    ax.set_title(title)
    ax.set_xlim(-0.5,len(df)-0.5)
    if font_mod:
        for item in (
            [ax.xaxis.label, ax.title, ax.yaxis.label] +
             ax.get_xticklabels() + ax.get_yticklabels()):
                item.set_fontsize(8)
    if any([trend, seasonal, residual]):
        # seasonal decompose
        decomposition = sm.tsa.seasonal_decompose(
            df[val_col], model='additive')
        # plot trend part only
        if trend:
            plt.plot(list(decomposition.trend), color='black',
                label="Trend", linewidth=linewidth, alpha=0.8)
        if seasonal:
            plt.plot(list(decomposition.seasonal), color='black', linestyle='dotted',
                label="Seasonal", linewidth=1, alpha=0.8)
        if residual:
            plt.plot(list(decomposition.resid), color='black', linestyle='dashed',
                label="Residual", linewidth=1, alpha=0.8)
        # trend.plot(ax=ax)
    # store figure to file
    if filename:
        fig.savefig(
            output / "figures" / f"{filename}.png", dpi=300, format='PNG',
            bbox_inches='tight', pad_inches=1, facecolor="white")
        # also save as svg
        fig.savefig(
            output / "svg" / f"{filename}.svg", format='svg',
            bbox_inches='tight', pad_inches=1, facecolor="white")

def load_and_plot(
        dataset: Path = None, metric: str = None, src_ref: str = "flickr", colors: cm.colors.ListedColormap = None,
        agg_base: str = None, trend: bool = None, return_df: bool = None, df_post: pd.DataFrame = None,):
    """Load data and plot"""
    if metric is None:
        metric = 'post_hll'
    if metric == 'post_hll':
        metric_label = 'postcount'
    else:
        metric_label = 'usercount'
    if colors is None:
        colors = sns.color_palette("vlag", as_cmap=True, n_colors=2)
        colors = colors([1.0])
    df = process_dataset(dataset, metric=metric, agg_base=agg_base, df_post=df_post)
    plot_time(
        df, legend=metric_label.capitalize(), color=colors,
        title=f'{src_ref.capitalize()} {metric_label} over time', 
        filename=f"temporal_{metric_label}_{src_ref}_absolute", trend=trend, agg_base=agg_base)
    if return_df:
        return df

colors = sns.color_palette("vlag", as_cmap=True, n_colors=2)

load_and_plot(FLICKR_ALL, src_ref=f"flickr_{AGG_BASE}", colors=colors([1.0]), agg_base=AGG_BASE, trend=False)

Plot Flickr use count

load_and_plot(FLICKR_ALL, src_ref=f"flickr_{AGG_BASE}", metric="user_hll", colors=colors([0.0]), agg_base=AGG_BASE, trend=False)

Repeat for iNaturalist data

load_and_plot(INATURALIST_ALL, src_ref=f"inaturalist_{AGG_BASE}", colors=colors([1.0]), agg_base=AGG_BASE, trend=False)
load_and_plot(INATURALIST_ALL, src_ref=f"inaturalist_{AGG_BASE}", metric="user_hll", colors=colors([0.0]), agg_base=AGG_BASE, trend=False)

Visualize chi for subquery¶

Load benchmark data¶

source_zip="https://opara.zih.tu-dresden.de/xmlui/bitstream/handle/123456789/5793/S10.zip?sequence=1&isAllowed=y"
FLICKR_SUNRISE = WORK_DIR / "flickr-sunrise-months.csv"
FLICKR_SUNSET = WORK_DIR / "flickr-sunset-months.csv"

if not (FLICKR_SUNRISE).exists():
    tools.get_zip_extract(
        uri=source_zip, filename="S10.zip", output_path=WORK_DIR,
        filter_files=["flickr-sunrise-months.csv", "flickr-sunset-months.csv"])

df_sunrise = read_csv_datetime(FLICKR_SUNRISE)
df_sunrise = append_cardinality_df(df_sunrise, f'{METRIC}_hll', f'{METRIC}count_est')
df_sunrise = filter_fill_time(df_sunrise, 2007, 2018, f'{METRIC}count_est', agg_base=AGG_BASE)

plot_time(
    df_sunrise, legend=f"{METRIC.capitalize()}count", color=colors([0.0]),
    title=f'Flickr {METRIC}count over time for sunrise related posts', 
    filename=f"temporal_{METRIC}count_flickr_sunrise_{AGG_BASE}", trend=False, agg_base=AGG_BASE)

At the time of writing the sunset-sunrise paper, the data was only collected up to 2018.

Still, the graphic above shows the same seasonal patterns as the graph for all Flickr photographs. Below, we normalize the sunrise graph using chi.

Limit the dataframe for all posts over time to 2018, too.

if METRIC == 'post':
    df_expected = df_post
else:
    df_expected = df_user

df_expected = filter_fill_time(df_expected, 2007, 2018, f'{METRIC}count_est', agg_base=AGG_BASE)

Prepare Chi¶

This is adapted from notebook three of the original publication.

First, define the input parameter:

dof: degrees of freedom (dof) is calculated: (variables - 1) with the variables being: observed posts, expected posts
chi_crit_val: given a dof value of 1 and a confidence interval of 0.05, the critical value to accept or neglect the h0 is 3.84
chi_column: we'll do the chi calculation based on the usercount-column, but this notebook can be run for postcount or userdays, too.

DOF = 1
CHI_CRIT_VAL = 3.84
CHI_COLUMN: str = f"{METRIC}count_est"

def calc_norm(
    df_expected: pd.DataFrame,
    df_observed: pd.DataFrame,
    chi_column: str = CHI_COLUMN):
    """Fetch the number of data points for the observed and 
    expected dataset by the relevant column
    and calculate the normalisation value
    """
    v_expected = df_expected[chi_column].sum()
    v_observed = df_observed[chi_column].sum()
    norm_val = (v_expected / v_observed)
    return norm_val

norm_val = calc_norm(df_expected, df_sunrise)
print(norm_val)

345.5817441940252

Merge dataframes¶

rename_expected = {
    'postcount_est':'postcount_est_expected',
    'usercount_est':'usercount_est_expected',
    }
df_expected.rename(
    columns=rename_expected,
    inplace=True)

merge_cols = [f'{METRIC}count_est']
df_expected_observed = df_expected.merge(
    df_sunrise[merge_cols],
    left_index=True, right_index=True)

Preview

df_expected_observed.head()

Calculate the Chi-value¶

def chi_calc(x_observed: float, x_expected: float, x_normalized: float) -> pd.Series:
    """Apply chi calculation based on observed (normalized) and expected value"""
    value_observed_normalised = x_observed * x_normalized
    a = value_observed_normalised - x_expected
    b = math.sqrt(x_expected)
    # native division with division by zero protection
    chi_value = a / b if b else 0
    return chi_value
    
def apply_chi_calc(
        df: pd.DataFrame, norm_val: float,
        chi_column: str = CHI_COLUMN, chi_crit_val: float = CHI_CRIT_VAL):
    """Calculate chi-values based on two GeoDataFrames (expected and observed values)
    and return new grid with results"""
    # lambda: apply function chi_calc() to each item
    df['chi_value'] = df.apply(
        lambda x: chi_calc(
           x[chi_column],
           x[f'{chi_column}_expected'],
           norm_val),
        axis=1)
    # add significant column, default False
    df['significant'] = False
    # calculate significance for both negative and positive chi_values
    df.loc[np.abs(df['chi_value'])>chi_crit_val, 'significant'] = True

Apply calculation

%%time
apply_chi_calc(
    df=df_expected_observed,
    norm_val=norm_val)

CPU times: user 1.45 ms, sys: 737 µs, total: 2.18 ms
Wall time: 2.14 ms

Visualize chi¶

df_expected_observed.head()

plot_time(
    df_expected_observed, legend="Signed Chi", val_col="chi_value", color=colors([0.0]),
    title=f'Flickr Chi for "Sunrise"-related {METRIC}count over time', 
    filename=f"temporal_chi_flickr_{METRIC}count_sunrise_{AGG_BASE}", trend=False, agg_base=AGG_BASE)

Repeat the same for the observation of sunset posts worldwide

df_sunset = read_csv_datetime(FLICKR_SUNSET)
df_sunset = append_cardinality_df(df_sunset, f'{METRIC}_hll', f'{METRIC}count_est')
df_sunset = filter_fill_time(df_sunset, 2007, 2018, f'{METRIC}count_est', agg_base=AGG_BASE)

plot_time(
    df_sunset, legend=f"{METRIC.capitalize()}count", color=colors([1.0]),
    title=f'Flickr {METRIC}count over time for sunset related posts', 
    filename=f"temporal_{METRIC}count_flickr_sunset_{AGG_BASE}", trend=False, agg_base=AGG_BASE)

norm_val = calc_norm(
    df_expected.rename(columns={f'{METRIC}count_est_expected':f'{METRIC}count_est'}, inplace=False),
    df_sunset)
print(norm_val)

120.94355348791775

merge_cols = [f'{METRIC}count_est']
df_expected_observed = df_expected.merge(
    df_sunset[merge_cols],
    left_index=True, right_index=True)

%%time
apply_chi_calc(
    df=df_expected_observed,
    norm_val=norm_val)

CPU times: user 2.09 ms, sys: 0 ns, total: 2.09 ms
Wall time: 2.05 ms

plot_time(
    df_expected_observed, legend="Signed Chi", val_col="chi_value", color=colors([1.0]),
    title=f'Flickr Chi for "Sunset"-related {METRIC}count over time', 
    filename=f"temporal_chi_flickr_{METRIC}count_sunset_{AGG_BASE}", trend=False, agg_base=AGG_BASE)

Visualize Chi for subquery "Milvus milvus" and "Bloom"¶

Select topic parameter below

topic = "milvusmilvus"
# topic = "bloom"

FLICKR_SUBQUERY = OUTPUT / f"flickr_{topic}_months.csv"

%%time
df_post = read_csv_datetime(FLICKR_ALL)
df_post = append_cardinality_df(df_post, 'post_hll', 'postcount_est')
df_post = filter_fill_time(df_post, 2007, 2020, 'postcount_est', max_month=8, agg_base=AGG_BASE)
df_expected = df_post

CPU times: user 23.3 ms, sys: 9.67 ms, total: 32.9 ms
Wall time: 49.5 ms

df_subquery= read_csv_datetime(FLICKR_SUBQUERY)
df_subquery = append_cardinality_df(df_subquery, f'{METRIC}_hll', f'{METRIC}count_est')
df_subquery = filter_fill_time(df_subquery, 2007, 2020, f'{METRIC}count_est', max_month=8, agg_base=AGG_BASE)

plot_time(
    df_subquery, legend=f"{METRIC.capitalize()}count", color=colors([1.0]),
    title=f'Flickr {METRIC}count over time for {topic.capitalize()} related posts', 
    filename=f"temporal_{METRIC}count_flickr_{topic}_{AGG_BASE}", trend=False, agg_base=AGG_BASE)

df_expected.rename(columns={f'{METRIC}count_est_expected':f'{METRIC}count_est'}, inplace=True)
norm_val = calc_norm(
    df_expected,
    df_subquery)
print(norm_val)
df_expected.rename(columns={f'{METRIC}count_est':f'{METRIC}count_est_expected'}, inplace=True)

14384.199366802352

df_expected.head()

merge_cols = [f'{METRIC}count_est']
df_expected_observed_flickr = df_expected.merge(
    df_subquery[merge_cols],
    left_index=True, right_index=True)

%%time
apply_chi_calc(
    df=df_expected_observed_flickr,
    norm_val=norm_val)

CPU times: user 2.21 ms, sys: 0 ns, total: 2.21 ms
Wall time: 2.17 ms

plot_time(
    df_expected_observed_flickr, legend="Signed Chi", val_col="chi_value", color=colors([1.0]),
    title=f'Flickr Chi for "{topic.capitalize()}"-related {METRIC}count over time', 
    filename=f"temporal_chi_flickr_{METRIC}count_{topic}_{AGG_BASE}", trend=False, seasonal=False, residual=False, agg_base=AGG_BASE)

iNaturalist Milvus milvus chi¶

src = Path.cwd().parents[0] / "00_data" / "milvus" / "observations-350501.csv"

load_inat_kwargs = {
    "filepath_or_buffer":src,
    "index_col":'datetime', 
    "parse_dates":{'datetime':["observed_on"]},
    "date_format":'%Y-%m-%d',
    "keep_date_col":'False',
    "usecols":["id", "observed_on"]
}
df = pd.read_csv(**load_inat_kwargs)

df.drop(columns=['observed_on'], inplace=True)

df.head()

df_milvus = filter_fill_time(
    df, 2007, 2022, val_col="id", agg_base=AGG_BASE, agg_method="count")

df_milvus.rename(columns={"id": "observations"}, inplace=True)

metric_label="observations"
src_ref="iNaturalist Milvus milvus"

plot_time(
        df_milvus, legend=metric_label.capitalize(), color=colors([1.0]),
        title=f'{src_ref.capitalize()} {metric_label} over time', val_col=metric_label,
        filename=f"temporal_iNaturalist_milvusmilvus_absolute", trend=False, agg_base=AGG_BASE)

df_expected = load_and_plot(
    INATURALIST_ALL, src_ref=f"inaturalist_{AGG_BASE}", colors=colors([1.0]),
    agg_base=AGG_BASE, trend=False, return_df=True)

df_expected.head()

norm_val = calc_norm(
    df_expected.rename(columns={f'{METRIC}count_est':f'observations'}, inplace=False),
    df_milvus, chi_column = "observations")
print(norm_val)

3098.4659662808253

df_expected.rename(columns={f'{METRIC}count_est':f'observations_expected'}, inplace=True)

merge_cols = [f'{METRIC}count_est']
df_expected_observed_inat = df_expected.merge(
    df_milvus["observations"],
    left_index=True, right_index=True)

%%time
apply_chi_calc(
    df=df_expected_observed_inat,
    norm_val=norm_val, chi_column="observations")

CPU times: user 1.44 ms, sys: 717 µs, total: 2.15 ms
Wall time: 2.13 ms

plot_time(
    df_expected_observed_inat, legend="Signed Chi", val_col="chi_value", color=colors([1.0]),
    title=f'iNaturalist Chi for "{topic.capitalize()}"-related {METRIC}count over time', 
    filename=f"temporal_chi_inaturalist_observations_milvus_{AGG_BASE}", trend=False, 
    seasonal=False, residual=False, agg_base=AGG_BASE)

Get correlation between time and chi¶

Temporarily fill missing values.

df_expected_observed_flickr.loc["2021-12-31", "chi_value"]=3059.0
df_expected_observed_flickr.loc["2022-12-31", "chi_value"]=3059.0

x = df_expected_observed_inat[f"chi_value"]
y = df_expected_observed_flickr[f"chi_value"]

from scipy import stats
r, p = stats.pearsonr(x, y)
# covariance
cov = np.cov(x, y)[0][1]
correlation_matrix = np.corrcoef(x, y)
correlation_xy = correlation_matrix[0, 1]
r_squared = correlation_xy**2
print(f"Statistics:\nr={r:.2f},")
print(f"p={p:.2g},")
print(f"cov={cov:.2f}")
print(f"r²={r_squared:.2f}")

Statistics:
r=0.15,
p=0.57,
cov=107430.15
r²=0.02

nas = np.logical_or(x.isna(), y.isna())
r, p = stats.spearmanr(x[~nas], y[~nas])
print(f"Statistics:\nr={r:.2f},")
print(f"p={p:.2g},")

Statistics:
r=0.13,
p=0.64,

This is a pretty weak relationship between iNat and Flickr chi.

Ranked stats:

iNaturalist Milvus milvus chi (range)¶

CRS_WGS = "epsg:4326" # WGS1984
CRS_PROJ = "esri:54009" # Mollweide

load_inat_kwargs["usecols"] = ["id", "observed_on", "longitude", "latitude"]
df = pd.read_csv(**load_inat_kwargs)

df.dropna(subset=['longitude', 'latitude'], inplace=True)

import geopandas as gp
milvus_range = gp.read_file(
    OUTPUT/ 'Milvusmilvus_range.gpkg', layer='Milvus milvus')

gdf = gp.GeoDataFrame(
    df, geometry=gp.points_from_xy(df.longitude, df.latitude), crs=CRS_WGS)

Intersect, keep only observations within range.

gdf_overlay = gp.overlay(
    gdf, milvus_range,
    how='intersection')

ax = gdf_overlay.plot(markersize=.1)
ax.axis('off')
plt.show()

Calculate chi

gdf_overlay

gdf_overlay['datetime'] = pd.to_datetime(gdf_overlay["observed_on"], format=load_inat_kwargs.get("date_format"))
gdf_overlay.set_index('datetime', inplace=True)
gdf_cleaned = tools.drop_cols_except(gdf_overlay, ["id"], inplace=False)

df_milvus_range = filter_fill_time(
    gdf_cleaned, 2007, 2022, val_col="id", agg_base=AGG_BASE, agg_method="count")

df_milvus_range.rename(columns={"id": "observations"}, inplace=True)

metric_label="observations"
src_ref="iNaturalist Milvus milvus"

plot_time(
        df_milvus_range, legend=metric_label.capitalize(), color=colors([1.0]),
        title=f'{src_ref.capitalize()} {metric_label} over time', val_col=metric_label,
        filename=f"temporal_iNaturalist_milvusmilvus_absolute", trend=False, agg_base=AGG_BASE)

df_expected = load_and_plot(
    INATURALIST_ALL_MILVUSRANGE, src_ref=f"inaturalist_{AGG_BASE}", colors=colors([1.0]),
    agg_base=AGG_BASE, trend=False, return_df=True)

norm_val = calc_norm(
    df_expected.rename(columns={f'{METRIC}count_est':f'observations'}, inplace=False),
    df_milvus_range, chi_column = "observations")
print(norm_val)

383.41553242594074

df_expected.rename(columns={f'{METRIC}count_est':f'observations_expected'}, inplace=True)

merge_cols = [f'{METRIC}count_est']
df_expected_observed_inat = df_expected.merge(
    df_milvus_range["observations"],
    left_index=True, right_index=True)

%%time
apply_chi_calc(
    df=df_expected_observed_inat,
    norm_val=norm_val, chi_column="observations")

CPU times: user 1.82 ms, sys: 644 µs, total: 2.46 ms
Wall time: 3.01 ms

plot_time(
    df_expected_observed_inat, legend="Signed Chi", val_col="chi_value", color=colors([1.0]),
    title=f'iNaturalist Chi for "{topic.capitalize()}"-related {METRIC}count over time', 
    filename=f"temporal_chi_inaturalist_observations_milvus_{AGG_BASE}", trend=False, 
    seasonal=False, residual=False, agg_base=AGG_BASE)

Focus region¶

Let's check the total iNat and Flickr data for the focus region

df_expected = load_and_plot(
    INATURALIST_FLICKR_ALL_FOCUS, src_ref=f"iNaturalist + Flickr", colors=colors([1.0]),
    agg_base=AGG_BASE, trend=False, return_df=True)

df_post = read_csv_datetime(INATURALIST_FLICKR_ALL_FOCUS)

df_post.head()

Process independently

source_names = {
    2: "Flickr", 23: "iNaturalist"}

vis_df = []
for idx, name in source_names.items():
    sel_df = df_post[df_post["origin_id"]==idx].copy()
    vis_df.append(process_dataset(df_post=sel_df, agg_base=AGG_BASE))

vis_df[0].tail(5)

Merge into single dataframe

stacked_df = pd.DataFrame()
for ix, df in enumerate(vis_df):
    source_name = source_names.get(list(source_names)[ix])
    stacked_df[source_name] = df
    stacked_df.index = df.index

stacked_df.head()

stacked_df.tail()

The method below is largely adapted from 05_hotspots.ipynb.

BAR_PARAM = {
    "width":1.0,
    "label":"Flickr (comparison)",
    "edgecolor":"black",
    "linewidth":0.5,
    "alpha":0.6,
}

def plot_bars(
        df: pd.DataFrame, ax: plt.axes = None, title: str = None, 
        ytitle: float = None, padtitle: float = None, legend: bool = None,
        bar_param: Dict[str, Any] = BAR_PARAM, title_legend: str = None,
        xlegend: float = None, ylegend: float = None, lang: str = None):
    """Plot stacked bars from a DataFrame with multiple columns"""
    colors = sns.color_palette("vlag", as_cmap=True, n_colors=2)
    if lang is None:
        lang = "en"
    # create figure
    if not ax:
        fig, ax = plt.subplots(1, 1, figsize=(3, 1.5))
    # colors = sns.color_palette("bright", as_cmap=True, n_colors=2)
    # Create another axis that shares the same x-axis as ax.
    ax2 = ax.twinx()
    # plot
    # calculate mean:
    # exclude months with no data in mean calculation:
    df.Flickr.replace(0, np.NaN) \
        .groupby(df.index.year, dropna=True) \
        .sum() \
        .plot(kind='bar', ax=ax, color="white", **bar_param)
    # bar_param["alpha"] = 1
    bar_param["label"] = "iNaturalist"
    bar_param["edgecolor"] = "white"
    df.iNaturalist.replace(0, np.NaN) \
        .groupby(df.index.year, dropna=True) \
        .sum() \
        .plot(kind='bar', ax=ax2, color=colors([1.0]), **bar_param)    
    # format
    ax.set_xlim(-0.5,len(df)-0.5)
    for axis in [ax, ax2]:
        axis.tick_params(axis='x', rotation=45, length=0) # length: of ticks
        axis.spines["left"].set_linewidth(0.25)
        axis.spines["bottom"].set_linewidth(0.25)
        axis.spines["top"].set_linewidth(0)
        axis.spines["right"].set_linewidth(0.25)
        axis.yaxis.set_tick_params(width=0.5)
        axis.set(xlabel="")
    ax.set(ylabel="Flickr")
    ax2.set(ylabel="iNaturalist")
    leg_kwarg = {
        "bbox_to_anchor":(0,1.2),
        "loc":'upper left', 
        "fontsize":8, 
        "frameon":False, 
        "title":title_legend, 
        "title_fontsize":8
        }
    ax.legend(**leg_kwarg)
    leg_kwarg["bbox_to_anchor"]=(0.53,1.2)
    # ax2.legend(**leg_kwarg)
    if not title:
        title = "Year"
    if ytitle is None:
        ytitle =-0.2
    if padtitle is None:
        padtitle=-14
    ax.set_title(title, y=ytitle, pad=padtitle)
    for axis in [ax, ax2]:
        for item in (
            [axis.xaxis.label, axis.title, axis.yaxis.label] +
             axis.get_xticklabels() + axis.get_yticklabels()):
                item.set_fontsize(8)
    tools.save_fig(fig, output=OUTPUT, name="focus_region_all")

plot_bars(df=stacked_df, legend=True)

Focus Region Chi¶

Get expected:

df_post = read_csv_datetime(INATURALIST_FLICKR_ALL_FOCUS)

Limit to iNaturalist

mask = df_post["origin_id"]==23

df_post.loc[mask].head()

df_expected = load_and_plot(
    df_post=df_post.loc[mask].copy(), src_ref=f"iNaturalist + Flickr", colors=colors([1.0]),
    agg_base=AGG_BASE, trend=False, return_df=True)

Get observed:

geom_saxonya_bbox = box(10.52659544, 50.90971941, 13.30916778, 53.0603523)

gdf_focus_inat = gp.clip(gdf_overlay, mask=geom_saxonya_bbox)

ax = gdf_focus_inat.plot(markersize=.1, color="red")
ax.axis('off')
plt.show()

Cleanup: Drop all cols except id

gdf_cleaned = tools.drop_cols_except(gdf_focus_inat, ["id"], inplace=False)

df_milvus_focus_inat = filter_fill_time(
    gdf_cleaned, 2007, 2022, val_col="id", agg_base=AGG_BASE, agg_method="count")

df_milvus_focus_inat.rename(columns={"id": "observations"}, inplace=True)
df_milvus_focus_inat.head()

Get observed for Flickr

df_post = read_csv_datetime(INATURALIST_FLICKR_MILVUS_FOCUS)

df_post.head()

df_milvus_focus_flickr = filter_fill_time(
    df_post, 2007, 2022, val_col="post_guid", agg_base=AGG_BASE, agg_method="count")

df_milvus_focus_flickr.rename(columns={"post_guid": "observations"}, inplace=True)

Union Flickr + Inat

df_milvus_focus = pd.concat(
    [df_milvus_focus_inat,df_milvus_focus_flickr], axis=1).sum(axis=1).rename("observations").to_frame()

df_milvus_focus = df_milvus_focus_inat

df_milvus_focus.head(10)

norm_val = calc_norm(
    df_expected.rename(columns={f'{METRIC}count_est':f'observations'}, inplace=False),
    df_milvus_focus, chi_column = "observations")
print(norm_val)

184.80053547523428

df_expected.rename(columns={f'{METRIC}count_est':f'observations_expected'}, inplace=True)

merge_cols = [f'{METRIC}count_est']
df_expected_observed = df_expected.merge(
    df_milvus_focus["observations"],
    left_index=True, right_index=True)

%%time
apply_chi_calc(
    df=df_expected_observed,
    norm_val=norm_val, chi_column="observations")

CPU times: user 1.64 ms, sys: 625 µs, total: 2.27 ms
Wall time: 2.23 ms

df_expected_observed

plot_time(
    df_expected_observed, legend="Signed Chi", val_col="chi_value", color=colors([1.0]),
    title=f'Chi for "{topic.capitalize()}"-related {METRIC}count over time (Flickr+iNat)', 
    filename=f"temporal_chi_flickr-inat_observations_milvus_focus_{AGG_BASE}", trend=False, 
    seasonal=False, residual=False, agg_base=AGG_BASE)

Create notebook HTML¶

!jupyter nbconvert --to html_toc \
    --output-dir=../resources/html/ ./01_temporal_chi.ipynb \
    --output 01_temporal_chi_{AGG_BASE.lower()} \
    --template=../nbconvert.tpl \
    --ExtractOutputPreprocessor.enabled=False >&- 2>&-

	year	month	post_hll	user_hll
0	1970	1	\x138b4006e432653ea153e15a835e4168e279e1ad21dd...	\x138b40da21
1	2005	3	\x138b40308549657da28201ee41f1e1	\x138b40e1e1
2	2005	6	\x138b4046c1	\x138b4025a1
3	2005	7	\x138b406ea3baa1	\x138b4000041ea2
4	2005	8	\x138b4027625181b9e1	\x138b4039a1cd23f101
5	2005	9	\x138b4027a23564506157638482ae43b8a2	\x138b4000041722a9e3
6	2005	10	\x138b4013a11921206130e53101372143c4538158816d...	\x138b4017229943ac61b541ba62e541e663
7	2005	11	\x138b4002c21ae32501330236c1474347c24c624f6363...	\x138b402d813de145e28621b841
8	2005	12	\x138b4002c103a1054306a207010bc60ee111c2168217...	\x138b40000406e12d813de15ee1a862cda4e541
9	2006	1	\x138b400041018105c307010ec31ac12b612cc22dc13b...	\x138b402b212d813a01b841cbe1d624e541ea01f945fcc2

	postcount_est
datetime
2007-12-31	16919501.0
2008-12-31	21894775.0
2009-12-31	24366233.0
2010-12-31	27459331.0
2011-12-31	33458926.0

	usercount_est
datetime
2007-12-31	623044.0
2008-12-31	827512.0
2009-12-31	983209.0
2010-12-31	1343740.0
2011-12-31	1692940.0

	postcount_est_expected	postcount_est
datetime
2007-12-31	16919501.0	38767.0
2008-12-31	21894775.0	50264.0
2009-12-31	24366233.0	60505.0
2010-12-31	27459331.0	82934.0
2011-12-31	33458926.0	97462.0

	postcount_est_expected	postcount_est	chi_value	significant
datetime
2007-12-31	16919501.0	38767.0	-856.321210	True
2008-12-31	21894775.0	50264.0	-966.932046	True
2009-12-31	24366233.0	60505.0	-700.295376	True
2010-12-31	27459331.0	82934.0	229.218991	True
2011-12-31	33458926.0	97462.0	38.407292	True

	id
datetime
2010-05-22	7793
2011-01-01	9495
2011-01-01	9496
2011-04-17	14694
2011-10-05	50134

	postcount_est
datetime
2007-12-31	114001.0
2008-12-31	136471.0
2009-12-31	168795.0
2010-12-31	202430.0
2011-12-31	245601.0

	id	observed_on	latitude	longitude	Name	Description	geometry
0	7793	2010-05-22	52.440923	-3.953872			POINT (-3.95387 52.44092)
1	9495	2011-01-01	51.185595	-1.212616			POINT (-1.21262 51.18559)
2	9496	2011-01-01	51.280226	-0.925083			POINT (-0.92508 51.28023)
3	14694	2011-04-17	51.814146	-0.192910			POINT (-0.19291 51.81415)
4	50348	2011-08-03	50.406329	8.873863			POINT (8.87386 50.40633)
...	...	...	...	...	...	...	...
19067	178281936	2023-07-02	47.661184	8.854844			POINT (8.85484 47.66118)
19068	178325879	2023-07-29	51.047883	10.996382			POINT (10.99638 51.04788)
19069	178326364	2013-06-09	51.454250	-0.884408			POINT (-0.88441 51.45425)
19070	178333330	2023-08-12	47.961332	16.769473			POINT (16.76947 47.96133)
19071	178333356	2023-08-13	45.129853	7.695633			POINT (7.69563 45.12985)

	origin_id	post_hll	user_hll
datetime
2007-01-01	2	\x148b7f00c24000410800218441100000048208422020...	\x138b7f00610223028202c2046504a2058206c2094109...
2007-02-01	2	\x148b7f190a608c801080308c8210c81084401908300c...	\x138b7f00040465062106e209c10a430aa10b040d220e...
2007-03-01	2	\x148b7f20c21184031942208481210461086120021094...	\x138b7f00040043008101c1028203c107c109c10aa30b...
2007-03-01	23	\x128b7fbf7dc7d226e191ba4f4af095ed1ac23b	\x128b7fb6f0f487e76c9f29
2007-04-01	2	\x148b7f18c422896300863204240a0e01902200023100...	\x138b7f000401e10282032203c103e20404080808c409...

	postcount_est
datetime
2018-12-31	49715.0
2019-12-31	42560.0
2020-12-31	38108.0
2021-12-31	38546.0
2022-12-31	13878.0

	Flickr	iNaturalist
datetime
2007-12-31	45862.0	94.0
2008-12-31	73364.0	25.0
2009-12-31	75420.0	286.0
2010-12-31	82369.0	151.0
2011-12-31	107369.0	179.0

name	FLICKR_ALL	INATURALIST_ALL
size	941.67 KB	989.88 KB
records	208	198

	origin_id	post_hll	user_hll
datetime
2007-03-01	23	\x128b7fbf7dc7d226e191ba4f4af095ed1ac23b	\x128b7fb6f0f487e76c9f29
2007-04-01	23	\x128b7f86f3dc507a1d08e3b32ffece828a24cf1420d9...	\x128b7f8435ffad9e0a5bad57fa9b70f8063f22
2007-05-01	23	\x128b7face0cb648f192b57b57f5ee3464eb317cd0dd2...	\x128b7fb6f0f487e76c9f29bfbf495bc9e6f7323a8805...
2007-06-01	23	\x128b7fb849f8f7280bc6bac6e3fcbb5cfa3b5ae2aace...	\x128b7fb555d813ab76dee6fc4ee09b0d014f9e
2007-07-01	23	\x128b7f83d715533b4e504f88ae402108b4f9d6891b19...	\x128b7fb6f0f487e76c9f29d4d6fd03d4c929f2f5b994...

	observations_expected	observations	chi_value	significant
datetime
2007-12-31	94.0	4	66.547520	True
2008-12-31	25.0	0	-5.000000	True
2009-12-31	286.0	0	-16.911535	True
2010-12-31	151.0	2	17.789503	True
2011-12-31	179.0	1	0.433552	False
2012-12-31	189.0	9	107.232622	True
2013-12-31	185.0	5	54.332557	True
2014-12-31	710.0	7	21.902258	True
2015-12-31	1713.0	7	-10.133182	True
2016-12-31	2848.0	34	64.370123	True
2017-12-31	3845.0	40	57.202583	True
2018-12-31	9514.0	51	-0.914219	False
2019-12-31	17669.0	92	-5.020514	True
2020-12-31	33545.0	166	-15.659651	True
2021-12-31	30029.0	154	-9.058391	True
2022-12-31	37064.0	175	-24.537207	True