Temporal chi visualization

Alexander Dunkel, Institute of Cartography, TU Dresden

•••
Out[1]:

Last updated: Aug-19-2023, Carto-Lab Docker Version 0.14.0

Chi visualization of temporal patterns for ephemeral events. This notebook is a continuation from a previous publication.

Data sources used:

  • Flickr
  • iNaturalist
  • Reddit

Preparations

In [2]:
import sys, os
import math
import numpy as np
import pandas as pd
import psycopg2
import statsmodels.api as sm
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import seaborn as sns
from matplotlib.axes import Axes 
from matplotlib import cm
from typing import Tuple, Dict, Any
from pathlib import Path
from python_hll.hll import HLL
from python_hll.util import NumberUtil
from shapely.geometry import box
module_path = str(Path.cwd().parents[0] / "py")
if module_path not in sys.path:
    sys.path.append(module_path)
from modules.base import tools, hll
In [3]:
OUTPUT = Path.cwd().parents[0] / "out"       # output directory for figures (etc.)
WORK_DIR = Path.cwd().parents[0] / "tmp"     # Working directory
In [4]:
(OUTPUT / "figures").mkdir(exist_ok=True)
(OUTPUT / "svg").mkdir(exist_ok=True)
WORK_DIR.mkdir(exist_ok=True)
In [5]:
%load_ext autoreload
%autoreload 2

Select M for monthly aggregation, Y for yearly aggregation

In [6]:
AGG_BASE = "Y"

First, define whether to study usercount or postcount

In [7]:
# METRIC = 'user'
METRIC = 'post'

Set global font

In [8]:
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = ['Times New Roman'] + plt.rcParams['font.serif']

Load HLL aggregate data

Load the data from CSV, generated in the previous notebook. Data is stored as aggregate HLL data (postcount, usercount) for each month.

In [9]:
FLICKR_ALL = OUTPUT / "flickr_all_months.csv"
INATURALIST_ALL = OUTPUT / "inaturalist_all_months.csv"
INATURALIST_ALL_MILVUSRANGE = OUTPUT / "milvus_range_inat_all_months.csv"
INATURALIST_FLICKR_ALL_FOCUS = OUTPUT / "milvus_focus_flickr_inat_all_months.csv"
INATURALIST_FLICKR_MILVUS_FOCUS = Path.cwd().parents[0] \
    / "00_data" / "milvus" / "2023-08-18_milvusmilvus_flickr_focus.csv"
In [10]:
%%time
data_files = {
    "FLICKR_ALL":FLICKR_ALL,
    "INATURALIST_ALL":INATURALIST_ALL,
    }
tools.display_file_stats(data_files)
name FLICKR_ALL INATURALIST_ALL
size 941.67 KB 989.88 KB
records 208 198
CPU times: user 49.6 ms, sys: 8.48 ms, total: 58.1 ms
Wall time: 57.4 ms
In [11]:
pd.read_csv(FLICKR_ALL, nrows=10)
Out[11]:
year month post_hll user_hll
0 1970 1 \x138b4006e432653ea153e15a835e4168e279e1ad21dd... \x138b40da21
1 2005 3 \x138b40308549657da28201ee41f1e1 \x138b40e1e1
2 2005 6 \x138b4046c1 \x138b4025a1
3 2005 7 \x138b406ea3baa1 \x138b4000041ea2
4 2005 8 \x138b4027625181b9e1 \x138b4039a1cd23f101
5 2005 9 \x138b4027a23564506157638482ae43b8a2 \x138b4000041722a9e3
6 2005 10 \x138b4013a11921206130e53101372143c4538158816d... \x138b4017229943ac61b541ba62e541e663
7 2005 11 \x138b4002c21ae32501330236c1474347c24c624f6363... \x138b402d813de145e28621b841
8 2005 12 \x138b4002c103a1054306a207010bc60ee111c2168217... \x138b40000406e12d813de15ee1a862cda4e541
9 2006 1 \x138b400041018105c307010ec31ac12b612cc22dc13b... \x138b402b212d813a01b841cbe1d624e541ea01f945fcc2

Connect hll worker db

In [12]:
DB_USER = "hlluser"
DB_PASS = os.getenv('READONLY_USER_PASSWORD')
# set connection variables
DB_HOST = "hllworkerdb"
DB_PORT = "5432"
DB_NAME = "hllworkerdb"

Connect to empty Postgres database running HLL Extension:

In [13]:
DB_CONN = psycopg2.connect(
        host=DB_HOST,
        port=DB_PORT ,
        dbname=DB_NAME,
        user=DB_USER,
        password=DB_PASS
)
DB_CONN.set_session(
    readonly=True)
DB_CALC = tools.DbConn(
    DB_CONN)
CUR_HLL = DB_CONN.cursor()

test

Calculate HLL Cardinality per month

Define additional functions for reading and formatting CSV as pd.DataFrame

In [14]:
from datetime import datetime

def read_csv_datetime(csv: Path) -> pd.DataFrame:
    """Read CSV with parsing datetime index (months)
    
        First CSV column: Year
        Second CSV column: Month
    """
    date_cols = ["year", "month"]
    df = pd.read_csv(
        csv, index_col='datetime', 
        parse_dates={'datetime':date_cols},
        date_format='%Y %m',
        keep_date_col='False')
    df.drop(columns=date_cols, inplace=True)
    return df
    
def append_cardinality_df(df: pd.DataFrame, hll_col: str = "post_hll", cardinality_col: str = 'postcount_est'):
    """Calculate cardinality from HLL and append to extra column in df"""
    df[cardinality_col] = df.apply(
        lambda x: hll.cardinality_hll(
           x[hll_col], CUR_HLL),
        axis=1)
    df.drop(columns=[hll_col], inplace=True)
    return df

def filter_fill_time(
        df: pd.DataFrame, min_year: int, 
        max_year: int, val_col: str = "postcount_est",
        min_month: str = "01", max_month: str = "01", agg_base: str = None,
        agg_method = None):
    """Filter time values between min - max year and fill missing values"""
    max_day = "01"
    if agg_base is None:
        agg_base = "M"
    elif agg_base == "Y":
        max_month = "12"
        max_day = "31"
    min_date = pd.Timestamp(f'{min_year}-{min_month}-01')
    max_date = pd.Timestamp(f'{max_year}-{max_month}-{max_day}')
    # clip by start and end date
    if not min_date in df.index:
        df.loc[min_date, val_col] = 0
    if not max_date in df.index:
        df.loc[max_date, val_col] = 0
    df.sort_index(inplace=True)
    # mask min and max time
    time_mask = ((df.index >= min_date) & (df.index <= max_date))
    resampled = df.loc[time_mask][val_col].resample(agg_base)
    if agg_method is None:
        series = resampled.sum()
    elif agg_method == "count":
        series = resampled.count()
    # fill missing months with 0
    # this will also set the day to max of month
    return series.fillna(0).to_frame()

Select dataset to process below

Apply functions to all data sets.

  • Read from CSV
  • calculate cardinality
  • merge year and month to single column
  • filter 2007 - 2018 range, fill missing values
In [15]:
def process_dataset(
        dataset: Path = None, metric: str = None, df_post: pd.DataFrame = None,
        min_year: int = None, max_year: int = None, agg_base: str = None) -> pd.DataFrame:
    """Apply temporal filter/pre-processing to all data sets."""
    if metric is None:
        metric = 'post_hll'
    if metric == 'post_hll':
        cardinality_col = 'postcount_est'
    else:
        cardinality_col = 'usercount_est'
    if min_year is None:
        min_year = 2007
    if max_year is None:
        max_year = 2022
    if df_post is None:
        df_post = read_csv_datetime(dataset)
    df_post = append_cardinality_df(df_post, metric, cardinality_col)
    return filter_fill_time(df_post, min_year, max_year, cardinality_col, agg_base=agg_base)
In [16]:
%%time
df_post = process_dataset(FLICKR_ALL, agg_base=AGG_BASE)
CPU times: user 24.3 ms, sys: 8.09 ms, total: 32.4 ms
Wall time: 46.4 ms
In [17]:
df_post.head(5)
Out[17]:
postcount_est
datetime
2007-12-31 16919501.0
2008-12-31 21894775.0
2009-12-31 24366233.0
2010-12-31 27459331.0
2011-12-31 33458926.0
In [18]:
%%time
df_user = process_dataset(FLICKR_ALL, metric='user_hll', agg_base=AGG_BASE)
CPU times: user 18.3 ms, sys: 14.8 ms, total: 33.1 ms
Wall time: 49.5 ms
In [19]:
df_user.head(5)
Out[19]:
usercount_est
datetime
2007-12-31 623044.0
2008-12-31 827512.0
2009-12-31 983209.0
2010-12-31 1343740.0
2011-12-31 1692940.0

Visualize Cardinality

Define plot function.

In [20]:
def bar_plot_time(
        df: pd.DataFrame, ax: Axes, color: str,
        label: str, val_col: str = "postcount_est") -> Axes:
    """Matplotlib Barplot with time axis formatting"""
    ax = df.set_index(
        df.index.map(lambda s: s.strftime('%Y'))).plot.bar(
            ax=ax, y=val_col, color=color, width=1.0,
            label=label, edgecolor="white", linewidth=0.5, alpha=0.6)
    return ax

def plot_time(
        df: Tuple[pd.DataFrame, pd.DataFrame], title, color, filename = None, 
        output = OUTPUT, legend: str = "Postcount", val_col: str = None,
        trend: bool = None, seasonal: bool = None, residual: bool = None,
        agg_base: str = None):
    """Create dataframe(s) time plot"""
    x_ticks_every = 12
    fig_x = 15.7
    fig_y = 4.27
    font_mod = False
    x_label = "Month"
    linewidth = 3
    if agg_base and agg_base == "Y":
        x_ticks_every = 1
        fig_x = 3
        fig_y = 1.5
        font_mod = True
        x_label = "Year"
        linewidth = 1
    fig, ax = plt.subplots()
    fig.set_size_inches(fig_x, fig_y)
    ylabel = f'{legend}'
    if val_col is None:
        val_col = f'{legend.lower()}_est'
    ax = bar_plot_time(
        df=df, ax=ax, color=color, val_col=val_col, label=legend)
    # x axis ticker formatting
    tick_loc = mticker.MultipleLocator(x_ticks_every)
    ax.xaxis.set_major_locator(tick_loc)
    ax.tick_params(axis='x', rotation=45, length=0)
    ax.yaxis.set_major_formatter(mticker.StrMethodFormatter('{x:,.0f}'))
    ax.set(xlabel=x_label, ylabel=ylabel)
    ax.spines["left"].set_linewidth(0.25)
    ax.spines["bottom"].set_linewidth(0.25)
    ax.spines["top"].set_linewidth(0)
    ax.spines["right"].set_linewidth(0)
    ax.yaxis.set_tick_params(width=0.5)
    # remove legend
    ax.get_legend().remove()
    ax.set_title(title)
    ax.set_xlim(-0.5,len(df)-0.5)
    if font_mod:
        for item in (
            [ax.xaxis.label, ax.title, ax.yaxis.label] +
             ax.get_xticklabels() + ax.get_yticklabels()):
                item.set_fontsize(8)
    if any([trend, seasonal, residual]):
        # seasonal decompose
        decomposition = sm.tsa.seasonal_decompose(
            df[val_col], model='additive')
        # plot trend part only
        if trend:
            plt.plot(list(decomposition.trend), color='black',
                label="Trend", linewidth=linewidth, alpha=0.8)
        if seasonal:
            plt.plot(list(decomposition.seasonal), color='black', linestyle='dotted',
                label="Seasonal", linewidth=1, alpha=0.8)
        if residual:
            plt.plot(list(decomposition.resid), color='black', linestyle='dashed',
                label="Residual", linewidth=1, alpha=0.8)
        # trend.plot(ax=ax)
    # store figure to file
    if filename:
        fig.savefig(
            output / "figures" / f"{filename}.png", dpi=300, format='PNG',
            bbox_inches='tight', pad_inches=1, facecolor="white")
        # also save as svg
        fig.savefig(
            output / "svg" / f"{filename}.svg", format='svg',
            bbox_inches='tight', pad_inches=1, facecolor="white")
In [132]:
def load_and_plot(
        dataset: Path = None, metric: str = None, src_ref: str = "flickr", colors: cm.colors.ListedColormap = None,
        agg_base: str = None, trend: bool = None, return_df: bool = None, df_post: pd.DataFrame = None,):
    """Load data and plot"""
    if metric is None:
        metric = 'post_hll'
    if metric == 'post_hll':
        metric_label = 'postcount'
    else:
        metric_label = 'usercount'
    if colors is None:
        colors = sns.color_palette("vlag", as_cmap=True, n_colors=2)
        colors = colors([1.0])
    df = process_dataset(dataset, metric=metric, agg_base=agg_base, df_post=df_post)
    plot_time(
        df, legend=metric_label.capitalize(), color=colors,
        title=f'{src_ref.capitalize()} {metric_label} over time', 
        filename=f"temporal_{metric_label}_{src_ref}_absolute", trend=trend, agg_base=agg_base)
    if return_df:
        return df
In [22]:
colors = sns.color_palette("vlag", as_cmap=True, n_colors=2)
In [23]:
load_and_plot(FLICKR_ALL, src_ref=f"flickr_{AGG_BASE}", colors=colors([1.0]), agg_base=AGG_BASE, trend=False)

Plot Flickr use count

In [24]:
load_and_plot(FLICKR_ALL, src_ref=f"flickr_{AGG_BASE}", metric="user_hll", colors=colors([0.0]), agg_base=AGG_BASE, trend=False)

Repeat for iNaturalist data

In [25]:
load_and_plot(INATURALIST_ALL, src_ref=f"inaturalist_{AGG_BASE}", colors=colors([1.0]), agg_base=AGG_BASE, trend=False)
load_and_plot(INATURALIST_ALL, src_ref=f"inaturalist_{AGG_BASE}", metric="user_hll", colors=colors([0.0]), agg_base=AGG_BASE, trend=False)

Visualize chi for subquery

Load benchmark data

In [26]:
source_zip="https://opara.zih.tu-dresden.de/xmlui/bitstream/handle/123456789/5793/S10.zip?sequence=1&isAllowed=y"
FLICKR_SUNRISE = WORK_DIR / "flickr-sunrise-months.csv"
FLICKR_SUNSET = WORK_DIR / "flickr-sunset-months.csv"
In [27]:
if not (FLICKR_SUNRISE).exists():
    tools.get_zip_extract(
        uri=source_zip, filename="S10.zip", output_path=WORK_DIR,
        filter_files=["flickr-sunrise-months.csv", "flickr-sunset-months.csv"])
In [28]:
df_sunrise = read_csv_datetime(FLICKR_SUNRISE)
df_sunrise = append_cardinality_df(df_sunrise, f'{METRIC}_hll', f'{METRIC}count_est')
df_sunrise = filter_fill_time(df_sunrise, 2007, 2018, f'{METRIC}count_est', agg_base=AGG_BASE)
In [29]:
plot_time(
    df_sunrise, legend=f"{METRIC.capitalize()}count", color=colors([0.0]),
    title=f'Flickr {METRIC}count over time for sunrise related posts', 
    filename=f"temporal_{METRIC}count_flickr_sunrise_{AGG_BASE}", trend=False, agg_base=AGG_BASE)

At the time of writing the sunset-sunrise paper, the data was only collected up to 2018.

Still, the graphic above shows the same seasonal patterns as the graph for all Flickr photographs. Below, we normalize the sunrise graph using chi.

Limit the dataframe for all posts over time to 2018, too.

In [30]:
if METRIC == 'post':
    df_expected = df_post
else:
    df_expected = df_user
In [31]:
df_expected = filter_fill_time(df_expected, 2007, 2018, f'{METRIC}count_est', agg_base=AGG_BASE)

Prepare Chi

This is adapted from notebook three of the original publication.

First, define the input parameter:

  • dof: degrees of freedom (dof) is calculated: (variables - 1) with the variables being: observed posts, expected posts
  • chi_crit_val: given a dof value of 1 and a confidence interval of 0.05, the critical value to accept or neglect the h0 is 3.84
  • chi_column: we'll do the chi calculation based on the usercount-column, but this notebook can be run for postcount or userdays, too.
In [32]:
DOF = 1
CHI_CRIT_VAL = 3.84
CHI_COLUMN: str = f"{METRIC}count_est"
In [33]:
def calc_norm(
    df_expected: pd.DataFrame,
    df_observed: pd.DataFrame,
    chi_column: str = CHI_COLUMN):
    """Fetch the number of data points for the observed and 
    expected dataset by the relevant column
    and calculate the normalisation value
    """
    v_expected = df_expected[chi_column].sum()
    v_observed = df_observed[chi_column].sum()
    norm_val = (v_expected / v_observed)
    return norm_val
In [34]:
norm_val = calc_norm(df_expected, df_sunrise)
print(norm_val)
345.5817441940252

Merge dataframes

In [35]:
rename_expected = {
    'postcount_est':'postcount_est_expected',
    'usercount_est':'usercount_est_expected',
    }
df_expected.rename(
    columns=rename_expected,
    inplace=True)
In [36]:
merge_cols = [f'{METRIC}count_est']
df_expected_observed = df_expected.merge(
    df_sunrise[merge_cols],
    left_index=True, right_index=True)

Preview

In [37]:
df_expected_observed.head()
Out[37]:
postcount_est_expected postcount_est
datetime
2007-12-31 16919501.0 38767.0
2008-12-31 21894775.0 50264.0
2009-12-31 24366233.0 60505.0
2010-12-31 27459331.0 82934.0
2011-12-31 33458926.0 97462.0

Calculate the Chi-value

In [38]:
def chi_calc(x_observed: float, x_expected: float, x_normalized: float) -> pd.Series:
    """Apply chi calculation based on observed (normalized) and expected value"""
    value_observed_normalised = x_observed * x_normalized
    a = value_observed_normalised - x_expected
    b = math.sqrt(x_expected)
    # native division with division by zero protection
    chi_value = a / b if b else 0
    return chi_value
    
def apply_chi_calc(
        df: pd.DataFrame, norm_val: float,
        chi_column: str = CHI_COLUMN, chi_crit_val: float = CHI_CRIT_VAL):
    """Calculate chi-values based on two GeoDataFrames (expected and observed values)
    and return new grid with results"""
    # lambda: apply function chi_calc() to each item
    df['chi_value'] = df.apply(
        lambda x: chi_calc(
           x[chi_column],
           x[f'{chi_column}_expected'],
           norm_val),
        axis=1)
    # add significant column, default False
    df['significant'] = False
    # calculate significance for both negative and positive chi_values
    df.loc[np.abs(df['chi_value'])>chi_crit_val, 'significant'] = True

Apply calculation

In [39]:
%%time
apply_chi_calc(
    df=df_expected_observed,
    norm_val=norm_val)
CPU times: user 1.45 ms, sys: 737 µs, total: 2.18 ms
Wall time: 2.14 ms

Visualize chi

In [40]:
df_expected_observed.head()
Out[40]:
postcount_est_expected postcount_est chi_value significant
datetime
2007-12-31 16919501.0 38767.0 -856.321210 True
2008-12-31 21894775.0 50264.0 -966.932046 True
2009-12-31 24366233.0 60505.0 -700.295376 True
2010-12-31 27459331.0 82934.0 229.218991 True
2011-12-31 33458926.0 97462.0 38.407292 True
In [41]:
plot_time(
    df_expected_observed, legend="Signed Chi", val_col="chi_value", color=colors([0.0]),
    title=f'Flickr Chi for "Sunrise"-related {METRIC}count over time', 
    filename=f"temporal_chi_flickr_{METRIC}count_sunrise_{AGG_BASE}", trend=False, agg_base=AGG_BASE)

Repeat the same for the observation of sunset posts worldwide

In [42]:
df_sunset = read_csv_datetime(FLICKR_SUNSET)
df_sunset = append_cardinality_df(df_sunset, f'{METRIC}_hll', f'{METRIC}count_est')
df_sunset = filter_fill_time(df_sunset, 2007, 2018, f'{METRIC}count_est', agg_base=AGG_BASE)
In [43]:
plot_time(
    df_sunset, legend=f"{METRIC.capitalize()}count", color=colors([1.0]),
    title=f'Flickr {METRIC}count over time for sunset related posts', 
    filename=f"temporal_{METRIC}count_flickr_sunset_{AGG_BASE}", trend=False, agg_base=AGG_BASE)
In [44]:
norm_val = calc_norm(
    df_expected.rename(columns={f'{METRIC}count_est_expected':f'{METRIC}count_est'}, inplace=False),
    df_sunset)
print(norm_val)
120.94355348791775
In [45]:
merge_cols = [f'{METRIC}count_est']
df_expected_observed = df_expected.merge(
    df_sunset[merge_cols],
    left_index=True, right_index=True)
In [46]:
%%time
apply_chi_calc(
    df=df_expected_observed,
    norm_val=norm_val)
CPU times: user 2.09 ms, sys: 0 ns, total: 2.09 ms
Wall time: 2.05 ms
In [47]:
plot_time(
    df_expected_observed, legend="Signed Chi", val_col="chi_value", color=colors([1.0]),
    title=f'Flickr Chi for "Sunset"-related {METRIC}count over time', 
    filename=f"temporal_chi_flickr_{METRIC}count_sunset_{AGG_BASE}", trend=False, agg_base=AGG_BASE)

Visualize Chi for subquery "Milvus milvus" and "Bloom"

Select topic parameter below

In [48]:
topic = "milvusmilvus"
# topic = "bloom"
In [49]:
FLICKR_SUBQUERY = OUTPUT / f"flickr_{topic}_months.csv"
In [50]:
%%time
df_post = read_csv_datetime(FLICKR_ALL)
df_post = append_cardinality_df(df_post, 'post_hll', 'postcount_est')
df_post = filter_fill_time(df_post, 2007, 2020, 'postcount_est', max_month=8, agg_base=AGG_BASE)
df_expected = df_post
CPU times: user 23.3 ms, sys: 9.67 ms, total: 32.9 ms
Wall time: 49.5 ms
In [51]:
df_subquery= read_csv_datetime(FLICKR_SUBQUERY)
df_subquery = append_cardinality_df(df_subquery, f'{METRIC}_hll', f'{METRIC}count_est')
df_subquery = filter_fill_time(df_subquery, 2007, 2020, f'{METRIC}count_est', max_month=8, agg_base=AGG_BASE)
In [52]:
plot_time(
    df_subquery, legend=f"{METRIC.capitalize()}count", color=colors([1.0]),
    title=f'Flickr {METRIC}count over time for {topic.capitalize()} related posts', 
    filename=f"temporal_{METRIC}count_flickr_{topic}_{AGG_BASE}", trend=False, agg_base=AGG_BASE)
In [53]:
df_expected.rename(columns={f'{METRIC}count_est_expected':f'{METRIC}count_est'}, inplace=True)
norm_val = calc_norm(
    df_expected,
    df_subquery)
print(norm_val)
df_expected.rename(columns={f'{METRIC}count_est':f'{METRIC}count_est_expected'}, inplace=True)
14384.199366802352
In [54]:
df_expected.head()
Out[54]:
postcount_est_expected
datetime
2007-12-31 16919501.0
2008-12-31 21894775.0
2009-12-31 24366233.0
2010-12-31 27459331.0
2011-12-31 33458926.0
In [55]:
merge_cols = [f'{METRIC}count_est']
df_expected_observed_flickr = df_expected.merge(
    df_subquery[merge_cols],
    left_index=True, right_index=True)
In [56]:
%%time
apply_chi_calc(
    df=df_expected_observed_flickr,
    norm_val=norm_val)
CPU times: user 2.21 ms, sys: 0 ns, total: 2.21 ms
Wall time: 2.17 ms
In [57]:
plot_time(
    df_expected_observed_flickr, legend="Signed Chi", val_col="chi_value", color=colors([1.0]),
    title=f'Flickr Chi for "{topic.capitalize()}"-related {METRIC}count over time', 
    filename=f"temporal_chi_flickr_{METRIC}count_{topic}_{AGG_BASE}", trend=False, seasonal=False, residual=False, agg_base=AGG_BASE)

iNaturalist Milvus milvus chi

In [58]:
src = Path.cwd().parents[0] / "00_data" / "milvus" / "observations-350501.csv"
In [59]:
load_inat_kwargs = {
    "filepath_or_buffer":src,
    "index_col":'datetime', 
    "parse_dates":{'datetime':["observed_on"]},
    "date_format":'%Y-%m-%d',
    "keep_date_col":'False',
    "usecols":["id", "observed_on"]
}
df = pd.read_csv(**load_inat_kwargs)
In [60]:
df.drop(columns=['observed_on'], inplace=True)
In [61]:
df.head()
Out[61]:
id
datetime
2010-05-22 7793
2011-01-01 9495
2011-01-01 9496
2011-04-17 14694
2011-10-05 50134
In [62]:
df_milvus = filter_fill_time(
    df, 2007, 2022, val_col="id", agg_base=AGG_BASE, agg_method="count")
In [63]:
df_milvus.rename(columns={"id": "observations"}, inplace=True)
In [64]:
metric_label="observations"
src_ref="iNaturalist Milvus milvus"
In [65]:
plot_time(
        df_milvus, legend=metric_label.capitalize(), color=colors([1.0]),
        title=f'{src_ref.capitalize()} {metric_label} over time', val_col=metric_label,
        filename=f"temporal_iNaturalist_milvusmilvus_absolute", trend=False, agg_base=AGG_BASE)
In [66]:
df_expected = load_and_plot(
    INATURALIST_ALL, src_ref=f"inaturalist_{AGG_BASE}", colors=colors([1.0]),
    agg_base=AGG_BASE, trend=False, return_df=True)
In [67]:
df_expected.head()
Out[67]:
postcount_est
datetime
2007-12-31 114001.0
2008-12-31 136471.0
2009-12-31 168795.0
2010-12-31 202430.0
2011-12-31 245601.0
In [68]:
norm_val = calc_norm(
    df_expected.rename(columns={f'{METRIC}count_est':f'observations'}, inplace=False),
    df_milvus, chi_column = "observations")
print(norm_val)
3098.4659662808253
In [69]:
df_expected.rename(columns={f'{METRIC}count_est':f'observations_expected'}, inplace=True)
In [70]:
merge_cols = [f'{METRIC}count_est']
df_expected_observed_inat = df_expected.merge(
    df_milvus["observations"],
    left_index=True, right_index=True)
In [71]:
%%time
apply_chi_calc(
    df=df_expected_observed_inat,
    norm_val=norm_val, chi_column="observations")
CPU times: user 1.44 ms, sys: 717 µs, total: 2.15 ms
Wall time: 2.13 ms
In [72]:
plot_time(
    df_expected_observed_inat, legend="Signed Chi", val_col="chi_value", color=colors([1.0]),
    title=f'iNaturalist Chi for "{topic.capitalize()}"-related {METRIC}count over time', 
    filename=f"temporal_chi_inaturalist_observations_milvus_{AGG_BASE}", trend=False, 
    seasonal=False, residual=False, agg_base=AGG_BASE)

Get correlation between time and chi

Temporarily fill missing values.

In [73]:
df_expected_observed_flickr.loc["2021-12-31", "chi_value"]=3059.0
df_expected_observed_flickr.loc["2022-12-31", "chi_value"]=3059.0
In [74]:
x = df_expected_observed_inat[f"chi_value"]
y = df_expected_observed_flickr[f"chi_value"]
In [75]:
from scipy import stats
r, p = stats.pearsonr(x, y)
# covariance
cov = np.cov(x, y)[0][1]
correlation_matrix = np.corrcoef(x, y)
correlation_xy = correlation_matrix[0, 1]
r_squared = correlation_xy**2
print(f"Statistics:\nr={r:.2f},")
print(f"p={p:.2g},")
print(f"cov={cov:.2f}")
print(f"r²={r_squared:.2f}")
Statistics:
r=0.15,
p=0.57,
cov=107430.15
r²=0.02
In [76]:
nas = np.logical_or(x.isna(), y.isna())
r, p = stats.spearmanr(x[~nas], y[~nas])
print(f"Statistics:\nr={r:.2f},")
print(f"p={p:.2g},")
Statistics:
r=0.13,
p=0.64,

This is a pretty weak relationship between iNat and Flickr chi.

Ranked stats:

iNaturalist Milvus milvus chi (range)

In [77]:
CRS_WGS = "epsg:4326" # WGS1984
CRS_PROJ = "esri:54009" # Mollweide
In [78]:
load_inat_kwargs["usecols"] = ["id", "observed_on", "longitude", "latitude"]
df = pd.read_csv(**load_inat_kwargs)
In [79]:
df.dropna(subset=['longitude', 'latitude'], inplace=True)
In [80]:
import geopandas as gp
milvus_range = gp.read_file(
    OUTPUT/ 'Milvusmilvus_range.gpkg', layer='Milvus milvus')
In [81]:
gdf = gp.GeoDataFrame(
    df, geometry=gp.points_from_xy(df.longitude, df.latitude), crs=CRS_WGS)

Intersect, keep only observations within range.

In [82]:
gdf_overlay = gp.overlay(
    gdf, milvus_range,
    how='intersection')
In [83]:
ax = gdf_overlay.plot(markersize=.1)
ax.axis('off')
plt.show()

Calculate chi

In [84]:
gdf_overlay
Out[84]:
id observed_on latitude longitude Name Description geometry
0 7793 2010-05-22 52.440923 -3.953872 POINT (-3.95387 52.44092)
1 9495 2011-01-01 51.185595 -1.212616 POINT (-1.21262 51.18559)
2 9496 2011-01-01 51.280226 -0.925083 POINT (-0.92508 51.28023)
3 14694 2011-04-17 51.814146 -0.192910 POINT (-0.19291 51.81415)
4 50348 2011-08-03 50.406329 8.873863 POINT (8.87386 50.40633)
... ... ... ... ... ... ... ...
19067 178281936 2023-07-02 47.661184 8.854844 POINT (8.85484 47.66118)
19068 178325879 2023-07-29 51.047883 10.996382 POINT (10.99638 51.04788)
19069 178326364 2013-06-09 51.454250 -0.884408 POINT (-0.88441 51.45425)
19070 178333330 2023-08-12 47.961332 16.769473 POINT (16.76947 47.96133)
19071 178333356 2023-08-13 45.129853 7.695633 POINT (7.69563 45.12985)

19072 rows × 7 columns

In [85]:
gdf_overlay['datetime'] = pd.to_datetime(gdf_overlay["observed_on"], format=load_inat_kwargs.get("date_format"))
gdf_overlay.set_index('datetime', inplace=True)
gdf_cleaned = tools.drop_cols_except(gdf_overlay, ["id"], inplace=False)
In [86]:
df_milvus_range = filter_fill_time(
    gdf_cleaned, 2007, 2022, val_col="id", agg_base=AGG_BASE, agg_method="count")
In [87]:
df_milvus_range.rename(columns={"id": "observations"}, inplace=True)
In [88]:
metric_label="observations"
src_ref="iNaturalist Milvus milvus"
In [89]:
plot_time(
        df_milvus_range, legend=metric_label.capitalize(), color=colors([1.0]),
        title=f'{src_ref.capitalize()} {metric_label} over time', val_col=metric_label,
        filename=f"temporal_iNaturalist_milvusmilvus_absolute", trend=False, agg_base=AGG_BASE)
In [90]:
df_expected = load_and_plot(
    INATURALIST_ALL_MILVUSRANGE, src_ref=f"inaturalist_{AGG_BASE}", colors=colors([1.0]),
    agg_base=AGG_BASE, trend=False, return_df=True)
In [91]:
norm_val = calc_norm(
    df_expected.rename(columns={f'{METRIC}count_est':f'observations'}, inplace=False),
    df_milvus_range, chi_column = "observations")
print(norm_val)
383.41553242594074
In [92]:
df_expected.rename(columns={f'{METRIC}count_est':f'observations_expected'}, inplace=True)
In [93]:
merge_cols = [f'{METRIC}count_est']
df_expected_observed_inat = df_expected.merge(
    df_milvus_range["observations"],
    left_index=True, right_index=True)
In [94]:
%%time
apply_chi_calc(
    df=df_expected_observed_inat,
    norm_val=norm_val, chi_column="observations")
CPU times: user 1.82 ms, sys: 644 µs, total: 2.46 ms
Wall time: 3.01 ms
In [95]:
plot_time(
    df_expected_observed_inat, legend="Signed Chi", val_col="chi_value", color=colors([1.0]),
    title=f'iNaturalist Chi for "{topic.capitalize()}"-related {METRIC}count over time', 
    filename=f"temporal_chi_inaturalist_observations_milvus_{AGG_BASE}", trend=False, 
    seasonal=False, residual=False, agg_base=AGG_BASE)

Focus region

Let's check the total iNat and Flickr data for the focus region

In [96]:
df_expected = load_and_plot(
    INATURALIST_FLICKR_ALL_FOCUS, src_ref=f"iNaturalist + Flickr", colors=colors([1.0]),
    agg_base=AGG_BASE, trend=False, return_df=True)
In [97]:
df_post = read_csv_datetime(INATURALIST_FLICKR_ALL_FOCUS)
In [98]:
df_post.head()
Out[98]:
origin_id post_hll user_hll
datetime
2007-01-01 2 \x148b7f00c24000410800218441100000048208422020... \x138b7f00610223028202c2046504a2058206c2094109...
2007-02-01 2 \x148b7f190a608c801080308c8210c81084401908300c... \x138b7f00040465062106e209c10a430aa10b040d220e...
2007-03-01 2 \x148b7f20c21184031942208481210461086120021094... \x138b7f00040043008101c1028203c107c109c10aa30b...
2007-03-01 23 \x128b7fbf7dc7d226e191ba4f4af095ed1ac23b \x128b7fb6f0f487e76c9f29
2007-04-01 2 \x148b7f18c422896300863204240a0e01902200023100... \x138b7f000401e10282032203c103e20404080808c409...

Process independently

In [99]:
source_names = {
    2: "Flickr", 23: "iNaturalist"}
In [100]:
vis_df = []
for idx, name in source_names.items():
    sel_df = df_post[df_post["origin_id"]==idx].copy()
    vis_df.append(process_dataset(df_post=sel_df, agg_base=AGG_BASE))
In [101]:
vis_df[0].tail(5)
Out[101]:
postcount_est
datetime
2018-12-31 49715.0
2019-12-31 42560.0
2020-12-31 38108.0
2021-12-31 38546.0
2022-12-31 13878.0

Merge into single dataframe

In [102]:
stacked_df = pd.DataFrame()
for ix, df in enumerate(vis_df):
    source_name = source_names.get(list(source_names)[ix])
    stacked_df[source_name] = df
    stacked_df.index = df.index
In [103]:
stacked_df.head()
Out[103]:
Flickr iNaturalist
datetime
2007-12-31 45862.0 94.0
2008-12-31 73364.0 25.0
2009-12-31 75420.0 286.0
2010-12-31 82369.0 151.0
2011-12-31 107369.0 179.0
In [104]:
stacked_df.tail()
Out[104]:
Flickr iNaturalist
datetime
2018-12-31 49715.0 9514.0
2019-12-31 42560.0 17669.0
2020-12-31 38108.0 33545.0
2021-12-31 38546.0 30029.0
2022-12-31 13878.0 37064.0

The method below is largely adapted from 05_hotspots.ipynb.

In [117]:
BAR_PARAM = {
    "width":1.0,
    "label":"Flickr (comparison)",
    "edgecolor":"black",
    "linewidth":0.5,
    "alpha":0.6,
}

def plot_bars(
        df: pd.DataFrame, ax: plt.axes = None, title: str = None, 
        ytitle: float = None, padtitle: float = None, legend: bool = None,
        bar_param: Dict[str, Any] = BAR_PARAM, title_legend: str = None,
        xlegend: float = None, ylegend: float = None, lang: str = None):
    """Plot stacked bars from a DataFrame with multiple columns"""
    colors = sns.color_palette("vlag", as_cmap=True, n_colors=2)
    if lang is None:
        lang = "en"
    # create figure
    if not ax:
        fig, ax = plt.subplots(1, 1, figsize=(3, 1.5))
    # colors = sns.color_palette("bright", as_cmap=True, n_colors=2)
    # Create another axis that shares the same x-axis as ax.
    ax2 = ax.twinx()
    # plot
    # calculate mean:
    # exclude months with no data in mean calculation:
    df.Flickr.replace(0, np.NaN) \
        .groupby(df.index.year, dropna=True) \
        .sum() \
        .plot(kind='bar', ax=ax, color="white", **bar_param)
    # bar_param["alpha"] = 1
    bar_param["label"] = "iNaturalist"
    bar_param["edgecolor"] = "white"
    df.iNaturalist.replace(0, np.NaN) \
        .groupby(df.index.year, dropna=True) \
        .sum() \
        .plot(kind='bar', ax=ax2, color=colors([1.0]), **bar_param)    
    # format
    ax.set_xlim(-0.5,len(df)-0.5)
    for axis in [ax, ax2]:
        axis.tick_params(axis='x', rotation=45, length=0) # length: of ticks
        axis.spines["left"].set_linewidth(0.25)
        axis.spines["bottom"].set_linewidth(0.25)
        axis.spines["top"].set_linewidth(0)
        axis.spines["right"].set_linewidth(0.25)
        axis.yaxis.set_tick_params(width=0.5)
        axis.set(xlabel="")
    ax.set(ylabel="Flickr")
    ax2.set(ylabel="iNaturalist")
    leg_kwarg = {
        "bbox_to_anchor":(0,1.2),
        "loc":'upper left', 
        "fontsize":8, 
        "frameon":False, 
        "title":title_legend, 
        "title_fontsize":8
        }
    ax.legend(**leg_kwarg)
    leg_kwarg["bbox_to_anchor"]=(0.53,1.2)
    # ax2.legend(**leg_kwarg)
    if not title:
        title = "Year"
    if ytitle is None:
        ytitle =-0.2
    if padtitle is None:
        padtitle=-14
    ax.set_title(title, y=ytitle, pad=padtitle)
    for axis in [ax, ax2]:
        for item in (
            [axis.xaxis.label, axis.title, axis.yaxis.label] +
             axis.get_xticklabels() + axis.get_yticklabels()):
                item.set_fontsize(8)
    tools.save_fig(fig, output=OUTPUT, name="focus_region_all")
In [118]:
plot_bars(df=stacked_df, legend=True)

Focus Region Chi

Get expected:

In [134]:
df_post = read_csv_datetime(INATURALIST_FLICKR_ALL_FOCUS)

Limit to iNaturalist

In [135]:
mask = df_post["origin_id"]==23
In [123]:
df_post.loc[mask].head()
Out[123]:
origin_id post_hll user_hll
datetime
2007-03-01 23 \x128b7fbf7dc7d226e191ba4f4af095ed1ac23b \x128b7fb6f0f487e76c9f29
2007-04-01 23 \x128b7f86f3dc507a1d08e3b32ffece828a24cf1420d9... \x128b7f8435ffad9e0a5bad57fa9b70f8063f22
2007-05-01 23 \x128b7face0cb648f192b57b57f5ee3464eb317cd0dd2... \x128b7fb6f0f487e76c9f29bfbf495bc9e6f7323a8805...
2007-06-01 23 \x128b7fb849f8f7280bc6bac6e3fcbb5cfa3b5ae2aace... \x128b7fb555d813ab76dee6fc4ee09b0d014f9e
2007-07-01 23 \x128b7f83d715533b4e504f88ae402108b4f9d6891b19... \x128b7fb6f0f487e76c9f29d4d6fd03d4c929f2f5b994...
In [137]:
df_expected = load_and_plot(
    df_post=df_post.loc[mask].copy(), src_ref=f"iNaturalist + Flickr", colors=colors([1.0]),
    agg_base=AGG_BASE, trend=False, return_df=True)

Get observed:

In [124]:
geom_saxonya_bbox = box(10.52659544, 50.90971941, 13.30916778, 53.0603523)
In [125]:
gdf_focus_inat = gp.clip(gdf_overlay, mask=geom_saxonya_bbox)
In [126]:
ax = gdf_focus_inat.plot(markersize=.1, color="red")
ax.axis('off')
plt.show()

Cleanup: Drop all cols except id

In [127]:
gdf_cleaned = tools.drop_cols_except(gdf_focus_inat, ["id"], inplace=False)
In [128]:
df_milvus_focus_inat = filter_fill_time(
    gdf_cleaned, 2007, 2022, val_col="id", agg_base=AGG_BASE, agg_method="count")
In [129]:
df_milvus_focus_inat.rename(columns={"id": "observations"}, inplace=True)
df_milvus_focus_inat.head()
Out[129]:
observations
datetime
2007-12-31 4
2008-12-31 0
2009-12-31 0
2010-12-31 2
2011-12-31 1

Get observed for Flickr

df_post = read_csv_datetime(INATURALIST_FLICKR_MILVUS_FOCUS)
df_post.head()
df_milvus_focus_flickr = filter_fill_time( df_post, 2007, 2022, val_col="post_guid", agg_base=AGG_BASE, agg_method="count")
df_milvus_focus_flickr.rename(columns={"post_guid": "observations"}, inplace=True)

Union Flickr + Inat

df_milvus_focus = pd.concat( [df_milvus_focus_inat,df_milvus_focus_flickr], axis=1).sum(axis=1).rename("observations").to_frame()
In [138]:
df_milvus_focus = df_milvus_focus_inat
In [146]:
df_milvus_focus.head(10)
Out[146]:
observations
datetime
2007-12-31 4
2008-12-31 0
2009-12-31 0
2010-12-31 2
2011-12-31 1
2012-12-31 9
2013-12-31 5
2014-12-31 7
2015-12-31 7
2016-12-31 34
In [140]:
norm_val = calc_norm(
    df_expected.rename(columns={f'{METRIC}count_est':f'observations'}, inplace=False),
    df_milvus_focus, chi_column = "observations")
print(norm_val)
184.80053547523428
In [141]:
df_expected.rename(columns={f'{METRIC}count_est':f'observations_expected'}, inplace=True)
In [142]:
merge_cols = [f'{METRIC}count_est']
df_expected_observed = df_expected.merge(
    df_milvus_focus["observations"],
    left_index=True, right_index=True)
In [143]:
%%time
apply_chi_calc(
    df=df_expected_observed,
    norm_val=norm_val, chi_column="observations")
CPU times: user 1.64 ms, sys: 625 µs, total: 2.27 ms
Wall time: 2.23 ms
In [144]:
df_expected_observed
Out[144]:
observations_expected observations chi_value significant
datetime
2007-12-31 94.0 4 66.547520 True
2008-12-31 25.0 0 -5.000000 True
2009-12-31 286.0 0 -16.911535 True
2010-12-31 151.0 2 17.789503 True
2011-12-31 179.0 1 0.433552 False
2012-12-31 189.0 9 107.232622 True
2013-12-31 185.0 5 54.332557 True
2014-12-31 710.0 7 21.902258 True
2015-12-31 1713.0 7 -10.133182 True
2016-12-31 2848.0 34 64.370123 True
2017-12-31 3845.0 40 57.202583 True
2018-12-31 9514.0 51 -0.914219 False
2019-12-31 17669.0 92 -5.020514 True
2020-12-31 33545.0 166 -15.659651 True
2021-12-31 30029.0 154 -9.058391 True
2022-12-31 37064.0 175 -24.537207 True
In [145]:
plot_time(
    df_expected_observed, legend="Signed Chi", val_col="chi_value", color=colors([1.0]),
    title=f'Chi for "{topic.capitalize()}"-related {METRIC}count over time (Flickr+iNat)', 
    filename=f"temporal_chi_flickr-inat_observations_milvus_focus_{AGG_BASE}", trend=False, 
    seasonal=False, residual=False, agg_base=AGG_BASE)

Create notebook HTML

In [363]:
!jupyter nbconvert --to html_toc \
    --output-dir=../resources/html/ ./01_temporal_chi.ipynb \
    --output 01_temporal_chi_{AGG_BASE.lower()} \
    --template=../nbconvert.tpl \
    --ExtractOutputPreprocessor.enabled=False >&- 2>&-