Temporal chi visualization

Alexander Dunkel, Institute of Cartography, TU Dresden

•••
Out[57]:

Last updated: Aug-08-2023, Carto-Lab Docker Version 0.14.0

Chi visualization of temporal patterns for ephemeral events. This notebook is a continuation from a previous publication.

Data sources used:

  • Flickr
  • iNaturalist
  • Reddit

Preparations

In [34]:
import sys, os
import math
import numpy as np
import pandas as pd
import psycopg2
import statsmodels.api as sm
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import seaborn as sns
from matplotlib.axes import Axes 
from matplotlib import cm
from typing import Tuple
from pathlib import Path
from python_hll.hll import HLL
from python_hll.util import NumberUtil
module_path = str(Path.cwd().parents[0] / "py")
if module_path not in sys.path:
    sys.path.append(module_path)
from modules.base import tools, hll
In [3]:
OUTPUT = Path.cwd().parents[0] / "out"       # output directory for figures (etc.)
WORK_DIR = Path.cwd().parents[0] / "tmp"     # Working directory
In [4]:
(OUTPUT / "figures").mkdir(exist_ok=True)
(OUTPUT / "svg").mkdir(exist_ok=True)
WORK_DIR.mkdir(exist_ok=True)
In [5]:
%load_ext autoreload
%autoreload 2

Load HLL aggregate data

Load the data from CSV, generated in the previous notebook. Data is stored as aggregate HLL data (postcount, usercount) for each month.

In [6]:
FLICKR_ALL = OUTPUT / "flickr_all_months.csv"
INATURALIST_ALL = OUTPUT / "inaturalist_all_months.csv"
In [7]:
%%time
data_files = {
    "FLICKR_ALL":FLICKR_ALL,
    "INATURALIST_ALL":INATURALIST_ALL,
    }
tools.display_file_stats(data_files)
name FLICKR_ALL INATURALIST_ALL
size 941.67 KB 989.88 KB
records 208 198
CPU times: user 377 ms, sys: 29.7 ms, total: 407 ms
Wall time: 405 ms
In [8]:
pd.read_csv(FLICKR_ALL, nrows=10)
Out[8]:
year month post_hll user_hll
0 1970 1 \x138b4006e432653ea153e15a835e4168e279e1ad21dd... \x138b40da21
1 2005 3 \x138b40308549657da28201ee41f1e1 \x138b40e1e1
2 2005 6 \x138b4046c1 \x138b4025a1
3 2005 7 \x138b406ea3baa1 \x138b4000041ea2
4 2005 8 \x138b4027625181b9e1 \x138b4039a1cd23f101
5 2005 9 \x138b4027a23564506157638482ae43b8a2 \x138b4000041722a9e3
6 2005 10 \x138b4013a11921206130e53101372143c4538158816d... \x138b4017229943ac61b541ba62e541e663
7 2005 11 \x138b4002c21ae32501330236c1474347c24c624f6363... \x138b402d813de145e28621b841
8 2005 12 \x138b4002c103a1054306a207010bc60ee111c2168217... \x138b40000406e12d813de15ee1a862cda4e541
9 2006 1 \x138b400041018105c307010ec31ac12b612cc22dc13b... \x138b402b212d813a01b841cbe1d624e541ea01f945fcc2

Connect hll worker db

In [9]:
DB_USER = "hlluser"
DB_PASS = os.getenv('READONLY_USER_PASSWORD')
# set connection variables
DB_HOST = "hllworkerdb"
DB_PORT = "5432"
DB_NAME = "hllworkerdb"

Connect to empty Postgres database running HLL Extension:

In [10]:
DB_CONN = psycopg2.connect(
        host=DB_HOST,
        port=DB_PORT ,
        dbname=DB_NAME,
        user=DB_USER,
        password=DB_PASS
)
DB_CONN.set_session(
    readonly=True)
DB_CALC = tools.DbConn(
    DB_CONN)
CUR_HLL = DB_CONN.cursor()

test

Calculate HLL Cardinality per month

Define additional functions for reading and formatting CSV as pd.DataFrame

In [15]:
from datetime import datetime

def read_csv_datetime(csv: Path) -> pd.DataFrame:
    """Read CSV with parsing datetime index (months)
    
        First CSV column: Year
        Second CSV column: Month
    """
    date_cols = ["year", "month"]
    df = pd.read_csv(
        csv, index_col='datetime', 
        parse_dates={'datetime':date_cols},
        date_format='%Y %m',
        keep_date_col='False')
    df.drop(columns=date_cols, inplace=True)
    return df
    
def append_cardinality_df(df: pd.DataFrame, hll_col: str = "post_hll", cardinality_col: str = 'postcount_est'):
    """Calculate cardinality from HLL and append to extra column in df"""
    df[cardinality_col] = df.apply(
        lambda x: hll.cardinality_hll(
           x[hll_col], CUR_HLL),
        axis=1)
    df.drop(columns=[hll_col], inplace=True)
    return df

def filter_fill_time(
        df: pd.DataFrame, min_year: int, 
        max_year: int, val_col: str = "postcount_est",
        min_month: int = 1, max_month: int = 1):
    """Filter time values between min - max year and fill missing values"""
    min_date = pd.Timestamp(f'{min_year}-{min_month}-01')
    max_date = pd.Timestamp(f'{max_year}-{max_month}-01')
    # clip by start and end date
    if not min_date in df.index:
        df.loc[min_date, val_col] = 0
    if not max_date in df.index:
        df.loc[max_date, val_col] = 0
    df.sort_index(inplace=True)
    # mask min and max time
    time_mask = ((df.index >= min_date) & (df.index <= max_date))
    # fill missing months with 0
    # this will also set the day to max of month
    series = df.loc[time_mask][val_col].resample('M').sum().fillna(0)
    return series.to_frame()

Select dataset to process below

Apply functions to all data sets.

  • Read from CSV
  • calculate cardinality
  • merge year and month to single column
  • filter 2007 - 2018 range, fill missing values
In [29]:
def process_dataset(
        dataset: Path, metric: str = None, 
        min_year: int = None, max_year: int = None) -> pd.DataFrame:
    """Apply temporal filter/pre-processing to all data sets."""
    if metric is None:
        metric = 'post_hll'
    if metric == 'post_hll':
        cardinality_col = 'postcount_est'
    else:
        cardinality_col = 'usercount_est'
    if min_year is None:
        min_year = 2007
    if max_year is None:
        max_year = 2022
    df_post = read_csv_datetime(dataset)
    df_post = append_cardinality_df(df_post, metric, cardinality_col)
    return filter_fill_time(df_post, min_year, max_year, cardinality_col)
In [30]:
%%time
df_post = process_dataset(FLICKR_ALL)
CPU times: user 26.8 ms, sys: 3.63 ms, total: 30.4 ms
Wall time: 47.3 ms
In [31]:
df_post.head(5)
Out[31]:
postcount_est
datetime
2007-01-31 1018143
2007-02-28 897307
2007-03-31 1073342
2007-04-30 1227645
2007-05-31 1423686
In [32]:
%%time
df_user = process_dataset(FLICKR_ALL, metric='user_hll')
CPU times: user 28 ms, sys: 1.21 ms, total: 29.2 ms
Wall time: 47.6 ms
In [33]:
df_user.head(5)
Out[33]:
usercount_est
datetime
2007-01-31 40686
2007-02-28 40400
2007-03-31 45052
2007-04-30 46801
2007-05-31 50510

Visualize Cardinality

Define plot function.

In [20]:
def bar_plot_time(
        df: pd.DataFrame, ax: Axes, color: str,
        label: str, val_col: str = "postcount_est") -> Axes:
    """Matplotlib Barplot with time axis formatting"""
    ax = df.set_index(
        df.index.map(lambda s: s.strftime('%Y'))).plot.bar(
            ax=ax, y=val_col, color=color, width=1.0,
            label=label, edgecolor="white", linewidth=0.5, alpha=0.6)
    return ax

def plot_time(
        df: Tuple[pd.DataFrame, pd.DataFrame], title, color, filename = None, 
        output = OUTPUT, legend: str = "Postcount", val_col: str = None,
        trend: bool = None, seasonal: bool = None, residual: bool = None):
    """Create dataframe(s) time plot"""
    fig, ax = plt.subplots()
    fig.set_size_inches(15.7, 4.27)
    ylabel = f'{legend} (estimate)'
    if val_col is None:
        val_col = f'{legend.lower()}_est'
    ax = bar_plot_time(
        df=df, ax=ax, color=color, val_col=val_col, label=legend)
    # x axis ticker formatting
    tick_loc = mticker.MultipleLocator(12)
    ax.xaxis.set_major_locator(tick_loc)
    ax.tick_params(axis='x', rotation=45)
    ax.yaxis.set_major_formatter(mticker.StrMethodFormatter('{x:,.0f}'))
    ax.set(xlabel="Month", ylabel=ylabel)
    ax.spines["left"].set_linewidth(0.25)
    ax.spines["bottom"].set_linewidth(0.25)
    ax.spines["top"].set_linewidth(0)
    ax.spines["right"].set_linewidth(0)
    ax.yaxis.set_tick_params(width=0.5)
    # remove legend
    ax.get_legend().remove()
    ax.set_title(title)
    if any([trend, seasonal, residual]):
        # seasonal decompose
        decomposition = sm.tsa.seasonal_decompose(
            df[val_col], model='additive')
        # plot trend part only
        if trend:
            plt.plot(list(decomposition.trend), color='black',
                label="Trend", linewidth=3, alpha=0.8)
        if seasonal:
            plt.plot(list(decomposition.seasonal), color='black', linestyle='dotted',
                label="Seasonal", linewidth=1, alpha=0.8)
        if residual:
            plt.plot(list(decomposition.resid), color='black', linestyle='dashed',
                label="Residual", linewidth=1, alpha=0.8)
        # trend.plot(ax=ax)
    # store figure to file
    if filename:
        fig.savefig(
            output / "figures" / f"{filename}.png", dpi=300, format='PNG',
            bbox_inches='tight', pad_inches=1, facecolor="white")
        # also save as svg
        fig.savefig(
            output / "svg" / f"{filename}.svg", format='svg',
            bbox_inches='tight', pad_inches=1, facecolor="white")
In [49]:
def load_and_plot(dataset: Path, metric: str = None, src_ref: str = "flickr", colors: cm.colors.ListedColormap = None):
    """Load data and plot"""
    if metric is None:
        metric = 'post_hll'
    if metric == 'post_hll':
        metric_label = 'postcount'
    else:
        metric_label = 'usercount'
    if colors is None:
        colors = sns.color_palette("vlag", as_cmap=True, n_colors=2)
        colors = colors([1.0])
    df = process_dataset(dataset, metric=metric)
    plot_time(
        df, legend=metric_label.capitalize(), color=colors,
        title=f'{src_ref.capitalize()} {metric_label} over time [absolute estimates]', 
        filename=f"temporal_{metric_label}_{src_ref}_absolute", trend=True)
In [50]:
colors = sns.color_palette("vlag", as_cmap=True, n_colors=2)
In [51]:
load_and_plot(FLICKR_ALL, src_ref="flickr", colors=colors([1.0]))

Plot Flickr use count

In [54]:
load_and_plot(FLICKR_ALL, src_ref="flickr", metric="user_hll", colors=colors([0.0]))

Repeat for iNaturalist data

In [55]:
load_and_plot(INATURALIST_ALL, src_ref="inaturalist", colors=colors([1.0]))
load_and_plot(INATURALIST_ALL, src_ref="inaturalist", metric="user_hll", colors=colors([0.0]))

Visualize chi for subquery

First, define whether to study usercount or postcount

In [29]:
# METRIC = 'user'
METRIC = 'post'

Load benchmark data

In [24]:
source_zip="https://opara.zih.tu-dresden.de/xmlui/bitstream/handle/123456789/5793/S10.zip?sequence=1&isAllowed=y"
FLICKR_SUNRISE = WORK_DIR / "flickr-sunrise-months.csv"
FLICKR_SUNSET = WORK_DIR / "flickr-sunset-months.csv"
In [25]:
if not (FLICKR_SUNRISE).exists():
    tools.get_zip_extract(
        uri=source_zip, filename="S10.zip", output_path=WORK_DIR,
        filter_files=["flickr-sunrise-months.csv", "flickr-sunset-months.csv"])
In [26]:
df_sunrise = read_csv_datetime(FLICKR_SUNRISE)
df_sunrise = append_cardinality_df(df_sunrise, f'{METRIC}_hll', f'{METRIC}count_est')
df_sunrise = filter_fill_time(df_sunrise, 2007, 2018, f'{METRIC}count_est')
In [27]:
plot_time(
    df_sunrise, legend=f"{METRIC.capitalize()}count", color=colors([0.0]),
    title=f'Flickr {METRIC}count over time for sunrise related posts', 
    filename=f"temporal_{METRIC}count_flickr_sunrise", trend=True)

At the time of writing the sunset-sunrise paper, the data was only collected up to 2018.

Still, the graphic above shows the same seasonal patterns as the graph for all Flickr photographs. Below, we normalize the sunrise graph using chi.

Limit the dataframe for all posts over time to 2018, too.

In [28]:
if METRIC == 'post':
    df_expected = df_post
else:
    df_expected = df_user
In [29]:
df_expected = filter_fill_time(df_expected, 2007, 2018, f'{METRIC}count_est')

Prepare Chi

This is adapted from notebook three of the original publication.

First, define the input parameter:

  • dof: degrees of freedom (dof) is calculated: (variables - 1) with the variables being: observed posts, expected posts
  • chi_crit_val: given a dof value of 1 and a confidence interval of 0.05, the critical value to accept or neglect the h0 is 3.84
  • chi_column: we'll do the chi calculation based on the usercount-column, but this notebook can be run for postcount or userdays, too.
In [34]:
DOF = 1
CHI_CRIT_VAL = 3.84
CHI_COLUMN: str = f"{METRIC}count_est"
In [35]:
def calc_norm(
    df_expected: pd.DataFrame,
    df_observed: pd.DataFrame,
    chi_column: str = CHI_COLUMN):
    """Fetch the number of data points for the observed and 
    expected dataset by the relevant column
    and calculate the normalisation value
    """
    v_expected = df_expected[chi_column].sum()
    v_observed = df_observed[chi_column].sum()
    norm_val = (v_expected / v_observed)
    return norm_val
In [32]:
norm_val = calc_norm(df_expected, df_sunrise)
print(norm_val)
343.96221425896687

Merge dataframes

In [33]:
rename_expected = {
    'postcount_est':'postcount_est_expected',
    'usercount_est':'usercount_est_expected',
    }
df_expected.rename(
    columns=rename_expected,
    inplace=True)
In [34]:
merge_cols = [f'{METRIC}count_est']
df_expected_observed = df_expected.merge(
    df_sunrise[merge_cols],
    left_index=True, right_index=True)

Preview

In [35]:
df_expected_observed.head()
Out[35]:
postcount_est_expected postcount_est
datetime
2007-01-31 1018143.0 2788
2007-02-28 897307.0 2301
2007-03-31 1073342.0 3386
2007-04-30 1227645.0 2630
2007-05-31 1423686.0 2408

Calculate the Chi-value

In [40]:
def chi_calc(x_observed: float, x_expected: float, x_normalized: float) -> pd.Series:
    """Apply chi calculation based on observed (normalized) and expected value"""
    value_observed_normalised = x_observed * x_normalized
    a = value_observed_normalised - x_expected
    b = math.sqrt(x_expected)
    # native division with division by zero protection
    chi_value = a / b if b else 0
    return chi_value
    
def apply_chi_calc(
        df: pd.DataFrame, norm_val: float,
        chi_column: str = CHI_COLUMN, chi_crit_val: float = CHI_CRIT_VAL):
    """Calculate chi-values based on two GeoDataFrames (expected and observed values)
    and return new grid with results"""
    # lambda: apply function chi_calc() to each item
    df['chi_value'] = df.apply(
        lambda x: chi_calc(
           x[chi_column],
           x[f'{chi_column}_expected'],
           norm_val),
        axis=1)
    # add significant column, default False
    df['significant'] = False
    # calculate significance for both negative and positive chi_values
    df.loc[np.abs(df['chi_value'])>chi_crit_val, 'significant'] = True

Apply calculation

In [37]:
%%time
apply_chi_calc(
    df=df_expected_observed,
    norm_val=norm_val)
CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 4.11 ms

Visualize chi

In [38]:
df_expected_observed.head()
Out[38]:
postcount_est_expected postcount_est chi_value significant
datetime
2007-01-31 1018143.0 2788 -58.646724 True
2007-02-28 897307.0 2301 -111.742944 True
2007-03-31 1073342.0 3386 88.139093 True
2007-04-30 1227645.0 2630 -291.540502 True
2007-05-31 1423686.0 2408 -499.022295 True
In [39]:
plot_time(
    df_expected_observed, legend="Signed Chi", val_col="chi_value", color=colors([0.0]),
    title=f'Flickr Chi for "Sunrise"-related {METRIC}count over time', 
    filename=f"temporal_chi_flickr_{METRIC}count_sunrise", trend=True)

Repeat the same for the observation of sunset posts worldwide

In [40]:
df_sunset = read_csv_datetime(FLICKR_SUNSET)
df_sunset = append_cardinality_df(df_sunset, f'{METRIC}_hll', f'{METRIC}count_est')
df_sunset = filter_fill_time(df_sunset, 2007, 2018, f'{METRIC}count_est')
In [41]:
plot_time(
    df_sunset, legend=f"{METRIC.capitalize()}count", color=colors([1.0]),
    title=f'Flickr {METRIC}count over time for sunset related posts', 
    filename=f"temporal_{METRIC}count_flickr_sunset", trend=True)
In [42]:
norm_val = calc_norm(
    df_expected.rename(columns={f'{METRIC}count_est_expected':f'{METRIC}count_est'}, inplace=False),
    df_sunset)
print(norm_val)
119.17084615675292
In [43]:
merge_cols = [f'{METRIC}count_est']
df_expected_observed = df_expected.merge(
    df_sunset[merge_cols],
    left_index=True, right_index=True)
In [44]:
%%time
apply_chi_calc(
    df=df_expected_observed,
    norm_val=norm_val)
CPU times: user 3.83 ms, sys: 0 ns, total: 3.83 ms
Wall time: 3.79 ms
In [45]:
plot_time(
    df_expected_observed, legend="Signed Chi", val_col="chi_value", color=colors([1.0]),
    title=f'Flickr Chi for "Sunset"-related {METRIC}count over time', 
    filename=f"temporal_chi_flickr_{METRIC}count_sunset", trend=True)

Visualize Chi for subquery "Milvus milvus" and "Bloom"

Select topic parameter below

In [43]:
# topic = "milvusmilvus"
topic = "bloom"
In [44]:
FLICKR_SUBQUERY = OUTPUT / f"flickr_{topic}_months.csv"
In [45]:
%%time
df_post = read_csv_datetime(FLICKR_ALL)
df_post = append_cardinality_df(df_post, 'post_hll', 'postcount_est')
df_post = filter_fill_time(df_post, 2007, 2020, 'postcount_est', max_month=8)
df_expected = df_post
CPU times: user 19.2 ms, sys: 8.96 ms, total: 28.2 ms
Wall time: 44.2 ms
In [46]:
df_subquery= read_csv_datetime(FLICKR_SUBQUERY)
df_subquery = append_cardinality_df(df_subquery, f'{METRIC}_hll', f'{METRIC}count_est')
df_subquery = filter_fill_time(df_subquery, 2007, 2020, f'{METRIC}count_est', max_month=8)
In [47]:
plot_time(
    df_subquery, legend=f"{METRIC.capitalize()}count", color=colors([1.0]),
    title=f'Flickr {METRIC}count over time for {topic.capitalize()} related posts', 
    filename=f"temporal_{METRIC}count_flickr_{topic}", trend=True)
In [48]:
df_expected.rename(columns={f'{METRIC}count_est_expected':f'{METRIC}count_est'}, inplace=True)
norm_val = calc_norm(
    df_expected,
    df_subquery)
print(norm_val)
df_expected.rename(columns={f'{METRIC}count_est':f'{METRIC}count_est_expected'}, inplace=True)
98.03292334064963
In [49]:
df_expected.head()
Out[49]:
postcount_est_expected
datetime
2007-01-31 1018143
2007-02-28 897307
2007-03-31 1073342
2007-04-30 1227645
2007-05-31 1423686
In [50]:
merge_cols = [f'{METRIC}count_est']
df_expected_observed = df_expected.merge(
    df_subquery[merge_cols],
    left_index=True, right_index=True)
In [51]:
%%time
apply_chi_calc(
    df=df_expected_observed,
    norm_val=norm_val)
CPU times: user 3.85 ms, sys: 0 ns, total: 3.85 ms
Wall time: 3.79 ms
In [67]:
plot_time(
    df_expected_observed, legend="Signed Chi", val_col="chi_value", color=colors([1.0]),
    title=f'Flickr Chi for "{topic.capitalize()}"-related {METRIC}count over time', 
    filename=f"temporal_chi_flickr_{METRIC}count_{topic}", trend=True, seasonal=False, residual=False)

Create notebook HTML

In [69]:
!jupyter nbconvert --to html_toc \
    --output-dir=../resources/html/ ./01_temporal_chi.ipynb \
    --template=../nbconvert.tpl \
    --ExtractOutputPreprocessor.enabled=False >&- 2>&-