Alexander Dunkel, Institute of Cartography, TU Dresden
Visualization of events (for Nevada example) queried from Wikidata using SPARQL.
Create environment
!python -m venv /envs/wikidata_venv
Install qwikidata in a venv
and link the Python Kernel to Jupyter Lab.
%%bash
if [ ! -d "/envs/wikidata_venv/lib/python3.10/site-packages/qwikidata" ]; then
/envs/wikidata_venv/bin/python -m pip install qwikidata ipykernel pandas > /dev/null 2>&1
else
echo "Already installed."
fi
# link
if [ ! -d "/root/.local/share/jupyter/kernels/qwikidata" ]; then
echo "Linking environment to jupyter"
/envs/wikidata_venv/bin/python -m ipykernel install --user --name=qwikidata
else
echo "Already linked."
fi
Hit F5 and select the qwikidata
Kernel on the top-right corner of Jupyter Lab.
See the package versions used below.
import dependencies
import csv
import pandas as pd
from qwikidata.sparql import return_sparql_query_results
There are two parameters that needs modification, the entity name that is used to get the centroid (location), for filtering based on geodistance (the second parameter).
## Example 1:
# loc_name = "Nevada"
# entity = "Q1227"
# geodistance = 400
## Example 2:
loc_name = "Leipzig"
geodistance = 80
entity = "Q2079" # Leipzig, Germany
sparql_query = f"""
#title: All events in {loc_name}, based on distance query ({geodistance})
SELECT ?event ?eventLabel ?date ?location ?eventDescription
WITH {{
SELECT DISTINCT ?event ?date ?location
WHERE {{
# find events
wd:{entity} wdt:P625 ?loc_ref.
?event wdt:P31/wdt:P279* wd:Q1190554.
# wdt:P17 wd:Q30;
# with a point in time or start date
OPTIONAL {{ ?event wdt:P585 ?date. }}
OPTIONAL {{ ?event wdt:P580 ?date. }}
?event wdt:P625 ?location.
FILTER(geof:distance(?location, ?loc_ref) < {geodistance}).
}}
LIMIT 5000
}} AS %i
WHERE {{
INCLUDE %i
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en,de" .}}
}}
"""
%%time
result = return_sparql_query_results(sparql_query)
Format and convert to pandas DataFrame
import dateutil.parser
event_list = []
for event in result["results"]["bindings"]:
date_val = event.get('date')
if date_val:
date_val = date_val.get('value')
date_val = pd.to_datetime(dateutil.parser.parse(date_val), errors = 'coerce')
event_desc = event.get('eventDescription')
if event_desc:
event_desc = event['eventDescription']['value']
event_tuple = (
event['event']['value'],
event['eventLabel']['value'],
date_val,
event['location']['value'],
event_desc)
event_list.append(event_tuple)
df = pd.DataFrame(event_list, columns=result['head']['vars'])
df.head()
print(len(df))
Store to disk
from pathlib import Path
OUTPUT = Path.cwd().parents[0] / "out"
df.to_pickle(OUTPUT / f"wikidata_events_{loc_name.lower()}.pkl")
Select worker_env
as the visualization environment.
%load_ext autoreload
%autoreload 2
Load dependencies
import sys
import pandas as pd
import geopandas as gp
from pathlib import Path
from shapely.geometry import Point
from shapely import wkt
module_path = str(Path.cwd().parents[0] / "py")
if module_path not in sys.path:
sys.path.append(module_path)
from modules.base import tools
OUTPUT = Path.cwd().parents[0] / "out"
df = pd.read_pickle(OUTPUT / f"wikidata_events_{loc_name.lower()}.pkl")
CRS_WGS = "epsg:4326"
df['geometry'] = df.location.apply(wkt.loads)
gdf = gp.GeoDataFrame(df, crs=CRS_WGS)
Get Shapefile for US States/ Germany
if loc_name == "Nevada":
source_zip = "https://www2.census.gov/geo/tiger/GENZ2018/shp/"
filename = "cb_2018_us_state_5m.zip"
shapes_name = "cb_2018_us_state_5m.shp"
elif loc_name == "Leipzig":
source_zip = "https://daten.gdz.bkg.bund.de/produkte/vg/vg2500/aktuell/"
filename = "vg2500_12-31.utm32s.shape.zip"
shapes_name = "vg2500_12-31.utm32s.shape/vg2500/VG2500_LAN.shp"
SHAPE_DIR = (OUTPUT / "shapes")
SHAPE_DIR.mkdir(exist_ok=True)
if not (SHAPE_DIR / shapes_name).exists():
tools.get_zip_extract(uri=source_zip, filename=filename, output_path=SHAPE_DIR)
else:
print("Already exists")
shapes = gp.read_file(SHAPE_DIR / shapes_name)
shapes = shapes.to_crs("EPSG:4326")
ax = shapes.plot(color='none', edgecolor='black', linewidth=0.5)
ax = gdf.plot(ax=ax)
ax.set_axis_off()
buffer = 0.5
minx, miny, maxx, maxy = gdf.total_bounds
ax.set_xlim(minx-buffer, maxx+buffer)
ax.set_ylim(miny-buffer, maxy+buffer)
We want to filter those events whose location falls within the state boundary (Nevada, Saxony)
if loc_name == "Nevada":
state_name = "Nevada"
col_name = "NAME"
elif loc_name == "Leipzig":
state_name = "Sachsen"
col_name = "GEN"
sel_geom = shapes[shapes[col_name]==state_name].copy()
tools.drop_cols_except(df=sel_geom, columns_keep=["geometry", col_name])
sel_geom.rename(columns={col_name: "country"}, inplace=True)
gdf_overlay = gp.overlay(
gdf, sel_geom,
how='intersection')
ax = shapes.plot(color='none', edgecolor='black', linewidth=0.5)
ax = gdf.plot(ax=ax)
ax = gdf_overlay.plot(ax=ax, color='red')
ax.set_axis_off()
buffer = 1
minx, miny, maxx, maxy = gdf.total_bounds
ax.set_xlim(minx-buffer, maxx+buffer)
ax.set_ylim(miny-buffer, maxy+buffer)
print(f'{len(gdf_overlay)} events queried from wikidata that are located in Nevada')
gdf_overlay.head(20)
Store results as CSV
gdf_overlay.to_csv(OUTPUT / f"wikidata_events_{loc_name.lower()}.csv")
!jupyter nbconvert --to html_toc \
--output-dir=../resources/html/ ./03_wikidata_event_query.ipynb \
--output 03_wikidata_event_query_{loc_name.lower()} \
--template=../nbconvert.tpl \
--ExtractOutputPreprocessor.enabled=False >&- 2>&-