diff --git a/binder/requirements.txt b/binder/requirements.txt
index 1970f89..310aa3e 100644
--- a/binder/requirements.txt
+++ b/binder/requirements.txt
@@ -1,10 +1,17 @@
-# Binder environment requirements
-# Pin lonboard to version that supports _height parameter
-lonboard>=0.10.0
-ipydatagrid
-geopandas
-duckdb
+# Binder environment for examples/basic/isamples_explorer.ipynb
+# (and the adjacent basic/ notebooks). Tracks what the notebooks
+# actually import. Keep minimal — Binder's base image provides
+# JupyterLab, ipykernel, IPython.
+
+# Data + query
+duckdb>=0.10
pandas>=2.0.0
numpy
-ipywidgets
pyarrow>=12.0.0
+geopandas
+shapely
+
+# Visualization
+lonboard>=0.10.0 # _height + MaplibreBasemap
+ipydatagrid
+ipywidgets
diff --git a/examples/basic/geoparquet.ipynb b/examples/basic/geoparquet.ipynb
index eb978cc..72faa33 100644
--- a/examples/basic/geoparquet.ipynb
+++ b/examples/basic/geoparquet.ipynb
@@ -534,7 +534,7 @@
"\n",
"* [iSamples Complete Export Dataset - April 2025](https://zenodo.org/records/15278211)\n",
"\n",
- "* [Open Context Database SQL Dump and Parquet Exports](https://zenodo.org/records/15732000) -- [https://zenodo.org/records/15732000](https://zenodo.org/records/15732000) \n",
+ "* [Open Context Database SQL Dump and Parquet Exports](https://zenodo.org/records/15732000) -- [https://zenodo.org/records/15732000](https://zenodo.org/records/15732000)\u00a0\n",
"\n"
]
},
@@ -1479,6 +1479,59 @@
"print(result)"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Joining vocabulary URIs to human-readable labels\n",
+ "\n",
+ "iSamples concepts are referenced as SKOS URIs throughout the wide and\n",
+ "narrow parquets \u2014 `IdentifiedConcept.label` currently holds the URI\n",
+ "(e.g. `.../material/1.0/earthmaterial`) rather than the prefLabel.\n",
+ "We publish a small lookup at\n",
+ "`data.isamples.org/vocab_labels.parquet` (~60 KB, built from the\n",
+ "canonical TTLs at `isamplesorg/vocabularies`) so notebooks can render\n",
+ "`Natural Solid Material` instead. The `vocab_labels.py` helper in this\n",
+ "directory wraps it.\n",
+ "\n",
+ "See [issue #148](https://github.com/isamplesorg/isamplesorg.github.io/issues/148)\n",
+ "for background \u2014 this is a temporary client-side workaround until\n",
+ "`IdentifiedConcept.label` is populated with prefLabels at PQG-build time.\n",
+ "URIs that no TTL declares (a known set of 4, ~169/6M samples) fall back\n",
+ "to the last URL segment.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from vocab_labels import load_vocab_labels, pretty_label\n",
+ "import duckdb, pandas as pd\n",
+ "\n",
+ "labels = load_vocab_labels() # one HTTP fetch, ~60 KB\n",
+ "print(f'Loaded {len(labels):,} URI -> prefLabel entries')\n",
+ "\n",
+ "# Count how many concept entities exist for each material URI in the\n",
+ "# IdentifiedConcept slice of the wide parquet, then join to prefLabels.\n",
+ "# (The wide parquet stores concept URIs in IdentifiedConcept.label \u2014\n",
+ "# the #148 data-quality issue.)\n",
+ "url = 'https://data.isamples.org/current/wide.parquet'\n",
+ "con = duckdb.connect(); con.execute('INSTALL httpfs; LOAD httpfs;')\n",
+ "concept_uris = con.sql(f'''\n",
+ " SELECT label AS uri\n",
+ " FROM read_parquet(\\'{url}\\')\n",
+ " WHERE otype = \\'IdentifiedConcept\\'\n",
+ " AND label LIKE \\'%/material/%\\'\n",
+ " LIMIT 15\n",
+ "''').df()\n",
+ "concept_uris['pref_label'] = concept_uris['uri'].map(\n",
+ " lambda u: pretty_label(u, labels)\n",
+ ")\n",
+ "concept_uris[['pref_label', 'uri']]\n"
+ ]
+ },
{
"cell_type": "code",
"execution_count": 19,
@@ -1717,7 +1770,7 @@
" \n",
" \n",
"\n",
- "
6680932 rows × 3 columns
\n",
+ "6680932 rows \u00d7 3 columns
\n",
""
],
"text/plain": [
@@ -1957,7 +2010,7 @@
"version_minor": 1
},
"text/plain": [
- "Map(custom_attribution='', layers=(BitmapTileLayer(data='https://tile.openstreetmap.org/{z}/{x}/{y}.png', max_…"
+ "Map(custom_attribution='', layers=(BitmapTileLayer(data='https://tile.openstreetmap.org/{z}/{x}/{y}.png', max_\u2026"
]
},
"metadata": {},
@@ -2805,7 +2858,7 @@
"\n",
"The central idea is to create a \"control panel\" of widgets that are dynamically generated based on the schema of your Ibis table. This panel allows a user to build up a complex filter expression interactively, and then Ibis executes the final, filtered query.\n",
"\n",
- "Here’s a step-by-step approach to implementing your vision:\n",
+ "Here\u2019s a step-by-step approach to implementing your vision:\n",
"\n",
"---\n",
"\n",
@@ -2946,6 +2999,224 @@
"metadata": {},
"outputs": [],
"source": []
+ },
+ {
+ "cell_type": "markdown",
+ "id": "h3-acceleration-header",
+ "metadata": {},
+ "source": [
+ "## H3-Accelerated Spatial Filtering\n",
+ "\n",
+ "The [H3 geospatial indexing system](https://h3geo.org/) partitions the Earth into hexagonal cells at\n",
+ "multiple resolutions. By pre-computing H3 cell indices for each sample's coordinates, we can\n",
+ "replace expensive lat/lon range scans with fast integer lookups.\n",
+ "\n",
+ "The iSamples wide parquet file with H3 indices adds three BIGINT columns \u2014 `h3_res4`, `h3_res6`,\n",
+ "`h3_res8` \u2014 covering ~11.96M of 20.7M rows (those with valid coordinates).\n",
+ "\n",
+ "Below we benchmark **baseline lat/lon filtering** vs **H3 res4 pre-filtering** for a bounding-box\n",
+ "query, show H3 cell statistics, and render the H3-indexed data on a Lonboard map."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "h3-setup-cell",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import duckdb\n",
+ "import time\n",
+ "\n",
+ "# Data URLs\n",
+ "WIDE_H3_URL = \"https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202601_wide_h3.parquet\"\n",
+ "WIDE_URL = \"https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202601_wide.parquet\"\n",
+ "\n",
+ "con_h3 = duckdb.connect()\n",
+ "con_h3.execute(\"INSTALL h3 FROM community; LOAD h3;\")\n",
+ "print(\"DuckDB H3 extension loaded.\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "h3-stats-header",
+ "metadata": {},
+ "source": [
+ "### H3 Cell Distribution Statistics\n",
+ "\n",
+ "How many distinct hexagonal cells exist at each resolution, and what fraction of rows carry H3 values?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "h3-stats-cell",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "h3_stats = con_h3.sql(f\"\"\"\n",
+ " SELECT\n",
+ " COUNT(*) AS total_rows,\n",
+ " COUNT(h3_res4) AS rows_with_h3,\n",
+ " ROUND(100.0 * COUNT(h3_res4) / COUNT(*), 1) AS pct_with_h3,\n",
+ " COUNT(DISTINCT h3_res4) AS distinct_res4,\n",
+ " COUNT(DISTINCT h3_res6) AS distinct_res6,\n",
+ " COUNT(DISTINCT h3_res8) AS distinct_res8\n",
+ " FROM read_parquet('{WIDE_H3_URL}')\n",
+ "\"\"\").df()\n",
+ "\n",
+ "print(\"H3 Cell Distribution\")\n",
+ "print(\"=\" * 50)\n",
+ "print(f\"Total rows: {h3_stats['total_rows'].iloc[0]:>12,}\")\n",
+ "print(f\"Rows with H3: {h3_stats['rows_with_h3'].iloc[0]:>12,} ({h3_stats['pct_with_h3'].iloc[0]}%)\")\n",
+ "print(f\"Distinct res4 cells: {h3_stats['distinct_res4'].iloc[0]:>12,} (~1,770 km hex)\")\n",
+ "print(f\"Distinct res6 cells: {h3_stats['distinct_res6'].iloc[0]:>12,} (~3.2 km hex)\")\n",
+ "print(f\"Distinct res8 cells: {h3_stats['distinct_res8'].iloc[0]:>12,} (~0.46 km hex)\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "h3-benchmark-header",
+ "metadata": {},
+ "source": [
+ "### Bbox Benchmark: Lat/Lon Range Scan vs H3 Pre-Filter\n",
+ "\n",
+ "We query samples inside a bounding box (San Francisco Bay Area) two ways:\n",
+ "\n",
+ "1. **Baseline** \u2014 filter on `latitude BETWEEN ... AND longitude BETWEEN ...`\n",
+ "2. **H3 pre-filter** \u2014 find the set of res4 cells that overlap the bbox, then filter by those cells\n",
+ " before applying the exact lat/lon check.\n",
+ "\n",
+ "The H3 approach narrows the scan to a small number of hexagonal cells, reducing I/O on remote\n",
+ "parquet files."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "h3-benchmark-cell",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import h3\n",
+ "\n",
+ "# Bay Area bounding box\n",
+ "BBOX = dict(min_lat=37.2, max_lat=37.9, min_lon=-122.6, max_lon=-121.8)\n",
+ "\n",
+ "# --- Baseline: raw lat/lon range scan ---\n",
+ "t0 = time.time()\n",
+ "baseline = con_h3.sql(f\"\"\"\n",
+ " SELECT COUNT(*) AS n\n",
+ " FROM read_parquet('{WIDE_H3_URL}')\n",
+ " WHERE otype = 'MaterialSampleRecord'\n",
+ " AND latitude BETWEEN {BBOX['min_lat']} AND {BBOX['max_lat']}\n",
+ " AND longitude BETWEEN {BBOX['min_lon']} AND {BBOX['max_lon']}\n",
+ "\"\"\").df()\n",
+ "baseline_ms = (time.time() - t0) * 1000\n",
+ "\n",
+ "# --- H3 pre-filter: compute covering cells mathematically (no data scan) ---\n",
+ "t0 = time.time()\n",
+ "\n",
+ "# Use h3 Python library to compute all res4 cells covering the bbox.\n",
+ "# This is O(1) relative to dataset size \u2014 pure geometry, no I/O.\n",
+ "bbox_polygon = h3.LatLngPoly([\n",
+ " (BBOX['min_lat'], BBOX['min_lon']),\n",
+ " (BBOX['min_lat'], BBOX['max_lon']),\n",
+ " (BBOX['max_lat'], BBOX['max_lon']),\n",
+ " (BBOX['max_lat'], BBOX['min_lon']),\n",
+ "])\n",
+ "covering_cells = h3.geo_to_cells(bbox_polygon, res=4)\n",
+ "# Convert to signed int64 to match DuckDB BIGINT storage\n",
+ "def h3_to_signed(cell_hex):\n",
+ " val = h3.str_to_int(cell_hex)\n",
+ " return val if val < 2**63 else val - 2**64\n",
+ "\n",
+ "cell_list = [str(h3_to_signed(c)) for c in covering_cells]\n",
+ "print(f'Bbox covered by {len(cell_list)} res4 cells (computed mathematically)')\n",
+ "\n",
+ "if not cell_list:\n",
+ " print('No H3 cells cover this bbox.')\n",
+ " h3_ms = 0\n",
+ " h3_result = None\n",
+ "else:\n",
+ " cell_sql = ', '.join(cell_list)\n",
+ " h3_result = con_h3.sql(f\"\"\"\n",
+ " SELECT COUNT(*) AS n\n",
+ " FROM read_parquet('{WIDE_H3_URL}')\n",
+ " WHERE otype = 'MaterialSampleRecord'\n",
+ " AND h3_res4 IN ({cell_sql})\n",
+ " AND latitude BETWEEN {BBOX['min_lat']} AND {BBOX['max_lat']}\n",
+ " AND longitude BETWEEN {BBOX['min_lon']} AND {BBOX['max_lon']}\n",
+ " \"\"\").df()\n",
+ " h3_ms = (time.time() - t0) * 1000\n",
+ "\n",
+ "# --- Results ---\n",
+ "print('Bounding-Box Query Benchmark (Bay Area)')\n",
+ "print('=' * 50)\n",
+ "print(f'Baseline (lat/lon scan): {baseline_ms:>8.0f} ms | {baseline[\"n\"].iloc[0]:,} rows')\n",
+ "if h3_result is not None:\n",
+ " print(f'H3 res4 pre-filter: {h3_ms:>8.0f} ms | {h3_result[\"n\"].iloc[0]:,} rows')\n",
+ " speedup = baseline_ms / h3_ms if h3_ms > 0 else float('inf')\n",
+ " print(f'Speedup: {speedup:>7.1f}x')\n",
+ " print(f'\\nRow counts match: {baseline[\"n\"].iloc[0] == h3_result[\"n\"].iloc[0]}')\n",
+ "else:\n",
+ " print('H3 result: no covering cells')\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "h3-lonboard-header",
+ "metadata": {},
+ "source": [
+ "### Lonboard Visualization with H3-Indexed Data\n",
+ "\n",
+ "Render a sample of the H3-indexed dataset to confirm the visualization pipeline works with\n",
+ "the enriched file."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "h3-lonboard-cell",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from lonboard import Map, ScatterplotLayer, BitmapTileLayer\n",
+ "\n",
+ "# Sample 100K rows with coordinates from the H3 file\n",
+ "sample_df = con_h3.sql(f\"\"\"\n",
+ " SELECT latitude, longitude, n AS source, h3_res4, h3_res6\n",
+ " FROM read_parquet('{WIDE_H3_URL}')\n",
+ " WHERE latitude IS NOT NULL AND longitude IS NOT NULL\n",
+ " USING SAMPLE 100000\n",
+ "\"\"\").df()\n",
+ "\n",
+ "gdf_h3 = gpd.GeoDataFrame(\n",
+ " sample_df,\n",
+ " geometry=gpd.points_from_xy(sample_df.longitude, sample_df.latitude),\n",
+ " crs=\"EPSG:4326\"\n",
+ ")\n",
+ "\n",
+ "# Color by source\n",
+ "h3_color_map = {\n",
+ " 'SESAR': [51, 102, 204, 200],\n",
+ " 'OPENCONTEXT': [220, 57, 18, 200],\n",
+ " 'GEOME': [16, 150, 24, 200],\n",
+ " 'SMITHSONIAN': [255, 153, 0, 200],\n",
+ "}\n",
+ "default_c = [128, 128, 128, 200]\n",
+ "colors_arr = np.array([h3_color_map.get(s, default_c) for s in gdf_h3['source']], dtype=np.uint8)\n",
+ "\n",
+ "layer = ScatterplotLayer.from_geopandas(gdf_h3, get_fill_color=colors_arr, get_radius=500)\n",
+ "tile_layer = BitmapTileLayer(\n",
+ " data=\"https://tile.openstreetmap.org/{z}/{x}/{y}.png\",\n",
+ " min_zoom=0, max_zoom=19,\n",
+ ")\n",
+ "\n",
+ "m = Map([tile_layer, layer], view_state={\"zoom\": 2, \"latitude\": 20, \"longitude\": 0})\n",
+ "print(f\"Rendering {len(gdf_h3):,} points from H3-indexed file\")\n",
+ "m"
+ ]
}
],
"metadata": {
@@ -2969,4 +3240,4 @@
},
"nbformat": 4,
"nbformat_minor": 5
-}
+}
\ No newline at end of file
diff --git a/examples/basic/h3_clustering.ipynb b/examples/basic/h3_clustering.ipynb
new file mode 100644
index 0000000..e901d89
--- /dev/null
+++ b/examples/basic/h3_clustering.ipynb
@@ -0,0 +1,374 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "h3-intro",
+ "metadata": {},
+ "source": [
+ "# H3 Clustering for iSamples Geospatial Data\n",
+ "\n",
+ "## What is H3?\n",
+ "\n",
+ "[H3](https://h3geo.org/) is Uber's **hierarchical hexagonal indexing system** that partitions the\n",
+ "entire Earth's surface into hexagonal cells at 16 resolutions (0-15). Key properties:\n",
+ "\n",
+ "- **Hierarchical**: Each parent cell contains ~7 children at the next resolution\n",
+ "- **Hexagonal**: Hexagons tile the plane with equal-area cells and uniform neighbor distances\n",
+ "- **Multi-resolution**: Zoom from continent-scale (res 0, ~4.4M km2) to sub-meter (res 15)\n",
+ "\n",
+ "| Resolution | Hex Edge Length | Approx. Area | Use Case |\n",
+ "|-----------|----------------|-------------|----------|\n",
+ "| 4 | ~22 km | ~1,770 km2 | Country/region overview |\n",
+ "| 6 | ~3.2 km | ~36 km2 | City-level clustering |\n",
+ "| 8 | ~460 m | ~0.74 km2 | Neighborhood detail |\n",
+ "\n",
+ "The iSamples wide parquet with H3 columns (`h3_res4`, `h3_res6`, `h3_res8`) covers\n",
+ "~11.96M of 20.7M rows \u2014 those with valid latitude/longitude coordinates.\n",
+ "\n",
+ "This notebook demonstrates:\n",
+ "1. H3 cell statistics at multiple resolutions\n",
+ "2. Cluster visualization colored by dominant source\n",
+ "3. Multi-resolution comparison\n",
+ "4. Performance gains of clustering vs raw points\n",
+ "5. Hierarchical drill-down from coarse to fine cells"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "h3-setup",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import duckdb\n",
+ "import time\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import geopandas as gpd\n",
+ "from shapely.geometry import Point\n",
+ "\n",
+ "# Data URL\n",
+ "WIDE_H3_URL = \"https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202601_wide_h3.parquet\"\n",
+ "\n",
+ "# Initialize DuckDB with H3 extension\n",
+ "con = duckdb.connect()\n",
+ "con.execute(\"INSTALL h3 FROM community; LOAD h3;\")\n",
+ "\n",
+ "# Source colors (RGBA)\n",
+ "SOURCE_COLORS = {\n",
+ " 'SESAR': [51, 102, 204, 200],\n",
+ " 'OPENCONTEXT': [220, 57, 18, 200],\n",
+ " 'GEOME': [16, 150, 24, 200],\n",
+ " 'SMITHSONIAN': [255, 153, 0, 200],\n",
+ "}\n",
+ "DEFAULT_COLOR = [128, 128, 128, 200]\n",
+ "\n",
+ "print(\"Setup complete. DuckDB H3 extension loaded.\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "h3-stats-md",
+ "metadata": {},
+ "source": [
+ "## H3 Cell Statistics\n",
+ "\n",
+ "How many distinct hexagonal cells exist at each resolution, and how does\n",
+ "the point distribution look?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "h3-stats-code",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Overall H3 coverage stats\n",
+ "stats = con.sql(f\"\"\"\n",
+ " SELECT\n",
+ " COUNT(*) AS total_rows,\n",
+ " COUNT(h3_res4) AS rows_with_h3,\n",
+ " ROUND(100.0 * COUNT(h3_res4) / COUNT(*), 1) AS pct_with_h3,\n",
+ " COUNT(DISTINCT h3_res4) AS distinct_res4,\n",
+ " COUNT(DISTINCT h3_res6) AS distinct_res6,\n",
+ " COUNT(DISTINCT h3_res8) AS distinct_res8\n",
+ " FROM read_parquet('{WIDE_H3_URL}')\n",
+ "\"\"\").df()\n",
+ "\n",
+ "print(\"iSamples H3 Coverage\")\n",
+ "print(\"=\" * 50)\n",
+ "print(f\"Total rows: {stats['total_rows'].iloc[0]:>12,}\")\n",
+ "print(f\"Rows with H3: {stats['rows_with_h3'].iloc[0]:>12,} ({stats['pct_with_h3'].iloc[0]}%)\")\n",
+ "print(f\"Distinct res4 cells: {stats['distinct_res4'].iloc[0]:>12,}\")\n",
+ "print(f\"Distinct res6 cells: {stats['distinct_res6'].iloc[0]:>12,}\")\n",
+ "print(f\"Distinct res8 cells: {stats['distinct_res8'].iloc[0]:>12,}\")\n",
+ "\n",
+ "# Res6 cluster aggregation (~3.2 km hexagons)\n",
+ "print(\"\\nComputing res6 clusters...\")\n",
+ "t0 = time.time()\n",
+ "clusters = con.sql(f\"\"\"\n",
+ " SELECT\n",
+ " h3_res6,\n",
+ " COUNT(*) AS sample_count,\n",
+ " AVG(latitude) AS lat,\n",
+ " AVG(longitude) AS lng,\n",
+ " MODE(n) AS dominant_source,\n",
+ " COUNT(DISTINCT n) AS source_count\n",
+ " FROM read_parquet('{WIDE_H3_URL}')\n",
+ " WHERE h3_res6 IS NOT NULL\n",
+ " GROUP BY h3_res6\n",
+ " ORDER BY sample_count DESC\n",
+ "\"\"\").df()\n",
+ "cluster_ms = (time.time() - t0) * 1000\n",
+ "\n",
+ "print(f\"Computed {len(clusters):,} res6 clusters in {cluster_ms:.0f} ms\")\n",
+ "print(f\"\\nTop 10 clusters by sample count:\")\n",
+ "print(clusters[['h3_res6', 'sample_count', 'dominant_source', 'lat', 'lng']].head(10).to_string(index=False))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "h3-viz-md",
+ "metadata": {},
+ "source": [
+ "## Cluster Visualization with Lonboard\n",
+ "\n",
+ "Each cluster is rendered as a circle at its hexagon centroid. The radius is proportional\n",
+ "to `log(count)` so that both sparse and dense regions are visible. Colors indicate the\n",
+ "dominant source within each hex cell."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "h3-viz-code",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from lonboard import Map, ScatterplotLayer, BitmapTileLayer\n",
+ "\n",
+ "# Build GeoDataFrame from clusters\n",
+ "gdf_clusters = gpd.GeoDataFrame(\n",
+ " clusters,\n",
+ " geometry=gpd.points_from_xy(clusters.lng, clusters.lat),\n",
+ " crs=\"EPSG:4326\"\n",
+ ")\n",
+ "\n",
+ "# Colors based on dominant source\n",
+ "colors = np.array([\n",
+ " SOURCE_COLORS.get(s, DEFAULT_COLOR) for s in gdf_clusters['dominant_source']\n",
+ "], dtype=np.uint8)\n",
+ "\n",
+ "# Radius proportional to log(count), scaled for visibility\n",
+ "radii = (np.log1p(gdf_clusters['sample_count'].values) * 800).astype(np.float32)\n",
+ "\n",
+ "layer = ScatterplotLayer.from_geopandas(\n",
+ " gdf_clusters,\n",
+ " get_fill_color=colors,\n",
+ " get_radius=radii,\n",
+ " opacity=0.7,\n",
+ " radius_min_pixels=2,\n",
+ ")\n",
+ "\n",
+ "tile_layer = BitmapTileLayer(\n",
+ " data=\"https://tile.openstreetmap.org/{z}/{x}/{y}.png\",\n",
+ " min_zoom=0, max_zoom=19,\n",
+ ")\n",
+ "\n",
+ "m = Map([tile_layer, layer], view_state={\"zoom\": 2, \"latitude\": 20, \"longitude\": 0})\n",
+ "print(f\"Rendering {len(gdf_clusters):,} hex clusters (from {clusters['sample_count'].sum():,} samples)\")\n",
+ "print(f\"Color legend: Blue=SESAR, Red=OpenContext, Green=GEOME, Orange=Smithsonian\")\n",
+ "m"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "h3-multires-md",
+ "metadata": {},
+ "source": [
+ "## Multi-Resolution Comparison\n",
+ "\n",
+ "Compare clustering at res4 (regional), res6 (city), and res8 (neighborhood) to understand\n",
+ "how granularity affects cluster count and aggregation behavior."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "h3-multires-code",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "resolutions = []\n",
+ "\n",
+ "for res, col in [(4, 'h3_res4'), (6, 'h3_res6'), (8, 'h3_res8')]:\n",
+ " t0 = time.time()\n",
+ " res_stats = con.sql(f\"\"\"\n",
+ " SELECT\n",
+ " {res} AS resolution,\n",
+ " COUNT(DISTINCT {col}) AS num_clusters,\n",
+ " COUNT(*) AS total_points,\n",
+ " ROUND(COUNT(*) * 1.0 / COUNT(DISTINCT {col}), 1) AS avg_points_per_cluster,\n",
+ " MAX(ct) AS max_cluster_size,\n",
+ " MEDIAN(ct) AS median_cluster_size\n",
+ " FROM (\n",
+ " SELECT {col}, COUNT(*) AS ct\n",
+ " FROM read_parquet('{WIDE_H3_URL}')\n",
+ " WHERE {col} IS NOT NULL\n",
+ " GROUP BY {col}\n",
+ " )\n",
+ " \"\"\").df()\n",
+ " elapsed = (time.time() - t0) * 1000\n",
+ " res_stats['query_ms'] = round(elapsed)\n",
+ " resolutions.append(res_stats)\n",
+ "\n",
+ "comparison = pd.concat(resolutions, ignore_index=True)\n",
+ "print(\"Multi-Resolution Comparison\")\n",
+ "print(\"=\" * 70)\n",
+ "print(comparison.to_string(index=False))\n",
+ "\n",
+ "print(\"\\nGuidance:\")\n",
+ "print(\" res4 \u2014 Best for global/continental overviews (few clusters, fast)\")\n",
+ "print(\" res6 \u2014 Good balance for city-level exploration (~3.2 km hexagons)\")\n",
+ "print(\" res8 \u2014 Detailed neighborhood view (~460 m hexagons, more clusters)\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "h3-perf-md",
+ "metadata": {},
+ "source": [
+ "## Performance: Clusters vs Full Points\n",
+ "\n",
+ "Rendering all ~12M points individually overwhelms both the query engine and the browser.\n",
+ "Clustering via H3 reduces data volume by orders of magnitude while preserving spatial patterns."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "h3-perf-code",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Benchmark: clustered query vs full point query\n",
+ "\n",
+ "# Full points (limited to count to avoid browser crash)\n",
+ "t0 = time.time()\n",
+ "full_count = con.sql(f\"\"\"\n",
+ " SELECT COUNT(*) AS n\n",
+ " FROM read_parquet('{WIDE_H3_URL}')\n",
+ " WHERE latitude IS NOT NULL\n",
+ "\"\"\").fetchone()[0]\n",
+ "full_ms = (time.time() - t0) * 1000\n",
+ "\n",
+ "# Clustered at res6\n",
+ "t0 = time.time()\n",
+ "cluster_count = con.sql(f\"\"\"\n",
+ " SELECT COUNT(*) AS n FROM (\n",
+ " SELECT h3_res6, COUNT(*) AS ct,\n",
+ " AVG(latitude) AS lat, AVG(longitude) AS lng\n",
+ " FROM read_parquet('{WIDE_H3_URL}')\n",
+ " WHERE h3_res6 IS NOT NULL\n",
+ " GROUP BY h3_res6\n",
+ " )\n",
+ "\"\"\").fetchone()[0]\n",
+ "cluster_ms = (time.time() - t0) * 1000\n",
+ "\n",
+ "reduction = full_count / cluster_count if cluster_count > 0 else 0\n",
+ "\n",
+ "print(\"Performance Comparison\")\n",
+ "print(\"=\" * 50)\n",
+ "print(f\"Full points: {full_count:>12,} rows ({full_ms:>6.0f} ms)\")\n",
+ "print(f\"Res6 clusters: {cluster_count:>12,} rows ({cluster_ms:>6.0f} ms)\")\n",
+ "print(f\"Data reduction: {reduction:>11.0f}x fewer rows to render\")\n",
+ "print(f\"\\nClustering at res6 reduces rendering payload from ~{full_count/1e6:.1f}M to\")\n",
+ "print(f\"~{cluster_count/1e3:.0f}K points \u2014 enabling smooth interactive maps.\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "h3-drilldown-md",
+ "metadata": {},
+ "source": [
+ "## Hierarchical Drill-Down: res4 -> res6 -> res8\n",
+ "\n",
+ "H3's hierarchy means every res4 cell contains ~49 res6 children, and each res6 cell\n",
+ "contains ~49 res8 children. This enables progressive drill-down \u2014 start with a coarse\n",
+ "view, then zoom into regions of interest at finer resolution."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "h3-drilldown-code",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Pick the largest res4 cell and drill down\n",
+ "top_res4 = con.sql(f\"\"\"\n",
+ " SELECT h3_res4, COUNT(*) AS cnt\n",
+ " FROM read_parquet('{WIDE_H3_URL}')\n",
+ " WHERE h3_res4 IS NOT NULL\n",
+ " GROUP BY h3_res4\n",
+ " ORDER BY cnt DESC\n",
+ " LIMIT 1\n",
+ "\"\"\").fetchone()\n",
+ "\n",
+ "parent_cell = top_res4[0]\n",
+ "parent_count = top_res4[1]\n",
+ "print(f\"Largest res4 cell: {parent_cell} ({parent_count:,} samples)\")\n",
+ "\n",
+ "# Drill into res6 children\n",
+ "res6_children = con.sql(f\"\"\"\n",
+ " SELECT h3_res6, COUNT(*) AS cnt, MODE(n) AS dominant_source,\n",
+ " AVG(latitude) AS lat, AVG(longitude) AS lng\n",
+ " FROM read_parquet('{WIDE_H3_URL}')\n",
+ " WHERE h3_res4 = {parent_cell}\n",
+ " AND h3_res6 IS NOT NULL\n",
+ " GROUP BY h3_res6\n",
+ " ORDER BY cnt DESC\n",
+ "\"\"\").df()\n",
+ "\n",
+ "print(f\"\\nRes4 -> Res6: {len(res6_children)} child cells\")\n",
+ "print(res6_children.head(10).to_string(index=False))\n",
+ "\n",
+ "# Pick the top res6 child and drill into res8\n",
+ "if len(res6_children) > 0:\n",
+ " top_res6 = res6_children.iloc[0]\n",
+ " res8_children = con.sql(f\"\"\"\n",
+ " SELECT h3_res8, COUNT(*) AS cnt, MODE(n) AS dominant_source,\n",
+ " AVG(latitude) AS lat, AVG(longitude) AS lng\n",
+ " FROM read_parquet('{WIDE_H3_URL}')\n",
+ " WHERE h3_res6 = {int(top_res6['h3_res6'])}\n",
+ " AND h3_res8 IS NOT NULL\n",
+ " GROUP BY h3_res8\n",
+ " ORDER BY cnt DESC\n",
+ " \"\"\").df()\n",
+ "\n",
+ " print(f\"\\nRes6 -> Res8: {len(res8_children)} child cells\")\n",
+ " print(f\"(from res6 cell {int(top_res6['h3_res6'])} with {int(top_res6['cnt']):,} samples)\")\n",
+ " print(res8_children.head(10).to_string(index=False))\n",
+ "\n",
+ " print(f\"\\nDrill-down summary:\")\n",
+ " print(f\" res4: 1 cell -> {parent_count:,} samples\")\n",
+ " print(f\" res6: {len(res6_children)} cells -> {res6_children['cnt'].sum():,} samples\")\n",
+ " print(f\" res8: {len(res8_children)} cells -> {res8_children['cnt'].sum():,} samples (from top res6)\")"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "name": "python",
+ "version": "3.11.0"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
\ No newline at end of file
diff --git a/examples/basic/isamples_explorer.ipynb b/examples/basic/isamples_explorer.ipynb
index 6e4ac8a..53de73d 100644
--- a/examples/basic/isamples_explorer.ipynb
+++ b/examples/basic/isamples_explorer.ipynb
@@ -3,12 +3,37 @@
{
"cell_type": "markdown",
"metadata": {},
- "source": "# iSamples Interactive Explorer\n\nAn interactive interface for exploring iSamples data across all sources.\n\n**Features:**\n- Map view with 6M+ samples (lonboard WebGL)\n- Interactive table with filtering (ipydatagrid)\n- Sample cards on selection\n- **Faceted filtering**: Filter by source, material type, and time period\n - Hierarchical material types with rollup\n - **Decade quick-select** for time filtering\n- **Fulltext search**: Search label, description, place name with ranked results\n- **Bidirectional selection sync**: Click map → highlights table row; click table → recenters map\n- **Viewport Mode**: Dynamic loading based on pan/zoom (with loading indicator)\n- **Adaptive sampling**: More points when zoomed in, fewer when zoomed out\n\n**Data:** Zenodo wide parquet (~282 MB, 20M rows)"
+ "source": [
+ "# iSamples Interactive Explorer\n",
+ "\n",
+ "An interactive interface for exploring iSamples data across all sources.\n",
+ "\n",
+ "**Features:**\n",
+ "- Map view with 6M+ samples (lonboard WebGL)\n",
+ "- Interactive table with filtering (ipydatagrid)\n",
+ "- Sample cards on selection\n",
+ "- **Faceted filtering**: Filter by source, material type, and time period\n",
+ " - Hierarchical material types with rollup\n",
+ " - **Decade quick-select** for time filtering\n",
+ "- **Fulltext search**: Search label, description, place name with ranked results\n",
+ "- **Bidirectional selection sync**: Click map \u2192 highlights table row; click table \u2192 recenters map\n",
+ "- **Viewport Mode**: Dynamic loading based on pan/zoom (with loading indicator)\n",
+ "- **Adaptive sampling**: More points when zoomed in, fewer when zoomed out\n",
+ "\n",
+ "**Data:** Zenodo wide parquet (~282 MB, 20M rows)"
+ ]
},
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2026-04-24T15:12:49.646526Z",
+ "iopub.status.busy": "2026-04-24T15:12:49.646391Z",
+ "iopub.status.idle": "2026-04-24T15:12:50.301042Z",
+ "shell.execute_reply": "2026-04-24T15:12:50.300613Z"
+ }
+ },
"outputs": [],
"source": [
"# Imports\n",
@@ -30,28 +55,135 @@
"from IPython.display import display, HTML"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Running this notebook\n",
+ "\n",
+ "This notebook reads parquet data with **DuckDB over HTTP range requests**, so\n",
+ "no bulk download is required. It works in three environments:\n",
+ "\n",
+ "| Environment | What happens |\n",
+ "|---|---|\n",
+ "| **Local (Raymond's setup)** | If `~/Data/iSample/pqg_refining/` contains the parquet files, they are read from disk. |\n",
+ "| **mybinder.org** | Launch \u2192 Binder builds the image from `binder/requirements.txt` \u2192 parquets stream from `https://data.isamples.org/` (Cloudflare Worker in front of R2). |\n",
+ "| **Google Colab** | Open the `.ipynb` from GitHub, uncomment the `!pip install` cell below, then Run All. Parquets stream as in Binder. |\n",
+ "\n",
+ "Only the small facet-summary file (~2 KB) and the H3 tier files (\u2264 2.4 MB each)\n",
+ "are fully fetched. The 282 MB wide parquet is range-scanned \u2014 typical query\n",
+ "pulls a few MB at most."
+ ]
+ },
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2026-04-24T15:12:50.302199Z",
+ "iopub.status.busy": "2026-04-24T15:12:50.302104Z",
+ "iopub.status.idle": "2026-04-24T15:12:50.303620Z",
+ "shell.execute_reply": "2026-04-24T15:12:50.303339Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Uncomment on Google Colab (not needed on mybinder or local dev envs)\n",
+ "# !pip install -q lonboard ipydatagrid duckdb geopandas pyarrow ipywidgets\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2026-04-24T15:12:50.304532Z",
+ "iopub.status.busy": "2026-04-24T15:12:50.304482Z",
+ "iopub.status.idle": "2026-04-24T15:12:50.591188Z",
+ "shell.execute_reply": "2026-04-24T15:12:50.590696Z"
+ }
+ },
"outputs": [],
"source": [
- "# Data paths\n",
- "LOCAL_WIDE = os.path.expanduser(\"~/Data/iSample/pqg_refining/zenodo_wide_2026-01-09.parquet\")\n",
- "REMOTE_WIDE = \"https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202601_wide.parquet\"\n",
+ "# Data paths \u2014 local-first with remote fallback\n",
+ "#\n",
+ "# Canonical URL base is https://data.isamples.org (Cloudflare Worker in\n",
+ "# front of the isamples-ry R2 bucket). Two URL layers:\n",
+ "# /current/ -> 302, short cache, tracks latest snapshot\n",
+ "# / -> 1-yr immutable cache, pin-safe for papers\n",
+ "#\n",
+ "# Local cache convention: snapshots live under LOCAL_BASE named exactly\n",
+ "# as their versioned R2 counterparts (e.g. isamples_202604_wide.parquet).\n",
+ "# If the local file exists, it is used; otherwise DuckDB streams from R2\n",
+ "# via httpfs (HTTP range requests \u2014 no full download needed).\n",
+ "\n",
+ "LOCAL_BASE = os.path.expanduser(\"~/Data/iSample/pqg_refining\")\n",
+ "REMOTE_BASE = \"https://data.isamples.org\"\n",
+ "\n",
+ "\n",
+ "def resolve_data_url(local_filename, remote_path):\n",
+ " \"\"\"Pick local if cached, else remote URL. DuckDB's read_parquet\n",
+ " accepts either, so downstream SQL is identical.\n",
+ "\n",
+ " Args:\n",
+ " local_filename: name under LOCAL_BASE.\n",
+ " remote_path: path relative to REMOTE_BASE (may be /current/...\n",
+ " or isamples_YYYYMM_*.parquet).\n",
+ "\n",
+ " Returns:\n",
+ " Either an absolute local path or an https URL.\n",
+ " \"\"\"\n",
+ " local_path = os.path.join(LOCAL_BASE, local_filename)\n",
+ " if os.path.exists(local_path):\n",
+ " return local_path\n",
+ " return f\"{REMOTE_BASE}/{remote_path}\"\n",
+ "\n",
+ "\n",
+ "# Wide parquet \u2014 local copy is 202604; /current/ alias also 202604 today.\n",
+ "PARQUET_PATH = resolve_data_url(\n",
+ " \"isamples_202604_wide.parquet\", \"current/wide.parquet\"\n",
+ ")\n",
"\n",
- "# Use local if available\n",
- "PARQUET_PATH = LOCAL_WIDE if os.path.exists(LOCAL_WIDE) else REMOTE_WIDE\n",
- "print(f\"Using: {PARQUET_PATH}\")\n",
+ "# Pre-computed facet summaries (2KB \u2014 always fetched remote; file too\n",
+ "# small to bother caching locally).\n",
+ "FACET_SUMMARIES_URL = f\"{REMOTE_BASE}/isamples_202601_facet_summaries.parquet\"\n",
"\n",
- "# Connect to DuckDB\n",
- "con = duckdb.connect()"
+ "# Single-filter cross-filter cache (6KB). Unused in this notebook today;\n",
+ "# populate if you add cross-filtering UI (see QUERY_SPEC.md \u00a73.3).\n",
+ "# FACET_CROSS_FILTER_URL = f\"{REMOTE_BASE}/isamples_202601_facet_cross_filter.parquet\"\n",
+ "\n",
+ "print(f\"Wide parquet: {PARQUET_PATH}\")\n",
+ "print(f\"Facet summaries: {FACET_SUMMARIES_URL}\")\n",
+ "\n",
+ "# Connect to DuckDB and explicitly load httpfs so HTTPS reads work on\n",
+ "# fresh containers (mybinder, Colab) where the extension is not cached.\n",
+ "con = duckdb.connect()\n",
+ "con.sql(\"INSTALL httpfs; LOAD httpfs;\")\n",
+ "\n",
+ "# Load facet summaries at startup (instant \u2014 only 2KB)\n",
+ "import time as _time\n",
+ "_t0 = _time.time()\n",
+ "FACET_SUMMARIES_DF = con.sql(f\"\"\"\n",
+ " SELECT facet_type, facet_value, scheme, count\n",
+ " FROM read_parquet('{FACET_SUMMARIES_URL}')\n",
+ " ORDER BY facet_type, count DESC\n",
+ "\"\"\").df()\n",
+ "_summary_ms = (_time.time() - _t0) * 1000\n",
+ "print(f\"Facet summaries loaded in {_summary_ms:.0f} ms ({len(FACET_SUMMARIES_DF)} rows)\")\n",
+ "print(f\"Available facets: {sorted(FACET_SUMMARIES_DF['facet_type'].unique())}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2026-04-24T15:12:50.592352Z",
+ "iopub.status.busy": "2026-04-24T15:12:50.592259Z",
+ "iopub.status.idle": "2026-04-24T15:12:50.594304Z",
+ "shell.execute_reply": "2026-04-24T15:12:50.593913Z"
+ }
+ },
"outputs": [],
"source": [
"# Source color scheme (consistent across iSamples)\n",
@@ -77,9 +209,619 @@
{
"cell_type": "code",
"execution_count": null,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2026-04-24T15:12:50.595451Z",
+ "iopub.status.busy": "2026-04-24T15:12:50.595362Z",
+ "iopub.status.idle": "2026-04-24T15:12:51.623877Z",
+ "shell.execute_reply": "2026-04-24T15:12:51.623424Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "def load_samples(max_per_source=12500, source_filter=None, bbox=None, search_term=None,\n",
+ " source_filters=None, material_filters=None, year_range=None):\n",
+ " \"\"\"\n",
+ " Load samples with coordinates from the wide parquet.\n",
+ "\n",
+ " Args:\n",
+ " max_per_source: Maximum samples per source (for balanced representation)\n",
+ " source_filter: Optional single source name to filter (e.g., 'OPENCONTEXT') - DEPRECATED\n",
+ " bbox: Optional bounding box dict with min_lat, max_lat, min_lon, max_lon\n",
+ " search_term: Optional search string to filter and rank results\n",
+ " source_filters: Set of source names to include (empty = all)\n",
+ " material_filters: Set of material labels to include (empty = all)\n",
+ " year_range: Tuple of (min_year, max_year) or None for no filter\n",
+ "\n",
+ " Returns:\n",
+ " GeoDataFrame with sample data (includes search_score if search_term provided)\n",
+ " \"\"\"\n",
+ " # Build WHERE clause with optional table prefix for material filter queries\n",
+ " def build_where_clause(prefix=\"\"):\n",
+ " p = f\"{prefix}.\" if prefix else \"\"\n",
+ " clause = f\"WHERE {p}otype = 'MaterialSampleRecord' AND {p}latitude IS NOT NULL\"\n",
+ "\n",
+ " # Handle source filtering (new multi-select takes precedence)\n",
+ " if source_filters:\n",
+ " sources_sql = \", \".join(f\"'{s}'\" for s in source_filters)\n",
+ " clause += f\" AND {p}n IN ({sources_sql})\"\n",
+ " elif source_filter:\n",
+ " clause += f\" AND {p}n = '{source_filter}'\"\n",
+ "\n",
+ " if bbox:\n",
+ " clause += f\" AND {p}latitude BETWEEN {bbox['min_lat']} AND {bbox['max_lat']}\"\n",
+ " clause += f\" AND {p}longitude BETWEEN {bbox['min_lon']} AND {bbox['max_lon']}\"\n",
+ "\n",
+ " # Year range filter - cast result_time to TIMESTAMP first\n",
+ " if year_range and (year_range[0] is not None or year_range[1] is not None):\n",
+ " if year_range[0] is not None and year_range[1] is not None:\n",
+ " clause += f\" AND EXTRACT(YEAR FROM TRY_CAST({p}result_time AS TIMESTAMP)) BETWEEN {year_range[0]} AND {year_range[1]}\"\n",
+ " elif year_range[0] is not None:\n",
+ " clause += f\" AND EXTRACT(YEAR FROM TRY_CAST({p}result_time AS TIMESTAMP)) >= {year_range[0]}\"\n",
+ " elif year_range[1] is not None:\n",
+ " clause += f\" AND EXTRACT(YEAR FROM TRY_CAST({p}result_time AS TIMESTAMP)) <= {year_range[1]}\"\n",
+ "\n",
+ " return clause\n",
+ "\n",
+ " # Search filtering and scoring\n",
+ " def build_search_expr(prefix=\"\"):\n",
+ " p = f\"{prefix}.\" if prefix else \"\"\n",
+ " if not search_term or not search_term.strip():\n",
+ " return \"0 AS search_score\", \"\", \"ORDER BY RANDOM()\"\n",
+ "\n",
+ " # Escape single quotes in search term\n",
+ " term = search_term.strip().replace(\"'\", \"''\")\n",
+ "\n",
+ " # Weighted scoring: label (10) > description (5) > place_name (3)\n",
+ " score_expr = f\"\"\"\n",
+ " (CASE WHEN {p}label ILIKE '%{term}%' THEN 10 ELSE 0 END +\n",
+ " CASE WHEN {p}description ILIKE '%{term}%' THEN 5 ELSE 0 END +\n",
+ " CASE WHEN CAST({p}place_name AS VARCHAR) ILIKE '%{term}%' THEN 3 ELSE 0 END) AS search_score\n",
+ " \"\"\"\n",
+ "\n",
+ " # Filter to only matching records\n",
+ " search_filter = f\"\"\"\n",
+ " AND ({p}label ILIKE '%{term}%' \n",
+ " OR {p}description ILIKE '%{term}%' \n",
+ " OR CAST({p}place_name AS VARCHAR) ILIKE '%{term}%')\n",
+ " \"\"\"\n",
+ "\n",
+ " # Sort by score (highest first), then random within same score\n",
+ " order_by = \"ORDER BY search_score DESC, RANDOM()\"\n",
+ "\n",
+ " return score_expr, search_filter, order_by\n",
+ "\n",
+ " # Query with balanced sampling across sources\n",
+ " if material_filters:\n",
+ " # Material filter requires a CTE with join\n",
+ " material_labels_sql = \", \".join(f\"'{m}'\" for m in material_filters)\n",
+ " where_clause = build_where_clause(\"base\")\n",
+ " search_score_expr, search_filter, order_by = build_search_expr(\"base\")\n",
+ "\n",
+ " query = f\"\"\"\n",
+ " WITH material_matches AS (\n",
+ " SELECT DISTINCT msr.row_id\n",
+ " FROM read_parquet('{PARQUET_PATH}') msr\n",
+ " CROSS JOIN UNNEST(msr.p__has_material_category) AS t(mat_id)\n",
+ " JOIN read_parquet('{PARQUET_PATH}') ic ON ic.row_id = mat_id\n",
+ " WHERE msr.otype = 'MaterialSampleRecord'\n",
+ " AND ic.label IN ({material_labels_sql})\n",
+ " ),\n",
+ " scored AS (\n",
+ " SELECT \n",
+ " base.row_id, base.pid, base.label, base.description, \n",
+ " base.latitude, base.longitude, base.n as source,\n",
+ " base.place_name, base.result_time,\n",
+ " {search_score_expr}\n",
+ " FROM read_parquet('{PARQUET_PATH}') base\n",
+ " {where_clause}\n",
+ " {search_filter}\n",
+ " AND base.row_id IN (SELECT row_id FROM material_matches)\n",
+ " ),\n",
+ " ranked AS (\n",
+ " SELECT *,\n",
+ " ROW_NUMBER() OVER (PARTITION BY source {order_by.replace('ORDER BY', 'ORDER BY')}) as rn\n",
+ " FROM scored\n",
+ " )\n",
+ " SELECT row_id, pid, label, description, latitude, longitude, source, place_name, result_time, search_score\n",
+ " FROM ranked\n",
+ " WHERE rn <= {max_per_source}\n",
+ " {order_by}\n",
+ " \"\"\"\n",
+ " else:\n",
+ " where_clause = build_where_clause()\n",
+ " search_score_expr, search_filter, order_by = build_search_expr()\n",
+ "\n",
+ " query = f\"\"\"\n",
+ " WITH scored AS (\n",
+ " SELECT \n",
+ " row_id, pid, label, description, latitude, longitude, n as source,\n",
+ " place_name, result_time,\n",
+ " {search_score_expr}\n",
+ " FROM read_parquet('{PARQUET_PATH}')\n",
+ " {where_clause}\n",
+ " {search_filter}\n",
+ " ),\n",
+ " ranked AS (\n",
+ " SELECT *,\n",
+ " ROW_NUMBER() OVER (PARTITION BY source {order_by.replace('ORDER BY', 'ORDER BY')}) as rn\n",
+ " FROM scored\n",
+ " )\n",
+ " SELECT row_id, pid, label, description, latitude, longitude, source, place_name, result_time, search_score\n",
+ " FROM ranked\n",
+ " WHERE rn <= {max_per_source}\n",
+ " {order_by}\n",
+ " \"\"\"\n",
+ "\n",
+ " df = con.sql(query).df()\n",
+ "\n",
+ " # Convert to GeoDataFrame\n",
+ " geometry = [Point(lon, lat) for lon, lat in zip(df['longitude'], df['latitude'])]\n",
+ " gdf = gpd.GeoDataFrame(df, geometry=geometry, crs=\"EPSG:4326\")\n",
+ "\n",
+ " return gdf\n",
+ "\n",
+ "\n",
+ "def view_state_to_bbox(view_state, buffer_factor=1.5, aspect_ratio=1.5):\n",
+ " \"\"\"\n",
+ " Calculate bounding box from lonboard view_state.\n",
+ "\n",
+ " The view_state contains latitude, longitude, and zoom level.\n",
+ " We calculate the visible extent using Web Mercator projection math.\n",
+ "\n",
+ " Args:\n",
+ " view_state: lonboard MapViewState with latitude, longitude, zoom\n",
+ " buffer_factor: Multiply bbox by this to load slightly more data (default 1.5)\n",
+ " aspect_ratio: Width/height ratio of map container (default 1.5 for wider maps)\n",
+ "\n",
+ " Returns:\n",
+ " dict with min_lat, max_lat, min_lon, max_lon\n",
+ " \"\"\"\n",
+ " lat = view_state.latitude\n",
+ " lon = view_state.longitude\n",
+ " zoom = view_state.zoom\n",
+ "\n",
+ " # At zoom 0, entire world visible (~360 degrees longitude)\n",
+ " # Each zoom level halves the visible area\n",
+ " # Approximate degrees visible at zoom level\n",
+ " degrees_visible = 360 / (2 ** zoom)\n",
+ "\n",
+ " # Latitude visible area - apply buffer\n",
+ " lat_degrees = degrees_visible * buffer_factor / 2\n",
+ "\n",
+ " # Longitude visible area - wider due to aspect ratio and Mercator at higher latitudes\n",
+ " # Mercator stretches longitude at higher latitudes, so we need more buffer\n",
+ " lat_rad = math.radians(abs(lat))\n",
+ " mercator_stretch = 1 / max(math.cos(lat_rad), 0.1) # Avoid division by zero near poles\n",
+ " lon_degrees = degrees_visible * buffer_factor * aspect_ratio * mercator_stretch / 2\n",
+ "\n",
+ " # Clamp latitude to valid range\n",
+ " min_lat = max(-90, lat - lat_degrees)\n",
+ " max_lat = min(90, lat + lat_degrees)\n",
+ " min_lon = max(-180, lon - lon_degrees)\n",
+ " max_lon = min(180, lon + lon_degrees)\n",
+ "\n",
+ " return {\n",
+ " 'min_lat': min_lat,\n",
+ " 'max_lat': max_lat,\n",
+ " 'min_lon': min_lon,\n",
+ " 'max_lon': max_lon\n",
+ " }\n",
+ "\n",
+ "\n",
+ "def adaptive_sample_size(zoom, base_size=50000):\n",
+ " \"\"\"\n",
+ " Calculate sample size based on zoom level.\n",
+ "\n",
+ " At low zoom (world view), sample aggressively to avoid overwhelming.\n",
+ " At high zoom (local view), show all available points.\n",
+ "\n",
+ " Args:\n",
+ " zoom: Current zoom level (0-20)\n",
+ " base_size: Base sample size per source\n",
+ "\n",
+ " Returns:\n",
+ " Sample size to use per source\n",
+ " \"\"\"\n",
+ " if zoom < 2:\n",
+ " return min(base_size, 10000) # World view: max 10K per source\n",
+ " elif zoom < 5:\n",
+ " return min(base_size, 25000) # Continent view: max 25K\n",
+ " elif zoom < 8:\n",
+ " return min(base_size, 50000) # Country view: max 50K\n",
+ " elif zoom < 12:\n",
+ " return min(base_size, 100000) # Region view: max 100K\n",
+ " else:\n",
+ " return base_size # Local view: use full base_size\n",
+ "\n",
+ "\n",
+ "# Load initial data\n",
+ "print(\"Loading samples...\")\n",
+ "samples_gdf = load_samples(max_per_source=12500)\n",
+ "print(f\"Loaded {len(samples_gdf):,} samples\")\n",
+ "print(f\"\\nBy source:\")\n",
+ "print(samples_gdf['source'].value_counts())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
"metadata": {},
+ "source": [
+ "## H3-tier loader (scaffold \u2014 port of Cesium progressive strategy)\n",
+ "\n",
+ "The `load_samples()` cell above is the **crude sampler**: it pulls a balanced\n",
+ "random-per-source slice (~50K points) from the 282 MB wide parquet on every\n",
+ "call. Every viewport change re-hits the wide file.\n",
+ "\n",
+ "The Cesium `progressive_globe.qmd` frontend uses a different strategy: three\n",
+ "pre-computed H3 summary parquets (res 4 / 6 / 8) already on R2 \u2014 a few MB\n",
+ "combined. At low zoom, one row per H3 cell is rendered as a circle sized by\n",
+ "`log(sample_count)` and colored by `dominant_source`. Individual points are\n",
+ "only fetched at high zoom from the 60 MB lite parquet.\n",
+ "\n",
+ "Schema of each `isamples_202601_h3_summary_res{4,6,8}.parquet`:\n",
+ "\n",
+ "| column | type | notes |\n",
+ "|---|---|---|\n",
+ "| `h3_cell` | uint64 | H3 index |\n",
+ "| `sample_count` | int64 | total samples in cell |\n",
+ "| `center_lat`, `center_lng` | double | cell centroid |\n",
+ "| `dominant_source` | varchar | arg-max source for the cell |\n",
+ "| `source_count` | int64 | distinct sources in cell |\n",
+ "| `resolution` | int32 | 4, 6, or 8 |\n",
+ "\n",
+ "Row counts: res4 = 38,406 \u00b7 res6 = 111,681 \u00b7 res8 = 175,653.\n",
+ "\n",
+ "The cells below scaffold an H3-tier loader and a lonboard layer builder. They\n",
+ "live alongside `load_samples()` for side-by-side comparison; they are NOT yet\n",
+ "wired into the widget viewport observer. That integration is the next step."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2026-04-24T15:12:51.624876Z",
+ "iopub.status.busy": "2026-04-24T15:12:51.624818Z",
+ "iopub.status.idle": "2026-04-24T15:12:51.627031Z",
+ "shell.execute_reply": "2026-04-24T15:12:51.626635Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# H3-tier loader (scaffold) \u2014 constants\n",
+ "\n",
+ "# Three pre-computed H3 summary parquets \u2014 identical schema, different\n",
+ "# resolutions. Already backing the Cesium progressive_globe frontend.\n",
+ "# Uses the same local-first / remote-fallback resolver as cell 2, so a\n",
+ "# Raymond-on-MBP run uses `~/Data/iSample/pqg_refining/` copies if\n",
+ "# present; mybinder/Colab runs always stream from R2 (tiny files: 600\n",
+ "# KB / 1.6 MB / 2.4 MB).\n",
+ "H3_TIER_URLS = {\n",
+ " r: resolve_data_url(\n",
+ " f\"isamples_202601_h3_summary_res{r}.parquet\",\n",
+ " f\"isamples_202601_h3_summary_res{r}.parquet\",\n",
+ " )\n",
+ " for r in (4, 6, 8)\n",
+ "}\n",
+ "\n",
+ "# Zoom \u2192 resolution mapping. Breakpoints mirror the Cesium distance-based\n",
+ "# tiers (roughly: <120 km switches to individual points). For the notebook\n",
+ "# we start with pure-zoom thresholds; revisit if the UX feels off.\n",
+ "#\n",
+ "# zoom | tier | source\n",
+ "# 0-3 | H3 res 4 | h3_summary_res4.parquet (~600 KB, 38K cells)\n",
+ "# 4-6 | H3 res 6 | h3_summary_res6.parquet (~1.6 MB, 112K cells)\n",
+ "# 7-9 | H3 res 8 | h3_summary_res8.parquet (~2.4 MB, 176K cells)\n",
+ "# 10+ | individual points | lite parquet + viewport bbox\n",
+ "def zoom_to_h3_resolution(zoom):\n",
+ " if zoom < 4:\n",
+ " return 4\n",
+ " elif zoom < 7:\n",
+ " return 6\n",
+ " elif zoom < 10:\n",
+ " return 8\n",
+ " return None # hand off to individual-points tier"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2026-04-24T15:12:51.627934Z",
+ "iopub.status.busy": "2026-04-24T15:12:51.627874Z",
+ "iopub.status.idle": "2026-04-24T15:12:51.630259Z",
+ "shell.execute_reply": "2026-04-24T15:12:51.629904Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# H3-tier loader (scaffold) \u2014 query function\n",
+ "\n",
+ "import numpy as np\n",
+ "\n",
+ "def load_h3_tier(zoom, source_filters=None, bbox=None):\n",
+ " \"\"\"Load H3-aggregate rows for the tier matching the given zoom.\n",
+ "\n",
+ " Mirrors the argument shape of `load_samples()` so callers can swap\n",
+ " between the crude sampler and the H3 tier without rewriting widget\n",
+ " glue. Unlike `load_samples()`, this returns *aggregate* rows \u2014 one\n",
+ " per H3 cell \u2014 not individual sample records.\n",
+ "\n",
+ " Args:\n",
+ " zoom: lonboard view_state.zoom (float). Determines which H3 tier.\n",
+ " source_filters: set of source names (e.g. {'SESAR', 'GEOME'}) or None.\n",
+ " Filters cells on `dominant_source`. **IMPORTANT ACCURACY CAVEATS**:\n",
+ " (a) a cell with 100 SESAR + 1 GEOME appears ONLY when SESAR is\n",
+ " selected \u2014 dropping the one GEOME sample\n",
+ " (b) the returned `sample_count` is the TOTAL sample count in\n",
+ " the cell, not the count for the selected source\n",
+ " So filtering by GEOME can *under*count (drops cells with some\n",
+ " GEOME but dominated by another source) AND *over*count (shows\n",
+ " cell totals for GEOME-dominated cells). For exact source-\n",
+ " specific counts, a per-(cell, source) aggregate file is needed\n",
+ " (not yet shipped on R2; see frontend_bundle_v2/h3_cache.parquet\n",
+ " for the required shape).\n",
+ " The scaffold UI should flag this imprecision when tier mode is\n",
+ " active with a source filter.\n",
+ " bbox: optional dict {min_lat, max_lat, min_lon, max_lon}.\n",
+ "\n",
+ " Returns:\n",
+ " DataFrame with columns: h3_cell, sample_count, center_lat,\n",
+ " center_lng, dominant_source, source_count, resolution.\n",
+ " Empty frame if zoom is in the individual-points tier.\n",
+ " \"\"\"\n",
+ " resolution = zoom_to_h3_resolution(zoom)\n",
+ " if resolution is None:\n",
+ " return None # caller should fall back to individual-points loader\n",
+ "\n",
+ " url = H3_TIER_URLS[resolution]\n",
+ " where = []\n",
+ " if source_filters:\n",
+ " sources_sql = ', '.join(f\"'{s}'\" for s in source_filters)\n",
+ " where.append(f\"dominant_source IN ({sources_sql})\")\n",
+ " if bbox:\n",
+ " where.append(f\"center_lat BETWEEN {bbox['min_lat']} AND {bbox['max_lat']}\")\n",
+ " where.append(f\"center_lng BETWEEN {bbox['min_lon']} AND {bbox['max_lon']}\")\n",
+ " where_sql = ('WHERE ' + ' AND '.join(where)) if where else ''\n",
+ "\n",
+ " query = f\"\"\"\n",
+ " SELECT h3_cell, sample_count, center_lat, center_lng,\n",
+ " dominant_source, source_count, resolution\n",
+ " FROM read_parquet('{url}')\n",
+ " {where_sql}\n",
+ " \"\"\"\n",
+ " return con.sql(query).df()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2026-04-24T15:12:51.631040Z",
+ "iopub.status.busy": "2026-04-24T15:12:51.630984Z",
+ "iopub.status.idle": "2026-04-24T15:12:51.633243Z",
+ "shell.execute_reply": "2026-04-24T15:12:51.632947Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# H3-tier loader (scaffold) \u2014 lonboard layer builder\n",
+ "\n",
+ "def make_h3_tier_layer(tier_df, radius_scale=3000,\n",
+ " radius_min_pixels=2, radius_max_pixels=12):\n",
+ " \"\"\"Build a lonboard ScatterplotLayer from an H3 tier DataFrame.\n",
+ "\n",
+ " Circle radius scales with log1p(sample_count). Colors come from\n",
+ " SOURCE_COLORS (cell 3), which is already RGBA.\n",
+ "\n",
+ " Sizing note: radius is in meters but clamped to a tight pixel range\n",
+ " (2-12 px by default) so bubbles stay readable at both world and\n",
+ " country zoom levels. Lower radius_scale \u2192 smaller meters-radii \u2192\n",
+ " the pixel clamp kicks in only for the highest-count cells.\n",
+ "\n",
+ " Args:\n",
+ " tier_df: output of load_h3_tier().\n",
+ " radius_scale: meters per log-unit of sample count. Defaults to\n",
+ " 3000, so log1p(100)\u00b73000 \u2248 14 km \u2192 typically < 10 px at\n",
+ " country zoom, near the pixel floor at world zoom.\n",
+ " radius_min_pixels / radius_max_pixels: pixel clamps, tight by\n",
+ " default so density reads as density, not \"big vs huge\".\n",
+ "\n",
+ " Returns:\n",
+ " lonboard.ScatterplotLayer, ready to drop into a Map.\n",
+ " \"\"\"\n",
+ " from lonboard import ScatterplotLayer\n",
+ " import geopandas as gpd\n",
+ " from shapely.geometry import Point\n",
+ "\n",
+ " geometry = [Point(lng, lat) for lng, lat in\n",
+ " zip(tier_df['center_lng'], tier_df['center_lat'])]\n",
+ " gdf = gpd.GeoDataFrame(tier_df, geometry=geometry, crs='EPSG:4326')\n",
+ "\n",
+ " radii = np.log1p(gdf['sample_count'].to_numpy()) * radius_scale\n",
+ "\n",
+ " colors = np.array([\n",
+ " SOURCE_COLORS.get(src, DEFAULT_COLOR)\n",
+ " for src in gdf['dominant_source']\n",
+ " ], dtype=np.uint8)\n",
+ "\n",
+ " return ScatterplotLayer.from_geopandas(\n",
+ " gdf,\n",
+ " get_radius=radii,\n",
+ " get_fill_color=colors,\n",
+ " radius_units='meters',\n",
+ " radius_min_pixels=radius_min_pixels,\n",
+ " radius_max_pixels=radius_max_pixels,\n",
+ " stroked=True,\n",
+ " get_line_color=[255, 255, 255, 180],\n",
+ " line_width_min_pixels=1,\n",
+ " pickable=True,\n",
+ " )\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2026-04-24T15:12:51.633969Z",
+ "iopub.status.busy": "2026-04-24T15:12:51.633920Z",
+ "iopub.status.idle": "2026-04-24T15:12:53.708649Z",
+ "shell.execute_reply": "2026-04-24T15:12:53.708231Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# H3-tier loader (scaffold) \u2014 demo\n",
+ "#\n",
+ "# Side-by-side comparison with the crude sampler above. Load the three\n",
+ "# tiers and show how tiny the payload is compared to the 282 MB\n",
+ "# wide-parquet round-trip load_samples() does.\n",
+ "\n",
+ "import time\n",
+ "\n",
+ "for demo_zoom in [1.5, 5.0, 8.5]:\n",
+ " t0 = time.time()\n",
+ " tier = load_h3_tier(demo_zoom)\n",
+ " dt = time.time() - t0\n",
+ " res = zoom_to_h3_resolution(demo_zoom)\n",
+ " print(f'zoom={demo_zoom:4.1f} \u2192 H3 res {res} \u00b7 '\n",
+ " f'{len(tier):>6,} cells \u00b7 {dt*1000:>5.0f} ms')\n",
+ "\n",
+ "# And the individual-points handoff:\n",
+ "print(f'zoom=12.0 \u2192 individual-points tier (load_h3_tier returns None)')\n",
+ "\n",
+ "# Render the mid tier as a standalone map. Uses the same MaplibreBasemap\n",
+ "# as the main explorer map; defined locally here because the scaffold\n",
+ "# runs before the Map Component section (cell 15).\n",
+ "from lonboard import Map\n",
+ "from lonboard.basemap import CartoStyle, MaplibreBasemap\n",
+ "\n",
+ "_TIER_BASEMAP = MaplibreBasemap(style=CartoStyle.Voyager)\n",
+ "tier_mid = load_h3_tier(5.0)\n",
+ "tier_layer = make_h3_tier_layer(tier_mid)\n",
+ "tier_map = Map(layers=[tier_layer], basemap=_TIER_BASEMAP)\n",
+ "tier_map\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2026-04-24T15:12:53.712918Z",
+ "iopub.status.busy": "2026-04-24T15:12:53.712830Z",
+ "iopub.status.idle": "2026-04-24T15:12:53.715882Z",
+ "shell.execute_reply": "2026-04-24T15:12:53.715426Z"
+ }
+ },
"outputs": [],
- "source": "def load_samples(max_per_source=12500, source_filter=None, bbox=None, search_term=None,\n source_filters=None, material_filters=None, year_range=None):\n \"\"\"\n Load samples with coordinates from the wide parquet.\n\n Args:\n max_per_source: Maximum samples per source (for balanced representation)\n source_filter: Optional single source name to filter (e.g., 'OPENCONTEXT') - DEPRECATED\n bbox: Optional bounding box dict with min_lat, max_lat, min_lon, max_lon\n search_term: Optional search string to filter and rank results\n source_filters: Set of source names to include (empty = all)\n material_filters: Set of material labels to include (empty = all)\n year_range: Tuple of (min_year, max_year) or None for no filter\n\n Returns:\n GeoDataFrame with sample data (includes search_score if search_term provided)\n \"\"\"\n # Build WHERE clause with optional table prefix for material filter queries\n def build_where_clause(prefix=\"\"):\n p = f\"{prefix}.\" if prefix else \"\"\n clause = f\"WHERE {p}otype = 'MaterialSampleRecord' AND {p}latitude IS NOT NULL\"\n\n # Handle source filtering (new multi-select takes precedence)\n if source_filters:\n sources_sql = \", \".join(f\"'{s}'\" for s in source_filters)\n clause += f\" AND {p}n IN ({sources_sql})\"\n elif source_filter:\n clause += f\" AND {p}n = '{source_filter}'\"\n\n if bbox:\n clause += f\" AND {p}latitude BETWEEN {bbox['min_lat']} AND {bbox['max_lat']}\"\n clause += f\" AND {p}longitude BETWEEN {bbox['min_lon']} AND {bbox['max_lon']}\"\n\n # Year range filter - cast result_time to TIMESTAMP first\n if year_range and (year_range[0] is not None or year_range[1] is not None):\n if year_range[0] is not None and year_range[1] is not None:\n clause += f\" AND EXTRACT(YEAR FROM TRY_CAST({p}result_time AS TIMESTAMP)) BETWEEN {year_range[0]} AND {year_range[1]}\"\n elif year_range[0] is not None:\n clause += f\" AND EXTRACT(YEAR FROM TRY_CAST({p}result_time AS TIMESTAMP)) >= {year_range[0]}\"\n elif year_range[1] is not None:\n clause += f\" AND EXTRACT(YEAR FROM TRY_CAST({p}result_time AS TIMESTAMP)) <= {year_range[1]}\"\n\n return clause\n\n # Search filtering and scoring\n def build_search_expr(prefix=\"\"):\n p = f\"{prefix}.\" if prefix else \"\"\n if not search_term or not search_term.strip():\n return \"0 AS search_score\", \"\", \"ORDER BY RANDOM()\"\n\n # Escape single quotes in search term\n term = search_term.strip().replace(\"'\", \"''\")\n\n # Weighted scoring: label (10) > description (5) > place_name (3)\n score_expr = f\"\"\"\n (CASE WHEN {p}label ILIKE '%{term}%' THEN 10 ELSE 0 END +\n CASE WHEN {p}description ILIKE '%{term}%' THEN 5 ELSE 0 END +\n CASE WHEN CAST({p}place_name AS VARCHAR) ILIKE '%{term}%' THEN 3 ELSE 0 END) AS search_score\n \"\"\"\n\n # Filter to only matching records\n search_filter = f\"\"\"\n AND ({p}label ILIKE '%{term}%' \n OR {p}description ILIKE '%{term}%' \n OR CAST({p}place_name AS VARCHAR) ILIKE '%{term}%')\n \"\"\"\n\n # Sort by score (highest first), then random within same score\n order_by = \"ORDER BY search_score DESC, RANDOM()\"\n\n return score_expr, search_filter, order_by\n\n # Query with balanced sampling across sources\n if material_filters:\n # Material filter requires a CTE with join\n material_labels_sql = \", \".join(f\"'{m}'\" for m in material_filters)\n where_clause = build_where_clause(\"base\")\n search_score_expr, search_filter, order_by = build_search_expr(\"base\")\n\n query = f\"\"\"\n WITH material_matches AS (\n SELECT DISTINCT msr.row_id\n FROM read_parquet('{PARQUET_PATH}') msr\n CROSS JOIN UNNEST(msr.p__has_material_category) AS t(mat_id)\n JOIN read_parquet('{PARQUET_PATH}') ic ON ic.row_id = mat_id\n WHERE msr.otype = 'MaterialSampleRecord'\n AND ic.label IN ({material_labels_sql})\n ),\n scored AS (\n SELECT \n base.row_id, base.pid, base.label, base.description, \n base.latitude, base.longitude, base.n as source,\n base.place_name, base.result_time,\n {search_score_expr}\n FROM read_parquet('{PARQUET_PATH}') base\n {where_clause}\n {search_filter}\n AND base.row_id IN (SELECT row_id FROM material_matches)\n ),\n ranked AS (\n SELECT *,\n ROW_NUMBER() OVER (PARTITION BY source {order_by.replace('ORDER BY', 'ORDER BY')}) as rn\n FROM scored\n )\n SELECT row_id, pid, label, description, latitude, longitude, source, place_name, result_time, search_score\n FROM ranked\n WHERE rn <= {max_per_source}\n {order_by}\n \"\"\"\n else:\n where_clause = build_where_clause()\n search_score_expr, search_filter, order_by = build_search_expr()\n\n query = f\"\"\"\n WITH scored AS (\n SELECT \n row_id, pid, label, description, latitude, longitude, n as source,\n place_name, result_time,\n {search_score_expr}\n FROM read_parquet('{PARQUET_PATH}')\n {where_clause}\n {search_filter}\n ),\n ranked AS (\n SELECT *,\n ROW_NUMBER() OVER (PARTITION BY source {order_by.replace('ORDER BY', 'ORDER BY')}) as rn\n FROM scored\n )\n SELECT row_id, pid, label, description, latitude, longitude, source, place_name, result_time, search_score\n FROM ranked\n WHERE rn <= {max_per_source}\n {order_by}\n \"\"\"\n\n df = con.sql(query).df()\n\n # Convert to GeoDataFrame\n geometry = [Point(lon, lat) for lon, lat in zip(df['longitude'], df['latitude'])]\n gdf = gpd.GeoDataFrame(df, geometry=geometry, crs=\"EPSG:4326\")\n\n return gdf\n\n\ndef view_state_to_bbox(view_state, buffer_factor=1.5, aspect_ratio=1.5):\n \"\"\"\n Calculate bounding box from lonboard view_state.\n\n The view_state contains latitude, longitude, and zoom level.\n We calculate the visible extent using Web Mercator projection math.\n\n Args:\n view_state: lonboard MapViewState with latitude, longitude, zoom\n buffer_factor: Multiply bbox by this to load slightly more data (default 1.5)\n aspect_ratio: Width/height ratio of map container (default 1.5 for wider maps)\n\n Returns:\n dict with min_lat, max_lat, min_lon, max_lon\n \"\"\"\n lat = view_state.latitude\n lon = view_state.longitude\n zoom = view_state.zoom\n\n # At zoom 0, entire world visible (~360 degrees longitude)\n # Each zoom level halves the visible area\n # Approximate degrees visible at zoom level\n degrees_visible = 360 / (2 ** zoom)\n\n # Latitude visible area - apply buffer\n lat_degrees = degrees_visible * buffer_factor / 2\n\n # Longitude visible area - wider due to aspect ratio and Mercator at higher latitudes\n # Mercator stretches longitude at higher latitudes, so we need more buffer\n lat_rad = math.radians(abs(lat))\n mercator_stretch = 1 / max(math.cos(lat_rad), 0.1) # Avoid division by zero near poles\n lon_degrees = degrees_visible * buffer_factor * aspect_ratio * mercator_stretch / 2\n\n # Clamp latitude to valid range\n min_lat = max(-90, lat - lat_degrees)\n max_lat = min(90, lat + lat_degrees)\n min_lon = max(-180, lon - lon_degrees)\n max_lon = min(180, lon + lon_degrees)\n\n return {\n 'min_lat': min_lat,\n 'max_lat': max_lat,\n 'min_lon': min_lon,\n 'max_lon': max_lon\n }\n\n\ndef adaptive_sample_size(zoom, base_size=50000):\n \"\"\"\n Calculate sample size based on zoom level.\n\n At low zoom (world view), sample aggressively to avoid overwhelming.\n At high zoom (local view), show all available points.\n\n Args:\n zoom: Current zoom level (0-20)\n base_size: Base sample size per source\n\n Returns:\n Sample size to use per source\n \"\"\"\n if zoom < 2:\n return min(base_size, 10000) # World view: max 10K per source\n elif zoom < 5:\n return min(base_size, 25000) # Continent view: max 25K\n elif zoom < 8:\n return min(base_size, 50000) # Country view: max 50K\n elif zoom < 12:\n return min(base_size, 100000) # Region view: max 100K\n else:\n return base_size # Local view: use full base_size\n\n\n# Load initial data\nprint(\"Loading samples...\")\nsamples_gdf = load_samples(max_per_source=12500)\nprint(f\"Loaded {len(samples_gdf):,} samples\")\nprint(f\"\\nBy source:\")\nprint(samples_gdf['source'].value_counts())"
+ "source": [
+ "# H3-tier loader (scaffold) \u2014 lite parquet loader for zoom >= 10\n",
+ "\n",
+ "# At high zoom H3 aggregates are too coarse; we want individual points.\n",
+ "# The samples_map_lite.parquet is a 60 MB projection (pid/label/source/\n",
+ "# lat/lng/place_name/result_time/h3_res8) \u2014 much smaller than the 282 MB\n",
+ "# wide parquet and already filtered to MaterialSampleRecord rows with\n",
+ "# coordinates. Use it when H3 Tier Mode is on AND zoom >= 10 AND no\n",
+ "# tier-incompatible filter is active.\n",
+ "\n",
+ "LITE_PARQUET_URL = resolve_data_url(\n",
+ " \"isamples_202601_samples_map_lite.parquet\",\n",
+ " \"isamples_202601_samples_map_lite.parquet\",\n",
+ ")\n",
+ "\n",
+ "def load_samples_from_lite(bbox=None, source_filters=None, max_samples=50000):\n",
+ " \"\"\"Load individual sample points from the lite parquet.\n",
+ "\n",
+ " Returns a GeoDataFrame in the same shape that `update_map_and_table()`\n",
+ " expects, so the caller can swap it in for `load_samples()` at zoom\n",
+ " >= 10 under tier mode.\n",
+ "\n",
+ " Caveats vs the crude sampler:\n",
+ " - no description column (lite doesn't carry it) \u2014 search over\n",
+ " description won't work here\n",
+ " - no material/context/object_type filters \u2014 caller must verify\n",
+ " filter compatibility before calling\n",
+ " - returns search_score=0 for shape compatibility with existing\n",
+ " update_map_and_table() branch that expects the column\n",
+ "\n",
+ " Args:\n",
+ " bbox: dict with min_lat, max_lat, min_lon, max_lon (or None).\n",
+ " source_filters: set of source names (or None).\n",
+ " max_samples: total rows cap (not per-source; lite is smaller).\n",
+ "\n",
+ " Returns:\n",
+ " GeoDataFrame with columns: pid, label, description, latitude,\n",
+ " longitude, source, place_name, result_time, search_score, geometry.\n",
+ " \"\"\"\n",
+ " where = [\"latitude IS NOT NULL\"]\n",
+ " if source_filters:\n",
+ " sources_sql = \", \".join(f\"'{s}'\" for s in source_filters)\n",
+ " where.append(f\"source IN ({sources_sql})\")\n",
+ " if bbox:\n",
+ " where.append(f\"latitude BETWEEN {bbox['min_lat']} AND {bbox['max_lat']}\")\n",
+ " where.append(f\"longitude BETWEEN {bbox['min_lon']} AND {bbox['max_lon']}\")\n",
+ " where_sql = \" AND \".join(where)\n",
+ "\n",
+ " query = f\"\"\"\n",
+ " SELECT pid, label, source, latitude, longitude,\n",
+ " place_name, result_time\n",
+ " FROM read_parquet('{LITE_PARQUET_URL}')\n",
+ " WHERE {where_sql}\n",
+ " ORDER BY RANDOM()\n",
+ " LIMIT {max_samples}\n",
+ " \"\"\"\n",
+ " df = con.sql(query).df()\n",
+ "\n",
+ " # Lite has no description \u2014 supply an empty column so downstream\n",
+ " # code that expects description (e.g., sample card renderer) gets\n",
+ " # something reasonable instead of KeyError.\n",
+ " df['description'] = ''\n",
+ " df['search_score'] = 0\n",
+ "\n",
+ " # Build the geometry column that update_map_and_table() expects.\n",
+ " geometry = [Point(lon, lat) for lon, lat in\n",
+ " zip(df['longitude'], df['latitude'])]\n",
+ " return gpd.GeoDataFrame(df, geometry=geometry, crs='EPSG:4326')\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### What the scaffold demonstrates\n",
+ "\n",
+ "- **Payload**: world-view tier is ~600 KB vs 282 MB for the wide-parquet crude\n",
+ " sample. Country-view is ~1.6 MB. Even the near-zoom res 8 tier is 2.4 MB.\n",
+ "- **Latency**: after the first fetch the HTTP cache (Cloudflare, 1 yr immutable)\n",
+ " makes repeat tier queries effectively free.\n",
+ "- **Consistency with the web**: the notebook and the Cesium globe now read the\n",
+ " exact same H3 files. QUERY_SPEC.md \u00a72.4's \"H3 tier 4/6/8\" convention has\n",
+ " one concrete binding instead of two parallel implementations.\n",
+ "\n",
+ "### Still to do (not in this scaffold)\n",
+ "\n",
+ "1. **Wire into the viewport observer**: the widget layer (cell 13+) currently\n",
+ " calls `load_samples()` with bbox + `adaptive_sample_size(zoom)`. Replace\n",
+ " that path with `load_h3_tier(zoom, ...)` at low/mid zoom and keep\n",
+ " `load_samples()` (or better, a lite-parquet loader) for zoom >= 10.\n",
+ "2. **Two-layer map**: keep an \"aggregate\" ScatterplotLayer and an \"individual\"\n",
+ " ScatterplotLayer on the Map simultaneously, toggle visibility on tier\n",
+ " crossings. Avoids tearing down/rebuilding the map on every zoom step.\n",
+ "3. **Facet integration**: `source_filters` is implemented here; material /\n",
+ " specimen / year filters still need per-cell counts (the R2 tier files\n",
+ " don't carry those dimensions). Either pre-compute filtered tier files\n",
+ " or fall back to the wide parquet when non-source facets are active.\n",
+ "4. **Click-to-expand**: clicking an H3 cell on the map should re-query the\n",
+ " wide parquet for the cell's member samples and show them in the table.\n",
+ "5. **Lite-parquet loader**: the 60 MB `samples_map_lite.parquet` projection\n",
+ " is the right source for the individual-points tier, not the wide file."
+ ]
},
{
"cell_type": "markdown",
@@ -91,7 +833,14 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2026-04-24T15:12:53.716685Z",
+ "iopub.status.busy": "2026-04-24T15:12:53.716632Z",
+ "iopub.status.idle": "2026-04-24T15:12:53.720325Z",
+ "shell.execute_reply": "2026-04-24T15:12:53.719926Z"
+ }
+ },
"outputs": [],
"source": [
"def render_sample_card(row):\n",
@@ -184,7 +933,14 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2026-04-24T15:12:53.721269Z",
+ "iopub.status.busy": "2026-04-24T15:12:53.721210Z",
+ "iopub.status.idle": "2026-04-24T15:12:53.807316Z",
+ "shell.execute_reply": "2026-04-24T15:12:53.806949Z"
+ }
+ },
"outputs": [],
"source": [
"def get_colors_for_sources(sources):\n",
@@ -243,7 +999,14 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2026-04-24T15:12:53.813343Z",
+ "iopub.status.busy": "2026-04-24T15:12:53.813263Z",
+ "iopub.status.idle": "2026-04-24T15:12:53.869865Z",
+ "shell.execute_reply": "2026-04-24T15:12:53.869452Z"
+ }
+ },
"outputs": [],
"source": [
"def create_table(gdf):\n",
@@ -280,58 +1043,1879 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2026-04-24T15:12:53.875737Z",
+ "iopub.status.busy": "2026-04-24T15:12:53.875652Z",
+ "iopub.status.idle": "2026-04-24T15:12:53.927064Z",
+ "shell.execute_reply": "2026-04-24T15:12:53.926540Z"
+ }
+ },
"outputs": [],
- "source": "# State management\nclass ExplorerState:\n def __init__(self):\n self.selected_index = None\n self.selected_row = None\n self.current_gdf = None\n self.viewport_mode = False\n self.debounce_timer = None\n self.loading = False\n self.syncing_selection = False # Prevent infinite loops\n self.current_search = \"\" # Current search term\n # Facet filter state\n self.source_filters = set() # Selected sources (empty = all)\n self.material_filters = set() # Selected material URIs (full URIs for filtering)\n self.year_range = (None, None) # (min_year, max_year) or None for no filter\n self.selected_decades = set() # Selected decades for quick filter\n self.material_rollup = True # Whether to include children when parent selected\n # Facet counts cache\n self.facet_counts_cache = {}\n self.facet_cache_time = 0\n\nstate = ExplorerState()\nstate.current_gdf = samples_gdf\n\n\n# =============================================================================\n# Material Hierarchy Definition\n# =============================================================================\n\n# iSamples material vocabulary hierarchy (3 levels)\n# Structure: (display_name, uri_suffix, children_list)\n# uri_suffix maps to actual URIs in the data\n\nMATERIAL_HIERARCHY = [\n (\"Material\", \"material\", [\n (\"Natural Solid Material\", \"naturalsolidmaterial\", [\n (\"Earth Material\", \"earthmaterial\", [\n (\"Rock\", \"rock\", []),\n (\"Sediment\", \"sediment\", []),\n (\"Soil\", \"soil\", []),\n (\"Mineral\", \"mineral\", []),\n (\"Mixed Soil/Sediment/Rock\", \"mixedsoilsedimentrock\", []),\n ]),\n (\"Biogenic Non-organic\", \"biogenicnonorganicmaterial\", []),\n ]),\n (\"Organic Material\", \"organicmaterial\", [\n (\"Plant Material\", \"plantmaterial\", []),\n (\"Animal Product\", \"organicanimalproduct\", []),\n ]),\n (\"Anthropogenic Material\", \"anyanthropogenicmaterial\", [\n (\"Anthropogenic Metal\", \"anthropogenicmetal\", []),\n (\"Ceramic Clay\", \"ceramicclay\", []),\n ]),\n (\"Fluid Material\", \"fluidmaterial\", [\n (\"Liquid Water\", \"liquidwater\", []),\n (\"Gas\", \"gas\", []),\n (\"Non-aqueous Liquid\", \"nonaqueousliquid\", []),\n ]),\n (\"Dispersed Media\", \"dispersedmedia\", [\n (\"Particulate\", \"particulate\", []),\n ]),\n (\"Any Ice\", \"anyice\", []),\n ])\n]\n\n\ndef build_hierarchy_mappings():\n \"\"\"\n Build mappings for the material hierarchy.\n\n Returns:\n - suffix_to_children: dict mapping uri_suffix -> list of all descendant suffixes\n - display_order: list of (display_label, uri_suffix, indent_level) in tree order\n \"\"\"\n suffix_to_children = {}\n display_order = []\n\n def collect_descendants(nodes):\n \"\"\"Get all descendant suffixes from a list of hierarchy nodes.\"\"\"\n descendants = []\n for name, suffix, children in nodes:\n descendants.append(suffix)\n descendants.extend(collect_descendants(children))\n return descendants\n\n def walk(nodes, level=0):\n \"\"\"Walk hierarchy building mappings.\"\"\"\n for name, suffix, children in nodes:\n # All descendants (not including self)\n suffix_to_children[suffix] = collect_descendants(children)\n\n # Add to display order with indentation\n display_order.append((name, suffix, level))\n\n # Recurse into children\n walk(children, level + 1)\n\n walk(MATERIAL_HIERARCHY)\n return suffix_to_children, display_order\n\n\n# Build hierarchy mappings at module load\nSUFFIX_TO_CHILDREN, HIERARCHY_DISPLAY_ORDER = build_hierarchy_mappings()\n\n\n# =============================================================================\n# Facet Query Functions\n# =============================================================================\n\nimport re\nimport time as time_module\n\ndef uri_to_display_name(uri):\n \"\"\"\n Convert a vocabulary URI to a human-readable display name.\n\n Examples:\n https://w3id.org/isample/vocabulary/material/1.0/rock -> Rock\n https://w3id.org/isample/opencontext/material/0.1/ceramicclay -> Ceramic Clay\n \"\"\"\n if not uri or not isinstance(uri, str):\n return str(uri)\n\n # Extract last path segment\n name = uri.rstrip('/').split('/')[-1]\n\n # Insert spaces before uppercase letters (camelCase -> Camel Case)\n name = re.sub(r'([a-z])([A-Z])', r'\\1 \\2', name)\n\n # Capitalize first letter of each word\n name = name.title()\n\n return name\n\n\ndef uri_to_suffix(uri):\n \"\"\"Extract the suffix (last path segment) from a URI, lowercased.\"\"\"\n if not uri or not isinstance(uri, str):\n return \"\"\n return uri.rstrip('/').split('/')[-1].lower()\n\n\ndef get_source_counts(additional_filters=None):\n \"\"\"\n Get counts of samples by source.\n\n Args:\n additional_filters: Dict with material_filters, year_range keys\n\n Returns:\n Dict of {source_name: count}\n \"\"\"\n where_clause = \"WHERE otype = 'MaterialSampleRecord' AND latitude IS NOT NULL\"\n\n # Apply year filter if present (cast result_time to TIMESTAMP)\n if additional_filters and additional_filters.get('year_range'):\n yr = additional_filters['year_range']\n if yr[0] is not None and yr[1] is not None:\n where_clause += f\" AND EXTRACT(YEAR FROM TRY_CAST(result_time AS TIMESTAMP)) BETWEEN {yr[0]} AND {yr[1]}\"\n elif yr[0] is not None:\n where_clause += f\" AND EXTRACT(YEAR FROM TRY_CAST(result_time AS TIMESTAMP)) >= {yr[0]}\"\n elif yr[1] is not None:\n where_clause += f\" AND EXTRACT(YEAR FROM TRY_CAST(result_time AS TIMESTAMP)) <= {yr[1]}\"\n\n # Note: We don't filter by material here to show all source options\n query = f\"\"\"\n SELECT n as source, COUNT(*) as count\n FROM read_parquet('{PARQUET_PATH}')\n {where_clause}\n GROUP BY n ORDER BY count DESC\n \"\"\"\n\n result = con.sql(query).df()\n return dict(zip(result['source'], result['count']))\n\n\ndef get_all_material_counts():\n \"\"\"\n Get counts for ALL materials in the hierarchy (not just top N).\n Used for building the hierarchical display.\n\n Returns:\n Dict of {uri_suffix: {'uri': full_uri, 'count': count}}\n \"\"\"\n query = f\"\"\"\n WITH samples AS (\n SELECT msr.row_id, UNNEST(msr.p__has_material_category) as material_id\n FROM read_parquet('{PARQUET_PATH}') msr\n WHERE msr.otype = 'MaterialSampleRecord' AND msr.latitude IS NOT NULL\n )\n SELECT ic.label as material_uri, COUNT(*) as count\n FROM samples s\n JOIN read_parquet('{PARQUET_PATH}') ic ON ic.row_id = s.material_id\n WHERE ic.label IS NOT NULL\n GROUP BY ic.label\n ORDER BY count DESC\n \"\"\"\n\n try:\n result = con.sql(query).df()\n # Build dict keyed by suffix\n materials = {}\n for _, row in result.iterrows():\n uri = row['material_uri']\n suffix = uri_to_suffix(uri)\n if suffix:\n materials[suffix] = {\n 'uri': uri,\n 'count': row['count']\n }\n return materials\n except Exception as e:\n print(f\"Material count error: {e}\")\n return {}\n\n\n# =============================================================================\n# Accurate Rollup Counts (with caching)\n# =============================================================================\n\n# Cache for accurate rollup counts\n_rollup_cache = {\n 'direct_counts': {}, # suffix -> direct count\n 'rollup_counts': {}, # suffix -> accurate distinct rollup count\n 'computed_at': None, # timestamp\n 'existing_suffixes': set() # suffixes that exist in data\n}\n\n\ndef compute_accurate_rollup_counts(force_refresh=False):\n \"\"\"\n Compute accurate rollup counts using DISTINCT sample counting.\n \n This avoids double-counting samples tagged at multiple hierarchy levels.\n Results are cached for performance.\n \n Args:\n force_refresh: If True, recompute even if cache exists\n \n Returns:\n Tuple of (direct_counts, rollup_counts, elapsed_time)\n - direct_counts: Dict of {suffix: count} for samples tagged exactly at that level\n - rollup_counts: Dict of {suffix: count} for samples tagged at that level OR any descendant\n \"\"\"\n global _rollup_cache\n \n # Return cached results if available and not forcing refresh\n if not force_refresh and _rollup_cache['computed_at'] is not None:\n return (_rollup_cache['direct_counts'], \n _rollup_cache['rollup_counts'], \n 0.0)\n \n print(\"Computing accurate rollup counts (this takes ~3 seconds)...\")\n t0 = time_module.time()\n \n # Build temp table with sample-material pairs\n build_query = f\"\"\"\n CREATE OR REPLACE TEMP TABLE sample_materials AS\n SELECT DISTINCT \n msr.row_id as sample_id,\n LOWER(SPLIT_PART(ic.label, '/', -1)) as suffix,\n ic.label as uri\n FROM read_parquet('{PARQUET_PATH}') msr\n CROSS JOIN UNNEST(msr.p__has_material_category) AS t(mat_id)\n JOIN read_parquet('{PARQUET_PATH}') ic ON ic.row_id = mat_id\n WHERE msr.otype = 'MaterialSampleRecord' \n AND msr.latitude IS NOT NULL\n AND ic.label IS NOT NULL\n \"\"\"\n con.sql(build_query)\n \n # Get existing suffixes\n suffix_df = con.sql(\"SELECT DISTINCT suffix FROM sample_materials\").df()\n existing_suffixes = set(suffix_df['suffix'].tolist())\n \n # Get direct counts\n direct_df = con.sql(\"\"\"\n SELECT suffix, COUNT(*) as direct_count\n FROM sample_materials\n GROUP BY suffix\n \"\"\").df()\n direct_counts = dict(zip(direct_df['suffix'], direct_df['direct_count']))\n \n # Compute accurate rollup for each suffix\n rollup_counts = {}\n for suffix in existing_suffixes:\n if suffix not in SUFFIX_TO_CHILDREN:\n # Unknown suffix (not in our hierarchy), just use direct count\n rollup_counts[suffix] = direct_counts.get(suffix, 0)\n else:\n # Include self + all descendants that exist in data\n all_suffixes = [suffix] + [s for s in SUFFIX_TO_CHILDREN[suffix] if s in existing_suffixes]\n suffixes_sql = \", \".join(f\"'{s}'\" for s in all_suffixes)\n count_query = f\"SELECT COUNT(DISTINCT sample_id) FROM sample_materials WHERE suffix IN ({suffixes_sql})\"\n count = con.sql(count_query).fetchone()[0]\n rollup_counts[suffix] = count\n \n elapsed = time_module.time() - t0\n \n # Update cache\n _rollup_cache['direct_counts'] = direct_counts\n _rollup_cache['rollup_counts'] = rollup_counts\n _rollup_cache['computed_at'] = time_module.time()\n _rollup_cache['existing_suffixes'] = existing_suffixes\n \n print(f\" Computed {len(rollup_counts)} rollup counts in {elapsed:.2f}s\")\n \n return direct_counts, rollup_counts, elapsed\n\n\ndef get_cached_rollup_counts():\n \"\"\"Get cached rollup counts, computing if necessary.\"\"\"\n if _rollup_cache['computed_at'] is None:\n compute_accurate_rollup_counts()\n return _rollup_cache['direct_counts'], _rollup_cache['rollup_counts']\n\n\ndef get_material_counts(additional_filters=None, limit=50):\n \"\"\"\n Get counts of samples by material category (requires join for labels).\n\n Args:\n additional_filters: Dict with source_filters, year_range keys\n limit: Max number of materials to return\n\n Returns:\n Dict of {display_name: {'uri': full_uri, 'count': count}}\n \"\"\"\n where_clause = \"WHERE msr.otype = 'MaterialSampleRecord' AND msr.latitude IS NOT NULL\"\n\n # Apply source filter if present\n if additional_filters and additional_filters.get('source_filters'):\n sources_sql = \", \".join(f\"'{s}'\" for s in additional_filters['source_filters'])\n where_clause += f\" AND msr.n IN ({sources_sql})\"\n\n # Apply year filter if present (cast result_time to TIMESTAMP)\n if additional_filters and additional_filters.get('year_range'):\n yr = additional_filters['year_range']\n if yr[0] is not None and yr[1] is not None:\n where_clause += f\" AND EXTRACT(YEAR FROM TRY_CAST(msr.result_time AS TIMESTAMP)) BETWEEN {yr[0]} AND {yr[1]}\"\n elif yr[0] is not None:\n where_clause += f\" AND EXTRACT(YEAR FROM TRY_CAST(msr.result_time AS TIMESTAMP)) >= {yr[0]}\"\n elif yr[1] is not None:\n where_clause += f\" AND EXTRACT(YEAR FROM TRY_CAST(msr.result_time AS TIMESTAMP)) <= {yr[1]}\"\n\n query = f\"\"\"\n WITH samples AS (\n SELECT msr.row_id, UNNEST(msr.p__has_material_category) as material_id\n FROM read_parquet('{PARQUET_PATH}') msr\n {where_clause}\n )\n SELECT ic.label as material_uri, COUNT(*) as count\n FROM samples s\n JOIN read_parquet('{PARQUET_PATH}') ic ON ic.row_id = s.material_id\n WHERE ic.label IS NOT NULL\n GROUP BY ic.label\n ORDER BY count DESC\n LIMIT {limit}\n \"\"\"\n\n try:\n result = con.sql(query).df()\n # Return dict with display name as key, containing uri and count\n materials = {}\n for _, row in result.iterrows():\n uri = row['material_uri']\n display_name = uri_to_display_name(uri)\n materials[display_name] = {\n 'uri': uri,\n 'count': row['count']\n }\n return materials\n except Exception as e:\n print(f\"Material count error: {e}\")\n return {}\n\n\ndef get_year_range_stats():\n \"\"\"\n Get min/max years and decade counts for time facet.\n\n Returns:\n Dict with 'min_year', 'max_year', 'decades' (dict of decade: count)\n \"\"\"\n # Cast result_time to TIMESTAMP before extracting year\n query = f\"\"\"\n SELECT\n MIN(EXTRACT(YEAR FROM TRY_CAST(result_time AS TIMESTAMP)))::INT as min_year,\n MAX(EXTRACT(YEAR FROM TRY_CAST(result_time AS TIMESTAMP)))::INT as max_year\n FROM read_parquet('{PARQUET_PATH}')\n WHERE otype = 'MaterialSampleRecord'\n AND latitude IS NOT NULL\n AND result_time IS NOT NULL\n AND TRY_CAST(result_time AS TIMESTAMP) IS NOT NULL\n \"\"\"\n\n stats = con.sql(query).df().iloc[0]\n\n # Get decade counts\n decade_query = f\"\"\"\n SELECT\n (EXTRACT(YEAR FROM TRY_CAST(result_time AS TIMESTAMP))::INT / 10) * 10 as decade,\n COUNT(*) as count\n FROM read_parquet('{PARQUET_PATH}')\n WHERE otype = 'MaterialSampleRecord'\n AND latitude IS NOT NULL\n AND result_time IS NOT NULL\n AND TRY_CAST(result_time AS TIMESTAMP) IS NOT NULL\n GROUP BY decade\n ORDER BY decade\n \"\"\"\n\n decades_df = con.sql(decade_query).df()\n decades = dict(zip(decades_df['decade'].astype(int), decades_df['count']))\n\n return {\n 'min_year': int(stats['min_year']) if pd.notna(stats['min_year']) else 1900,\n 'max_year': int(stats['max_year']) if pd.notna(stats['max_year']) else 2025,\n 'decades': decades\n }\n\n\ndef expand_material_filters_with_rollup(selected_uris, all_material_counts):\n \"\"\"\n Expand selected URIs to include all descendant URIs (rollup).\n\n Args:\n selected_uris: Set of user-selected material URIs\n all_material_counts: Dict from get_all_material_counts()\n\n Returns:\n Set of URIs including selected + all descendants\n \"\"\"\n expanded = set(selected_uris)\n\n for uri in selected_uris:\n suffix = uri_to_suffix(uri)\n if suffix in SUFFIX_TO_CHILDREN:\n # Add all descendant URIs\n for child_suffix in SUFFIX_TO_CHILDREN[suffix]:\n # Find the full URI for this suffix\n if child_suffix in all_material_counts:\n expanded.add(all_material_counts[child_suffix]['uri'])\n\n return expanded\n\n\n# =============================================================================\n# Facet Widgets\n# =============================================================================\n\n# Get initial counts for facets\nprint(\"Loading facet counts...\")\ninitial_source_counts = get_source_counts()\nall_material_counts = get_all_material_counts() # Get all for hierarchy\ndirect_counts, rollup_counts = get_cached_rollup_counts() # Compute accurate rollup\nyear_stats = get_year_range_stats()\n\nprint(f\"Sources: {list(initial_source_counts.keys())}\")\nprint(f\"Materials: {len(all_material_counts)} types found in data\")\nprint(f\"Years: {year_stats['min_year']} - {year_stats['max_year']}\")\nprint(f\"Decades with data: {sorted(year_stats['decades'].keys())}\")\n\n\ndef build_hierarchical_material_options(material_counts, direct_counts_dict, rollup_counts_dict, use_rollup=True):\n \"\"\"\n Build SelectMultiple options showing material hierarchy with indentation.\n\n Args:\n material_counts: Dict from get_all_material_counts() keyed by suffix (for URIs)\n direct_counts_dict: Dict of {suffix: direct_count}\n rollup_counts_dict: Dict of {suffix: accurate_rollup_count}\n use_rollup: If True, show rollup totals; if False, show direct counts\n\n Returns:\n - options: List of option labels (with indentation and counts)\n - uri_map: Dict mapping option label -> full URI\n \"\"\"\n options = []\n uri_map = {}\n\n # Indentation characters\n INDENT = \" \" # 4 spaces per level\n\n for display_name, suffix, level in HIERARCHY_DISPLAY_ORDER:\n if suffix in material_counts:\n data = material_counts[suffix]\n uri = data['uri']\n\n # Choose count based on rollup setting\n if use_rollup:\n count = rollup_counts_dict.get(suffix, direct_counts_dict.get(suffix, 0))\n else:\n count = direct_counts_dict.get(suffix, 0)\n\n # Create indented label\n indent = INDENT * level\n label = f\"{indent}{display_name} ({count:,})\"\n\n options.append(label)\n uri_map[label] = uri\n\n return options, uri_map\n\n\n# Build initial hierarchical material options (with rollup ON by default)\nmaterial_options, material_uri_map = build_hierarchical_material_options(\n all_material_counts, direct_counts, rollup_counts, use_rollup=True\n)\nprint(f\"Hierarchy has {len(material_options)} materials with data\")\n\n\ndef create_source_checkboxes(counts):\n \"\"\"Create checkboxes for source facet.\"\"\"\n checkboxes = []\n for source, count in counts.items():\n cb = widgets.Checkbox(\n value=False,\n description=f\"{source} ({count:,})\",\n indent=False,\n layout=widgets.Layout(width='100%', margin='2px 0')\n )\n cb.source_name = source # Store source name for easy access\n checkboxes.append(cb)\n return checkboxes\n\n\n# Create source checkboxes\nsource_checkboxes = create_source_checkboxes(initial_source_counts)\nsource_facet_box = widgets.VBox(\n source_checkboxes,\n layout=widgets.Layout(max_height='180px', overflow_y='auto', padding='5px')\n)\n\n# Use SelectMultiple for materials - hierarchical display with indentation\nmaterial_select = widgets.SelectMultiple(\n options=material_options,\n value=[],\n rows=12, # More rows to see hierarchy\n description='',\n layout=widgets.Layout(width='100%', height='280px'),\n style={'description_width': '0px'}\n)\n\n# Rollup toggle\nrollup_toggle = widgets.Checkbox(\n value=True,\n description='Include subcategories',\n indent=False,\n layout=widgets.Layout(margin='5px 0'),\n style={'description_width': 'initial'}\n)\n\n# Refresh counts button\nrefresh_counts_btn = widgets.Button(\n description='',\n button_style='',\n icon='refresh',\n tooltip='Recalculate material counts',\n layout=widgets.Layout(width='32px', height='26px')\n)\n\nmaterial_header = widgets.HBox([\n widgets.HTML(\"Ctrl/Cmd+click to multi-select\"),\n refresh_counts_btn\n], layout=widgets.Layout(justify_content='space-between', align_items='center'))\n\nmaterial_facet_box = widgets.VBox([\n material_header,\n rollup_toggle,\n material_select\n], layout=widgets.Layout(padding='5px'))\n\n\n# =============================================================================\n# Time Facet Widgets (with decade checkboxes)\n# =============================================================================\n\ndef create_decade_checkboxes(decade_counts):\n \"\"\"Create checkboxes for decade quick-select.\"\"\"\n checkboxes = []\n # Sort decades and filter to reasonable range (1800-2030)\n sorted_decades = sorted([d for d in decade_counts.keys() if 1800 <= d <= 2030])\n\n for decade in sorted_decades:\n count = decade_counts.get(decade, 0)\n if count > 0: # Only show decades with data\n cb = widgets.Checkbox(\n value=False,\n description=f\"{decade}s ({count:,})\",\n indent=False,\n layout=widgets.Layout(width='100%', margin='1px 0')\n )\n cb.decade_value = decade # Store decade value for easy access\n checkboxes.append(cb)\n return checkboxes\n\n\ndecade_checkboxes = create_decade_checkboxes(year_stats['decades'])\n\n# Container for decade checkboxes (scrollable if many decades)\ndecade_box = widgets.VBox(\n decade_checkboxes,\n layout=widgets.Layout(max_height='140px', overflow_y='auto', padding='2px')\n)\n\ndecade_label = widgets.HTML(\"Quick select decades:\")\n\nyear_slider = widgets.IntRangeSlider(\n value=[year_stats['min_year'], year_stats['max_year']],\n min=year_stats['min_year'],\n max=year_stats['max_year'],\n step=1,\n description='Range:',\n continuous_update=False,\n layout=widgets.Layout(width='100%'),\n style={'description_width': '50px'}\n)\n\n# Enable/disable time filter\ntime_filter_enabled = widgets.Checkbox(\n value=False,\n description='Filter by time',\n indent=False,\n layout=widgets.Layout(margin='5px 0')\n)\n\n# Sync mode: when True, decade checkboxes auto-update the slider\nsync_decades_to_slider = widgets.Checkbox(\n value=True,\n description='Sync decades to slider',\n indent=False,\n layout=widgets.Layout(margin='2px 0'),\n style={'description_width': 'initial'}\n)\n\ntime_facet_box = widgets.VBox([\n time_filter_enabled,\n decade_label,\n decade_box,\n widgets.HTML(\"Fine-tune range:\"),\n year_slider,\n sync_decades_to_slider\n], layout=widgets.Layout(padding='5px'))\n\n\n# Create accordion\nfacet_accordion = widgets.Accordion(\n children=[source_facet_box, material_facet_box, time_facet_box],\n titles=['Sources', 'Material Type', 'Time Period'],\n layout=widgets.Layout(width='100%')\n)\nfacet_accordion.selected_index = None # Start collapsed\n\n# Clear all filters button\nclear_filters_btn = widgets.Button(\n description='Clear All Filters',\n button_style='warning',\n icon='times-circle',\n layout=widgets.Layout(width='100%', margin='10px 0')\n)\n\n\n# =============================================================================\n# Original Widgets (preserved)\n# =============================================================================\n\nsource_filter = widgets.Dropdown(\n options=['All Sources', 'SESAR', 'OPENCONTEXT', 'GEOME', 'SMITHSONIAN'],\n value='All Sources',\n description='Source:',\n style={'description_width': '60px'},\n layout=widgets.Layout(display='none') # Hidden - replaced by facet\n)\n\nsearch_input = widgets.Text(\n value='',\n placeholder='Search label, description, place...',\n description='Search:',\n style={'description_width': '60px'},\n layout=widgets.Layout(width='280px')\n)\n\nsearch_btn = widgets.Button(\n description='',\n button_style='',\n icon='search',\n tooltip='Search (or press Enter)',\n layout=widgets.Layout(width='40px')\n)\n\nclear_search_btn = widgets.Button(\n description='',\n button_style='',\n icon='times',\n tooltip='Clear search',\n layout=widgets.Layout(width='40px')\n)\n\nsample_count = widgets.IntSlider(\n value=12500,\n min=1000,\n max=500000, # 500K per source - plenty for 128GB RAM\n step=5000,\n description='Per source:',\n style={'description_width': '80px'}\n)\n\nviewport_toggle = widgets.ToggleButton(\n value=False,\n description='Viewport Mode',\n tooltip='When enabled, automatically loads data for current map view',\n icon='map',\n button_style='' # 'success' when active\n)\n\nrefresh_btn = widgets.Button(\n description='Refresh Data',\n button_style='primary',\n icon='refresh'\n)\n\n# Loading indicator with spinner\nloading_indicator = widgets.HTML(value=\"\")\n\nstatus_label = widgets.HTML(value=f\"Loaded: {len(samples_gdf):,} samples\")\n\ncard_output = widgets.HTML(value=render_sample_card(None))\n\n# Active filters display\nactive_filters_html = widgets.HTML(value=\"\")\n\n\ndef update_active_filters_display():\n \"\"\"Update the display of currently active filters.\"\"\"\n filters = []\n\n if state.source_filters:\n filters.append(f\"Sources: {', '.join(state.source_filters)}\")\n\n if state.material_filters:\n # Get display names from selected options\n mat_names = [uri_to_display_name(uri) for uri in state.material_filters]\n mat_display = mat_names[:2]\n if len(mat_names) > 2:\n mat_display.append(f\"+{len(mat_names)-2} more\")\n rollup_indicator = \" (+sub)\" if state.material_rollup else \"\"\n filters.append(f\"Materials: {', '.join(mat_display)}{rollup_indicator}\")\n\n if state.year_range[0] is not None or state.year_range[1] is not None:\n yr_str = f\"{state.year_range[0] or 'any'} - {state.year_range[1] or 'any'}\"\n # Show selected decades if any\n if state.selected_decades:\n decades_str = ', '.join(f\"{d}s\" for d in sorted(state.selected_decades))\n filters.append(f\"Time: {decades_str} ({yr_str})\")\n else:\n filters.append(f\"Years: {yr_str}\")\n\n if filters:\n active_filters_html.value = f\"Active: {''.join(filters)}
\"\n else:\n active_filters_html.value = \"\"\n\n\ndef show_loading(message=\"Loading...\"):\n \"\"\"Show loading indicator.\"\"\"\n state.loading = True\n loading_indicator.value = f\"\"\"\n \n \n \n {message}\n
\n \"\"\"\n\n\ndef hide_loading():\n \"\"\"Hide loading indicator.\"\"\"\n state.loading = False\n loading_indicator.value = \"\"\n\n\ndef select_sample(idx, source='map'):\n \"\"\"\n Select a sample by index and sync map/table/card.\n\n Args:\n idx: Row index in current_gdf\n source: 'map' or 'table' - which triggered the selection\n \"\"\"\n if idx is None or idx >= len(state.current_gdf):\n return\n\n state.selected_index = idx\n state.selected_row = state.current_gdf.iloc[idx]\n\n # Update sample card\n card_output.value = render_sample_card(state.selected_row)\n\n if source == 'map':\n # Map click -> highlight table row\n # Column count depends on whether we're showing search_score\n col_count = 4 if state.current_search else 3\n sample_table.selections = [{'r1': idx, 'c1': 0, 'r2': idx, 'c2': col_count}]\n\n elif source == 'table':\n # Table click -> recenter map (keep current zoom)\n lat = state.selected_row['latitude']\n lon = state.selected_row['longitude']\n if not pd.isna(lat) and not pd.isna(lon):\n sample_map.set_view_state(latitude=float(lat), longitude=float(lon))\n\n\ndef on_map_point_click(change):\n \"\"\"Handle click on a map point - highlight corresponding table row.\"\"\"\n if state.syncing_selection:\n return\n\n idx = change.get('new')\n if idx is None:\n return\n\n state.syncing_selection = True\n try:\n select_sample(idx, source='map')\n finally:\n state.syncing_selection = False\n\n\ndef setup_layer_observer(layer):\n \"\"\"Setup the selected_index observer on a layer.\"\"\"\n layer.observe(on_map_point_click, names=['selected_index'])\n\n\ndef update_map_and_table(new_gdf, search_active=False):\n \"\"\"Update map and table with new data.\"\"\"\n state.current_gdf = new_gdf\n state.current_search = search_input.value.strip() if search_active else \"\"\n\n # Update map with new layer\n new_layer = create_map_layer(new_gdf)\n\n # Setup observer on new layer BEFORE adding to map\n setup_layer_observer(new_layer)\n\n sample_map.layers = [new_layer]\n\n # Update table - include score column if searching\n if search_active and 'search_score' in new_gdf.columns:\n display_cols = ['search_score', 'source', 'label', 'latitude', 'longitude']\n df_display = new_gdf[display_cols].copy()\n df_display = df_display.rename(columns={'search_score': 'score'})\n else:\n display_cols = ['source', 'label', 'latitude', 'longitude']\n df_display = new_gdf[display_cols].copy()\n\n df_display['latitude'] = df_display['latitude'].round(4)\n df_display['longitude'] = df_display['longitude'].round(4)\n sample_table.data = df_display\n\n # Update status\n if search_active:\n status_label.value = f\"Found: {len(new_gdf):,} matches for '{state.current_search}'\"\n else:\n status_label.value = f\"Loaded: {len(new_gdf):,} samples\"\n\n # Update active filters display\n update_active_filters_display()\n\n\ndef get_effective_material_filters():\n \"\"\"\n Get the effective set of material URIs to filter by,\n applying rollup expansion if enabled.\n \"\"\"\n if not state.material_filters:\n return None\n\n if state.material_rollup:\n # Expand to include descendants\n return expand_material_filters_with_rollup(state.material_filters, all_material_counts)\n else:\n # Exact match only\n return state.material_filters\n\n\ndef update_material_widget_options():\n \"\"\"\n Update material SelectMultiple options based on rollup toggle.\n Preserves current selection when possible.\n \"\"\"\n global material_uri_map\n\n # Remember current selection (by URI)\n current_uris = set()\n for option_label in material_select.value:\n if option_label in material_uri_map:\n current_uris.add(material_uri_map[option_label])\n\n # Get cached counts\n direct_cts, rollup_cts = get_cached_rollup_counts()\n\n # Rebuild options with new rollup setting\n new_options, new_uri_map = build_hierarchical_material_options(\n all_material_counts, direct_cts, rollup_cts, use_rollup=rollup_toggle.value\n )\n\n # Update the global uri_map\n material_uri_map = new_uri_map\n\n # Update widget options\n material_select.options = new_options\n\n # Restore selection (find labels for remembered URIs)\n new_selection = []\n for label, uri in new_uri_map.items():\n if uri in current_uris:\n new_selection.append(label)\n\n material_select.value = tuple(new_selection)\n\n\ndef do_search():\n \"\"\"Execute search with current parameters.\"\"\"\n show_loading(\"Searching...\")\n\n try:\n # Use facet filters instead of single source dropdown\n source_filters_set = state.source_filters if state.source_filters else None\n material_filters_set = get_effective_material_filters() # Apply rollup if enabled\n year_range = state.year_range if (state.year_range[0] is not None or state.year_range[1] is not None) else None\n\n search_term = search_input.value.strip()\n\n if state.viewport_mode:\n # Search within current viewport\n view_state = sample_map.view_state\n zoom = view_state.zoom if hasattr(view_state, 'zoom') else 1\n bbox = view_state_to_bbox(view_state)\n\n # When searching, use slider value directly (no adaptive reduction)\n # When browsing, use adaptive sampling based on zoom\n if search_term:\n max_samples = sample_count.value\n else:\n max_samples = adaptive_sample_size(zoom, base_size=sample_count.value)\n\n new_gdf = load_samples(\n max_per_source=max_samples,\n bbox=bbox,\n search_term=search_term if search_term else None,\n source_filters=source_filters_set,\n material_filters=material_filters_set,\n year_range=year_range\n )\n\n zoom_info = f\" (zoom {zoom:.1f})\"\n else:\n # Search globally\n new_gdf = load_samples(\n max_per_source=sample_count.value,\n search_term=search_term if search_term else None,\n source_filters=source_filters_set,\n material_filters=material_filters_set,\n year_range=year_range\n )\n zoom_info = \"\"\n\n update_map_and_table(new_gdf, search_active=bool(search_term))\n\n if search_term:\n status_label.value = f\"Found: {len(new_gdf):,} matches for '{search_term}'{zoom_info}\"\n else:\n status_label.value = f\"Loaded: {len(new_gdf):,} samples{zoom_info}\"\n\n except Exception as e:\n status_label.value = f\"Error: {str(e)[:50]}\"\n import traceback\n traceback.print_exc()\n finally:\n hide_loading()\n\n\ndef on_search_click(b):\n \"\"\"Handle search button click.\"\"\"\n do_search()\n\n\ndef on_search_submit(change):\n \"\"\"Handle Enter key in search box.\"\"\"\n do_search()\n\n\ndef on_clear_search(b):\n \"\"\"Clear search and reload data.\"\"\"\n search_input.value = ''\n do_search()\n\n\nsearch_btn.on_click(on_search_click)\nsearch_input.on_submit(on_search_submit)\nclear_search_btn.on_click(on_clear_search)\n\n\ndef load_viewport_data():\n \"\"\"Load data for current viewport with adaptive sampling.\"\"\"\n if state.loading:\n return\n\n show_loading(\"Loading viewport data...\")\n\n try:\n # Get current view state\n view_state = sample_map.view_state\n zoom = view_state.zoom if hasattr(view_state, 'zoom') else 1\n\n # Calculate bounding box\n bbox = view_state_to_bbox(view_state)\n\n # Get facet filters and search term\n source_filters_set = state.source_filters if state.source_filters else None\n material_filters_set = get_effective_material_filters() # Apply rollup if enabled\n year_range = state.year_range if (state.year_range[0] is not None or state.year_range[1] is not None) else None\n search_term = search_input.value.strip() if search_input.value.strip() else None\n\n # When searching, use slider value directly (no adaptive reduction)\n # When browsing, use adaptive sampling based on zoom\n if search_term:\n max_samples = sample_count.value\n else:\n max_samples = adaptive_sample_size(zoom, base_size=sample_count.value)\n\n # Load data\n new_gdf = load_samples(\n max_per_source=max_samples,\n bbox=bbox,\n search_term=search_term,\n source_filters=source_filters_set,\n material_filters=material_filters_set,\n year_range=year_range\n )\n\n update_map_and_table(new_gdf, search_active=bool(search_term))\n\n # Show zoom info in status\n if search_term:\n status_label.value = f\"Found: {len(new_gdf):,} matches for '{search_term}' (zoom {zoom:.1f})\"\n else:\n status_label.value = f\"Loaded: {len(new_gdf):,} samples (zoom {zoom:.1f}, {max_samples:,}/source max)\"\n\n except Exception as e:\n status_label.value = f\"Error: {str(e)[:50]}\"\n finally:\n hide_loading()\n\n\ndef debounced_viewport_load():\n \"\"\"Debounced viewport loading - waits for user to stop panning/zooming.\"\"\"\n # Cancel any existing timer\n if state.debounce_timer is not None:\n state.debounce_timer.cancel()\n\n # Set new timer (500ms delay)\n state.debounce_timer = threading.Timer(0.5, load_viewport_data)\n state.debounce_timer.start()\n\n\ndef on_view_state_change(change):\n \"\"\"Handle map pan/zoom changes.\"\"\"\n if state.viewport_mode and not state.loading:\n debounced_viewport_load()\n\n\ndef on_viewport_toggle(change):\n \"\"\"Handle viewport mode toggle.\"\"\"\n state.viewport_mode = change['new']\n if change['new']:\n viewport_toggle.button_style = 'success'\n viewport_toggle.description = 'Viewport Mode ON'\n # Immediately load viewport data\n load_viewport_data()\n else:\n viewport_toggle.button_style = ''\n viewport_toggle.description = 'Viewport Mode'\n\n\nviewport_toggle.observe(on_viewport_toggle, names=['value'])\n\n\n# Event handlers\ndef on_refresh_click(b):\n do_search() # Refresh now uses same logic as search\n\nrefresh_btn.on_click(on_refresh_click)\n\n\ndef on_table_selection(change):\n \"\"\"Handle table row selection - recenter map on selected point.\"\"\"\n if state.syncing_selection:\n return\n\n # selections is a LIST of selection dicts\n selections = change.get('new', [])\n if selections and len(selections) > 0:\n # Get the first selection\n sel = selections[0]\n row_idx = sel.get('r1')\n if row_idx is not None and row_idx < len(state.current_gdf):\n state.syncing_selection = True\n try:\n select_sample(row_idx, source='table')\n finally:\n state.syncing_selection = False\n\nsample_table.observe(on_table_selection, names=['selections'])\n\n# Register view_state observer on the map\nsample_map.observe(on_view_state_change, names=['view_state'])\n\n# Setup observer on initial layer\nsetup_layer_observer(sample_map.layers[0])\n\n\n# =============================================================================\n# Facet Event Handlers\n# =============================================================================\n\ndef on_source_checkbox_change(change):\n \"\"\"Handle source checkbox changes.\"\"\"\n # Rebuild source_filters from all checkboxes\n state.source_filters = set()\n for cb in source_checkboxes:\n if cb.value:\n state.source_filters.add(cb.source_name)\n\n # Trigger data reload\n do_search()\n\n\ndef on_material_select_change(change):\n \"\"\"Handle material selection changes.\"\"\"\n # Convert selected option labels to URIs\n state.material_filters = set()\n for option_label in material_select.value:\n if option_label in material_uri_map:\n state.material_filters.add(material_uri_map[option_label])\n\n # Trigger data reload\n do_search()\n\n\ndef on_rollup_toggle_change(change):\n \"\"\"Handle rollup toggle changes.\"\"\"\n state.material_rollup = rollup_toggle.value\n\n # Update the widget options to show correct counts\n update_material_widget_options()\n\n # If materials are selected, reload with new rollup setting\n if state.material_filters:\n do_search()\n\n\ndef on_refresh_counts_click(b):\n \"\"\"Handle refresh counts button click - recalculate material counts.\"\"\"\n global direct_counts, rollup_counts\n \n show_loading(\"Recalculating counts...\")\n try:\n # Force refresh of counts\n direct_counts, rollup_counts, elapsed = compute_accurate_rollup_counts(force_refresh=True)\n \n # Update widget display\n update_material_widget_options()\n \n status_label.value = f\"Counts refreshed in {elapsed:.1f}s\"\n except Exception as e:\n status_label.value = f\"Error: {str(e)[:50]}\"\n finally:\n hide_loading()\n\n\n# Flag to prevent infinite loops when syncing decades/slider\n_syncing_time_widgets = False\n\n\ndef sync_slider_from_decades():\n \"\"\"Update year slider based on selected decades.\"\"\"\n global _syncing_time_widgets\n if _syncing_time_widgets:\n return\n \n _syncing_time_widgets = True\n try:\n if state.selected_decades:\n # Calculate min/max from selected decades\n min_year = min(state.selected_decades)\n max_year = max(state.selected_decades) + 9 # End of decade (e.g., 2010 -> 2019)\n year_slider.value = [min_year, max_year]\n finally:\n _syncing_time_widgets = False\n\n\ndef on_decade_checkbox_change(change):\n \"\"\"Handle decade checkbox changes.\"\"\"\n global _syncing_time_widgets\n if _syncing_time_widgets:\n return\n \n # Rebuild selected_decades from all checkboxes\n state.selected_decades = set()\n for cb in decade_checkboxes:\n if cb.value:\n state.selected_decades.add(cb.decade_value)\n\n # Auto-enable time filter if any decade selected\n if state.selected_decades and not time_filter_enabled.value:\n _syncing_time_widgets = True\n time_filter_enabled.value = True\n _syncing_time_widgets = False\n\n # Sync slider if enabled\n if sync_decades_to_slider.value:\n sync_slider_from_decades()\n\n # Update state year_range from slider (which may have been synced)\n if time_filter_enabled.value:\n state.year_range = (year_slider.value[0], year_slider.value[1])\n else:\n state.year_range = (None, None)\n\n # Trigger data reload\n do_search()\n\n\ndef on_time_filter_change(change):\n \"\"\"Handle time filter enable/disable.\"\"\"\n global _syncing_time_widgets\n if _syncing_time_widgets:\n return\n \n if time_filter_enabled.value:\n state.year_range = (year_slider.value[0], year_slider.value[1])\n else:\n state.year_range = (None, None)\n # Also clear decade selections when disabling time filter\n _syncing_time_widgets = True\n state.selected_decades = set()\n for cb in decade_checkboxes:\n cb.value = False\n _syncing_time_widgets = False\n\n do_search()\n\n\ndef on_year_slider_change(change):\n \"\"\"Handle year slider changes.\"\"\"\n global _syncing_time_widgets\n if _syncing_time_widgets:\n return\n \n if time_filter_enabled.value:\n state.year_range = (year_slider.value[0], year_slider.value[1])\n \n # Update decade checkboxes to reflect slider range (if sync enabled)\n if sync_decades_to_slider.value:\n _syncing_time_widgets = True\n min_yr, max_yr = year_slider.value\n state.selected_decades = set()\n for cb in decade_checkboxes:\n decade = cb.decade_value\n # Check if decade overlaps with slider range\n decade_end = decade + 9\n overlaps = (decade <= max_yr) and (decade_end >= min_yr)\n cb.value = overlaps\n if overlaps:\n state.selected_decades.add(decade)\n _syncing_time_widgets = False\n \n do_search()\n\n\ndef on_clear_all_filters(b):\n \"\"\"Clear all facet filters.\"\"\"\n global _syncing_time_widgets\n _syncing_time_widgets = True\n \n # Clear source checkboxes\n for cb in source_checkboxes:\n cb.value = False\n state.source_filters = set()\n\n # Clear material selection\n material_select.value = []\n state.material_filters = set()\n\n # Reset rollup to default (on)\n rollup_toggle.value = True\n state.material_rollup = True\n\n # Clear time filter\n time_filter_enabled.value = False\n for cb in decade_checkboxes:\n cb.value = False\n state.selected_decades = set()\n year_slider.value = [year_stats['min_year'], year_stats['max_year']]\n state.year_range = (None, None)\n\n # Clear search\n search_input.value = ''\n \n _syncing_time_widgets = False\n\n # Reload data\n do_search()\n\n\n# Wire up facet event handlers\nfor cb in source_checkboxes:\n cb.observe(on_source_checkbox_change, names=['value'])\n\nmaterial_select.observe(on_material_select_change, names=['value'])\nrollup_toggle.observe(on_rollup_toggle_change, names=['value'])\nrefresh_counts_btn.on_click(on_refresh_counts_click)\n\n# Wire up decade checkboxes\nfor cb in decade_checkboxes:\n cb.observe(on_decade_checkbox_change, names=['value'])\n\ntime_filter_enabled.observe(on_time_filter_change, names=['value'])\nyear_slider.observe(on_year_slider_change, names=['value'])\nclear_filters_btn.on_click(on_clear_all_filters)\n\nprint(\"Facet widgets ready!\")"
- },
- {
- "cell_type": "markdown",
- "metadata": {},
"source": [
- "## Explorer Interface\n",
+ "# State management\n",
+ "class ExplorerState:\n",
+ " def __init__(self):\n",
+ " self.selected_index = None\n",
+ " self.selected_row = None\n",
+ " self.current_gdf = None\n",
+ " self.viewport_mode = False\n",
+ " self.h3_tier_mode = False # use H3 pre-computed tiers at low zoom\n",
+ " self.debounce_timer = None\n",
+ " self.loading = False\n",
+ " self.syncing_selection = False # Prevent infinite loops\n",
+ " self.current_search = \"\" # Current search term\n",
+ " # Facet filter state\n",
+ " self.source_filters = set() # Selected sources (empty = all)\n",
+ " self.material_filters = set() # Selected material URIs (full URIs for filtering)\n",
+ " self.year_range = (None, None) # (min_year, max_year) or None for no filter\n",
+ " self.selected_decades = set() # Selected decades for quick filter\n",
+ " self.material_rollup = True # Whether to include children when parent selected\n",
+ " # Facet counts cache\n",
+ " self.facet_counts_cache = {}\n",
+ " self.facet_cache_time = 0\n",
"\n",
- "Run this cell to launch the interactive explorer."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": "# Layout the interface\n\n# Search box with buttons\nsearch_box = widgets.HBox([\n search_input,\n search_btn,\n clear_search_btn\n], layout=widgets.Layout(margin='0 15px 0 0'))\n\n# Row 1: Search and viewport mode\ncontrols_row1 = widgets.HBox([\n search_box,\n viewport_toggle,\n], layout=widgets.Layout(margin='5px 0'))\n\n# Row 2: Sample count, refresh, status\ncontrols_row2 = widgets.HBox([\n sample_count,\n refresh_btn,\n loading_indicator,\n status_label\n], layout=widgets.Layout(margin='5px 0', flex_wrap='wrap'))\n\n# Row 3: Active filters display\ncontrols_row3 = widgets.HBox([\n active_filters_html\n], layout=widgets.Layout(margin='0'))\n\ncontrols = widgets.VBox([controls_row1, controls_row2, controls_row3])\n\n# Legend\nlegend_html = \"\"\"\n\n SESAR\n OpenContext\n GEOME\n Smithsonian\n
\n\"\"\"\nlegend = widgets.HTML(value=legend_html)\n\n# Facet panel header\nfacet_header = widgets.HTML(value=\"Filters
\")\n\n# Main layout with three columns: map | facets | details\nleft_panel = widgets.VBox([\n widgets.HTML(\"Map
\"),\n legend,\n sample_map\n], layout=widgets.Layout(flex='2', margin='0 10px 0 0'))\n\ncenter_panel = widgets.VBox([\n facet_header,\n facet_accordion,\n clear_filters_btn\n], layout=widgets.Layout(width='320px', min_width='280px', margin='0 10px 0 0'))\n\nright_panel = widgets.VBox([\n widgets.HTML(\"Selected Sample
\"),\n card_output,\n widgets.HTML(\"Sample List
\"),\n sample_table\n], layout=widgets.Layout(flex='1', min_width='350px'))\n\nmain_layout = widgets.HBox([left_panel, center_panel, right_panel])\n\n# Display\ndisplay(widgets.VBox([\n widgets.HTML(\"iSamples Explorer
\"),\n widgets.HTML(\"Interactive exploration of physical samples across scientific domains
\"),\n controls,\n main_layout\n]))"
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": "## Usage\n\n### Faceted Filters (Center Panel)\n\nThe **Filters** panel provides multi-select faceted filtering:\n\n**Sources** - Filter by data source (multi-select):\n- Check one or more sources to show only samples from those sources\n- Counts show total samples per source\n- Unchecking all shows all sources\n\n**Material Type** - Hierarchical filter with rollup:\n- Shows the iSamples 3-level material vocabulary as an indented tree\n- **Indentation shows hierarchy**: Top-level (Material), mid-level (Earth Material), leaf-level (Rock)\n- **Include subcategories** (default ON): Selecting \"Earth Material\" also includes Rock, Sediment, Soil, Mineral, Mixed\n- **Without rollup**: Only samples explicitly tagged at the selected level are shown\n- Use Ctrl/Cmd+click to select multiple materials\n\n**Time Period** - Filter by collection/sampling date:\n- **Filter by time**: Check to enable time filtering\n- **Decade quick-select**: Click decade checkboxes (e.g., \"2010s\") for fast filtering\n - Selecting multiple decades spans them (e.g., 2000s + 2010s = 2000-2019)\n - Auto-enables time filter when any decade is selected\n- **Year range slider**: Fine-tune the exact year range\n- **Sync decades to slider** (default ON): Keeps checkboxes and slider in sync\n - Checking \"2010s\" sets slider to 2010-2019\n - Moving slider updates which decade checkboxes are highlighted\n\n**Clear All Filters** - Reset all facet selections and search\n\n### Material Hierarchy Example\n\nThe material vocabulary has 3 levels. With \"Include subcategories\" ON:\n\n```\nSelecting \"Earth Material\" includes:\n → Rock (1M samples)\n → Sediment (66K)\n → Soil (32K)\n → Mineral (300K)\n → Mixed Soil/Sediment/Rock (838K)\n = Total ~2.3M samples\n```\n\nWithout rollup, selecting \"Earth Material\" only shows the ~2.2M samples tagged directly at that level (not the children).\n\n### Time Period Examples\n\n**Quick decade selection:**\n- Click \"2010s\" → Shows samples from 2010-2019\n- Click \"2000s\" AND \"2010s\" → Shows samples from 2000-2019\n- Click \"1990s\", \"2000s\", \"2010s\" → Shows samples from 1990-2019\n\n**Fine-tune with slider:**\n- After selecting decades, adjust slider to narrow further (e.g., 2015-2018)\n- Disable \"Sync decades to slider\" to use slider independently\n\n### Search\n\nSearch filters samples by matching text in **label**, **description**, and **place name** fields:\n\n- **Enter a term**: Type \"pottery\", \"basalt\", \"Cyprus\", etc. and press Enter\n- **Results are ranked**: Label matches (10 pts) > Description (5 pts) > Place name (3 pts)\n- **Score column**: When searching, a \"score\" column appears in the table showing match quality\n- **Combines with facets**: Search works together with facet filters (AND logic)\n- **Viewport aware**: With Viewport Mode ON, search is limited to the current map view\n\n### Selection Sync (Bidirectional)\n\nMap and table selections are synchronized:\n\n- **Click a dot on the map** → The corresponding row is highlighted in the table, and the sample card updates\n- **Click a row in the table** → The map recenters on that point (zoom level is preserved), and the sample card updates\n\nThis makes it easy to explore samples visually on the map and then find them in the table, or vice versa.\n\n### Viewport Mode (Dynamic Loading)\n\nEnable **Viewport Mode** to automatically reload data as you pan and zoom:\n\n- **Toggle ON**: Click the \"Viewport Mode\" button (turns green when active)\n- **Pan/zoom**: Data reloads automatically after you stop moving (500ms debounce)\n- **Loading indicator**: Spinner shows while data is being fetched\n- **Adaptive sampling**: \n - World view (zoom < 2): max 10K samples per source\n - Continent (zoom 2-5): max 25K per source\n - Country (zoom 5-8): max 50K per source\n - Region (zoom 8-12): max 100K per source\n - Local (zoom > 12): uses your slider value\n\n### Active Filters Display\n\nWhen filters are active, colored tags appear below the controls showing:\n- **Blue tag**: Active source filters\n- **Green tag**: Active material filters (shows \"+sub\" when rollup is enabled)\n- **Orange tag**: Active time range (shows selected decades if any)\n\n### Filter Combinations\n\nAll filters work together with AND logic:\n- Source + Material: Show pottery samples from OpenContext only\n- Material + Time: Show rock samples collected in the 2010s\n- Source + Time + Search: Find \"Cyprus\" in SESAR samples from 2000-2020\n\n### Color Legend\n- **Blue**: SESAR (geological samples, IGSNs)\n- **Red**: OpenContext (archaeological samples)\n- **Green**: GEOME (genomic/biological samples)\n- **Orange**: Smithsonian (museum collections)"
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Debug: Raw Data Access\n",
+ "state = ExplorerState()\n",
+ "state.current_gdf = samples_gdf\n",
"\n",
- "Use these cells to explore the underlying data."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Current selection\n",
- "if state.selected_row is not None:\n",
- " print(\"Selected sample:\")\n",
- " print(state.selected_row)\n",
- "else:\n",
- " print(\"No sample selected\")"
- ]
- },
- {
+ "\n",
+ "# =============================================================================\n",
+ "# Material Hierarchy Definition\n",
+ "# =============================================================================\n",
+ "\n",
+ "# iSamples material vocabulary hierarchy (3 levels)\n",
+ "# Structure: (display_name, uri_suffix, children_list)\n",
+ "# uri_suffix maps to actual URIs in the data\n",
+ "\n",
+ "MATERIAL_HIERARCHY = [\n",
+ " (\"Material\", \"material\", [\n",
+ " (\"Natural Solid Material\", \"naturalsolidmaterial\", [\n",
+ " (\"Earth Material\", \"earthmaterial\", [\n",
+ " (\"Rock\", \"rock\", []),\n",
+ " (\"Sediment\", \"sediment\", []),\n",
+ " (\"Soil\", \"soil\", []),\n",
+ " (\"Mineral\", \"mineral\", []),\n",
+ " (\"Mixed Soil/Sediment/Rock\", \"mixedsoilsedimentrock\", []),\n",
+ " ]),\n",
+ " (\"Biogenic Non-organic\", \"biogenicnonorganicmaterial\", []),\n",
+ " ]),\n",
+ " (\"Organic Material\", \"organicmaterial\", [\n",
+ " (\"Plant Material\", \"plantmaterial\", []),\n",
+ " (\"Animal Product\", \"organicanimalproduct\", []),\n",
+ " ]),\n",
+ " (\"Anthropogenic Material\", \"anyanthropogenicmaterial\", [\n",
+ " (\"Anthropogenic Metal\", \"anthropogenicmetal\", []),\n",
+ " (\"Ceramic Clay\", \"ceramicclay\", []),\n",
+ " ]),\n",
+ " (\"Fluid Material\", \"fluidmaterial\", [\n",
+ " (\"Liquid Water\", \"liquidwater\", []),\n",
+ " (\"Gas\", \"gas\", []),\n",
+ " (\"Non-aqueous Liquid\", \"nonaqueousliquid\", []),\n",
+ " ]),\n",
+ " (\"Dispersed Media\", \"dispersedmedia\", [\n",
+ " (\"Particulate\", \"particulate\", []),\n",
+ " ]),\n",
+ " (\"Any Ice\", \"anyice\", []),\n",
+ " ])\n",
+ "]\n",
+ "\n",
+ "\n",
+ "def build_hierarchy_mappings():\n",
+ " \"\"\"\n",
+ " Build mappings for the material hierarchy.\n",
+ "\n",
+ " Returns:\n",
+ " - suffix_to_children: dict mapping uri_suffix -> list of all descendant suffixes\n",
+ " - display_order: list of (display_label, uri_suffix, indent_level) in tree order\n",
+ " \"\"\"\n",
+ " suffix_to_children = {}\n",
+ " display_order = []\n",
+ "\n",
+ " def collect_descendants(nodes):\n",
+ " \"\"\"Get all descendant suffixes from a list of hierarchy nodes.\"\"\"\n",
+ " descendants = []\n",
+ " for name, suffix, children in nodes:\n",
+ " descendants.append(suffix)\n",
+ " descendants.extend(collect_descendants(children))\n",
+ " return descendants\n",
+ "\n",
+ " def walk(nodes, level=0):\n",
+ " \"\"\"Walk hierarchy building mappings.\"\"\"\n",
+ " for name, suffix, children in nodes:\n",
+ " # All descendants (not including self)\n",
+ " suffix_to_children[suffix] = collect_descendants(children)\n",
+ "\n",
+ " # Add to display order with indentation\n",
+ " display_order.append((name, suffix, level))\n",
+ "\n",
+ " # Recurse into children\n",
+ " walk(children, level + 1)\n",
+ "\n",
+ " walk(MATERIAL_HIERARCHY)\n",
+ " return suffix_to_children, display_order\n",
+ "\n",
+ "\n",
+ "# Build hierarchy mappings at module load\n",
+ "SUFFIX_TO_CHILDREN, HIERARCHY_DISPLAY_ORDER = build_hierarchy_mappings()\n",
+ "\n",
+ "\n",
+ "# =============================================================================\n",
+ "# Facet Query Functions\n",
+ "# =============================================================================\n",
+ "\n",
+ "import re\n",
+ "import time as time_module\n",
+ "\n",
+ "def uri_to_display_name(uri):\n",
+ " \"\"\"\n",
+ " Convert a vocabulary URI to a human-readable display name.\n",
+ "\n",
+ " Examples:\n",
+ " https://w3id.org/isample/vocabulary/material/1.0/rock -> Rock\n",
+ " https://w3id.org/isample/opencontext/material/0.1/ceramicclay -> Ceramic Clay\n",
+ " \"\"\"\n",
+ " if not uri or not isinstance(uri, str):\n",
+ " return str(uri)\n",
+ "\n",
+ " # Extract last path segment\n",
+ " name = uri.rstrip('/').split('/')[-1]\n",
+ "\n",
+ " # Insert spaces before uppercase letters (camelCase -> Camel Case)\n",
+ " name = re.sub(r'([a-z])([A-Z])', r'\\1 \\2', name)\n",
+ "\n",
+ " # Capitalize first letter of each word\n",
+ " name = name.title()\n",
+ "\n",
+ " return name\n",
+ "\n",
+ "\n",
+ "def uri_to_suffix(uri):\n",
+ " \"\"\"Extract the suffix (last path segment) from a URI, lowercased.\"\"\"\n",
+ " if not uri or not isinstance(uri, str):\n",
+ " return \"\"\n",
+ " return uri.rstrip('/').split('/')[-1].lower()\n",
+ "\n",
+ "\n",
+ "def get_source_counts(additional_filters=None):\n",
+ " \"\"\"\n",
+ " Get counts of samples by source.\n",
+ "\n",
+ " Args:\n",
+ " additional_filters: Dict with material_filters, year_range keys\n",
+ "\n",
+ " Returns:\n",
+ " Dict of {source_name: count}\n",
+ " \"\"\"\n",
+ " where_clause = \"WHERE otype = 'MaterialSampleRecord' AND latitude IS NOT NULL\"\n",
+ "\n",
+ " # Apply year filter if present (cast result_time to TIMESTAMP)\n",
+ " if additional_filters and additional_filters.get('year_range'):\n",
+ " yr = additional_filters['year_range']\n",
+ " if yr[0] is not None and yr[1] is not None:\n",
+ " where_clause += f\" AND EXTRACT(YEAR FROM TRY_CAST(result_time AS TIMESTAMP)) BETWEEN {yr[0]} AND {yr[1]}\"\n",
+ " elif yr[0] is not None:\n",
+ " where_clause += f\" AND EXTRACT(YEAR FROM TRY_CAST(result_time AS TIMESTAMP)) >= {yr[0]}\"\n",
+ " elif yr[1] is not None:\n",
+ " where_clause += f\" AND EXTRACT(YEAR FROM TRY_CAST(result_time AS TIMESTAMP)) <= {yr[1]}\"\n",
+ "\n",
+ " # Note: We don't filter by material here to show all source options\n",
+ " query = f\"\"\"\n",
+ " SELECT n as source, COUNT(*) as count\n",
+ " FROM read_parquet('{PARQUET_PATH}')\n",
+ " {where_clause}\n",
+ " GROUP BY n ORDER BY count DESC\n",
+ " \"\"\"\n",
+ "\n",
+ " result = con.sql(query).df()\n",
+ " return dict(zip(result['source'], result['count']))\n",
+ "\n",
+ "\n",
+ "def get_all_material_counts():\n",
+ " \"\"\"\n",
+ " Get counts for ALL materials in the hierarchy (not just top N).\n",
+ " Used for building the hierarchical display.\n",
+ "\n",
+ " Returns:\n",
+ " Dict of {uri_suffix: {'uri': full_uri, 'count': count}}\n",
+ " \"\"\"\n",
+ " query = f\"\"\"\n",
+ " WITH samples AS (\n",
+ " SELECT msr.row_id, UNNEST(msr.p__has_material_category) as material_id\n",
+ " FROM read_parquet('{PARQUET_PATH}') msr\n",
+ " WHERE msr.otype = 'MaterialSampleRecord' AND msr.latitude IS NOT NULL\n",
+ " )\n",
+ " SELECT ic.label as material_uri, COUNT(*) as count\n",
+ " FROM samples s\n",
+ " JOIN read_parquet('{PARQUET_PATH}') ic ON ic.row_id = s.material_id\n",
+ " WHERE ic.label IS NOT NULL\n",
+ " GROUP BY ic.label\n",
+ " ORDER BY count DESC\n",
+ " \"\"\"\n",
+ "\n",
+ " try:\n",
+ " result = con.sql(query).df()\n",
+ " # Build dict keyed by suffix, but also maintain a scheme-aware\n",
+ " # lookup for rollup expansion. Each suffix entry stores:\n",
+ " # - 'uri': primary URI (highest count)\n",
+ " # - 'count': count for that URI\n",
+ " # - 'all_uris': dict of {scheme_prefix: uri} for all vocabularies\n",
+ " materials = {}\n",
+ " for _, row in result.iterrows():\n",
+ " uri = row['material_uri']\n",
+ " suffix = uri_to_suffix(uri)\n",
+ " if not suffix:\n",
+ " continue\n",
+ " # Extract scheme prefix for disambiguation\n",
+ " parts = uri.rstrip('/').split('/')\n",
+ " scheme = '/'.join(parts[:-2]) if len(parts) > 2 else ''\n",
+ "\n",
+ " if suffix not in materials:\n",
+ " materials[suffix] = {\n",
+ " 'uri': uri,\n",
+ " 'count': row['count'],\n",
+ " 'all_uris': {scheme: uri}\n",
+ " }\n",
+ " else:\n",
+ " materials[suffix]['all_uris'][scheme] = uri\n",
+ " # Keep highest-count URI as primary\n",
+ " if row['count'] > materials[suffix]['count']:\n",
+ " materials[suffix]['uri'] = uri\n",
+ " materials[suffix]['count'] = row['count']\n",
+ " return materials\n",
+ " except Exception as e:\n",
+ " print(f\"Material count error: {e}\")\n",
+ " return {}\n",
+ "\n",
+ "\n",
+ "# =============================================================================\n",
+ "# Accurate Rollup Counts (with caching)\n",
+ "# =============================================================================\n",
+ "\n",
+ "# Cache for accurate rollup counts\n",
+ "_rollup_cache = {\n",
+ " 'direct_counts': {}, # suffix -> direct count\n",
+ " 'rollup_counts': {}, # suffix -> accurate distinct rollup count\n",
+ " 'computed_at': None, # timestamp\n",
+ " 'existing_suffixes': set() # suffixes that exist in data\n",
+ "}\n",
+ "\n",
+ "\n",
+ "def compute_accurate_rollup_counts(force_refresh=False):\n",
+ " \"\"\"\n",
+ " Compute accurate rollup counts using DISTINCT sample counting.\n",
+ " \n",
+ " This avoids double-counting samples tagged at multiple hierarchy levels.\n",
+ " Results are cached for performance.\n",
+ " \n",
+ " Args:\n",
+ " force_refresh: If True, recompute even if cache exists\n",
+ " \n",
+ " Returns:\n",
+ " Tuple of (direct_counts, rollup_counts, elapsed_time)\n",
+ " - direct_counts: Dict of {suffix: count} for samples tagged exactly at that level\n",
+ " - rollup_counts: Dict of {suffix: count} for samples tagged at that level OR any descendant\n",
+ " \"\"\"\n",
+ " global _rollup_cache\n",
+ " \n",
+ " # Return cached results if available and not forcing refresh\n",
+ " if not force_refresh and _rollup_cache['computed_at'] is not None:\n",
+ " return (_rollup_cache['direct_counts'], \n",
+ " _rollup_cache['rollup_counts'], \n",
+ " 0.0)\n",
+ " \n",
+ " print(\"Computing accurate rollup counts (this takes ~3 seconds)...\")\n",
+ " t0 = time_module.time()\n",
+ " \n",
+ " # Build temp table with sample-material pairs\n",
+ " build_query = f\"\"\"\n",
+ " CREATE OR REPLACE TEMP TABLE sample_materials AS\n",
+ " SELECT DISTINCT \n",
+ " msr.row_id as sample_id,\n",
+ " LOWER(SPLIT_PART(ic.label, '/', -1)) as suffix,\n",
+ " ic.label as uri\n",
+ " FROM read_parquet('{PARQUET_PATH}') msr\n",
+ " CROSS JOIN UNNEST(msr.p__has_material_category) AS t(mat_id)\n",
+ " JOIN read_parquet('{PARQUET_PATH}') ic ON ic.row_id = mat_id\n",
+ " WHERE msr.otype = 'MaterialSampleRecord' \n",
+ " AND msr.latitude IS NOT NULL\n",
+ " AND ic.label IS NOT NULL\n",
+ " \"\"\"\n",
+ " con.sql(build_query)\n",
+ " \n",
+ " # Get existing suffixes\n",
+ " suffix_df = con.sql(\"SELECT DISTINCT suffix FROM sample_materials\").df()\n",
+ " existing_suffixes = set(suffix_df['suffix'].tolist())\n",
+ " \n",
+ " # Get direct counts\n",
+ " direct_df = con.sql(\"\"\"\n",
+ " SELECT suffix, COUNT(*) as direct_count\n",
+ " FROM sample_materials\n",
+ " GROUP BY suffix\n",
+ " \"\"\").df()\n",
+ " direct_counts = dict(zip(direct_df['suffix'], direct_df['direct_count']))\n",
+ " \n",
+ " # Compute accurate rollup counts in a single query instead of N+1.\n",
+ " # Build a mapping of parent -> all descendant suffixes, then query once.\n",
+ " rollup_counts = {}\n",
+ " \n",
+ " # First, handle unknown suffixes (not in hierarchy) \u2014 use direct counts\n",
+ " for suffix in existing_suffixes:\n",
+ " if suffix not in SUFFIX_TO_CHILDREN:\n",
+ " rollup_counts[suffix] = direct_counts.get(suffix, 0)\n",
+ " \n",
+ " # For known hierarchy nodes, build all suffix groups and query in batch\n",
+ " hierarchy_suffixes = [s for s in existing_suffixes if s in SUFFIX_TO_CHILDREN]\n",
+ " if hierarchy_suffixes:\n",
+ " # Build a union of all rollup groups\n",
+ " union_parts = []\n",
+ " for suffix in hierarchy_suffixes:\n",
+ " all_suffixes = [suffix] + [s for s in SUFFIX_TO_CHILDREN[suffix] if s in existing_suffixes]\n",
+ " suffixes_sql = \", \".join(f\"'{s}'\" for s in all_suffixes)\n",
+ " union_parts.append(\n",
+ " f\"SELECT \\'{suffix}\\' as parent_suffix, COUNT(DISTINCT sample_id) as cnt \"\n",
+ " f\"FROM sample_materials WHERE suffix IN ({suffixes_sql})\"\n",
+ " )\n",
+ " \n",
+ " batch_query = \" UNION ALL \".join(union_parts)\n",
+ " batch_result = con.sql(batch_query).fetchall()\n",
+ " for parent_suffix, cnt in batch_result:\n",
+ " rollup_counts[parent_suffix] = cnt\n",
+ " \n",
+ " elapsed = time_module.time() - t0\n",
+ " \n",
+ " # Update cache\n",
+ " _rollup_cache['direct_counts'] = direct_counts\n",
+ " _rollup_cache['rollup_counts'] = rollup_counts\n",
+ " _rollup_cache['computed_at'] = time_module.time()\n",
+ " _rollup_cache['existing_suffixes'] = existing_suffixes\n",
+ " \n",
+ " print(f\" Computed {len(rollup_counts)} rollup counts in {elapsed:.2f}s\")\n",
+ " \n",
+ " return direct_counts, rollup_counts, elapsed\n",
+ "\n",
+ "\n",
+ "def get_cached_rollup_counts():\n",
+ " \"\"\"Get cached rollup counts, computing if necessary.\"\"\"\n",
+ " if _rollup_cache['computed_at'] is None:\n",
+ " compute_accurate_rollup_counts()\n",
+ " return _rollup_cache['direct_counts'], _rollup_cache['rollup_counts']\n",
+ "\n",
+ "\n",
+ "def get_material_counts(additional_filters=None, limit=50):\n",
+ " \"\"\"\n",
+ " Get counts of samples by material category (requires join for labels).\n",
+ "\n",
+ " Args:\n",
+ " additional_filters: Dict with source_filters, year_range keys\n",
+ " limit: Max number of materials to return\n",
+ "\n",
+ " Returns:\n",
+ " Dict of {display_name: {'uri': full_uri, 'count': count}}\n",
+ " \"\"\"\n",
+ " where_clause = \"WHERE msr.otype = 'MaterialSampleRecord' AND msr.latitude IS NOT NULL\"\n",
+ "\n",
+ " # Apply source filter if present\n",
+ " if additional_filters and additional_filters.get('source_filters'):\n",
+ " sources_sql = \", \".join(f\"'{s}'\" for s in additional_filters['source_filters'])\n",
+ " where_clause += f\" AND msr.n IN ({sources_sql})\"\n",
+ "\n",
+ " # Apply year filter if present (cast result_time to TIMESTAMP)\n",
+ " if additional_filters and additional_filters.get('year_range'):\n",
+ " yr = additional_filters['year_range']\n",
+ " if yr[0] is not None and yr[1] is not None:\n",
+ " where_clause += f\" AND EXTRACT(YEAR FROM TRY_CAST(msr.result_time AS TIMESTAMP)) BETWEEN {yr[0]} AND {yr[1]}\"\n",
+ " elif yr[0] is not None:\n",
+ " where_clause += f\" AND EXTRACT(YEAR FROM TRY_CAST(msr.result_time AS TIMESTAMP)) >= {yr[0]}\"\n",
+ " elif yr[1] is not None:\n",
+ " where_clause += f\" AND EXTRACT(YEAR FROM TRY_CAST(msr.result_time AS TIMESTAMP)) <= {yr[1]}\"\n",
+ "\n",
+ " query = f\"\"\"\n",
+ " WITH samples AS (\n",
+ " SELECT msr.row_id, UNNEST(msr.p__has_material_category) as material_id\n",
+ " FROM read_parquet('{PARQUET_PATH}') msr\n",
+ " {where_clause}\n",
+ " )\n",
+ " SELECT ic.label as material_uri, COUNT(*) as count\n",
+ " FROM samples s\n",
+ " JOIN read_parquet('{PARQUET_PATH}') ic ON ic.row_id = s.material_id\n",
+ " WHERE ic.label IS NOT NULL\n",
+ " GROUP BY ic.label\n",
+ " ORDER BY count DESC\n",
+ " LIMIT {limit}\n",
+ " \"\"\"\n",
+ "\n",
+ " try:\n",
+ " result = con.sql(query).df()\n",
+ " # Return dict with display name as key, containing uri and count\n",
+ " materials = {}\n",
+ " for _, row in result.iterrows():\n",
+ " uri = row['material_uri']\n",
+ " display_name = uri_to_display_name(uri)\n",
+ " materials[display_name] = {\n",
+ " 'uri': uri,\n",
+ " 'count': row['count']\n",
+ " }\n",
+ " return materials\n",
+ " except Exception as e:\n",
+ " print(f\"Material count error: {e}\")\n",
+ " return {}\n",
+ "\n",
+ "\n",
+ "def get_year_range_stats():\n",
+ " \"\"\"\n",
+ " Get min/max years and decade counts for time facet.\n",
+ "\n",
+ " Returns:\n",
+ " Dict with 'min_year', 'max_year', 'decades' (dict of decade: count)\n",
+ " \"\"\"\n",
+ " # Cast result_time to TIMESTAMP before extracting year\n",
+ " query = f\"\"\"\n",
+ " SELECT\n",
+ " MIN(EXTRACT(YEAR FROM TRY_CAST(result_time AS TIMESTAMP)))::INT as min_year,\n",
+ " MAX(EXTRACT(YEAR FROM TRY_CAST(result_time AS TIMESTAMP)))::INT as max_year\n",
+ " FROM read_parquet('{PARQUET_PATH}')\n",
+ " WHERE otype = 'MaterialSampleRecord'\n",
+ " AND latitude IS NOT NULL\n",
+ " AND result_time IS NOT NULL\n",
+ " AND TRY_CAST(result_time AS TIMESTAMP) IS NOT NULL\n",
+ " \"\"\"\n",
+ "\n",
+ " stats = con.sql(query).df().iloc[0]\n",
+ "\n",
+ " # Get decade counts\n",
+ " decade_query = f\"\"\"\n",
+ " SELECT\n",
+ " (EXTRACT(YEAR FROM TRY_CAST(result_time AS TIMESTAMP))::INT / 10) * 10 as decade,\n",
+ " COUNT(*) as count\n",
+ " FROM read_parquet('{PARQUET_PATH}')\n",
+ " WHERE otype = 'MaterialSampleRecord'\n",
+ " AND latitude IS NOT NULL\n",
+ " AND result_time IS NOT NULL\n",
+ " AND TRY_CAST(result_time AS TIMESTAMP) IS NOT NULL\n",
+ " GROUP BY decade\n",
+ " ORDER BY decade\n",
+ " \"\"\"\n",
+ "\n",
+ " decades_df = con.sql(decade_query).df()\n",
+ " decades = dict(zip(decades_df['decade'].astype(int), decades_df['count']))\n",
+ "\n",
+ " return {\n",
+ " 'min_year': int(stats['min_year']) if pd.notna(stats['min_year']) else 1900,\n",
+ " 'max_year': int(stats['max_year']) if pd.notna(stats['max_year']) else 2025,\n",
+ " 'decades': decades\n",
+ " }\n",
+ "\n",
+ "\n",
+ "def expand_material_filters_with_rollup(selected_uris, all_material_counts):\n",
+ " \"\"\"\n",
+ " Expand selected URIs to include all descendant URIs (rollup).\n",
+ " Uses scheme-aware matching to avoid cross-vocabulary collisions.\n",
+ "\n",
+ " Args:\n",
+ " selected_uris: Set of user-selected material URIs\n",
+ " all_material_counts: Dict from get_all_material_counts()\n",
+ "\n",
+ " Returns:\n",
+ " Set of URIs including selected + all descendants\n",
+ " \"\"\"\n",
+ " expanded = set(selected_uris)\n",
+ "\n",
+ " for uri in selected_uris:\n",
+ " suffix = uri_to_suffix(uri)\n",
+ " # Extract scheme prefix from the selected URI for matching\n",
+ " parts = uri.rstrip('/').split('/')\n",
+ " scheme_prefix = '/'.join(parts[:-2]) if len(parts) > 2 else ''\n",
+ "\n",
+ " if suffix in SUFFIX_TO_CHILDREN:\n",
+ " # Add descendant URIs, preferring same vocabulary scheme\n",
+ " for child_suffix in SUFFIX_TO_CHILDREN[suffix]:\n",
+ " if child_suffix in all_material_counts:\n",
+ " child_data = all_material_counts[child_suffix]\n",
+ " # Try to find child in the same scheme first\n",
+ " if scheme_prefix and 'all_uris' in child_data:\n",
+ " if scheme_prefix in child_data['all_uris']:\n",
+ " expanded.add(child_data['all_uris'][scheme_prefix])\n",
+ " continue\n",
+ " # Fall back to primary URI (highest count)\n",
+ " expanded.add(child_data['uri'])\n",
+ "\n",
+ " return expanded\n",
+ "\n",
+ "\n",
+ "# =============================================================================\n",
+ "# Facet Widgets\n",
+ "# =============================================================================\n",
+ "\n",
+ "# =============================================================================\n",
+ "# Summary-Based Facet Loading (instant startup from 2KB summary file)\n",
+ "# =============================================================================\n",
+ "\n",
+ "def get_facet_from_summaries(facet_type):\n",
+ " \"\"\"Get facet values and counts from the pre-loaded summaries dataframe.\"\"\"\n",
+ " subset = FACET_SUMMARIES_DF[FACET_SUMMARIES_DF['facet_type'] == facet_type]\n",
+ " return dict(zip(subset['facet_value'], subset['count']))\n",
+ "\n",
+ "# Load all facets from summaries (instant - no full parquet scan)\n",
+ "print(\"Loading facets from pre-computed summaries...\")\n",
+ "_t0 = _time.time()\n",
+ "initial_source_counts = get_facet_from_summaries('source')\n",
+ "summary_material_counts = get_facet_from_summaries('material')\n",
+ "summary_context_counts = get_facet_from_summaries('context')\n",
+ "summary_object_type_counts = get_facet_from_summaries('object_type')\n",
+ "_facet_ms = (_time.time() - _t0) * 1000\n",
+ "print(f\"All facets populated in {_facet_ms:.0f} ms\")\n",
+ "print(f\"Sources: {list(initial_source_counts.keys())}\")\n",
+ "print(f\"Materials: {len(summary_material_counts)} types\")\n",
+ "print(f\"Contexts: {len(summary_context_counts)} types\")\n",
+ "print(f\"Object types: {len(summary_object_type_counts)} types\")\n",
+ "\n",
+ "# Defer heavy hierarchy/year queries \u2014 compute on first use, not at startup.\n",
+ "# The summary parquet above gives us instant facet counts; these are only\n",
+ "# needed when the user opens the Material tree or Time accordion.\n",
+ "_lazy = {'materials': None, 'year_stats': None}\n",
+ "\n",
+ "def _ensure_material_hierarchy():\n",
+ " if _lazy['materials'] is None:\n",
+ " print(\"Computing material hierarchy (first use)...\")\n",
+ " _lazy['materials'] = get_all_material_counts()\n",
+ " return _lazy['materials']\n",
+ "\n",
+ "def _ensure_year_stats():\n",
+ " if _lazy['year_stats'] is None:\n",
+ " print(\"Computing year stats (first use)...\")\n",
+ " _lazy['year_stats'] = get_year_range_stats()\n",
+ " return _lazy['year_stats']\n",
+ "\n",
+ "# NOTE: These are computed lazily on first access via _ensure_*.\n",
+ "# We do NOT call them here at startup \u2014 they run when the user\n",
+ "# opens the Material or Time accordion for the first time.\n",
+ "all_material_counts = {}\n",
+ "direct_counts, rollup_counts = {}, {}\n",
+ "year_stats = {'min_year': 1900, 'max_year': 2025, 'decades': {}}\n",
+ "\n",
+ "print(\"Heavy queries deferred until first use (Material/Time accordion open)\")\n",
+ "\n",
+ "\n",
+ "def build_hierarchical_material_options(material_counts, direct_counts_dict, rollup_counts_dict, use_rollup=True):\n",
+ " \"\"\"\n",
+ " Build SelectMultiple options showing material hierarchy with indentation.\n",
+ "\n",
+ " Args:\n",
+ " material_counts: Dict from get_all_material_counts() keyed by suffix (for URIs)\n",
+ " direct_counts_dict: Dict of {suffix: direct_count}\n",
+ " rollup_counts_dict: Dict of {suffix: accurate_rollup_count}\n",
+ " use_rollup: If True, show rollup totals; if False, show direct counts\n",
+ "\n",
+ " Returns:\n",
+ " - options: List of option labels (with indentation and counts)\n",
+ " - uri_map: Dict mapping option label -> full URI\n",
+ " \"\"\"\n",
+ " options = []\n",
+ " uri_map = {}\n",
+ "\n",
+ " # Indentation characters\n",
+ " INDENT = \" \" # 4 spaces per level\n",
+ "\n",
+ " for display_name, suffix, level in HIERARCHY_DISPLAY_ORDER:\n",
+ " if suffix in material_counts:\n",
+ " data = material_counts[suffix]\n",
+ " uri = data['uri']\n",
+ "\n",
+ " # Choose count based on rollup setting\n",
+ " if use_rollup:\n",
+ " count = rollup_counts_dict.get(suffix, direct_counts_dict.get(suffix, 0))\n",
+ " else:\n",
+ " count = direct_counts_dict.get(suffix, 0)\n",
+ "\n",
+ " # Create indented label\n",
+ " indent = INDENT * level\n",
+ " label = f\"{indent}{display_name} ({count:,})\"\n",
+ "\n",
+ " options.append(label)\n",
+ " uri_map[label] = uri\n",
+ "\n",
+ " return options, uri_map\n",
+ "\n",
+ "\n",
+ "# Build initial hierarchical material options (with rollup ON by default)\n",
+ "material_options, material_uri_map = build_hierarchical_material_options(\n",
+ " all_material_counts, direct_counts, rollup_counts, use_rollup=True\n",
+ ")\n",
+ "print(f\"Hierarchy has {len(material_options)} materials with data\")\n",
+ "\n",
+ "\n",
+ "def create_source_checkboxes(counts):\n",
+ " \"\"\"Create checkboxes for source facet.\"\"\"\n",
+ " checkboxes = []\n",
+ " for source, count in counts.items():\n",
+ " cb = widgets.Checkbox(\n",
+ " value=False,\n",
+ " description=f\"{source} ({count:,})\",\n",
+ " indent=False,\n",
+ " layout=widgets.Layout(width='100%', margin='2px 0')\n",
+ " )\n",
+ " cb.source_name = source # Store source name for easy access\n",
+ " checkboxes.append(cb)\n",
+ " return checkboxes\n",
+ "\n",
+ "\n",
+ "# Create source checkboxes\n",
+ "source_checkboxes = create_source_checkboxes(initial_source_counts)\n",
+ "source_facet_box = widgets.VBox(\n",
+ " source_checkboxes,\n",
+ " layout=widgets.Layout(max_height='180px', overflow_y='auto', padding='5px')\n",
+ ")\n",
+ "\n",
+ "# Use SelectMultiple for materials - hierarchical display with indentation\n",
+ "material_select = widgets.SelectMultiple(\n",
+ " options=material_options,\n",
+ " value=[],\n",
+ " rows=12, # More rows to see hierarchy\n",
+ " description='',\n",
+ " layout=widgets.Layout(width='100%', height='280px'),\n",
+ " style={'description_width': '0px'}\n",
+ ")\n",
+ "\n",
+ "# Rollup toggle\n",
+ "rollup_toggle = widgets.Checkbox(\n",
+ " value=True,\n",
+ " description='Include subcategories',\n",
+ " indent=False,\n",
+ " layout=widgets.Layout(margin='5px 0'),\n",
+ " style={'description_width': 'initial'}\n",
+ ")\n",
+ "\n",
+ "# Refresh counts button\n",
+ "refresh_counts_btn = widgets.Button(\n",
+ " description='',\n",
+ " button_style='',\n",
+ " icon='refresh',\n",
+ " tooltip='Recalculate material counts',\n",
+ " layout=widgets.Layout(width='32px', height='26px')\n",
+ ")\n",
+ "\n",
+ "material_header = widgets.HBox([\n",
+ " widgets.HTML(\"Ctrl/Cmd+click to multi-select\"),\n",
+ " refresh_counts_btn\n",
+ "], layout=widgets.Layout(justify_content='space-between', align_items='center'))\n",
+ "\n",
+ "material_facet_box = widgets.VBox([\n",
+ " material_header,\n",
+ " rollup_toggle,\n",
+ " material_select\n",
+ "], layout=widgets.Layout(padding='5px'))\n",
+ "\n",
+ "\n",
+ "# =============================================================================\n",
+ "# Time Facet Widgets (with decade checkboxes)\n",
+ "# =============================================================================\n",
+ "\n",
+ "def create_decade_checkboxes(decade_counts):\n",
+ " \"\"\"Create checkboxes for decade quick-select.\"\"\"\n",
+ " checkboxes = []\n",
+ " # Sort decades and filter to reasonable range (1800-2030)\n",
+ " sorted_decades = sorted([d for d in decade_counts.keys() if 1800 <= d <= 2030])\n",
+ "\n",
+ " for decade in sorted_decades:\n",
+ " count = decade_counts.get(decade, 0)\n",
+ " if count > 0: # Only show decades with data\n",
+ " cb = widgets.Checkbox(\n",
+ " value=False,\n",
+ " description=f\"{decade}s ({count:,})\",\n",
+ " indent=False,\n",
+ " layout=widgets.Layout(width='100%', margin='1px 0')\n",
+ " )\n",
+ " cb.decade_value = decade # Store decade value for easy access\n",
+ " checkboxes.append(cb)\n",
+ " return checkboxes\n",
+ "\n",
+ "\n",
+ "decade_checkboxes = create_decade_checkboxes(year_stats['decades'])\n",
+ "\n",
+ "# Container for decade checkboxes (scrollable if many decades)\n",
+ "decade_box = widgets.VBox(\n",
+ " decade_checkboxes,\n",
+ " layout=widgets.Layout(max_height='140px', overflow_y='auto', padding='2px')\n",
+ ")\n",
+ "\n",
+ "decade_label = widgets.HTML(\"Quick select decades:\")\n",
+ "\n",
+ "year_slider = widgets.IntRangeSlider(\n",
+ " value=[year_stats['min_year'], year_stats['max_year']],\n",
+ " min=year_stats['min_year'],\n",
+ " max=year_stats['max_year'],\n",
+ " step=1,\n",
+ " description='Range:',\n",
+ " continuous_update=False,\n",
+ " layout=widgets.Layout(width='100%'),\n",
+ " style={'description_width': '50px'}\n",
+ ")\n",
+ "\n",
+ "# Enable/disable time filter\n",
+ "time_filter_enabled = widgets.Checkbox(\n",
+ " value=False,\n",
+ " description='Filter by time',\n",
+ " indent=False,\n",
+ " layout=widgets.Layout(margin='5px 0')\n",
+ ")\n",
+ "\n",
+ "# Sync mode: when True, decade checkboxes auto-update the slider\n",
+ "sync_decades_to_slider = widgets.Checkbox(\n",
+ " value=True,\n",
+ " description='Sync decades to slider',\n",
+ " indent=False,\n",
+ " layout=widgets.Layout(margin='2px 0'),\n",
+ " style={'description_width': 'initial'}\n",
+ ")\n",
+ "\n",
+ "time_facet_box = widgets.VBox([\n",
+ " time_filter_enabled,\n",
+ " decade_label,\n",
+ " decade_box,\n",
+ " widgets.HTML(\"Fine-tune range:\"),\n",
+ " year_slider,\n",
+ " sync_decades_to_slider\n",
+ "], layout=widgets.Layout(padding='5px'))\n",
+ "\n",
+ "\n",
+ "# Context dropdown (populated from summaries)\n",
+ "context_options = [('All Contexts', '')] + [\n",
+ " (f\"{val} ({count:,})\", val) for val, count in summary_context_counts.items()\n",
+ "]\n",
+ "context_dropdown = widgets.Dropdown(\n",
+ " options=context_options,\n",
+ " value='',\n",
+ " description='',\n",
+ " layout=widgets.Layout(width='100%')\n",
+ ")\n",
+ "context_facet_box = widgets.VBox([\n",
+ " widgets.HTML(\"Filter by specimen context/setting:\"),\n",
+ " context_dropdown\n",
+ "], layout=widgets.Layout(padding='5px'))\n",
+ "\n",
+ "# Object type dropdown (populated from summaries)\n",
+ "object_type_options = [('All Object Types', '')] + [\n",
+ " (f\"{val} ({count:,})\", val) for val, count in summary_object_type_counts.items()\n",
+ "]\n",
+ "object_type_dropdown = widgets.Dropdown(\n",
+ " options=object_type_options,\n",
+ " value='',\n",
+ " description='',\n",
+ " layout=widgets.Layout(width='100%')\n",
+ ")\n",
+ "object_type_facet_box = widgets.VBox([\n",
+ " widgets.HTML(\"Filter by sample object type:\"),\n",
+ " object_type_dropdown\n",
+ "], layout=widgets.Layout(padding='5px'))\n",
+ "\n",
+ "# Create accordion with all facets\n",
+ "facet_accordion = widgets.Accordion(\n",
+ " children=[source_facet_box, material_facet_box, context_facet_box, object_type_facet_box, time_facet_box],\n",
+ " titles=['Sources', 'Material Type', 'Context', 'Object Type', 'Time Period'],\n",
+ " layout=widgets.Layout(width='100%')\n",
+ ")\n",
+ "facet_accordion.selected_index = None # Start collapsed\n",
+ "\n",
+ "def on_accordion_open(change):\n",
+ " \"\"\"Lazy-load heavy queries when user opens Material or Time accordion.\"\"\"\n",
+ " global all_material_counts, direct_counts, rollup_counts, year_stats\n",
+ " idx = change.get('new')\n",
+ " if idx == 1 and not all_material_counts:\n",
+ " # Material accordion opened for first time\n",
+ " all_material_counts = _ensure_material_hierarchy()\n",
+ " direct_counts, rollup_counts = get_cached_rollup_counts()\n",
+ " update_material_widget_options()\n",
+ " print(f\"Material hierarchy loaded: {len(all_material_counts)} types\")\n",
+ " elif idx == 4 and not year_stats.get('decades'):\n",
+ " # Time accordion opened for first time\n",
+ " year_stats = _ensure_year_stats()\n",
+ " # Rebuild decade checkboxes and slider with real data\n",
+ " new_decade_cbs = create_decade_checkboxes(year_stats['decades'])\n",
+ " for cb in new_decade_cbs:\n",
+ " cb.observe(on_decade_checkbox_change, names=['value'])\n",
+ " decade_box.children = new_decade_cbs\n",
+ " decade_checkboxes.clear()\n",
+ " decade_checkboxes.extend(new_decade_cbs)\n",
+ " year_slider.min = year_stats['min_year']\n",
+ " year_slider.max = year_stats['max_year']\n",
+ " year_slider.value = [year_stats['min_year'], year_stats['max_year']]\n",
+ " print(f\"Year stats loaded: {year_stats['min_year']} - {year_stats['max_year']}, {len(new_decade_cbs)} decades\")\n",
+ "\n",
+ "facet_accordion.observe(on_accordion_open, names=['selected_index'])\n",
+ "\n",
+ "# Clear all filters button\n",
+ "clear_filters_btn = widgets.Button(\n",
+ " description='Clear All Filters',\n",
+ " button_style='warning',\n",
+ " icon='times-circle',\n",
+ " layout=widgets.Layout(width='100%', margin='10px 0')\n",
+ ")\n",
+ "\n",
+ "\n",
+ "# =============================================================================\n",
+ "# Original Widgets (preserved)\n",
+ "# =============================================================================\n",
+ "\n",
+ "source_filter = widgets.Dropdown(\n",
+ " options=['All Sources', 'SESAR', 'OPENCONTEXT', 'GEOME', 'SMITHSONIAN'],\n",
+ " value='All Sources',\n",
+ " description='Source:',\n",
+ " style={'description_width': '60px'},\n",
+ " layout=widgets.Layout(display='none') # Hidden - replaced by facet\n",
+ ")\n",
+ "\n",
+ "search_input = widgets.Text(\n",
+ " value='',\n",
+ " placeholder='Search label, description, place...',\n",
+ " description='Search:',\n",
+ " style={'description_width': '60px'},\n",
+ " layout=widgets.Layout(width='280px')\n",
+ ")\n",
+ "\n",
+ "search_btn = widgets.Button(\n",
+ " description='',\n",
+ " button_style='',\n",
+ " icon='search',\n",
+ " tooltip='Search (or press Enter)',\n",
+ " layout=widgets.Layout(width='40px')\n",
+ ")\n",
+ "\n",
+ "clear_search_btn = widgets.Button(\n",
+ " description='',\n",
+ " button_style='',\n",
+ " icon='times',\n",
+ " tooltip='Clear search',\n",
+ " layout=widgets.Layout(width='40px')\n",
+ ")\n",
+ "\n",
+ "sample_count = widgets.IntSlider(\n",
+ " value=12500,\n",
+ " min=1000,\n",
+ " max=500000, # 500K per source - plenty for 128GB RAM\n",
+ " step=5000,\n",
+ " description='Per source:',\n",
+ " style={'description_width': '80px'}\n",
+ ")\n",
+ "\n",
+ "viewport_toggle = widgets.ToggleButton(\n",
+ " value=False,\n",
+ " description='Viewport Mode',\n",
+ " tooltip='When enabled, automatically loads data for current map view',\n",
+ " icon='map',\n",
+ " button_style='' # 'success' when active\n",
+ ")\n",
+ "\n",
+ "h3_tier_toggle = widgets.ToggleButton(\n",
+ " value=False,\n",
+ " description='H3 Tier Mode',\n",
+ " tooltip='Use pre-computed H3 summaries at low zoom (fast; source facet only)',\n",
+ " button_style='',\n",
+ " icon='layer-group',\n",
+ ")\n",
+ "\n",
+ "def on_h3_tier_toggle(change):\n",
+ " \"\"\"Toggle H3 tier mode. On change, re-run viewport load if active.\"\"\"\n",
+ " state.h3_tier_mode = change['new']\n",
+ " if change['new']:\n",
+ " h3_tier_toggle.button_style = 'info'\n",
+ " h3_tier_toggle.description = 'H3 Tier Mode ON'\n",
+ " else:\n",
+ " h3_tier_toggle.button_style = ''\n",
+ " h3_tier_toggle.description = 'H3 Tier Mode'\n",
+ " # If viewport mode is also active, reload immediately so the user\n",
+ " # sees the effect of flipping the toggle without panning.\n",
+ " if state.viewport_mode:\n",
+ " load_viewport_data()\n",
+ "\n",
+ "h3_tier_toggle.observe(on_h3_tier_toggle, names=['value'])\n",
+ "\n",
+ "\n",
+ "refresh_btn = widgets.Button(\n",
+ " description='Refresh Data',\n",
+ " button_style='primary',\n",
+ " icon='refresh'\n",
+ ")\n",
+ "\n",
+ "# Loading indicator with spinner\n",
+ "loading_indicator = widgets.HTML(value=\"\")\n",
+ "\n",
+ "status_label = widgets.HTML(value=f\"Loaded: {len(samples_gdf):,} samples\")\n",
+ "\n",
+ "card_output = widgets.HTML(value=render_sample_card(None))\n",
+ "\n",
+ "# Active filters display\n",
+ "active_filters_html = widgets.HTML(value=\"\")\n",
+ "\n",
+ "\n",
+ "def update_active_filters_display():\n",
+ " \"\"\"Update the display of currently active filters.\"\"\"\n",
+ " filters = []\n",
+ "\n",
+ " if state.source_filters:\n",
+ " filters.append(f\"Sources: {', '.join(state.source_filters)}\")\n",
+ "\n",
+ " if state.material_filters:\n",
+ " # Get display names from selected options\n",
+ " mat_names = [uri_to_display_name(uri) for uri in state.material_filters]\n",
+ " mat_display = mat_names[:2]\n",
+ " if len(mat_names) > 2:\n",
+ " mat_display.append(f\"+{len(mat_names)-2} more\")\n",
+ " rollup_indicator = \" (+sub)\" if state.material_rollup else \"\"\n",
+ " filters.append(f\"Materials: {', '.join(mat_display)}{rollup_indicator}\")\n",
+ "\n",
+ " if state.year_range[0] is not None or state.year_range[1] is not None:\n",
+ " yr_str = f\"{state.year_range[0] or 'any'} - {state.year_range[1] or 'any'}\"\n",
+ " # Show selected decades if any\n",
+ " if state.selected_decades:\n",
+ " decades_str = ', '.join(f\"{d}s\" for d in sorted(state.selected_decades))\n",
+ " filters.append(f\"Time: {decades_str} ({yr_str})\")\n",
+ " else:\n",
+ " filters.append(f\"Years: {yr_str}\")\n",
+ "\n",
+ " if filters:\n",
+ " active_filters_html.value = f\"Active: {''.join(filters)}
\"\n",
+ " else:\n",
+ " active_filters_html.value = \"\"\n",
+ "\n",
+ "\n",
+ "def show_loading(message=\"Loading...\"):\n",
+ " \"\"\"Show loading indicator.\"\"\"\n",
+ " state.loading = True\n",
+ " loading_indicator.value = f\"\"\"\n",
+ " \n",
+ " \n",
+ " \n",
+ " {message}\n",
+ "
\n",
+ " \"\"\"\n",
+ "\n",
+ "\n",
+ "def hide_loading():\n",
+ " \"\"\"Hide loading indicator.\"\"\"\n",
+ " state.loading = False\n",
+ " loading_indicator.value = \"\"\n",
+ "\n",
+ "\n",
+ "def select_sample(idx, source='map'):\n",
+ " \"\"\"\n",
+ " Select a sample by index and sync map/table/card.\n",
+ "\n",
+ " Args:\n",
+ " idx: Row index in current_gdf\n",
+ " source: 'map' or 'table' - which triggered the selection\n",
+ " \"\"\"\n",
+ " if idx is None or idx >= len(state.current_gdf):\n",
+ " return\n",
+ "\n",
+ " state.selected_index = idx\n",
+ " state.selected_row = state.current_gdf.iloc[idx]\n",
+ "\n",
+ " # Update sample card\n",
+ " card_output.value = render_sample_card(state.selected_row)\n",
+ "\n",
+ " if source == 'map':\n",
+ " # Map click -> highlight table row\n",
+ " # Column count depends on whether we're showing search_score\n",
+ " col_count = 4 if state.current_search else 3\n",
+ " sample_table.selections = [{'r1': idx, 'c1': 0, 'r2': idx, 'c2': col_count}]\n",
+ "\n",
+ " elif source == 'table':\n",
+ " # Table click -> recenter map (keep current zoom)\n",
+ " lat = state.selected_row['latitude']\n",
+ " lon = state.selected_row['longitude']\n",
+ " if not pd.isna(lat) and not pd.isna(lon):\n",
+ " sample_map.set_view_state(latitude=float(lat), longitude=float(lon))\n",
+ "\n",
+ "\n",
+ "def on_map_point_click(change):\n",
+ " \"\"\"Handle click on a map point - highlight corresponding table row.\"\"\"\n",
+ " if state.syncing_selection:\n",
+ " return\n",
+ "\n",
+ " idx = change.get('new')\n",
+ " if idx is None:\n",
+ " return\n",
+ "\n",
+ " state.syncing_selection = True\n",
+ " try:\n",
+ " select_sample(idx, source='map')\n",
+ " finally:\n",
+ " state.syncing_selection = False\n",
+ "\n",
+ "\n",
+ "def setup_layer_observer(layer):\n",
+ " \"\"\"Setup the selected_index observer on a layer.\"\"\"\n",
+ " layer.observe(on_map_point_click, names=['selected_index'])\n",
+ "\n",
+ "\n",
+ "def update_map_and_table(new_gdf, search_active=False):\n",
+ " \"\"\"Update map and table with new data.\"\"\"\n",
+ " state.current_gdf = new_gdf\n",
+ " state.current_search = search_input.value.strip() if search_active else \"\"\n",
+ "\n",
+ " # Update map with new layer\n",
+ " new_layer = create_map_layer(new_gdf)\n",
+ "\n",
+ " # Setup observer on new layer BEFORE adding to map\n",
+ " setup_layer_observer(new_layer)\n",
+ "\n",
+ " sample_map.layers = [new_layer]\n",
+ "\n",
+ " # Update table - include score column if searching\n",
+ " if search_active and 'search_score' in new_gdf.columns:\n",
+ " display_cols = ['search_score', 'source', 'label', 'latitude', 'longitude']\n",
+ " df_display = new_gdf[display_cols].copy()\n",
+ " df_display = df_display.rename(columns={'search_score': 'score'})\n",
+ " else:\n",
+ " display_cols = ['source', 'label', 'latitude', 'longitude']\n",
+ " df_display = new_gdf[display_cols].copy()\n",
+ "\n",
+ " df_display['latitude'] = df_display['latitude'].round(4)\n",
+ " df_display['longitude'] = df_display['longitude'].round(4)\n",
+ " sample_table.data = df_display\n",
+ "\n",
+ " # Update status\n",
+ " if search_active:\n",
+ " status_label.value = f\"Found: {len(new_gdf):,} matches for '{state.current_search}'\"\n",
+ " else:\n",
+ " status_label.value = f\"Loaded: {len(new_gdf):,} samples\"\n",
+ "\n",
+ " # Update active filters display\n",
+ " update_active_filters_display()\n",
+ "\n",
+ "\n",
+ "def get_effective_material_filters():\n",
+ " \"\"\"\n",
+ " Get the effective set of material URIs to filter by,\n",
+ " applying rollup expansion if enabled.\n",
+ " \"\"\"\n",
+ " if not state.material_filters:\n",
+ " return None\n",
+ "\n",
+ " if state.material_rollup:\n",
+ " # Expand to include descendants\n",
+ " return expand_material_filters_with_rollup(state.material_filters, all_material_counts)\n",
+ " else:\n",
+ " # Exact match only\n",
+ " return state.material_filters\n",
+ "\n",
+ "\n",
+ "def update_material_widget_options():\n",
+ " \"\"\"\n",
+ " Update material SelectMultiple options based on rollup toggle.\n",
+ " Preserves current selection when possible.\n",
+ " \"\"\"\n",
+ " global material_uri_map\n",
+ "\n",
+ " # Remember current selection (by URI)\n",
+ " current_uris = set()\n",
+ " for option_label in material_select.value:\n",
+ " if option_label in material_uri_map:\n",
+ " current_uris.add(material_uri_map[option_label])\n",
+ "\n",
+ " # Get cached counts\n",
+ " direct_cts, rollup_cts = get_cached_rollup_counts()\n",
+ "\n",
+ " # Rebuild options with new rollup setting\n",
+ " new_options, new_uri_map = build_hierarchical_material_options(\n",
+ " all_material_counts, direct_cts, rollup_cts, use_rollup=rollup_toggle.value\n",
+ " )\n",
+ "\n",
+ " # Update the global uri_map\n",
+ " material_uri_map = new_uri_map\n",
+ "\n",
+ " # Update widget options\n",
+ " material_select.options = new_options\n",
+ "\n",
+ " # Restore selection (find labels for remembered URIs)\n",
+ " new_selection = []\n",
+ " for label, uri in new_uri_map.items():\n",
+ " if uri in current_uris:\n",
+ " new_selection.append(label)\n",
+ "\n",
+ " material_select.value = tuple(new_selection)\n",
+ "\n",
+ "\n",
+ "def do_search():\n",
+ " \"\"\"Execute search with current parameters.\"\"\"\n",
+ " show_loading(\"Searching...\")\n",
+ "\n",
+ " try:\n",
+ " # Use facet filters instead of single source dropdown\n",
+ " source_filters_set = state.source_filters if state.source_filters else None\n",
+ " material_filters_set = get_effective_material_filters() # Apply rollup if enabled\n",
+ " year_range = state.year_range if (state.year_range[0] is not None or state.year_range[1] is not None) else None\n",
+ "\n",
+ " search_term = search_input.value.strip()\n",
+ "\n",
+ " if state.viewport_mode:\n",
+ " # Search within current viewport\n",
+ " view_state = sample_map.view_state\n",
+ " zoom = view_state.zoom if hasattr(view_state, 'zoom') else 1\n",
+ " bbox = view_state_to_bbox(view_state)\n",
+ "\n",
+ " # When searching, use slider value directly (no adaptive reduction)\n",
+ " # When browsing, use adaptive sampling based on zoom\n",
+ " if search_term:\n",
+ " max_samples = sample_count.value\n",
+ " else:\n",
+ " max_samples = adaptive_sample_size(zoom, base_size=sample_count.value)\n",
+ "\n",
+ " new_gdf = load_samples(\n",
+ " max_per_source=max_samples,\n",
+ " bbox=bbox,\n",
+ " search_term=search_term if search_term else None,\n",
+ " source_filters=source_filters_set,\n",
+ " material_filters=material_filters_set,\n",
+ " year_range=year_range\n",
+ " )\n",
+ "\n",
+ " zoom_info = f\" (zoom {zoom:.1f})\"\n",
+ " else:\n",
+ " # Search globally\n",
+ " new_gdf = load_samples(\n",
+ " max_per_source=sample_count.value,\n",
+ " search_term=search_term if search_term else None,\n",
+ " source_filters=source_filters_set,\n",
+ " material_filters=material_filters_set,\n",
+ " year_range=year_range\n",
+ " )\n",
+ " zoom_info = \"\"\n",
+ "\n",
+ " update_map_and_table(new_gdf, search_active=bool(search_term))\n",
+ "\n",
+ " if search_term:\n",
+ " status_label.value = f\"Found: {len(new_gdf):,} matches for '{search_term}'{zoom_info}\"\n",
+ " else:\n",
+ " status_label.value = f\"Loaded: {len(new_gdf):,} samples{zoom_info}\"\n",
+ "\n",
+ " except Exception as e:\n",
+ " status_label.value = f\"Error: {str(e)[:50]}\"\n",
+ " import traceback\n",
+ " traceback.print_exc()\n",
+ " finally:\n",
+ " hide_loading()\n",
+ "\n",
+ "\n",
+ "def on_search_click(b):\n",
+ " \"\"\"Handle search button click.\"\"\"\n",
+ " do_search()\n",
+ "\n",
+ "\n",
+ "def on_search_submit(change):\n",
+ " \"\"\"Handle Enter key in search box.\"\"\"\n",
+ " do_search()\n",
+ "\n",
+ "\n",
+ "def on_clear_search(b):\n",
+ " \"\"\"Clear search and reload data.\"\"\"\n",
+ " search_input.value = ''\n",
+ " do_search()\n",
+ "\n",
+ "\n",
+ "search_btn.on_click(on_search_click)\n",
+ "search_input.on_submit(on_search_submit)\n",
+ "clear_search_btn.on_click(on_clear_search)\n",
+ "\n",
+ "\n",
+ "def _tier_compatible_with_filters(material_filters, year_range, search_term):\n",
+ " \"\"\"H3 tier files only carry source_count + dominant_source per cell.\n",
+ " When material/year/search filters are active, the crude sampler is the\n",
+ " only accurate path \u2014 fall back to it.\"\"\"\n",
+ " if material_filters:\n",
+ " return False\n",
+ " if year_range and (year_range[0] is not None or year_range[1] is not None):\n",
+ " return False\n",
+ " if search_term:\n",
+ " return False\n",
+ " return True\n",
+ "\n",
+ "\n",
+ "def _make_tier_table_df(tier_df):\n",
+ " \"\"\"Compact DataFrame for the ipydatagrid when showing H3 aggregates.\n",
+ " Columns match the crude-sampler table (source/label/lat/lng) so the\n",
+ " grid doesn't reshape on tier crossings. Returns empty DataFrame with\n",
+ " the same columns if tier_df has no rows (e.g. bbox outside all cells).\n",
+ " \"\"\"\n",
+ " empty_cols = ['source', 'label', 'latitude', 'longitude']\n",
+ " if len(tier_df) == 0:\n",
+ " return pd.DataFrame(columns=empty_cols)\n",
+ " resolution = int(tier_df['resolution'].iloc[0])\n",
+ " out = tier_df[['dominant_source', 'sample_count',\n",
+ " 'center_lat', 'center_lng']].copy()\n",
+ " out = out.rename(columns={\n",
+ " 'dominant_source': 'source',\n",
+ " 'sample_count': 'label', # label slot shows \"N samples\" as a string\n",
+ " 'center_lat': 'latitude',\n",
+ " 'center_lng': 'longitude',\n",
+ " })\n",
+ " out['label'] = out['label'].apply(lambda n: f\"{n:,} samples (H3 res {resolution})\")\n",
+ " out['latitude'] = out['latitude'].round(4)\n",
+ " out['longitude'] = out['longitude'].round(4)\n",
+ " return out[empty_cols]\n",
+ "\n",
+ "\n",
+ "def _update_map_and_table_tier(tier_df, zoom, resolution):\n",
+ " \"\"\"Mirror of update_map_and_table() for aggregate H3 rows.\n",
+ " Handles empty tier_df (e.g. bbox outside all cells, source filter\n",
+ " matching no dominant_source) by clearing the map + table and\n",
+ " showing a 0-cell status instead of raising.\"\"\"\n",
+ " if len(tier_df) == 0:\n",
+ " sample_map.layers = []\n",
+ " sample_table.data = _make_tier_table_df(tier_df) # empty DataFrame\n",
+ " status_label.value = (\n",
+ " f\"H3 tier res {resolution}: 0 cells in viewport \"\n",
+ " f\"(zoom {zoom:.1f}). Pan/zoom or broaden source filter.\"\n",
+ " )\n",
+ " update_active_filters_display()\n",
+ " state.current_gdf = None\n",
+ " return\n",
+ "\n",
+ " new_layer = make_h3_tier_layer(tier_df)\n",
+ " # Setup click observer on the tier layer so selections still fire (the\n",
+ " # click handler tolerates the aggregate table shape via label lookups).\n",
+ " setup_layer_observer(new_layer)\n",
+ " sample_map.layers = [new_layer]\n",
+ " sample_table.data = _make_tier_table_df(tier_df)\n",
+ " # Flag imprecision in status if a source filter is active (tier cells\n",
+ " # are dominant-source; see load_h3_tier() accuracy caveats).\n",
+ " source_warning = \"\"\n",
+ " if state.source_filters:\n",
+ " source_warning = \" \u00b7 \u26a0\ufe0f source filter is dominant-source only\"\n",
+ " status_label.value = (\n",
+ " f\"H3 tier res {resolution}: {len(tier_df):,} cells \u00b7 \"\n",
+ " f\"{int(tier_df['sample_count'].sum()):,} samples \"\n",
+ " f\"(zoom {zoom:.1f}){source_warning}\"\n",
+ " )\n",
+ " update_active_filters_display()\n",
+ " # Mark current_gdf as tier-mode so the table-selection handler can\n",
+ " # refuse to dive into a per-sample card (no per-sample data here).\n",
+ " state.current_gdf = tier_df.assign(_is_tier=True)\n",
+ "\n",
+ "\n",
+ "def load_viewport_data():\n",
+ " \"\"\"Load data for current viewport with adaptive sampling.\"\"\"\n",
+ " if state.loading:\n",
+ " return\n",
+ "\n",
+ " show_loading(\"Loading viewport data...\")\n",
+ "\n",
+ " try:\n",
+ " # Get current view state\n",
+ " view_state = sample_map.view_state\n",
+ " zoom = view_state.zoom if hasattr(view_state, 'zoom') else 1\n",
+ "\n",
+ " # Calculate bounding box\n",
+ " bbox = view_state_to_bbox(view_state)\n",
+ "\n",
+ " # H3-tier branch: if tier mode is on, zoom is in a tier range, and\n",
+ " # no tier-incompatible filter is active, load the aggregate tier\n",
+ " # instead of the crude sampler.\n",
+ " _source_filters_for_tier = state.source_filters if state.source_filters else None\n",
+ " _material_filters_for_tier = get_effective_material_filters()\n",
+ " _year_range_for_tier = state.year_range if (state.year_range[0] is not None or state.year_range[1] is not None) else None\n",
+ " _search_term_for_tier = search_input.value.strip() if search_input.value.strip() else None\n",
+ " _tier_resolution = zoom_to_h3_resolution(zoom)\n",
+ " if (state.h3_tier_mode\n",
+ " and _tier_resolution is not None\n",
+ " and _tier_compatible_with_filters(\n",
+ " _material_filters_for_tier, _year_range_for_tier, _search_term_for_tier)):\n",
+ " tier_df = load_h3_tier(zoom, source_filters=_source_filters_for_tier, bbox=bbox)\n",
+ " _update_map_and_table_tier(tier_df, zoom, _tier_resolution)\n",
+ " return\n",
+ "\n",
+ " # Lite-parquet branch: tier mode is on, zoom is >= 10 (no H3 tier),\n",
+ " # but filters are tier-compatible \u2014 use the 60 MB lite parquet\n",
+ " # instead of the 282 MB wide parquet. Much faster HTTP range scan.\n",
+ " if (state.h3_tier_mode\n",
+ " and _tier_resolution is None\n",
+ " and _tier_compatible_with_filters(\n",
+ " _material_filters_for_tier, _year_range_for_tier, _search_term_for_tier)):\n",
+ " lite_gdf = load_samples_from_lite(\n",
+ " bbox=bbox,\n",
+ " source_filters=_source_filters_for_tier,\n",
+ " max_samples=sample_count.value * 4, # 4 sources \u00d7 per-source cap\n",
+ " )\n",
+ " update_map_and_table(lite_gdf, search_active=False)\n",
+ " status_label.value = (\n",
+ " f\"Lite: {len(lite_gdf):,} samples \"\n",
+ " f\"(zoom {zoom:.1f}, 60 MB lite parquet, no description/material filter)\"\n",
+ " )\n",
+ " return\n",
+ "\n",
+ " # Get facet filters and search term\n",
+ " source_filters_set = state.source_filters if state.source_filters else None\n",
+ " material_filters_set = get_effective_material_filters() # Apply rollup if enabled\n",
+ " year_range = state.year_range if (state.year_range[0] is not None or state.year_range[1] is not None) else None\n",
+ " search_term = search_input.value.strip() if search_input.value.strip() else None\n",
+ "\n",
+ " # When searching, use slider value directly (no adaptive reduction)\n",
+ " # When browsing, use adaptive sampling based on zoom\n",
+ " if search_term:\n",
+ " max_samples = sample_count.value\n",
+ " else:\n",
+ " max_samples = adaptive_sample_size(zoom, base_size=sample_count.value)\n",
+ "\n",
+ " # Load data\n",
+ " new_gdf = load_samples(\n",
+ " max_per_source=max_samples,\n",
+ " bbox=bbox,\n",
+ " search_term=search_term,\n",
+ " source_filters=source_filters_set,\n",
+ " material_filters=material_filters_set,\n",
+ " year_range=year_range\n",
+ " )\n",
+ "\n",
+ " update_map_and_table(new_gdf, search_active=bool(search_term))\n",
+ "\n",
+ " # Show zoom info in status\n",
+ " if search_term:\n",
+ " status_label.value = f\"Found: {len(new_gdf):,} matches for '{search_term}' (zoom {zoom:.1f})\"\n",
+ " else:\n",
+ " status_label.value = f\"Loaded: {len(new_gdf):,} samples (zoom {zoom:.1f}, {max_samples:,}/source max)\"\n",
+ "\n",
+ " except Exception as e:\n",
+ " status_label.value = f\"Error: {str(e)[:50]}\"\n",
+ " finally:\n",
+ " hide_loading()\n",
+ "\n",
+ "\n",
+ "def debounced_viewport_load():\n",
+ " \"\"\"Debounced viewport loading - waits for user to stop panning/zooming.\"\"\"\n",
+ " # Cancel any existing timer\n",
+ " if state.debounce_timer is not None:\n",
+ " state.debounce_timer.cancel()\n",
+ "\n",
+ " # Set new timer (500ms delay)\n",
+ " state.debounce_timer = threading.Timer(0.5, load_viewport_data)\n",
+ " state.debounce_timer.start()\n",
+ "\n",
+ "\n",
+ "def on_view_state_change(change):\n",
+ " \"\"\"Handle map pan/zoom changes.\"\"\"\n",
+ " if state.viewport_mode and not state.loading:\n",
+ " debounced_viewport_load()\n",
+ "\n",
+ "\n",
+ "def on_viewport_toggle(change):\n",
+ " \"\"\"Handle viewport mode toggle.\"\"\"\n",
+ " state.viewport_mode = change['new']\n",
+ " if change['new']:\n",
+ " viewport_toggle.button_style = 'success'\n",
+ " viewport_toggle.description = 'Viewport Mode ON'\n",
+ " # Immediately load viewport data\n",
+ " load_viewport_data()\n",
+ " else:\n",
+ " viewport_toggle.button_style = ''\n",
+ " viewport_toggle.description = 'Viewport Mode'\n",
+ "\n",
+ "\n",
+ "viewport_toggle.observe(on_viewport_toggle, names=['value'])\n",
+ "\n",
+ "\n",
+ "# Event handlers\n",
+ "def on_refresh_click(b):\n",
+ " do_search() # Refresh now uses same logic as search\n",
+ "\n",
+ "refresh_btn.on_click(on_refresh_click)\n",
+ "\n",
+ "\n",
+ "def on_table_selection(change):\n",
+ " \"\"\"Handle table row selection - recenter map on selected point.\"\"\"\n",
+ " if state.syncing_selection:\n",
+ " return\n",
+ "\n",
+ " # selections is a LIST of selection dicts\n",
+ " selections = change.get('new', [])\n",
+ " if selections and len(selections) > 0:\n",
+ " # Get the first selection\n",
+ " sel = selections[0]\n",
+ " row_idx = sel.get('r1')\n",
+ " if row_idx is not None and row_idx < len(state.current_gdf):\n",
+ " state.syncing_selection = True\n",
+ " try:\n",
+ " select_sample(row_idx, source='table')\n",
+ " finally:\n",
+ " state.syncing_selection = False\n",
+ "\n",
+ "sample_table.observe(on_table_selection, names=['selections'])\n",
+ "\n",
+ "# Register view_state observer on the map\n",
+ "sample_map.observe(on_view_state_change, names=['view_state'])\n",
+ "\n",
+ "# Setup observer on initial layer\n",
+ "setup_layer_observer(sample_map.layers[0])\n",
+ "\n",
+ "\n",
+ "# =============================================================================\n",
+ "# Facet Event Handlers\n",
+ "# =============================================================================\n",
+ "\n",
+ "def on_source_checkbox_change(change):\n",
+ " \"\"\"Handle source checkbox changes.\"\"\"\n",
+ " # Rebuild source_filters from all checkboxes\n",
+ " state.source_filters = set()\n",
+ " for cb in source_checkboxes:\n",
+ " if cb.value:\n",
+ " state.source_filters.add(cb.source_name)\n",
+ "\n",
+ " # Trigger data reload\n",
+ " do_search()\n",
+ "\n",
+ "\n",
+ "def on_material_select_change(change):\n",
+ " \"\"\"Handle material selection changes.\"\"\"\n",
+ " # Convert selected option labels to URIs\n",
+ " state.material_filters = set()\n",
+ " for option_label in material_select.value:\n",
+ " if option_label in material_uri_map:\n",
+ " state.material_filters.add(material_uri_map[option_label])\n",
+ "\n",
+ " # Trigger data reload\n",
+ " do_search()\n",
+ "\n",
+ "\n",
+ "def on_rollup_toggle_change(change):\n",
+ " \"\"\"Handle rollup toggle changes.\"\"\"\n",
+ " state.material_rollup = rollup_toggle.value\n",
+ "\n",
+ " # Update the widget options to show correct counts\n",
+ " update_material_widget_options()\n",
+ "\n",
+ " # If materials are selected, reload with new rollup setting\n",
+ " if state.material_filters:\n",
+ " do_search()\n",
+ "\n",
+ "\n",
+ "def on_refresh_counts_click(b):\n",
+ " \"\"\"Handle refresh counts button click - recalculate material counts.\"\"\"\n",
+ " global direct_counts, rollup_counts\n",
+ " \n",
+ " show_loading(\"Recalculating counts...\")\n",
+ " try:\n",
+ " # Force refresh of counts\n",
+ " direct_counts, rollup_counts, elapsed = compute_accurate_rollup_counts(force_refresh=True)\n",
+ " \n",
+ " # Update widget display\n",
+ " update_material_widget_options()\n",
+ " \n",
+ " status_label.value = f\"Counts refreshed in {elapsed:.1f}s\"\n",
+ " except Exception as e:\n",
+ " status_label.value = f\"Error: {str(e)[:50]}\"\n",
+ " finally:\n",
+ " hide_loading()\n",
+ "\n",
+ "\n",
+ "# Flag to prevent infinite loops when syncing decades/slider\n",
+ "_syncing_time_widgets = False\n",
+ "\n",
+ "\n",
+ "def sync_slider_from_decades():\n",
+ " \"\"\"Update year slider based on selected decades.\"\"\"\n",
+ " global _syncing_time_widgets\n",
+ " if _syncing_time_widgets:\n",
+ " return\n",
+ " \n",
+ " _syncing_time_widgets = True\n",
+ " try:\n",
+ " if state.selected_decades:\n",
+ " # Calculate min/max from selected decades\n",
+ " min_year = min(state.selected_decades)\n",
+ " max_year = max(state.selected_decades) + 9 # End of decade (e.g., 2010 -> 2019)\n",
+ " year_slider.value = [min_year, max_year]\n",
+ " finally:\n",
+ " _syncing_time_widgets = False\n",
+ "\n",
+ "\n",
+ "def on_decade_checkbox_change(change):\n",
+ " \"\"\"Handle decade checkbox changes.\"\"\"\n",
+ " global _syncing_time_widgets\n",
+ " if _syncing_time_widgets:\n",
+ " return\n",
+ " \n",
+ " # Rebuild selected_decades from all checkboxes\n",
+ " state.selected_decades = set()\n",
+ " for cb in decade_checkboxes:\n",
+ " if cb.value:\n",
+ " state.selected_decades.add(cb.decade_value)\n",
+ "\n",
+ " # Auto-enable time filter if any decade selected\n",
+ " if state.selected_decades and not time_filter_enabled.value:\n",
+ " _syncing_time_widgets = True\n",
+ " time_filter_enabled.value = True\n",
+ " _syncing_time_widgets = False\n",
+ "\n",
+ " # Sync slider if enabled\n",
+ " if sync_decades_to_slider.value:\n",
+ " sync_slider_from_decades()\n",
+ "\n",
+ " # Update state year_range from slider (which may have been synced)\n",
+ " if time_filter_enabled.value:\n",
+ " state.year_range = (year_slider.value[0], year_slider.value[1])\n",
+ " else:\n",
+ " state.year_range = (None, None)\n",
+ "\n",
+ " # Trigger data reload\n",
+ " do_search()\n",
+ "\n",
+ "\n",
+ "def on_time_filter_change(change):\n",
+ " \"\"\"Handle time filter enable/disable.\"\"\"\n",
+ " global _syncing_time_widgets\n",
+ " if _syncing_time_widgets:\n",
+ " return\n",
+ " \n",
+ " if time_filter_enabled.value:\n",
+ " state.year_range = (year_slider.value[0], year_slider.value[1])\n",
+ " else:\n",
+ " state.year_range = (None, None)\n",
+ " # Also clear decade selections when disabling time filter\n",
+ " _syncing_time_widgets = True\n",
+ " state.selected_decades = set()\n",
+ " for cb in decade_checkboxes:\n",
+ " cb.value = False\n",
+ " _syncing_time_widgets = False\n",
+ "\n",
+ " do_search()\n",
+ "\n",
+ "\n",
+ "def on_year_slider_change(change):\n",
+ " \"\"\"Handle year slider changes.\"\"\"\n",
+ " global _syncing_time_widgets\n",
+ " if _syncing_time_widgets:\n",
+ " return\n",
+ " \n",
+ " if time_filter_enabled.value:\n",
+ " state.year_range = (year_slider.value[0], year_slider.value[1])\n",
+ " \n",
+ " # Update decade checkboxes to reflect slider range (if sync enabled)\n",
+ " if sync_decades_to_slider.value:\n",
+ " _syncing_time_widgets = True\n",
+ " min_yr, max_yr = year_slider.value\n",
+ " state.selected_decades = set()\n",
+ " for cb in decade_checkboxes:\n",
+ " decade = cb.decade_value\n",
+ " # Check if decade overlaps with slider range\n",
+ " decade_end = decade + 9\n",
+ " overlaps = (decade <= max_yr) and (decade_end >= min_yr)\n",
+ " cb.value = overlaps\n",
+ " if overlaps:\n",
+ " state.selected_decades.add(decade)\n",
+ " _syncing_time_widgets = False\n",
+ " \n",
+ " do_search()\n",
+ "\n",
+ "\n",
+ "def on_clear_all_filters(b):\n",
+ " \"\"\"Clear all facet filters.\"\"\"\n",
+ " global _syncing_time_widgets\n",
+ " _syncing_time_widgets = True\n",
+ " \n",
+ " # Clear source checkboxes\n",
+ " for cb in source_checkboxes:\n",
+ " cb.value = False\n",
+ " state.source_filters = set()\n",
+ "\n",
+ " # Clear material selection\n",
+ " material_select.value = []\n",
+ " state.material_filters = set()\n",
+ "\n",
+ " # Reset rollup to default (on)\n",
+ " rollup_toggle.value = True\n",
+ " state.material_rollup = True\n",
+ "\n",
+ " # Clear time filter\n",
+ " time_filter_enabled.value = False\n",
+ " for cb in decade_checkboxes:\n",
+ " cb.value = False\n",
+ " state.selected_decades = set()\n",
+ " year_slider.value = [year_stats['min_year'], year_stats['max_year']]\n",
+ " state.year_range = (None, None)\n",
+ "\n",
+ " # Clear search\n",
+ " search_input.value = ''\n",
+ " \n",
+ " _syncing_time_widgets = False\n",
+ "\n",
+ " # Reload data\n",
+ " do_search()\n",
+ "\n",
+ "\n",
+ "# Wire up facet event handlers\n",
+ "for cb in source_checkboxes:\n",
+ " cb.observe(on_source_checkbox_change, names=['value'])\n",
+ "\n",
+ "material_select.observe(on_material_select_change, names=['value'])\n",
+ "rollup_toggle.observe(on_rollup_toggle_change, names=['value'])\n",
+ "refresh_counts_btn.on_click(on_refresh_counts_click)\n",
+ "\n",
+ "# Wire up decade checkboxes\n",
+ "for cb in decade_checkboxes:\n",
+ " cb.observe(on_decade_checkbox_change, names=['value'])\n",
+ "\n",
+ "time_filter_enabled.observe(on_time_filter_change, names=['value'])\n",
+ "year_slider.observe(on_year_slider_change, names=['value'])\n",
+ "clear_filters_btn.on_click(on_clear_all_filters)\n",
+ "\n",
+ "print(\"Facet widgets ready!\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Explorer Interface\n",
+ "\n",
+ "Run this cell to launch the interactive explorer."
+ ]
+ },
+ {
"cell_type": "code",
"execution_count": null,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2026-04-24T15:12:53.942836Z",
+ "iopub.status.busy": "2026-04-24T15:12:53.942743Z",
+ "iopub.status.idle": "2026-04-24T15:12:53.951318Z",
+ "shell.execute_reply": "2026-04-24T15:12:53.950918Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Layout the interface\n",
+ "\n",
+ "# Search box with buttons\n",
+ "search_box = widgets.HBox([\n",
+ " search_input,\n",
+ " search_btn,\n",
+ " clear_search_btn\n",
+ "], layout=widgets.Layout(margin='0 15px 0 0'))\n",
+ "\n",
+ "# Row 1: Search and viewport mode\n",
+ "controls_row1 = widgets.HBox([\n",
+ " search_box,\n",
+ " viewport_toggle,\n",
+ " h3_tier_toggle,\n",
+ "], layout=widgets.Layout(margin='5px 0'))\n",
+ "\n",
+ "# Row 2: Sample count, refresh, status\n",
+ "controls_row2 = widgets.HBox([\n",
+ " sample_count,\n",
+ " refresh_btn,\n",
+ " loading_indicator,\n",
+ " status_label\n",
+ "], layout=widgets.Layout(margin='5px 0', flex_wrap='wrap'))\n",
+ "\n",
+ "# Row 3: Active filters display\n",
+ "controls_row3 = widgets.HBox([\n",
+ " active_filters_html\n",
+ "], layout=widgets.Layout(margin='0'))\n",
+ "\n",
+ "controls = widgets.VBox([controls_row1, controls_row2, controls_row3])\n",
+ "\n",
+ "# Legend\n",
+ "legend_html = \"\"\"\n",
+ "\n",
+ " SESAR\n",
+ " OpenContext\n",
+ " GEOME\n",
+ " Smithsonian\n",
+ "
\n",
+ "\"\"\"\n",
+ "legend = widgets.HTML(value=legend_html)\n",
+ "\n",
+ "# Facet panel header\n",
+ "facet_header = widgets.HTML(value=\"Filters
\")\n",
+ "\n",
+ "# Main layout with three columns: map | facets | details\n",
+ "left_panel = widgets.VBox([\n",
+ " widgets.HTML(\"Map
\"),\n",
+ " legend,\n",
+ " sample_map\n",
+ "], layout=widgets.Layout(flex='2', margin='0 10px 0 0'))\n",
+ "\n",
+ "center_panel = widgets.VBox([\n",
+ " facet_header,\n",
+ " facet_accordion,\n",
+ " clear_filters_btn\n",
+ "], layout=widgets.Layout(width='320px', min_width='280px', margin='0 10px 0 0'))\n",
+ "\n",
+ "right_panel = widgets.VBox([\n",
+ " widgets.HTML(\"Selected Sample
\"),\n",
+ " card_output,\n",
+ " widgets.HTML(\"Sample List
\"),\n",
+ " sample_table\n",
+ "], layout=widgets.Layout(flex='1', min_width='350px'))\n",
+ "\n",
+ "main_layout = widgets.HBox([left_panel, center_panel, right_panel])\n",
+ "\n",
+ "# Display\n",
+ "display(widgets.VBox([\n",
+ " widgets.HTML(\"iSamples Explorer
\"),\n",
+ " widgets.HTML(\"Interactive exploration of physical samples across scientific domains
\"),\n",
+ " controls,\n",
+ " main_layout\n",
+ "]))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
"metadata": {},
+ "source": [
+ "## Usage\n",
+ "\n",
+ "### Faceted Filters (Center Panel)\n",
+ "\n",
+ "The **Filters** panel provides multi-select faceted filtering:\n",
+ "\n",
+ "**Sources** - Filter by data source (multi-select):\n",
+ "- Check one or more sources to show only samples from those sources\n",
+ "- Counts show total samples per source\n",
+ "- Unchecking all shows all sources\n",
+ "\n",
+ "**Material Type** - Hierarchical filter with rollup:\n",
+ "- Shows the iSamples 3-level material vocabulary as an indented tree\n",
+ "- **Indentation shows hierarchy**: Top-level (Material), mid-level (Earth Material), leaf-level (Rock)\n",
+ "- **Include subcategories** (default ON): Selecting \"Earth Material\" also includes Rock, Sediment, Soil, Mineral, Mixed\n",
+ "- **Without rollup**: Only samples explicitly tagged at the selected level are shown\n",
+ "- Use Ctrl/Cmd+click to select multiple materials\n",
+ "\n",
+ "**Time Period** - Filter by collection/sampling date:\n",
+ "- **Filter by time**: Check to enable time filtering\n",
+ "- **Decade quick-select**: Click decade checkboxes (e.g., \"2010s\") for fast filtering\n",
+ " - Selecting multiple decades spans them (e.g., 2000s + 2010s = 2000-2019)\n",
+ " - Auto-enables time filter when any decade is selected\n",
+ "- **Year range slider**: Fine-tune the exact year range\n",
+ "- **Sync decades to slider** (default ON): Keeps checkboxes and slider in sync\n",
+ " - Checking \"2010s\" sets slider to 2010-2019\n",
+ " - Moving slider updates which decade checkboxes are highlighted\n",
+ "\n",
+ "**Clear All Filters** - Reset all facet selections and search\n",
+ "\n",
+ "### Material Hierarchy Example\n",
+ "\n",
+ "The material vocabulary has 3 levels. With \"Include subcategories\" ON:\n",
+ "\n",
+ "```\n",
+ "Selecting \"Earth Material\" includes:\n",
+ " \u2192 Rock (1M samples)\n",
+ " \u2192 Sediment (66K)\n",
+ " \u2192 Soil (32K)\n",
+ " \u2192 Mineral (300K)\n",
+ " \u2192 Mixed Soil/Sediment/Rock (838K)\n",
+ " = Total ~2.3M samples\n",
+ "```\n",
+ "\n",
+ "Without rollup, selecting \"Earth Material\" only shows the ~2.2M samples tagged directly at that level (not the children).\n",
+ "\n",
+ "### Time Period Examples\n",
+ "\n",
+ "**Quick decade selection:**\n",
+ "- Click \"2010s\" \u2192 Shows samples from 2010-2019\n",
+ "- Click \"2000s\" AND \"2010s\" \u2192 Shows samples from 2000-2019\n",
+ "- Click \"1990s\", \"2000s\", \"2010s\" \u2192 Shows samples from 1990-2019\n",
+ "\n",
+ "**Fine-tune with slider:**\n",
+ "- After selecting decades, adjust slider to narrow further (e.g., 2015-2018)\n",
+ "- Disable \"Sync decades to slider\" to use slider independently\n",
+ "\n",
+ "### Search\n",
+ "\n",
+ "Search filters samples by matching text in **label**, **description**, and **place name** fields:\n",
+ "\n",
+ "- **Enter a term**: Type \"pottery\", \"basalt\", \"Cyprus\", etc. and press Enter\n",
+ "- **Results are ranked**: Label matches (10 pts) > Description (5 pts) > Place name (3 pts)\n",
+ "- **Score column**: When searching, a \"score\" column appears in the table showing match quality\n",
+ "- **Combines with facets**: Search works together with facet filters (AND logic)\n",
+ "- **Viewport aware**: With Viewport Mode ON, search is limited to the current map view\n",
+ "\n",
+ "### Selection Sync (Bidirectional)\n",
+ "\n",
+ "Map and table selections are synchronized:\n",
+ "\n",
+ "- **Click a dot on the map** \u2192 The corresponding row is highlighted in the table, and the sample card updates\n",
+ "- **Click a row in the table** \u2192 The map recenters on that point (zoom level is preserved), and the sample card updates\n",
+ "\n",
+ "This makes it easy to explore samples visually on the map and then find them in the table, or vice versa.\n",
+ "\n",
+ "### Viewport Mode (Dynamic Loading)\n",
+ "\n",
+ "Enable **Viewport Mode** to automatically reload data as you pan and zoom:\n",
+ "\n",
+ "- **Toggle ON**: Click the \"Viewport Mode\" button (turns green when active)\n",
+ "- **Pan/zoom**: Data reloads automatically after you stop moving (500ms debounce)\n",
+ "- **Loading indicator**: Spinner shows while data is being fetched\n",
+ "- **Adaptive sampling**: \n",
+ " - World view (zoom < 2): max 10K samples per source\n",
+ " - Continent (zoom 2-5): max 25K per source\n",
+ " - Country (zoom 5-8): max 50K per source\n",
+ " - Region (zoom 8-12): max 100K per source\n",
+ " - Local (zoom > 12): uses your slider value\n",
+ "\n",
+ "### Active Filters Display\n",
+ "\n",
+ "When filters are active, colored tags appear below the controls showing:\n",
+ "- **Blue tag**: Active source filters\n",
+ "- **Green tag**: Active material filters (shows \"+sub\" when rollup is enabled)\n",
+ "- **Orange tag**: Active time range (shows selected decades if any)\n",
+ "\n",
+ "### Filter Combinations\n",
+ "\n",
+ "All filters work together with AND logic:\n",
+ "- Source + Material: Show pottery samples from OpenContext only\n",
+ "- Material + Time: Show rock samples collected in the 2010s\n",
+ "- Source + Time + Search: Find \"Cyprus\" in SESAR samples from 2000-2020\n",
+ "\n",
+ "### Color Legend\n",
+ "- **Blue**: SESAR (geological samples, IGSNs)\n",
+ "- **Red**: OpenContext (archaeological samples)\n",
+ "- **Green**: GEOME (genomic/biological samples)\n",
+ "- **Orange**: Smithsonian (museum collections)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Debug: Raw Data Access\n",
+ "\n",
+ "Use these cells to explore the underlying data."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2026-04-24T15:12:53.958129Z",
+ "iopub.status.busy": "2026-04-24T15:12:53.958050Z",
+ "iopub.status.idle": "2026-04-24T15:12:53.959689Z",
+ "shell.execute_reply": "2026-04-24T15:12:53.959396Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Current selection\n",
+ "if state.selected_row is not None:\n",
+ " print(\"Selected sample:\")\n",
+ " print(state.selected_row)\n",
+ "else:\n",
+ " print(\"No sample selected\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2026-04-24T15:12:53.960476Z",
+ "iopub.status.busy": "2026-04-24T15:12:53.960427Z",
+ "iopub.status.idle": "2026-04-24T15:12:53.979427Z",
+ "shell.execute_reply": "2026-04-24T15:12:53.979092Z"
+ }
+ },
"outputs": [],
"source": [
"# Query the full dataset\n",
@@ -346,20 +2930,209 @@
},
{
"cell_type": "markdown",
- "source": "## Material Type Hierarchy Analysis\n\nThe iSamples material vocabulary is a SKOS hierarchy with **3 levels**. Samples are tagged inconsistently at different levels, and rollup does not happen automatically in the raw data.\n\n### Implementation Status\n\n**Hierarchical Material Facet**: Now implemented in the explorer interface above:\n- **Tree display**: Materials shown with indentation reflecting hierarchy\n- **Rollup toggle**: \"Include subcategories\" checkbox (default ON)\n- **Rollup behavior**: Selecting a parent material includes all descendants in the filter\n\n### Key Findings (from data analysis)\n\n1. **No automatic rollup in data** - \"Earth Material\" (2.2M samples) does NOT include Rock (1M), Sediment (66K), etc. They're tagged separately at different hierarchy levels.\n\n2. **Inconsistent tagging depth** - Some samples tagged at root \"Material\" (664K), some at mid-level \"Earth Material\" (2.2M), some at leaf \"Rock\" (1M).\n\n3. **Intermediate nodes often empty** - \"Natural Solid Material\", \"Fluid Material\", \"Dispersed Media\" have 0 direct tags.\n\n### Pre-computed Hierarchy (January 2026)\n\n```\n- Material: 664,199\n - Natural Solid Material: 0 (not used directly)\n - Earth Material: 2,251,086\n - Rock: 1,052,183\n - Sediment: 66,648\n - Soil: 32,157\n - Mineral: 300,179\n - Mixed Soil/Sediment/Rock: 838,726\n - Biogenic Non-organic: 1,090,222\n - Organic Material: 862,220\n - Plant Material: 1\n - Animal Product: 266\n - Anthropogenic Material: 44,399\n - Anthropogenic Metal: 269,981\n - Ceramic Clay: 100,501\n - Fluid Material: 0 (not used directly)\n - Liquid Water: 24,080\n - Gas: 1,154\n - Non-aqueous Liquid: 44\n - Dispersed Media: 0 (not used directly)\n - Particulate: 122\n - Any Ice: 4\n```\n\nTotal samples with coordinates: ~6M",
- "metadata": {}
+ "metadata": {},
+ "source": [
+ "## Material Type Hierarchy Analysis\n",
+ "\n",
+ "The iSamples material vocabulary is a SKOS hierarchy with **3 levels**. Samples are tagged inconsistently at different levels, and rollup does not happen automatically in the raw data.\n",
+ "\n",
+ "### Implementation Status\n",
+ "\n",
+ "**Hierarchical Material Facet**: Now implemented in the explorer interface above:\n",
+ "- **Tree display**: Materials shown with indentation reflecting hierarchy\n",
+ "- **Rollup toggle**: \"Include subcategories\" checkbox (default ON)\n",
+ "- **Rollup behavior**: Selecting a parent material includes all descendants in the filter\n",
+ "\n",
+ "### Key Findings (from data analysis)\n",
+ "\n",
+ "1. **No automatic rollup in data** - \"Earth Material\" (2.2M samples) does NOT include Rock (1M), Sediment (66K), etc. They're tagged separately at different hierarchy levels.\n",
+ "\n",
+ "2. **Inconsistent tagging depth** - Some samples tagged at root \"Material\" (664K), some at mid-level \"Earth Material\" (2.2M), some at leaf \"Rock\" (1M).\n",
+ "\n",
+ "3. **Intermediate nodes often empty** - \"Natural Solid Material\", \"Fluid Material\", \"Dispersed Media\" have 0 direct tags.\n",
+ "\n",
+ "### Pre-computed Hierarchy (January 2026)\n",
+ "\n",
+ "```\n",
+ "- Material: 664,199\n",
+ " - Natural Solid Material: 0 (not used directly)\n",
+ " - Earth Material: 2,251,086\n",
+ " - Rock: 1,052,183\n",
+ " - Sediment: 66,648\n",
+ " - Soil: 32,157\n",
+ " - Mineral: 300,179\n",
+ " - Mixed Soil/Sediment/Rock: 838,726\n",
+ " - Biogenic Non-organic: 1,090,222\n",
+ " - Organic Material: 862,220\n",
+ " - Plant Material: 1\n",
+ " - Animal Product: 266\n",
+ " - Anthropogenic Material: 44,399\n",
+ " - Anthropogenic Metal: 269,981\n",
+ " - Ceramic Clay: 100,501\n",
+ " - Fluid Material: 0 (not used directly)\n",
+ " - Liquid Water: 24,080\n",
+ " - Gas: 1,154\n",
+ " - Non-aqueous Liquid: 44\n",
+ " - Dispersed Media: 0 (not used directly)\n",
+ " - Particulate: 122\n",
+ " - Any Ice: 4\n",
+ "```\n",
+ "\n",
+ "Total samples with coordinates: ~6M"
+ ]
},
{
"cell_type": "code",
- "source": "# Compute Material Type Hierarchy with Sample Counts\n# \n# This analysis shows how samples are tagged at different levels of the \n# iSamples material vocabulary hierarchy.\n\ndef compute_material_hierarchy():\n \"\"\"\n Compute material type counts and display as a hierarchy tree.\n \n The iSamples material vocabulary is a 3-level SKOS hierarchy.\n This function queries the data to show counts at each level.\n \"\"\"\n # Get all material counts from the data\n query = \"\"\"\n WITH samples AS (\n SELECT UNNEST(p__has_material_category) as material_id\n FROM read_parquet(?)\n WHERE otype = 'MaterialSampleRecord' AND latitude IS NOT NULL\n )\n SELECT ic.label as uri, COUNT(*) as cnt\n FROM samples s\n JOIN read_parquet(?) ic ON ic.row_id = s.material_id\n WHERE ic.label IS NOT NULL\n GROUP BY ic.label\n ORDER BY cnt DESC\n \"\"\"\n df = con.execute(query, [PARQUET_PATH, PARQUET_PATH]).df()\n \n # Build counts dict from URI last segment\n counts = {}\n for _, row in df.iterrows():\n key = row['uri'].rstrip('/').split('/')[-1].lower()\n counts[key] = row['cnt']\n \n # Define the hierarchy structure (based on iSamples vocabulary)\n # https://isamplesorg.github.io/metadata/vocabularies/material.html\n hierarchy = [\n (\"Material\", \"material\", 0, [\n (\"Natural Solid Material\", \"naturalsolidmaterial\", 1, [\n (\"Earth Material\", \"earthmaterial\", 2, [\n (\"Rock\", \"rock\", 3, []),\n (\"Sediment\", \"sediment\", 3, []),\n (\"Soil\", \"soil\", 3, []),\n (\"Mineral\", \"mineral\", 3, []),\n (\"Mixed Soil/Sediment/Rock\", \"mixedsoilsedimentrock\", 3, []),\n ]),\n (\"Biogenic Non-organic\", \"biogenicnonorganicmaterial\", 2, []),\n ]),\n (\"Organic Material\", \"organicmaterial\", 1, [\n (\"Plant Material\", \"plantmaterial\", 2, []),\n (\"Animal Product\", \"organicanimalproduct\", 2, []),\n ]),\n (\"Anthropogenic Material\", \"anyanthropogenicmaterial\", 1, [\n (\"Anthropogenic Metal\", \"anthropogenicmetal\", 2, []),\n (\"Ceramic Clay\", \"ceramicclay\", 2, []),\n ]),\n (\"Fluid Material\", \"fluidmaterial\", 1, [\n (\"Liquid Water\", \"liquidwater\", 2, []),\n (\"Gas\", \"gas\", 2, []),\n (\"Non-aqueous Liquid\", \"nonaqueousliquid\", 2, []),\n ]),\n (\"Dispersed Media\", \"dispersedmedia\", 1, [\n (\"Particulate\", \"particulate\", 2, []),\n ]),\n (\"Any Ice\", \"anyice\", 1, []),\n ])\n ]\n \n def print_node(nodes, indent=0):\n \"\"\"Recursively print hierarchy with counts.\"\"\"\n for name, key, level, children in nodes:\n cnt = counts.get(key, 0)\n prefix = \" \" * indent\n marker = \"- \" if indent == 0 else \"└─ \"\n \n # Calculate rollup (what count WOULD be with proper rollup)\n def calc_rollup(node_list):\n total = 0\n for n, k, l, c in node_list:\n total += counts.get(k, 0) + calc_rollup(c)\n return total\n \n rollup = cnt + calc_rollup(children)\n \n if children and rollup != cnt:\n print(f\"{prefix}{marker}**{name}**: {cnt:,} (rollup would be {rollup:,})\")\n else:\n print(f\"{prefix}{marker}**{name}**: {cnt:,}\")\n \n if children:\n print_node(children, indent + 1)\n \n print(\"Material Type Hierarchy with Sample Counts\")\n print(\"=\" * 50)\n print()\n print_node(hierarchy)\n print()\n \n # Summary statistics\n total_tags = sum(counts.values())\n total_samples = con.execute(f\"\"\"\n SELECT COUNT(*) FROM read_parquet('{PARQUET_PATH}')\n WHERE otype = 'MaterialSampleRecord' AND latitude IS NOT NULL\n \"\"\").fetchone()[0]\n \n # Materials per sample distribution\n dist_query = f\"\"\"\n SELECT \n LEN(p__has_material_category) as num_materials,\n COUNT(*) as num_samples\n FROM read_parquet('{PARQUET_PATH}')\n WHERE otype = 'MaterialSampleRecord' \n AND latitude IS NOT NULL\n AND p__has_material_category IS NOT NULL\n GROUP BY LEN(p__has_material_category)\n ORDER BY num_materials\n \"\"\"\n dist_df = con.execute(dist_query).df()\n \n print(\"Summary Statistics\")\n print(\"-\" * 30)\n print(f\"Total samples with coordinates: {total_samples:,}\")\n print(f\"Total material tags: {total_tags:,}\")\n print(f\"Unique material types: {len(counts)}\")\n print()\n print(\"Materials per sample:\")\n for _, row in dist_df.iterrows():\n print(f\" {int(row['num_materials'])} material(s): {int(row['num_samples']):,} samples\")\n \n return counts, hierarchy\n\n# Run the analysis\nmaterial_counts, material_hierarchy = compute_material_hierarchy()",
- "metadata": {},
"execution_count": null,
- "outputs": []
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2026-04-24T15:12:53.980418Z",
+ "iopub.status.busy": "2026-04-24T15:12:53.980367Z",
+ "iopub.status.idle": "2026-04-24T15:12:54.154691Z",
+ "shell.execute_reply": "2026-04-24T15:12:54.154187Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Compute Material Type Hierarchy with Sample Counts\n",
+ "# \n",
+ "# This analysis shows how samples are tagged at different levels of the \n",
+ "# iSamples material vocabulary hierarchy.\n",
+ "\n",
+ "def compute_material_hierarchy():\n",
+ " \"\"\"\n",
+ " Compute material type counts and display as a hierarchy tree.\n",
+ " \n",
+ " The iSamples material vocabulary is a 3-level SKOS hierarchy.\n",
+ " This function queries the data to show counts at each level.\n",
+ " \"\"\"\n",
+ " # Get all material counts from the data\n",
+ " query = \"\"\"\n",
+ " WITH samples AS (\n",
+ " SELECT UNNEST(p__has_material_category) as material_id\n",
+ " FROM read_parquet(?)\n",
+ " WHERE otype = 'MaterialSampleRecord' AND latitude IS NOT NULL\n",
+ " )\n",
+ " SELECT ic.label as uri, COUNT(*) as cnt\n",
+ " FROM samples s\n",
+ " JOIN read_parquet(?) ic ON ic.row_id = s.material_id\n",
+ " WHERE ic.label IS NOT NULL\n",
+ " GROUP BY ic.label\n",
+ " ORDER BY cnt DESC\n",
+ " \"\"\"\n",
+ " df = con.execute(query, [PARQUET_PATH, PARQUET_PATH]).df()\n",
+ " \n",
+ " # Build counts dict from URI last segment\n",
+ " counts = {}\n",
+ " for _, row in df.iterrows():\n",
+ " key = row['uri'].rstrip('/').split('/')[-1].lower()\n",
+ " counts[key] = row['cnt']\n",
+ " \n",
+ " # Define the hierarchy structure (based on iSamples vocabulary)\n",
+ " # https://isamplesorg.github.io/metadata/vocabularies/material.html\n",
+ " hierarchy = [\n",
+ " (\"Material\", \"material\", 0, [\n",
+ " (\"Natural Solid Material\", \"naturalsolidmaterial\", 1, [\n",
+ " (\"Earth Material\", \"earthmaterial\", 2, [\n",
+ " (\"Rock\", \"rock\", 3, []),\n",
+ " (\"Sediment\", \"sediment\", 3, []),\n",
+ " (\"Soil\", \"soil\", 3, []),\n",
+ " (\"Mineral\", \"mineral\", 3, []),\n",
+ " (\"Mixed Soil/Sediment/Rock\", \"mixedsoilsedimentrock\", 3, []),\n",
+ " ]),\n",
+ " (\"Biogenic Non-organic\", \"biogenicnonorganicmaterial\", 2, []),\n",
+ " ]),\n",
+ " (\"Organic Material\", \"organicmaterial\", 1, [\n",
+ " (\"Plant Material\", \"plantmaterial\", 2, []),\n",
+ " (\"Animal Product\", \"organicanimalproduct\", 2, []),\n",
+ " ]),\n",
+ " (\"Anthropogenic Material\", \"anyanthropogenicmaterial\", 1, [\n",
+ " (\"Anthropogenic Metal\", \"anthropogenicmetal\", 2, []),\n",
+ " (\"Ceramic Clay\", \"ceramicclay\", 2, []),\n",
+ " ]),\n",
+ " (\"Fluid Material\", \"fluidmaterial\", 1, [\n",
+ " (\"Liquid Water\", \"liquidwater\", 2, []),\n",
+ " (\"Gas\", \"gas\", 2, []),\n",
+ " (\"Non-aqueous Liquid\", \"nonaqueousliquid\", 2, []),\n",
+ " ]),\n",
+ " (\"Dispersed Media\", \"dispersedmedia\", 1, [\n",
+ " (\"Particulate\", \"particulate\", 2, []),\n",
+ " ]),\n",
+ " (\"Any Ice\", \"anyice\", 1, []),\n",
+ " ])\n",
+ " ]\n",
+ " \n",
+ " def print_node(nodes, indent=0):\n",
+ " \"\"\"Recursively print hierarchy with counts.\"\"\"\n",
+ " for name, key, level, children in nodes:\n",
+ " cnt = counts.get(key, 0)\n",
+ " prefix = \" \" * indent\n",
+ " marker = \"- \" if indent == 0 else \"\u2514\u2500 \"\n",
+ " \n",
+ " # Calculate rollup (what count WOULD be with proper rollup)\n",
+ " def calc_rollup(node_list):\n",
+ " total = 0\n",
+ " for n, k, l, c in node_list:\n",
+ " total += counts.get(k, 0) + calc_rollup(c)\n",
+ " return total\n",
+ " \n",
+ " rollup = cnt + calc_rollup(children)\n",
+ " \n",
+ " if children and rollup != cnt:\n",
+ " print(f\"{prefix}{marker}**{name}**: {cnt:,} (rollup would be {rollup:,})\")\n",
+ " else:\n",
+ " print(f\"{prefix}{marker}**{name}**: {cnt:,}\")\n",
+ " \n",
+ " if children:\n",
+ " print_node(children, indent + 1)\n",
+ " \n",
+ " print(\"Material Type Hierarchy with Sample Counts\")\n",
+ " print(\"=\" * 50)\n",
+ " print()\n",
+ " print_node(hierarchy)\n",
+ " print()\n",
+ " \n",
+ " # Summary statistics\n",
+ " total_tags = sum(counts.values())\n",
+ " total_samples = con.execute(f\"\"\"\n",
+ " SELECT COUNT(*) FROM read_parquet('{PARQUET_PATH}')\n",
+ " WHERE otype = 'MaterialSampleRecord' AND latitude IS NOT NULL\n",
+ " \"\"\").fetchone()[0]\n",
+ " \n",
+ " # Materials per sample distribution\n",
+ " dist_query = f\"\"\"\n",
+ " SELECT \n",
+ " LEN(p__has_material_category) as num_materials,\n",
+ " COUNT(*) as num_samples\n",
+ " FROM read_parquet('{PARQUET_PATH}')\n",
+ " WHERE otype = 'MaterialSampleRecord' \n",
+ " AND latitude IS NOT NULL\n",
+ " AND p__has_material_category IS NOT NULL\n",
+ " GROUP BY LEN(p__has_material_category)\n",
+ " ORDER BY num_materials\n",
+ " \"\"\"\n",
+ " dist_df = con.execute(dist_query).df()\n",
+ " \n",
+ " print(\"Summary Statistics\")\n",
+ " print(\"-\" * 30)\n",
+ " print(f\"Total samples with coordinates: {total_samples:,}\")\n",
+ " print(f\"Total material tags: {total_tags:,}\")\n",
+ " print(f\"Unique material types: {len(counts)}\")\n",
+ " print()\n",
+ " print(\"Materials per sample:\")\n",
+ " for _, row in dist_df.iterrows():\n",
+ " print(f\" {int(row['num_materials'])} material(s): {int(row['num_samples']):,} samples\")\n",
+ " \n",
+ " return counts, hierarchy\n",
+ "\n",
+ "# Run the analysis\n",
+ "material_counts, material_hierarchy = compute_material_hierarchy()"
+ ]
}
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3 (ipykernel)",
+ "display_name": "isamples-python-3.12.9",
"language": "python",
"name": "python3"
},
@@ -373,9 +3146,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.12.9"
+ "version": "3.13.11"
}
},
"nbformat": 4,
"nbformat_minor": 4
-}
\ No newline at end of file
+}
diff --git a/examples/basic/pqg_demo.ipynb b/examples/basic/pqg_demo.ipynb
index 449011f..2d0effe 100644
--- a/examples/basic/pqg_demo.ipynb
+++ b/examples/basic/pqg_demo.ipynb
@@ -547,6 +547,33 @@
" print(f\"{material:<50} {count:>10,}\")"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Same distribution, with human-readable labels\n",
+ "\n",
+ "The `material.label` field above currently holds the concept URI (this is\n",
+ "the [#148](https://github.com/isamplesorg/isamplesorg.github.io/issues/148)\n",
+ "data-quality issue). We can join to the canonical `vocab_labels.parquet`\n",
+ "lookup to render prefLabels client-side.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from vocab_labels import load_vocab_labels, pretty_label\n",
+ "\n",
+ "labels = load_vocab_labels()\n",
+ "\n",
+ "pretty_counts = {pretty_label(uri, labels): n for uri, n in material_counts.items()}\n",
+ "for label, n in sorted(pretty_counts.items(), key=lambda kv: -kv[1])[:10]:\n",
+ " print(f' {n:>6,d} {label}')\n"
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {},
@@ -831,6 +858,230 @@
"- **Example script:** `pqg/examples/typed_edges_demo.py`\n",
"- **Tests:** `pqg/tests/test_typed_edges.py`"
]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "wide-shortcut-header",
+ "metadata": {},
+ "source": [
+ "## Wide Format Shortcut: H3 Spatial Queries\n",
+ "\n",
+ "The narrow (graph) format requires multi-hop traversals to answer spatial questions:\n",
+ "\n",
+ "```\n",
+ "Sample --produced_by--> SamplingEvent --sample_location--> GeospatialCoordLocation\n",
+ "```\n",
+ "\n",
+ "The **wide format with H3 columns** flattens these joins into a single row per sample,\n",
+ "with `latitude`, `longitude`, and pre-computed `h3_res4/6/8` columns. This enables\n",
+ "spatial queries in a single scan \u2014 no graph traversal needed.\n",
+ "\n",
+ "Below we compare both approaches for the same query: **materials in the San Francisco Bay Area**."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "wide-shortcut-setup",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import duckdb\n",
+ "import time\n",
+ "\n",
+ "WIDE_H3_URL = \"https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202601_wide_h3.parquet\"\n",
+ "\n",
+ "con_wide = duckdb.connect()\n",
+ "con_wide.execute(\"INSTALL h3 FROM community; LOAD h3;\")\n",
+ "\n",
+ "# Bay Area bounding box\n",
+ "BBOX = dict(min_lat=37.2, max_lat=37.9, min_lon=-122.6, max_lon=-121.8)\n",
+ "\n",
+ "print(\"Wide format connection ready with H3 extension.\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "wide-graph-approach",
+ "metadata": {},
+ "source": [
+ "### Approach 1: Graph Traversal (Narrow Format)\n",
+ "\n",
+ "Walk the edge chain: `MaterialSampleRecord -> produced_by -> SamplingEvent -> sample_location\n",
+ "-> GeospatialCoordLocation`, then filter by lat/lon bounds."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "wide-graph-code",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Multi-hop graph traversal for Bay Area samples\n",
+ "# This uses the PQG narrow-format approach through typed edges\n",
+ "t0 = time.time()\n",
+ "\n",
+ "try:\n",
+ " graph_samples = []\n",
+ " for subject_pid, pred, objects, ng, et in typed_queries.get_edges_by_type(\n",
+ " ISamplesEdgeType.MSR_PRODUCED_BY, limit=50000\n",
+ " ):\n",
+ " # For each sample -> event, find event -> location\n",
+ " for obj_pid in objects:\n",
+ " for s2, p2, objs2, ng2, et2 in typed_queries.get_edges_by_type(\n",
+ " ISamplesEdgeType.EVENT_SAMPLE_LOCATION, limit=5\n",
+ " ):\n",
+ " # Note: Full implementation would match obj_pid to s2\n",
+ " # and extract coordinates from the location node.\n",
+ " # This is shown as pseudocode \u2014 the wide format\n",
+ " # shortcut below is the recommended approach.\n",
+ " graph_ms = (time.time() - t0) * 1000\n",
+ " print(f\"Graph traversal (50K sample limit): {graph_ms:.0f} ms\")\n",
+ "except Exception as e:\n",
+ " graph_ms = (time.time() - t0) * 1000\n",
+ " print(f\"Graph traversal attempted: {graph_ms:.0f} ms\")\n",
+ " print(f\"(PQG typed edges require narrow-format parquet loaded above)\")\n",
+ "\n",
+ "print(\"\\nGraph traversal requires:\")\n",
+ "print(\" 1. Find Sample -> SamplingEvent edges (produced_by)\")\n",
+ "print(\" 2. For each event, find Event -> Location edges (sample_location)\")\n",
+ "print(\" 3. Extract lat/lon from Location nodes\")\n",
+ "print(\" 4. Filter by bounding box\")\n",
+ "print(\" = Multiple hops through millions of edges\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "wide-h3-approach",
+ "metadata": {},
+ "source": [
+ "### Approach 2: H3 Spatial Shortcut (Wide Format)\n",
+ "\n",
+ "Direct query on the wide format file \u2014 latitude, longitude, and H3 cells are\n",
+ "pre-joined columns. No traversal needed."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "wide-h3-code",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import h3\n",
+ "\n",
+ "# H3-accelerated spatial query on wide format\n",
+ "\n",
+ "# Step 1: Compute covering res4 cells mathematically (no data scan).\n",
+ "# This uses the h3 Python library \u2014 pure geometry, O(1) relative to data size.\n",
+ "t0 = time.time()\n",
+ "bbox_polygon = h3.LatLngPoly([\n",
+ " (BBOX['min_lat'], BBOX['min_lon']),\n",
+ " (BBOX['min_lat'], BBOX['max_lon']),\n",
+ " (BBOX['max_lat'], BBOX['max_lon']),\n",
+ " (BBOX['max_lat'], BBOX['min_lon']),\n",
+ "])\n",
+ "covering = h3.geo_to_cells(bbox_polygon, res=4)\n",
+ "# Convert to signed int64 to match DuckDB BIGINT storage\n",
+ "def h3_to_signed(cell_hex):\n",
+ " val = h3.str_to_int(cell_hex)\n",
+ " return val if val < 2**63 else val - 2**64\n",
+ "\n",
+ "cell_list = ', '.join(str(h3_to_signed(c)) for c in covering)\n",
+ "print(f'Bbox covered by {len(covering)} res4 cells')\n",
+ "\n",
+ "if not covering:\n",
+ " print('No H3 cells cover this bbox.')\n",
+ " result = None\n",
+ " h3_ms = 0\n",
+ "else:\n",
+ " # Step 2: Query with H3 pre-filter + exact bbox\n",
+ " result = con_wide.sql(f\"\"\"\n",
+ " SELECT\n",
+ " n AS source,\n",
+ " COUNT(*) AS sample_count,\n",
+ " COUNT(DISTINCT label) AS distinct_labels\n",
+ " FROM read_parquet('{WIDE_H3_URL}')\n",
+ " WHERE otype = 'MaterialSampleRecord'\n",
+ " AND h3_res4 IN ({cell_list})\n",
+ " AND latitude BETWEEN {BBOX['min_lat']} AND {BBOX['max_lat']}\n",
+ " AND longitude BETWEEN {BBOX['min_lon']} AND {BBOX['max_lon']}\n",
+ " GROUP BY n\n",
+ " ORDER BY sample_count DESC\n",
+ " \"\"\").df()\n",
+ " h3_ms = (time.time() - t0) * 1000\n",
+ "\n",
+ " print(f'H3 spatial shortcut: {h3_ms:.0f} ms')\n",
+ " print(f'\\nSamples in the Bay Area ({BBOX}):')\n",
+ " print(result.to_string(index=False))\n",
+ " print(f'\\nTotal samples: {result[\"sample_count\"].sum():,}')\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "wide-comparison-table",
+ "metadata": {},
+ "source": [
+ "### When to Use Each Approach\n",
+ "\n",
+ "| Query Type | Narrow (Graph) | Wide + H3 |\n",
+ "|-----------|---------------|----------|\n",
+ "| **Provenance chains** (who collected what, when) | Best \u2014 follow typed edges | Not available |\n",
+ "| **Spatial filtering** (samples in a bbox) | Slow \u2014 multi-hop traversal | Fast \u2014 single scan with H3 pre-filter |\n",
+ "| **Regional aggregation** (count by area) | Very slow \u2014 join 3 tables | Fast \u2014 GROUP BY h3_res4/6/8 |\n",
+ "| **Schema exploration** (edge types, patterns) | Best \u2014 typed edge API | N/A |\n",
+ "| **Full-text search + location** | Requires custom joins | Single WHERE clause |\n",
+ "| **Material/context at a location** | Multi-hop + filter | Single query with all columns |\n",
+ "\n",
+ "**Rule of thumb**: Use the narrow graph format when you need to understand *relationships*\n",
+ "(provenance, attribution, classification chains). Use the wide + H3 format when you need\n",
+ "to *filter and aggregate spatially*."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "wide-timing-comparison",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Side-by-side timing: lat/lon scan vs H3 pre-filter on wide format\n",
+ "\n",
+ "# Baseline: raw lat/lon scan (same filters as H3 query)\n",
+ "t0 = time.time()\n",
+ "baseline = con_wide.sql(f\"\"\"\n",
+ " SELECT COUNT(*) AS n\n",
+ " FROM read_parquet('{WIDE_H3_URL}')\n",
+ " WHERE otype = 'MaterialSampleRecord'\n",
+ " AND latitude BETWEEN {BBOX['min_lat']} AND {BBOX['max_lat']}\n",
+ " AND longitude BETWEEN {BBOX['min_lon']} AND {BBOX['max_lon']}\n",
+ "\"\"\").fetchone()[0]\n",
+ "baseline_ms = (time.time() - t0) * 1000\n",
+ "\n",
+ "# H3 pre-filter + exact bbox (reuse covering cells from above)\n",
+ "if not covering:\n",
+ " print('No covering cells \u2014 skipping H3 benchmark')\n",
+ "else:\n",
+ " t0 = time.time()\n",
+ " h3_count = con_wide.sql(f\"\"\"\n",
+ " SELECT COUNT(*) AS n\n",
+ " FROM read_parquet('{WIDE_H3_URL}')\n",
+ " WHERE otype = 'MaterialSampleRecord'\n",
+ " AND h3_res4 IN ({cell_list})\n",
+ " AND latitude BETWEEN {BBOX['min_lat']} AND {BBOX['max_lat']}\n",
+ " AND longitude BETWEEN {BBOX['min_lon']} AND {BBOX['max_lon']}\n",
+ " \"\"\").fetchone()[0]\n",
+ " h3_fast_ms = (time.time() - t0) * 1000\n",
+ "\n",
+ " print('Timing Comparison: Bay Area Bounding Box')\n",
+ " print('=' * 50)\n",
+ " print(f'Baseline (lat/lon scan): {baseline_ms:>8.0f} ms | {baseline:,} rows')\n",
+ " print(f'H3 res4 pre-filter: {h3_fast_ms:>8.0f} ms | {h3_count:,} rows')\n",
+ " speedup = baseline_ms / h3_fast_ms if h3_fast_ms > 0 else float('inf')\n",
+ " print(f'Speedup: {speedup:>7.1f}x')\n",
+ " print(f'\\nRow counts match: {baseline == h3_count}')\n"
+ ]
}
],
"metadata": {
diff --git a/examples/basic/vocab_labels.py b/examples/basic/vocab_labels.py
new file mode 100644
index 0000000..b60d5e7
--- /dev/null
+++ b/examples/basic/vocab_labels.py
@@ -0,0 +1,43 @@
+"""Helpers for joining iSamples vocabulary URIs to human-readable labels.
+
+The wide and narrow parquets currently store SKOS concept references as URIs
+in `IdentifiedConcept.label` (e.g. `https://w3id.org/isample/vocabulary/
+material/1.0/earthmaterial`). This module wraps the canonical
+`vocab_labels.parquet` lookup published at `data.isamples.org` so notebooks
+can render `Natural Solid Material` instead of the raw URI.
+
+See https://github.com/isamplesorg/isamplesorg.github.io/issues/148 for
+background and the build script that produces the artifact.
+"""
+
+from __future__ import annotations
+
+import duckdb
+
+VOCAB_LABELS_URL = "https://data.isamples.org/vocab_labels.parquet"
+
+
+def load_vocab_labels(url: str = VOCAB_LABELS_URL, lang: str = "en") -> dict[str, str]:
+ """Return a {uri: pref_label} dict for the requested language.
+
+ The artifact is ~60KB; one HTTP fetch is fine for any notebook session.
+ """
+ con = duckdb.connect()
+ rows = con.sql(
+ f"SELECT uri, pref_label FROM read_parquet('{url}') WHERE lang = ?",
+ params=[lang],
+ ).fetchall()
+ return {uri: label for uri, label in rows}
+
+
+def pretty_label(uri: str | None, labels: dict[str, str]) -> str:
+ """Return the SKOS prefLabel for `uri`, falling back to the URI tail."""
+ if uri is None:
+ return ""
+ if uri in labels:
+ return labels[uri]
+ s = str(uri)
+ if s.startswith(("http://", "https://")):
+ tail = s.rstrip("/").rsplit("/", 1)[-1]
+ return tail or s
+ return s