From 97d3c1339e03442db475e7426dfeb1d64760ebbf Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Wed, 8 Apr 2026 17:19:45 -0700 Subject: [PATCH 1/2] Improve Explorer search: multi-term AND, relevance ranking, FTS spike MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Search improvements (immediate): - Multi-term search: "pottery Cyprus" requires BOTH words to match - Relevance ranking: label matches weighted 3x, place 2x, description 1x - Results sorted by relevance score when searching (random for browsing) FTS spike (future path, documented): - Added tools/build_fts_index.py to build DuckDB FTS index offline - Tested: 358 MB full index, 211 MB lite — too large for auto-download - BM25 scoring works correctly (Porter stemming, stopwords) - Next step: explore smaller index strategies or on-demand loading Closes #84 (spike complete — findings documented in PR) Co-Authored-By: Claude Opus 4.6 --- .gitignore | 1 + tools/build_fts_index.py | 97 +++++++++++++++++++++++++++++++++ tutorials/isamples_explorer.qmd | 45 ++++++++++++--- 3 files changed, 134 insertions(+), 9 deletions(-) create mode 100644 tools/build_fts_index.py diff --git a/.gitignore b/.gitignore index 40f2d5c..e3b2633 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,7 @@ models/generated/extensions/*.md # Large data files *.parquet +*.duckdb # Node / Playwright node_modules/ diff --git a/tools/build_fts_index.py b/tools/build_fts_index.py new file mode 100644 index 0000000..b0ddd09 --- /dev/null +++ b/tools/build_fts_index.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 +""" +Build a DuckDB full-text search index for the iSamples Explorer. + +Creates a .duckdb file containing the FTS index (BM25-scored) that can +be ATTACHed in DuckDB-WASM for ranked text search over 6.7M samples. + +Usage: + python tools/build_fts_index.py + +Output: + tools/isamples_fts_index.duckdb (upload to data.isamples.org) + +Requirements: + pip install duckdb +""" + +import duckdb +import os +import sys +from pathlib import Path + +PARQUET_URL = "https://data.isamples.org/isamples_202601_wide.parquet" +OUTPUT_DB = Path(__file__).parent / "isamples_fts_index.duckdb" + +# Local fallback for faster builds +LOCAL_PARQUET = Path.home() / "Data/iSample/pqg_refining/zenodo_wide_2026-01-09.parquet" + + +def build_fts_index(): + # Use local file if available, otherwise remote + source = str(LOCAL_PARQUET) if LOCAL_PARQUET.exists() else PARQUET_URL + print(f"Source: {source}") + + # Remove existing index file + if OUTPUT_DB.exists(): + OUTPUT_DB.unlink() + + con = duckdb.connect(str(OUTPUT_DB)) + + print("Creating samples table from parquet...") + con.execute(f""" + CREATE TABLE samples AS + SELECT + pid, + label, + COALESCE(description, '') AS description, + COALESCE(CAST(place_name AS VARCHAR), '') AS place_name + FROM read_parquet('{source}') + WHERE otype = 'MaterialSampleRecord' + """) + + row_count = con.execute("SELECT COUNT(*) FROM samples").fetchone()[0] + print(f"Loaded {row_count:,} rows") + + print("Installing and loading FTS extension...") + con.execute("INSTALL fts") + con.execute("LOAD fts") + + print("Building FTS index (this may take a few minutes)...") + con.execute(""" + PRAGMA create_fts_index( + 'samples', 'pid', + 'label', 'description', 'place_name', + stemmer = 'porter', + stopwords = 'english', + overwrite = 1 + ) + """) + + # Verify the index works + test_result = con.execute(""" + SELECT pid, fts_main_samples.match_bm25(pid, 'pottery') AS score + FROM samples + WHERE score IS NOT NULL + ORDER BY score DESC + LIMIT 5 + """).fetchall() + print(f"Test query 'pottery': {len(test_result)} results") + for pid, score in test_result: + print(f" {pid[:60]} score={score:.4f}") + + # Keep samples table — FTS macros reference it internally. + # The table has only pid + text columns (not the full schema), + # so it's much smaller than the full parquet. + + con.close() + + size_mb = OUTPUT_DB.stat().st_size / (1024 * 1024) + print(f"\nIndex file: {OUTPUT_DB}") + print(f"Size: {size_mb:.1f} MB") + print(f"\nUpload to data.isamples.org and ATTACH in DuckDB-WASM:") + print(f" ATTACH 'https://data.isamples.org/isamples_fts_index.duckdb' AS fts_db;") + + +if __name__ == "__main__": + build_fts_index() diff --git a/tutorials/isamples_explorer.qmd b/tutorials/isamples_explorer.qmd index 402b8a4..b7b5597 100644 --- a/tutorials/isamples_explorer.qmd +++ b/tutorials/isamples_explorer.qmd @@ -71,7 +71,7 @@ initialParams = { //| code-fold: false // Search input viewof searchInput = Inputs.text({ - placeholder: "Search samples (e.g., pottery, basalt, Cyprus...)", + placeholder: "Search samples — multiple words narrow results (e.g., pottery Cyprus)", value: initialParams.q, submit: "Search" }) @@ -392,14 +392,17 @@ whereClause = { "latitude IS NOT NULL" ]; - // Text search + // Multi-term text search: each word must match at least one text field if (searchInput?.trim()) { - const term = searchInput.trim().replace(/'/g, "''"); - conditions.push(`( - label ILIKE '%${term}%' - OR description ILIKE '%${term}%' - OR CAST(place_name AS VARCHAR) ILIKE '%${term}%' - )`); + const terms = searchInput.trim().split(/\s+/).filter(t => t.length > 0); + for (const raw of terms) { + const term = raw.replace(/'/g, "''"); + conditions.push(`( + label ILIKE '%${term}%' + OR description ILIKE '%${term}%' + OR CAST(place_name AS VARCHAR) ILIKE '%${term}%' + )`); + } } // Source filter @@ -466,6 +469,30 @@ sampleData = { } try { + // When searching, rank results by relevance (fields matched); + // otherwise random sample for exploration + const hasSearch = searchInput?.trim()?.length > 0; + const terms = hasSearch + ? searchInput.trim().split(/\s+/).filter(t => t.length > 0) + : []; + + // Build a relevance score: +3 for label match, +2 for place, +1 for description + // per term — higher scores float to top + let scoreExpr = "0"; + if (terms.length > 0) { + const termScores = terms.map(raw => { + const t = raw.replace(/'/g, "''"); + return `(CASE WHEN label ILIKE '%${t}%' THEN 3 ELSE 0 END + + CASE WHEN CAST(place_name AS VARCHAR) ILIKE '%${t}%' THEN 2 ELSE 0 END + + CASE WHEN description ILIKE '%${t}%' THEN 1 ELSE 0 END)`; + }); + scoreExpr = termScores.join(" + "); + } + + const orderClause = hasSearch + ? `ORDER BY (${scoreExpr}) DESC, label` + : "ORDER BY RANDOM()"; + const query = ` SELECT row_id, @@ -478,7 +505,7 @@ sampleData = { place_name FROM samples WHERE ${whereClause} - ORDER BY RANDOM() + ${orderClause} LIMIT ${maxSamples} `; From 134aca2c256490f558db5fe8578e6087f0bb8754 Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Tue, 28 Apr 2026 13:45:59 -0700 Subject: [PATCH 2/2] Escape ILIKE wildcards in search; clarify FTS spike script status Search input was passed into ILIKE patterns with only single-quote escaping, so a literal "%" or "_" in the query (e.g. "100%", "co_op") silently turned into wildcards. Escape % _ \ and add ESCAPE '\' in both whereClause and the relevance-score expression. Also reframe tools/build_fts_index.py as a spike artifact: the docstring told readers to upload the index to data.isamples.org, but per PR #95 findings the 200-358 MB result is too large to ship. Mark the script NOT in production pipeline and drop the misleading upload instructions. Smoke-tested locally with /tmp/explorer_smoke_test.py (multi-term "pottery cyprus" + wildcard "100%"): 0 JS exceptions, 0 console errors, 0 failed requests. Co-Authored-By: Claude Opus 4.7 (1M context) --- tools/build_fts_index.py | 20 +++++++++++++++----- tutorials/isamples_explorer.qmd | 22 ++++++++++++---------- 2 files changed, 27 insertions(+), 15 deletions(-) diff --git a/tools/build_fts_index.py b/tools/build_fts_index.py index b0ddd09..9ff34f6 100644 --- a/tools/build_fts_index.py +++ b/tools/build_fts_index.py @@ -1,15 +1,25 @@ #!/usr/bin/env python3 """ +STATUS: spike artifact — NOT in production pipeline. + Build a DuckDB full-text search index for the iSamples Explorer. -Creates a .duckdb file containing the FTS index (BM25-scored) that can -be ATTACHed in DuckDB-WASM for ranked text search over 6.7M samples. +This script was used to evaluate whether DuckDB FTS could replace the +ILIKE-based search in the Explorer. Findings (PR #95): + - Full index (label + description + place_name): 358 MB + - Lite index (label + place_name only): 211 MB + - ATTACH-over-HTTP works in DuckDB-WASM, but the download is too + large for an interactive page. +The Explorer continues to use ILIKE; this script is preserved so we +can revisit FTS once we have a smaller index strategy (e.g. +pre-tokenized inverted index as parquet, or on-demand loading behind +an "Enhanced Search" toggle). Usage: python tools/build_fts_index.py Output: - tools/isamples_fts_index.duckdb (upload to data.isamples.org) + tools/isamples_fts_index.duckdb (NOT currently uploaded anywhere) Requirements: pip install duckdb @@ -89,8 +99,8 @@ def build_fts_index(): size_mb = OUTPUT_DB.stat().st_size / (1024 * 1024) print(f"\nIndex file: {OUTPUT_DB}") print(f"Size: {size_mb:.1f} MB") - print(f"\nUpload to data.isamples.org and ATTACH in DuckDB-WASM:") - print(f" ATTACH 'https://data.isamples.org/isamples_fts_index.duckdb' AS fts_db;") + print(f"\nNOTE: Index is too large to ship to the browser as-is.") + print(f" See module docstring for the spike findings.") if __name__ == "__main__": diff --git a/tutorials/isamples_explorer.qmd b/tutorials/isamples_explorer.qmd index b7b5597..6d8893a 100644 --- a/tutorials/isamples_explorer.qmd +++ b/tutorials/isamples_explorer.qmd @@ -392,15 +392,17 @@ whereClause = { "latitude IS NOT NULL" ]; - // Multi-term text search: each word must match at least one text field + // Multi-term text search: each word must match at least one text field. + // Escape ILIKE wildcards (% _) and the escape char (\) so user input like + // "100%" or "co_op" is treated literally rather than as a wildcard. if (searchInput?.trim()) { const terms = searchInput.trim().split(/\s+/).filter(t => t.length > 0); for (const raw of terms) { - const term = raw.replace(/'/g, "''"); + const term = raw.replace(/[\\%_]/g, "\\$&").replace(/'/g, "''"); conditions.push(`( - label ILIKE '%${term}%' - OR description ILIKE '%${term}%' - OR CAST(place_name AS VARCHAR) ILIKE '%${term}%' + label ILIKE '%${term}%' ESCAPE '\\' + OR description ILIKE '%${term}%' ESCAPE '\\' + OR CAST(place_name AS VARCHAR) ILIKE '%${term}%' ESCAPE '\\' )`); } } @@ -477,14 +479,14 @@ sampleData = { : []; // Build a relevance score: +3 for label match, +2 for place, +1 for description - // per term — higher scores float to top + // per term — higher scores float to top. Same wildcard escaping as whereClause. let scoreExpr = "0"; if (terms.length > 0) { const termScores = terms.map(raw => { - const t = raw.replace(/'/g, "''"); - return `(CASE WHEN label ILIKE '%${t}%' THEN 3 ELSE 0 END - + CASE WHEN CAST(place_name AS VARCHAR) ILIKE '%${t}%' THEN 2 ELSE 0 END - + CASE WHEN description ILIKE '%${t}%' THEN 1 ELSE 0 END)`; + const t = raw.replace(/[\\%_]/g, "\\$&").replace(/'/g, "''"); + return `(CASE WHEN label ILIKE '%${t}%' ESCAPE '\\' THEN 3 ELSE 0 END + + CASE WHEN CAST(place_name AS VARCHAR) ILIKE '%${t}%' ESCAPE '\\' THEN 2 ELSE 0 END + + CASE WHEN description ILIKE '%${t}%' ESCAPE '\\' THEN 1 ELSE 0 END)`; }); scoreExpr = termScores.join(" + "); }