From fbefa53e60c47104bbf885a3e1939230e79fd5d5 Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Tue, 28 Apr 2026 16:19:43 -0700 Subject: [PATCH 1/2] =?UTF-8?q?Add=20scripts/build=5Fvocab=5Flabels.py=20?= =?UTF-8?q?=E2=80=94=20canonical=20vocab=E2=86=92label=20artifact=20builde?= =?UTF-8?q?r?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Builds vocab_labels.parquet from the SKOS TTL vocabularies that generate_vocab_docs.sh already pulls (10 TTLs across 4 vocabulary repos: core, Earth Science, Archaeology/OpenContext, Biology). Produces 537 rows / 535 unique URIs / 10 schemes / en + de. Output columns: uri, uri_form, pref_label, lang, scheme, definition, alt_labels, source_ttl. The artifact is intended to be consumed by: - the Explorer (Quarto/OJS) — JOIN facet URIs onto pref_label - isamples-python notebooks — enrich queries on IdentifiedConcept - pqg facet-summaries — bake labels into facet summaries at build time - any future React UI — small JSON dump from the same source First step of #148. This commit only adds the builder; publishing the parquet to data.isamples.org and wiring consumers is follow-up work. Two real-world wrinkles handled in the builder, both flagged on the issue: 1. /1.0/ URI mismatch. TTLs declare concepts at e.g. .../materialsampleobjecttype/wholeorganism, but iSamples export records (and every downstream parquet) carry a /1.0/ version segment in the URI: .../materialsampleobjecttype/1.0/wholeorganism. The convention is in the export itself, not added by pqg. The builder emits dual-form rows tagged with a uri_form column ("vocab" vs "data_v1") so consumers can JOIN on either form. Also handles per- prefix oddities: OpenContext uses /0.1/ instead of /1.0/, and biology data has inconsistent slug casing (Animalia/Fungi/Plantae but bacteria/protozoa) — both casings emitted as aliases. 2. Cross-vocab redeclarations. 17 material URIs are declared in 2-3 different TTLs with slightly different labels (whitespace, casing). The builder dedupes (uri, lang) keys, preferring the TTL whose URL matches the concept's expected canonical owner, and preserves the losers' labels in alt_labels so no information is lost. Coverage against the 55 distinct IdentifiedConcept URIs in the current wide parquet: 51/55 (93%) by URI, >99.99% sample-weighted on every facet (material, context, object_type). The 4 residuals are upstream data-quality issues — concept URIs in source data that no TTL declares (opencontext/material/0.1/organicanimalproduct, .../plantmaterial, vocabulary/specimentype/1.0/othersolidobject, .../physicalspecimen) — not something this artifact can fix. Run: python scripts/build_vocab_labels.py [-o out.parquet] [--also-csv] Deps: rdflib, pandas, pyarrow (myenv: rdflib==6.3.2). Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/build_vocab_labels.py | 299 ++++++++++++++++++++++++++++++++++ 1 file changed, 299 insertions(+) create mode 100644 scripts/build_vocab_labels.py diff --git a/scripts/build_vocab_labels.py b/scripts/build_vocab_labels.py new file mode 100644 index 0000000..a668154 --- /dev/null +++ b/scripts/build_vocab_labels.py @@ -0,0 +1,299 @@ +#!/usr/bin/env python3 +""" +Build vocab_labels.parquet from the SKOS TTL vocabularies that iSamples uses. + +The Explorer and Python notebooks need a stable lookup from vocabulary URIs +(e.g. https://w3id.org/isample/vocabulary/sampledfeature/1.0/pasthumanoccupationsite) +to human-readable labels (e.g. "Past human occupation site"). This script +parses every SKOS TTL listed in scripts/generate_vocab_docs.sh, emits one row +per (concept URI, language) pair, and writes a single parquet file. + +Output columns: + uri str Concept URI (vocab-form OR data-form — see uri_form) + uri_form str "vocab" = URI as declared in the TTL + "data_v1" = synthesized URI with "/1.0/" version + segment after the scheme root (the + convention used in iSamples export + records and downstream parquet files). + pref_label str skos:prefLabel (or rdfs:label fallback) + lang str BCP47 language tag, default "en" + scheme str skos:inScheme URI (or derived) + definition str? skos:definition (best-available language) + alt_labels list skos:altLabel values plus prefLabels from any + cross-vocab redeclarations of the same URI. + source_ttl str URL of the TTL the canonical row came from. + +The dual-form (vocab + data_v1) emission is a workaround for a known +mismatch: the vocabulary TTLs declare concepts without a version segment, +but iSamples export records carry URIs with a "/1.0/" segment. See +issue #148 for the full background. + +Issue: https://github.com/isamplesorg/isamplesorg.github.io/issues/148 + +Usage: + python scripts/build_vocab_labels.py # writes ./vocab_labels.parquet + python scripts/build_vocab_labels.py -o /tmp/v.parquet +""" +from __future__ import annotations + +import argparse +import sys +from pathlib import Path + +import pandas as pd +import rdflib +from rdflib.namespace import RDF, RDFS, SKOS + +# Keep this list in sync with scripts/generate_vocab_docs.sh. +# When a new vocabulary is added there, add it here too. +VOCAB_TTLS: list[str] = [ + # Core iSamples vocabularies + "https://raw.githubusercontent.com/isamplesorg/vocabularies/main/vocabulary/material_type.ttl", + "https://raw.githubusercontent.com/isamplesorg/vocabularies/main/vocabulary/sampled_feature_type.ttl", + "https://raw.githubusercontent.com/isamplesorg/vocabularies/main/vocabulary/material_sample_object_type.ttl", + # Earth Science extension + "https://raw.githubusercontent.com/isamplesorg/metadata_profile_earth_science/main/vocabulary/earthenv_material_extension_mineral_group.ttl", + "https://raw.githubusercontent.com/isamplesorg/metadata_profile_earth_science/main/vocabulary/earthenv_material_extension_rock_sediment.ttl", + "https://raw.githubusercontent.com/isamplesorg/metadata_profile_earth_science/main/vocabulary/earthenv_sampled_feature_role.ttl", + "https://raw.githubusercontent.com/isamplesorg/metadata_profile_earth_science/main/vocabulary/earthenv_materialsampleobject_type.ttl", + # Archaeology / OpenContext extension + "https://raw.githubusercontent.com/isamplesorg/metadata_profile_archaeology/main/vocabulary/opencontext_material_extension.ttl", + "https://raw.githubusercontent.com/isamplesorg/metadata_profile_archaeology/main/vocabulary/opencontext_materialsampleobjecttype.ttl", + # Biology extension + "https://raw.githubusercontent.com/isamplesorg/metadata_profile_biology/main/vocabulary/biology_sampledfeature_extension.ttl", +] + +PREFERRED_LANG = "en" + +# When a concept URI is declared in more than one TTL, prefer the row whose +# source TTL's URL contains one of these path fragments. The fragments are +# matched against the concept URI: a URI containing "vocabulary/material/" +# prefers a row from a TTL URL containing "vocabulary/material" (i.e. the +# core material_type.ttl) over OpenContext or Earth Sci redeclarations. +CANONICAL_TTL_HINTS: tuple[tuple[str, str], ...] = ( + ("vocabulary/material/", "vocabularies/main/vocabulary/material_type"), + ("vocabulary/sampledfeature/", "vocabularies/main/vocabulary/sampled_feature_type"), + ("vocabulary/materialsampleobjecttype/", "vocabularies/main/vocabulary/material_sample_object_type"), + ("vocabulary/specimentype/", "vocabularies/main/vocabulary/material_sample_object_type"), +) + + +def _data_form_uris(vocab_uri: str) -> list[str]: + """Synthesize the URI form(s) used in iSamples export records. + + Each iSamples scheme uses its own version segment and slug-casing + convention (yes, really — see issue #148). Returns possibly-multiple + aliases when the data layer uses inconsistent casing. + """ + # Biology data is inconsistent: most slugs are Title-cased (Animalia, + # Fungi, Plantae) but some are lowercase (bacteria, protozoa). Emit + # both forms so JOINs hit either variant. + def _bio_variants(s: str) -> list[str]: + if not s: + return [] + title = s[:1].upper() + s[1:] + lower = s.lower() + return list(dict.fromkeys([title, lower])) + + # (scheme_root, version_segment, slug_variants_fn_or_None) + KNOWN_ROOTS: tuple[tuple[str, str, callable | None], ...] = ( + ("https://w3id.org/isample/vocabulary/material/", "1.0", None), + ("https://w3id.org/isample/vocabulary/sampledfeature/", "1.0", None), + ("https://w3id.org/isample/vocabulary/materialsampleobjecttype/", "1.0", None), + ("https://w3id.org/isample/vocabulary/specimentype/", "1.0", None), + # OpenContext extension uses /0.1/ rather than /1.0/. + ("https://w3id.org/isample/opencontext/material/", "0.1", None), + ("https://w3id.org/isample/opencontext/materialsampleobjecttype/","0.1", None), + # Biology extension: /1.0/ + inconsistent slug casing in the data. + ("https://w3id.org/isample/biology/biosampledfeature/", "1.0", _bio_variants), + ) + for root, version, variants in KNOWN_ROOTS: + if vocab_uri.startswith(root): + slug = vocab_uri[len(root):] + # Don't re-version a URI that already has a version segment. + if slug.split("/", 1)[0].replace(".", "").isdigit(): + return [] + slugs = variants(slug) if variants is not None else [slug] + return [f"{root}{version}/{s}" for s in slugs] + return [] + + +def _prefers(ttl_url: str, concept_uri: str) -> int: + """Return a sort key — lower is more canonical for tie-breaking. + A TTL whose URL matches the concept URI's expected canonical TTL gets 0; + everything else gets 1. + """ + for uri_fragment, ttl_fragment in CANONICAL_TTL_HINTS: + if uri_fragment in concept_uri and ttl_fragment in ttl_url: + return 0 + return 1 + + +def _pick_definition(g: rdflib.Graph, c: rdflib.term.Node) -> str | None: + """Return one definition string, preferring English when present.""" + defs = list(g.objects(c, SKOS.definition)) + if not defs: + return None + for d in defs: + if getattr(d, "language", None) == PREFERRED_LANG: + return str(d) + return str(defs[0]) + + +def _pick_scheme(g: rdflib.Graph, c: rdflib.term.Node) -> str | None: + """Return the skos:inScheme URI for a concept, if declared.""" + for s in g.objects(c, SKOS.inScheme): + return str(s) + return None + + +def extract_rows(ttl_url: str) -> list[dict]: + g = rdflib.Graph() + g.parse(ttl_url, format="turtle") + + rows: list[dict] = [] + for c in g.subjects(RDF.type, SKOS.Concept): + uri = str(c) + scheme = _pick_scheme(g, c) + definition = _pick_definition(g, c) + alt_labels = sorted({str(a) for a in g.objects(c, SKOS.altLabel)}) + + # One row per language of skos:prefLabel; fall back to rdfs:label. + pref_labels = list(g.objects(c, SKOS.prefLabel)) + if not pref_labels: + pref_labels = list(g.objects(c, RDFS.label)) + + if not pref_labels: + # Concept with no label at all — emit a row with NULL label so + # downstream JOINs at least know the URI exists. + rows.append({ + "uri": uri, + "pref_label": None, + "lang": None, + "scheme": scheme, + "definition": definition, + "alt_labels": alt_labels, + "source_ttl": ttl_url, + }) + continue + + for lit in pref_labels: + rows.append({ + "uri": uri, + "uri_form": "vocab", + "pref_label": str(lit), + "lang": getattr(lit, "language", None) or PREFERRED_LANG, + "scheme": scheme, + "definition": definition, + "alt_labels": alt_labels, + "source_ttl": ttl_url, + }) + return rows + + +def _dedupe(rows: list[dict]) -> list[dict]: + """Collapse cross-vocab duplicate (uri, lang) rows. + + Strategy: + - For each (uri, lang), pick the row whose source TTL is the canonical + owner of that URI's scheme (see CANONICAL_TTL_HINTS). + - Move any losing rows' pref_labels into the survivor's alt_labels list + so we don't lose information. + """ + from collections import defaultdict + groups: dict[tuple[str, str | None], list[dict]] = defaultdict(list) + for r in rows: + groups[(r["uri"], r["lang"])].append(r) + + out: list[dict] = [] + for (uri, lang), candidates in groups.items(): + if len(candidates) == 1: + out.append(candidates[0]) + continue + candidates.sort(key=lambda r: (_prefers(r["source_ttl"], r["uri"]), r["source_ttl"])) + keep = dict(candidates[0]) + extra = [] + for loser in candidates[1:]: + if loser["pref_label"] and loser["pref_label"] != keep["pref_label"]: + extra.append(loser["pref_label"]) + if extra: + keep["alt_labels"] = sorted(set((keep.get("alt_labels") or []) + extra)) + out.append(keep) + return out + + +def _emit_data_form_aliases(rows: list[dict]) -> list[dict]: + """For each vocab-form row, emit an alias row at the /1.0/ data-form URI + so JOINs against iSamples export-derived URIs work without normalization. + """ + aliases: list[dict] = [] + for r in rows: + for data_uri in _data_form_uris(r["uri"]): + clone = dict(r) + clone["uri"] = data_uri + clone["uri_form"] = "data_v1" + aliases.append(clone) + return aliases + + +def main(argv: list[str] | None = None) -> int: + ap = argparse.ArgumentParser(description=__doc__.splitlines()[1]) + ap.add_argument( + "-o", "--output", + default="vocab_labels.parquet", + type=Path, + help="Output parquet path (default: ./vocab_labels.parquet)", + ) + ap.add_argument( + "--also-csv", + action="store_true", + help="Also emit a sibling .csv for diff-friendly review.", + ) + args = ap.parse_args(argv) + + all_rows: list[dict] = [] + for url in VOCAB_TTLS: + try: + n_before = len(all_rows) + all_rows.extend(extract_rows(url)) + print(f" {len(all_rows) - n_before:>4} rows {url}") + except Exception as e: + print(f"WARN: failed to parse {url}: {e}", file=sys.stderr) + + if not all_rows: + print("ERROR: no rows extracted; aborting.", file=sys.stderr) + return 2 + + raw_count = len(all_rows) + all_rows = _dedupe(all_rows) + deduped_collapsed = raw_count - len(all_rows) + print(f"\nDedupe: collapsed {deduped_collapsed} cross-vocab duplicate rows.") + + aliases = _emit_data_form_aliases(all_rows) + print(f"Aliases: emitted {len(aliases)} data-form (/1.0/) rows.") + all_rows.extend(aliases) + + df = pd.DataFrame(all_rows) + # Final sanity check + dupes = df.duplicated(subset=["uri", "lang"], keep=False).sum() + if dupes: + print(f"WARN: {dupes} duplicate (uri, lang) rows survived dedupe", file=sys.stderr) + + args.output.parent.mkdir(parents=True, exist_ok=True) + df.to_parquet(args.output, index=False) + print(f"\nWrote {len(df):,} rows → {args.output}") + print(f" by uri_form: {df['uri_form'].value_counts().to_dict()}") + print(f" unique URIs: {df['uri'].nunique():,}") + print(f" languages: {sorted(df['lang'].dropna().unique().tolist())}") + print(f" schemes: {df['scheme'].nunique()} distinct skos:inScheme values") + + if args.also_csv: + csv_path = args.output.with_suffix(".csv") + df.to_csv(csv_path, index=False) + print(f"Also wrote {csv_path}") + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From 30b3c814ec3841c49121ce7af4f6c0eb9c4d9e9f Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Tue, 28 Apr 2026 16:50:52 -0700 Subject: [PATCH 2/2] Address Codex review (#149): fail-loud on TTL fetch errors + pin deps - Default behavior is now to fail loud (exit 3) if any TTL source fails to fetch or parse. Add --allow-partial to override. Since this artifact is intended for publishing, a partial parquet should not be silently produced. - Add scripts/requirements.txt pinning the script's runtime deps (rdflib, pandas, pyarrow). Kept separate from the site-build ../requirements.txt because these scripts are not run in CI; this file is just so a fresh checkout can run them. Verified: injecting a bogus TTL URL yields exit 3 by default, exit 0 with --allow-partial. Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/build_vocab_labels.py | 22 ++++++++++++++++++++++ scripts/requirements.txt | 8 ++++++++ 2 files changed, 30 insertions(+) create mode 100644 scripts/requirements.txt diff --git a/scripts/build_vocab_labels.py b/scripts/build_vocab_labels.py index a668154..4bd7c9f 100644 --- a/scripts/build_vocab_labels.py +++ b/scripts/build_vocab_labels.py @@ -31,6 +31,7 @@ Issue: https://github.com/isamplesorg/isamplesorg.github.io/issues/148 Usage: + pip install -r scripts/requirements.txt python scripts/build_vocab_labels.py # writes ./vocab_labels.parquet python scripts/build_vocab_labels.py -o /tmp/v.parquet """ @@ -249,9 +250,19 @@ def main(argv: list[str] | None = None) -> int: action="store_true", help="Also emit a sibling .csv for diff-friendly review.", ) + ap.add_argument( + "--allow-partial", + action="store_true", + help=( + "Continue and emit an artifact even if one or more TTL sources " + "fail to fetch/parse. Default is to fail-loud, since this " + "artifact is intended for publishing." + ), + ) args = ap.parse_args(argv) all_rows: list[dict] = [] + failures: list[tuple[str, str]] = [] for url in VOCAB_TTLS: try: n_before = len(all_rows) @@ -259,6 +270,17 @@ def main(argv: list[str] | None = None) -> int: print(f" {len(all_rows) - n_before:>4} rows {url}") except Exception as e: print(f"WARN: failed to parse {url}: {e}", file=sys.stderr) + failures.append((url, str(e))) + + if failures and not args.allow_partial: + print( + f"\nERROR: {len(failures)} TTL source(s) failed; refusing to " + f"emit a partial artifact. Pass --allow-partial to override.", + file=sys.stderr, + ) + for url, err in failures: + print(f" - {url}: {err}", file=sys.stderr) + return 3 if not all_rows: print("ERROR: no rows extracted; aborting.", file=sys.stderr) diff --git a/scripts/requirements.txt b/scripts/requirements.txt new file mode 100644 index 0000000..67746e4 --- /dev/null +++ b/scripts/requirements.txt @@ -0,0 +1,8 @@ +# Dependencies for one-shot scripts in this directory (NOT the Quarto site +# build — that uses ../requirements.txt). Install with: +# pip install -r scripts/requirements.txt + +# build_vocab_labels.py +rdflib>=6.3 +pandas>=2.0 +pyarrow>=14