diff --git a/.github/workflows/zeeschuimer_map_item_sync.yml b/.github/workflows/zeeschuimer_map_item_sync.yml
index 54e3d8221..755884e52 100644
--- a/.github/workflows/zeeschuimer_map_item_sync.yml
+++ b/.github/workflows/zeeschuimer_map_item_sync.yml
@@ -1,14 +1,252 @@
-# Bootstrap the Zeeschuimer map_item sync workflow
-# This is necessary to test workflow in PR (so far as I can tell)
+# Auto-translate Zeeschuimer datasource `map_item` functions from Python to JS
+# and open a draft PR per module against digitalmethodsinitiative/zeeschuimer.
+#
+# Triggers on pushes to master that touch any Zeeschuimer datasource (or the
+# helper script itself). Also exposes a `workflow_dispatch` trigger with a
+# `bootstrap` input for the initial run that translates every Zeeschuimer
+# datasource at once (single PR). Datasources without a matching Zeeschuimer
+# module — e.g. facebook — are skipped automatically.
+#
+# Architecture: a `detect` job groups changed files by module and emits a
+# matrix; a `sync` job fans out one parallel run per module, each opening
+# (or updating) its own PR on a stable per-module branch. The planning
+# and PR-body logic are in `helper-scripts/map_item_ci.py` (# unit-tested 
+# in tests/test_map_item_sync.py); the LLM translation and JS
+# splicing live in `helper-scripts/map_item_converter.py`.
+#
+# Required secrets (configured in repo Settings -> Secrets and variables -> Actions):
+#   ZEESCHUIMER_APP_ID           - numeric App ID of the GitHub App installed on
+#                                  digitalmethodsinitiative/zeeschuimer with permissions
+#                                  contents:write + pull-requests:write (and nothing else)
+#   ZEESCHUIMER_APP_PRIVATE_KEY  - full PEM private key for that App (including BEGIN/END lines)
+#   DMI_OLLAMA_KEY               - API key for https://ollama.digitalmethods.net (legacy fallback)
+#
+# Optional overrides — set in repo Settings -> Secrets and variables -> Actions to change
+# the provider used by automatic (push-triggered) runs without editing this file.
+# Resolution order for each setting (first non-empty wins):
+#   API key:         workflow_dispatch input  ->  LLM_PROVIDER_API_KEY secret  ->  DMI_OLLAMA_KEY secret
+#   provider/url/model/output_mode:  workflow_dispatch input  ->  repo variable  ->  hardcoded default below
+#
+#   LLM_PROVIDER_API_KEY  (secret)   - generic key for the active provider; swap when switching providers
+#   LLM_PROVIDER          (variable) - provider type for LLMAdapter         (default: ollama)
+#   LLM_BASE_URL          (variable) - provider base URL                    (default: https://ollama.digitalmethods.net)
+#   LLM_MODEL             (variable) - model name                           (default: qwen2.5-coder:14b)
+#   LLM_OUTPUT_MODE       (variable) - structured or prompt                 (default: structured)
 
 name: Sync Zeeschuimer map_item from 4CAT
 
 on:
+  push:
+    branches: [master]
+    paths:
+      # Only datasource changes drive a push-triggered translation: the detect
+      # job's plan-matrix diffs `datasources/**` and nothing else
+      - 'datasources/**/search_*.py'
+      - '.github/workflows/zeeschuimer_map_item_sync.yml'
   workflow_dispatch:
+    # NOTE: defaults are '' which falls through to Github settings (see Optional overrides above)
+    inputs:
+      bootstrap:
+        description: 'Translate every Zeeschuimer datasource (initial sync, single PR). Ignored if "files" is set.'
+        type: boolean
+        default: false
+      files:
+        description: 'Space-separated list of datasource files to translate (e.g. "datasources/tiktok/search_tiktok.py"). Overrides bootstrap. One PR per module.'
+        type: string
+        default: ''
+      llm_provider:
+        description: 'LLM provider type for LLMAdapter. Leave blank to use LLM_PROVIDER variable or default (ollama).'
+        type: string
+        default: ''
+      llm_base_url:
+        description: 'LLM provider base URL. Leave blank to use LLM_BASE_URL variable or default (https://ollama.digitalmethods.net).'
+        type: string
+        default: ''
+      llm_api_key:
+        description: 'LLM API key. Leave blank to use LLM_PROVIDER_API_KEY secret or DMI_OLLAMA_KEY secret.'
+        type: string
+        default: ''
+      model:
+        description: 'LLM model name. Leave blank to use LLM_MODEL variable or default (qwen2.5-coder:14b). Examples: qwen2.5-coder:7b, deepseek-coder-v2:16b, gemma3:27b'
+        type: string
+        default: ''
+      output_mode:
+        description: 'LLM output mode (structured or prompt). Leave blank to use LLM_OUTPUT_MODE variable or default (structured). Use prompt for models that do not support structured output (e.g. gpt-oss-120b).'
+        type: string
+        default: ''
+
+# Least privilege: this workflow's own GITHUB_TOKEN only needs to read the 4CAT
+permissions:
+  contents: read
 
 jobs:
-  sync-map-item:
+  detect:
+    name: Detect modules to translate
     runs-on: ubuntu-latest
+    outputs:
+      mode: ${{ steps.plan.outputs.mode }}
+      matrix: ${{ steps.plan.outputs.matrix }}
     steps:
-      - name: Placeholder
-        run: echo "Workflow scaffold is valid."
\ No newline at end of file
+      - name: Checkout 4CAT
+        uses: actions/checkout@v4
+        with:
+          # Full history: the push-event plan diffs `github.event.before`
+          # against `github.sha`. A shallow clone may not contain `before` for
+          # a multi-commit push, in which case the diff resolves to nothing and
+          # the change is silently skipped.
+          fetch-depth: 0
+
+      - name: Set up Python
+        # `detect` runs map_item_ci.py (stdlib only — no LLM deps installed
+        # here), but still needs a `python` on PATH; don't rely on the runner
+        # image happening to provide one.
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Plan translation matrix
+        id: plan
+        env:
+          EVENT_NAME: ${{ github.event_name }}
+          INPUTS_FILES: ${{ inputs.files }}
+          INPUTS_BOOTSTRAP: ${{ inputs.bootstrap }}
+          BEFORE_SHA: ${{ github.event.before }}
+          AFTER_SHA: ${{ github.sha }}
+        # Validates paths against a strict datasource shape (dropping anything
+        # else) and writes `mode` + `matrix` to $GITHUB_OUTPUT. See
+        # helper-scripts/map_item_ci.py.
+        run: python helper-scripts/map_item_ci.py plan-matrix
+
+  sync:
+    name: Sync ${{ matrix.target.module }}
+    needs: detect
+    if: needs.detect.outputs.mode != 'none'
+    runs-on: ubuntu-latest
+    # Per-module concurrency: a newer push to master supersedes any in-flight
+    # sync for the same module (LLM run gets cancelled, latest run wins).
+    # Each matrix instance gets its own group, so different modules don't block.
+    concurrency:
+      group: zeeschuimer-sync-${{ matrix.target.module }}
+      cancel-in-progress: true
+    strategy:
+      fail-fast: false
+      matrix:
+        target: ${{ fromJson(needs.detect.outputs.matrix) }}
+    steps:
+      - name: Checkout 4CAT
+        uses: actions/checkout@v4
+        with:
+          # Full history so the PR-body builder can `git diff before..after`
+          # for the changed Python file (see map_item_ci.py build-pr-body).
+          fetch-depth: 0
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Install LLM dependencies
+        run: |
+          # LLMAdapter (common/lib/llm.py) imports every provider's langchain
+          # package at module load, so all of them are required even though we
+          # only use the Ollama provider at runtime. Derive the exact list from
+          # setup.py (single source of truth) so it can't drift from what 4CAT
+          # declares; we install only this LLM subset, not all of 4CAT, to keep
+          # the job light. Write the specs to a requirements file (one per line)
+          # and install with `-r`, rather than an unquoted `pip install $VAR`:
+          # that way a version specifier that contains a shell metacharacter
+          # (e.g. a future `langchain_core>=0.3` pin — `>` is redirection) can't
+          # be misparsed by the shell.
+          python helper-scripts/map_item_ci.py llm-requirements > llm-requirements.txt
+          echo "Installing from setup.py:"
+          cat llm-requirements.txt
+          pip install -r llm-requirements.txt
+
+      - name: Mint Zeeschuimer App token
+        id: app_token
+        uses: actions/create-github-app-token@v1
+        with:
+          app-id: ${{ secrets.ZEESCHUIMER_APP_ID }}
+          private-key: ${{ secrets.ZEESCHUIMER_APP_PRIVATE_KEY }}
+          owner: digitalmethodsinitiative
+          repositories: zeeschuimer
+
+      - name: Checkout Zeeschuimer
+        uses: actions/checkout@v4
+        with:
+          repository: digitalmethodsinitiative/zeeschuimer
+          path: zeeschuimer-checkout
+          token: ${{ steps.app_token.outputs.token }}
+
+      - name: Run translation
+        env:
+          PROVIDER_API_KEY: ${{ inputs.llm_api_key || secrets.LLM_PROVIDER_API_KEY || secrets.DMI_OLLAMA_KEY }}
+          LLM_PROVIDER: ${{ inputs.llm_provider || vars.LLM_PROVIDER || 'ollama' }}
+          LLM_BASE_URL: ${{ inputs.llm_base_url || vars.LLM_BASE_URL || 'https://ollama.digitalmethods.net' }}
+          LLM_MODEL: ${{ inputs.model || vars.LLM_MODEL || 'qwen2.5-coder:14b' }}
+          LLM_OUTPUT_MODE: ${{ inputs.output_mode || vars.LLM_OUTPUT_MODE || 'structured' }}
+          # Pass matrix values through env rather than interpolating. 
+          # IS_BOOTSTRAP is always the literal true/false the detect job emitted
+          # MODULE_FILES is a list of paths the detect job already validated against 
+          # `datasources/<module>/search_<name>.py` shape (no shell
+          # metacharacters), so the unquoted `$MODULE_FILES` expansion is safe
+          # and still word-splits into multiple --files arguments.
+          IS_BOOTSTRAP: ${{ matrix.target.bootstrap }}
+          MODULE_FILES: ${{ matrix.target.files }}
+        run: |
+          if [ "$IS_BOOTSTRAP" = "true" ]; then
+            # Bootstrap translates every datasource in one run; --no-fail-fast
+            # so one datasource failing doesn't abort the whole initial sync.
+            python helper-scripts/map_item_converter.py \
+              --bootstrap \
+              --no-fail-fast \
+              --zeeschuimer-checkout ./zeeschuimer-checkout \
+              --output-manifest ./manifest.json
+          else
+            python helper-scripts/map_item_converter.py \
+              --files $MODULE_FILES \
+              --zeeschuimer-checkout ./zeeschuimer-checkout \
+              --output-manifest ./manifest.json
+          fi
+
+      - name: Build PR body
+        id: pr_body
+        env:
+          MODULE: ${{ matrix.target.module }}
+          BOOTSTRAP: ${{ matrix.target.bootstrap }}
+          BEFORE_SHA: ${{ github.event.before }}
+          AFTER_SHA: ${{ github.sha }}
+          RUN_ID: ${{ github.run_id }}
+          EVENT_NAME: ${{ github.event_name }}
+          REPO: ${{ github.repository }}
+        # Reads manifest.json, writes pr_body.md, and writes `title` to
+        # $GITHUB_OUTPUT (delimiter form, injection-safe). See map_item_ci.py.
+        run: python helper-scripts/map_item_ci.py build-pr-body --manifest manifest.json --out pr_body.md
+
+      - name: Check there are JS changes to PR
+        id: have_changes
+        working-directory: zeeschuimer-checkout
+        run: |
+          if [ -z "$(git status --porcelain)" ]; then
+            echo "has_changes=false" >> "$GITHUB_OUTPUT"
+            echo "No JS changes produced by translation; not opening a PR."
+          else
+            echo "has_changes=true" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Open or update Zeeschuimer PR
+        if: steps.have_changes.outputs.has_changes == 'true'
+        # Third-party action that operates with a write token to the Zeeschuimer
+        # repo — pinned to a full commit SHA (the v6 release) rather than the
+        # mutable `@v6` tag, so a tag move can't silently change what runs here.
+        uses: peter-evans/create-pull-request@c5a7806660adbe173f04e3e038b0ccdcd758773c # v6
+        with:
+          path: zeeschuimer-checkout
+          token: ${{ steps.app_token.outputs.token }}
+          # Stable per-module branch: a fresh push that retranslates the same
+          # module updates the same PR. Different modules never share a branch.
+          branch: auto/4cat-map-item-sync-${{ matrix.target.module }}
+          title: ${{ steps.pr_body.outputs.title }}
+          commit-message: "chore: sync map_item for ${{ matrix.target.module }} from 4CAT ${{ github.sha }}"
+          body-path: pr_body.md
+          draft: true
diff --git a/.gitignore b/.gitignore
index 8850a7bcc..5aa6eae56 100644
--- a/.gitignore
+++ b/.gitignore
@@ -46,6 +46,9 @@ webtool/venv/
 *.ipynb
 venv/
 __pycache__/
+.claude/
+# ignore symlink at the repo root -> config/extensions.
+/extensions
 
 # do not ignore interface images
 !webtool/static/img/*.png
diff --git a/common/lib/dataset.py b/common/lib/dataset.py
index 37aaed1ba..53766b59a 100644
--- a/common/lib/dataset.py
+++ b/common/lib/dataset.py
@@ -495,14 +495,18 @@ def iterate_items(
         :param max_unmappable:  Skip at most this many unmappable items; if
         more are encountered, stop iterating. `None` to never stop.
         :param map_missing: Indicates what to do with mapped items for which
-        some fields could not be mapped. Defaults to 'empty_str'. Must be one of:
+        some fields could not be mapped. Defaults to 'default'. Must be one of:
         - 'default': fill missing fields with the default passed by map_item
+        - 'keep': leave the MissingMappedField sentinel in place so the caller
+          can tell which fields were missing (useful for JSON serialisation
+          via MissingMappedFieldEncoder)
         - 'abort': raise a MappedItemIncompleteException if a field is missing
         - a callback: replace missing field with the return value of the
           callback. The MappedItem object is passed to the callback as the
           first argument and the name of the missing field as the second.
         - a dictionary with a key for each possible missing field: replace missing
-          field with a strategy for that field ('default', 'abort', or a callback)
+          field with a strategy for that field ('default', 'keep', 'abort', or
+          a callback)
         :param get_annotations: Whether to also fetch annotations from the database.
           This can be disabled to help speed up iteration.
         :param offset: After how many rows we should yield items.
@@ -587,6 +591,10 @@ def iterate_items(
                             mapped_item.data[missing_field] = strategy(
                                 mapped_item.data, missing_field
                             )
+                        elif strategy == "keep":
+                            # leave the MissingMappedField in place so the
+                            # caller can distinguish missing from present
+                            continue
                         elif strategy == "abort":
                             # raise an exception to be handled at the processor level
                             raise MappedItemIncompleteException(
@@ -599,7 +607,7 @@ def iterate_items(
                             ].value
                         else:
                             raise ValueError(
-                                "map_missing must be 'abort', 'default', or a callback."
+                                "map_missing must be 'abort', 'default', 'keep', or a callback."
                             )
             else:
                 mapped_item = original_item
diff --git a/common/lib/llm.py b/common/lib/llm.py
index 0901194d1..e696d2918 100644
--- a/common/lib/llm.py
+++ b/common/lib/llm.py
@@ -121,6 +121,22 @@ def _load_llm(self) -> BaseChatModel:
             )
             self.model = llm.model_name
             return llm
+        elif self.provider  == "litellm":
+            url = f"{self.base_url}/" if not self.base_url.endswith("/") else self.base_url
+            url += "v1/" if not url.endswith("v1/") else ""
+
+            llm = ChatOpenAI(
+                model=self.model,
+                temperature=self.temperature,
+                api_key=SecretStr(self.api_key),
+                base_url=url,
+                max_tokens=self.max_tokens,
+                default_headers={
+                        "x-litellm-api-key": f"Bearer {self.api_key}"
+                    }
+            )
+            self.model = llm.model_name
+            return llm
         else:
             raise ValueError(f"Unsupported LLM provider: {self.provider}")
 
@@ -287,7 +303,25 @@ def _format_media_block(
                     }}
                 return {"type": "image_url", "image_url": {"url": data_uri}}
 
-    def set_structure(self, json_schema):
+    def set_structure(self, json_schema, method=None, include_raw=False, strict=None):
+        """
+        Bind a JSON schema so the model returns schema-validated structured output.
+
+        :param json_schema: JSON schema dict (or JSON string) describing the output.
+        :param method: How structured output is enforced. None uses LangChain's
+            per-provider default (usually "function_calling", which binds a tool).
+            For reasoning models served over an OpenAI-compatible proxy, pass
+            "json_schema" — constrained decoding forces the answer channel itself
+            to match the schema, rather than relying on a clean tool call that the
+            model may emit in the wrong channel (yielding empty, unparseable output).
+        :param include_raw: When True, structured-output calls return a
+            {"raw", "parsed", "parsing_error"} dict instead of raising on a parse
+            failure, so callers can inspect the raw AIMessage (finish_reason,
+            reasoning channel, token usage) to diagnose what went wrong.
+        :param strict: Passed through to with_structured_output when not None.
+            Use strict=False for schemas that don't satisfy OpenAI strict-mode
+            requirements but are fine for a guided-decoding backend (e.g. vLLM).
+        """
         if not json_schema:
             raise ValueError("json_schema is None")
 
@@ -301,7 +335,12 @@ def set_structure(self, json_schema):
             json_schema = {"type": "json_schema", "json_schema": {"schema": json_schema}}
             self.llm = self.llm.bind(response_format=json_schema)
         else:
-            self.llm = self.llm.with_structured_output(json_schema)
+            kwargs = {"include_raw": include_raw}
+            if method:
+                kwargs["method"] = method
+            if strict is not None:
+                kwargs["strict"] = strict
+            self.llm = self.llm.with_structured_output(json_schema, **kwargs)
         self.structured_output = True
 
     @staticmethod
diff --git a/helper-scripts/map_item_ci.py b/helper-scripts/map_item_ci.py
new file mode 100644
index 000000000..78b742a50
--- /dev/null
+++ b/helper-scripts/map_item_ci.py
@@ -0,0 +1,483 @@
+"""
+CI glue for the Zeeschuimer `map_item` sync workflow: translation-matrix
+planning (`plan-matrix`) and PR-body construction (`build-pr-body`).
+
+This logic used to live as Python heredocs embedded in
+`.github/workflows/zeeschuimer_map_item_sync.yml`. It was moved here so it can
+be unit-tested and linted like the rest of the codebase.
+
+IMPORTANT: this module is intentionally pure-stdlib and MUST NOT import
+`map_item_converter` (or anything under `common/`). The `detect` job runs
+`plan-matrix` WITHOUT installing the LLM dependencies (langchain etc.), so any
+heavy import here would break it.
+
+Usage (from the 4CAT repo root):
+    python helper-scripts/map_item_ci.py plan-matrix
+    python helper-scripts/map_item_ci.py build-pr-body --manifest manifest.json --out pr_body.md
+    python helper-scripts/map_item_ci.py llm-requirements
+
+`llm-requirements` prints the langchain/pydantic/requests pip specs read from
+setup.py, so the workflow installs the same LLM stack 4CAT declares instead of
+a hand-maintained list that can drift.
+
+`plan-matrix` reads EVENT_NAME / INPUTS_FILES / INPUTS_BOOTSTRAP / BEFORE_SHA /
+AFTER_SHA from the environment and writes `mode` and `matrix` to $GITHUB_OUTPUT.
+`build-pr-body` reads MODULE / BOOTSTRAP / BEFORE_SHA / AFTER_SHA / RUN_ID /
+EVENT_NAME / REPO, writes the PR body to `--out`, and writes `title` to
+$GITHUB_OUTPUT.
+"""
+from __future__ import annotations
+
+import argparse
+import ast
+import json
+import os
+import re
+import subprocess
+import sys
+from typing import Callable, Optional
+
+
+# Strict shape for a datasource path: `datasources/<module>/search_<name>.py`.
+# Anchored and restricted to a safe charset (no shell metacharacters) so that a
+# path coming from `git diff` or, especially, a `workflow_dispatch` `files`
+# input can be interpolated into the sync job's `--files` shell argument
+# without any risk of command injection. Anything not matching is dropped and
+# logged (never silently passed through).
+DATASOURCE_PATH_RE = re.compile(r"^datasources/[A-Za-z0-9_-]+/search_[A-Za-z0-9_]+\.py$")
+
+# Pathspec used to limit the push-event diff to datasource search files.
+_DATASOURCE_PATHSPEC = "datasources/*/search_*.py"
+
+
+def _dist_name(spec: str) -> str:
+    """Bare distribution name from a requirement spec: strip version, extras,
+    and environment markers. `requests~=2.27` -> `requests`,
+    `Flask_Limiter[memcached]` -> `Flask_Limiter`."""
+    return re.split(r"[<>=!~;\[ ]", spec, maxsplit=1)[0].strip()
+
+
+def extract_llm_requirements(setup_py_source: str) -> list[str]:
+    """
+    Pull the LLM dependency specs (langchain*, pydantic, requests) straight out
+    of setup.py's package sets. The sync job installs only this subset (not all
+    of 4CAT) to stay light, but deriving it from setup.py means the list can't
+    silently drift from what the app actually declares — and it picks up new
+    langchain providers automatically.
+
+    Returns sorted, de-duplicated requirement strings with whatever version
+    specifiers setup.py uses.
+    """
+    specs = set()
+    for node in ast.walk(ast.parse(setup_py_source)):
+        if not isinstance(node, ast.Assign):
+            continue
+        if not any(
+            isinstance(t, ast.Name) and t.id in ("core_packages", "processor_packages")
+            for t in node.targets
+        ):
+            continue
+        for elt in getattr(node.value, "elts", []):
+            if isinstance(elt, ast.Constant) and isinstance(elt.value, str):
+                specs.add(elt.value)
+
+    return sorted(
+        spec
+        for spec in specs
+        if _dist_name(spec).startswith("langchain") or _dist_name(spec) in ("pydantic", "requests")
+    )
+
+
+def _git_diff_names(before: str, after: str) -> list[str]:
+    """
+    Names of datasource search files changed between two commits. Returns an
+    empty list (rather than raising) if git can't resolve the range — e.g. a
+    shallow clone that doesn't contain `before`. The caller treats "can't tell"
+    the same as "nothing changed".
+    """
+    try:
+        out = subprocess.check_output(
+            ["git", "diff", "--name-only", before, after, "--", _DATASOURCE_PATHSPEC],
+            text=True,
+        )
+    except (subprocess.CalledProcessError, OSError):
+        # CalledProcessError: range can't resolve (shallow clone). OSError /
+        # FileNotFoundError: git not on PATH. Either way, "can't tell" == "nothing".
+        return []
+    return [line for line in out.splitlines() if line.strip()]
+
+
+def plan_matrix(
+    event_name: str,
+    inputs_files: str,
+    inputs_bootstrap: bool,
+    before: str,
+    after: str,
+    git_diff: Optional[Callable[[str, str], list[str]]] = None,
+) -> tuple[str, list[dict], list[str]]:
+    """
+    Decide what to translate. Returns `(mode, matrix, rejected)`:
+
+    - mode: "bootstrap" | "files" | "none"
+    - matrix: list of `{"module", "files", "bootstrap"}` entries for the
+      `sync` job's matrix.
+    - rejected: candidate paths dropped because they don't match
+      `DATASOURCE_PATH_RE` (logged by the caller for transparency).
+
+    `git_diff` is injectable for testing; it defaults to a real `git diff`.
+    """
+    inputs_files = (inputs_files or "").strip()
+
+    # Bootstrap is special: a single PR covering every datasource. An explicit
+    # `files` input overrides bootstrap; honor that.
+    if event_name == "workflow_dispatch" and inputs_bootstrap and not inputs_files:
+        return "bootstrap", [{"module": "bootstrap", "files": "", "bootstrap": True}], []
+
+    # Resolve the candidate file list.
+    if event_name == "workflow_dispatch" and inputs_files:
+        candidates = inputs_files.split()
+    else:
+        if git_diff is None:
+            git_diff = _git_diff_names
+        candidates = git_diff(before, after)
+
+    # Validate before anything reaches a shell. Drop (and report) anything that
+    # isn't a plain `datasources/<module>/search_<name>.py` path.
+    files: list[str] = []
+    rejected: list[str] = []
+    for path in candidates:
+        path = path.strip()
+        if not path:
+            continue
+        if DATASOURCE_PATH_RE.match(path):
+            files.append(path)
+        else:
+            rejected.append(path)
+
+    # Group by module: datasources/<module>/search_*.py
+    modules: dict[str, list[str]] = {}
+    for path in files:
+        parts = path.split("/")
+        modules.setdefault(parts[1], []).append(path)
+
+    if not modules:
+        return "none", [], rejected
+
+    matrix = [
+        {"module": mod, "files": " ".join(sorted(paths)), "bootstrap": False}
+        for mod, paths in sorted(modules.items())
+    ]
+    return "files", matrix, rejected
+
+
+def _git_python_diff(before: str, after: str, python_file: str) -> str:
+    """`git diff before..after -- <python_file>`; "" if the range can't resolve
+    or git isn't available."""
+    try:
+        return subprocess.check_output(
+            ["git", "diff", "{}..{}".format(before, after), "--", python_file],
+            text=True,
+        )
+    except (subprocess.CalledProcessError, OSError):
+        return ""
+
+
+def _code_fence(content: str, lang: str = "") -> tuple[str, str]:
+    """
+    Return `(open, close)` markdown code-fence markers long enough that nothing
+    inside `content` can close the block early. A diff (or LLM text) may itself
+    contain a ``` run or an HTML tag like `</details>`; GitHub renders those
+    literally only while the fence stays intact, so we use one more backtick
+    than the longest run already present (minimum three).
+    """
+    longest = max((len(run) for run in re.findall(r"`+", content)), default=0)
+    ticks = "`" * max(3, longest + 1)
+    return ticks + lang, ticks
+
+
+def build_pr_body(
+    manifest: dict,
+    module: str,
+    is_bootstrap: bool,
+    before: str,
+    after: str,
+    run_id: str,
+    event_name: str,
+    repo: str,
+    python_diff: Optional[Callable[[str, str, str], str]] = None,
+) -> tuple[str, str]:
+    """
+    Build the draft-PR `(title, body)` from a translation manifest. `python_diff`
+    is injectable for testing; it defaults to a real `git diff` and is only
+    invoked for `push` events.
+    """
+    if python_diff is None:
+        python_diff = _git_python_diff
+
+    model = manifest.get("model", "(unknown)")
+    provider = manifest.get("provider", "ollama")
+    total_duration = manifest.get("total_duration_seconds")
+    entries = manifest.get("entries", [])
+
+    short_sha = after[:7]
+    lines: list[str] = []
+    lines.append(
+        "> :robot: This PR was auto-generated by the [4CAT map_item sync "
+        "workflow](https://github.com/{}/actions/runs/{}). The JavaScript was "
+        "produced by an LLM and **requires human review** before merging — "
+        "including manual fixes for any lint warnings flagged below.".format(repo, run_id)
+    )
+    lines.append("")
+    lines.append("## Generation parameters")
+    lines.append("- **Model:** `{}` (provider: `{}`)".format(model, provider))
+    if total_duration is not None:
+        lines.append("- **Total LLM time:** {}s".format(total_duration))
+    if is_bootstrap:
+        lines.append(
+            "- **Trigger:** manual `workflow_dispatch` with `bootstrap=true` "
+            "(initial sync of all Zeeschuimer datasources)."
+        )
+    elif event_name == "workflow_dispatch":
+        lines.append("- **Trigger:** manual `workflow_dispatch` for `{}`.".format(module))
+    else:
+        lines.append(
+            "- **Trigger:** push of [`{}`](https://github.com/{}/commit/{}) to 4CAT "
+            "master (module: `{}`).".format(short_sha, repo, after, module)
+        )
+    lines.append("")
+
+    ok = [e for e in entries if e["status"] == "ok"]
+    ok_with_warnings = [e for e in ok if e.get("lint_warnings")]
+    failed = [e for e in entries if e["status"] == "failed"]
+    skipped = [e for e in entries if e["status"] == "skipped"]
+
+    lines.append("## Summary")
+    lines.append("- :white_check_mark: {} translated".format(len(ok)))
+    if ok_with_warnings:
+        lines.append(
+            "- :warning: {} translated with lint warnings (require manual fix)".format(
+                len(ok_with_warnings)
+            )
+        )
+    lines.append("- :x: {} failed".format(len(failed)))
+    lines.append("- :grey_question: {} skipped".format(len(skipped)))
+    lines.append("")
+
+    if ok:
+        lines.append("| Datasource | Module | Time | Warnings |")
+        lines.append("|---|---|---:|---:|")
+        for entry in ok:
+            dur = entry.get("duration_seconds")
+            dur_cell = "{}s".format(dur) if dur is not None else "—"
+            warn_count = len(entry.get("lint_warnings") or [])
+            warn_cell = ":warning: {}".format(warn_count) if warn_count else "—"
+            lines.append(
+                "| `{}` | `{}` | {} | {} |".format(
+                    entry["python_file"], entry["js_file"], dur_cell, warn_cell
+                )
+            )
+        lines.append("")
+
+    if ok_with_warnings:
+        lines.append("## :warning: Lint warnings — fix before merging")
+        lines.append("")
+        lines.append(
+            "The following datasources translated successfully but the static lint "
+            "flagged issues that need human fixes. The auto-generated code was "
+            "spliced into the JS module as-is; please patch the file directly in "
+            "this PR."
+        )
+        lines.append("")
+        for entry in ok_with_warnings:
+            lines.append("**`{}` -> `{}`**".format(entry["python_file"], entry["js_file"]))
+            for w in entry["lint_warnings"]:
+                lines.append("- {}".format(w))
+            lines.append("")
+
+    for entry in ok:
+        dur = entry.get("duration_seconds")
+        header_dur = " ({}s)".format(dur) if dur is not None else ""
+        warn_marker = " :warning:" if entry.get("lint_warnings") else ""
+        lines.append(
+            "## `{}` -> `{}`{}{}".format(
+                entry["python_file"], entry["js_file"], header_dur, warn_marker
+            )
+        )
+        if entry.get("commentary"):
+            lines.append("**LLM commentary:**")
+            lines.append("")
+            lines.append("> " + entry["commentary"].replace("\n", "\n> "))
+            lines.append("")
+        if event_name == "push":
+            diff = python_diff(before, after, entry["python_file"])
+        else:
+            diff = ""
+        if diff.strip():
+            fence_open, fence_close = _code_fence(diff, "diff")
+            lines.append("<details><summary>Python diff</summary>")
+            lines.append("")
+            lines.append(fence_open)
+            lines.append(diff.rstrip())
+            lines.append(fence_close)
+            lines.append("</details>")
+            lines.append("")
+
+    if failed:
+        lines.append("## Failures")
+        for entry in failed:
+            dur = entry.get("duration_seconds")
+            dur_str = " (after {}s)".format(dur) if dur is not None else ""
+            lines.append(
+                "- `{}`{}: {}".format(
+                    entry["python_file"], dur_str, entry.get("error", "(no error message)")
+                )
+            )
+        lines.append("")
+
+    if skipped:
+        lines.append("## Skipped")
+        for entry in skipped:
+            lines.append("- `{}`: {}".format(entry["python_file"], entry.get("error", "")))
+        lines.append("")
+
+    body = "\n".join(lines)
+
+    # Title is single-module in the matrix path; bootstrap is its own
+    # special-case (one PR covering every datasource).
+    ok_modules: list[str] = []
+    for entry in ok:
+        parts = entry["python_file"].split("/")
+        if len(parts) >= 2 and parts[0] == "datasources":
+            mod = parts[1]
+            if mod not in ok_modules:
+                ok_modules.append(mod)
+
+    if is_bootstrap:
+        title = "Auto-translated map_item updates from 4CAT (bootstrap, {} datasources)".format(
+            len(ok_modules)
+        )
+    elif not ok_modules:
+        title = "Auto-translated map_item updates from 4CAT: {}".format(module)
+    else:
+        title = "Auto-translated map_item updates from 4CAT: {}".format(", ".join(ok_modules))
+
+    return title, body
+
+
+def set_output(name: str, value: str) -> None:
+    """
+    Append a `name=value` step output to $GITHUB_OUTPUT using the heredoc
+    delimiter form, which is safe for values containing `\\n` or `=` (a plain
+    `name=value` line can be abused to inject extra outputs). No-op when
+    $GITHUB_OUTPUT is unset (e.g. running locally).
+    """
+    out_path = os.environ.get("GITHUB_OUTPUT")
+    if not out_path:
+        return
+    # A delimiter that cannot appear in our values. If it somehow does, strip it
+    # rather than emit a malformed/forgeable block.
+    delim = "ghadelim_{}_b3f9c1".format(name)
+    safe_value = value.replace(delim, "")
+    with open(out_path, "a", encoding="utf-8") as f:
+        f.write("{name}<<{delim}\n{value}\n{delim}\n".format(name=name, delim=delim, value=safe_value))
+
+
+def _cmd_plan_matrix() -> int:
+    mode, matrix, rejected = plan_matrix(
+        event_name=os.environ.get("EVENT_NAME", ""),
+        inputs_files=os.environ.get("INPUTS_FILES", ""),
+        inputs_bootstrap=os.environ.get("INPUTS_BOOTSTRAP", "").lower() == "true",
+        before=os.environ.get("BEFORE_SHA", ""),
+        after=os.environ.get("AFTER_SHA", ""),
+    )
+
+    if rejected:
+        print(
+            "Plan: dropped {} path(s) not matching `datasources/<module>/search_<name>.py`:".format(
+                len(rejected)
+            )
+        )
+        for path in rejected:
+            print("  - {!r}".format(path))
+
+    set_output("mode", mode)
+    set_output("matrix", json.dumps(matrix))
+
+    if mode == "bootstrap":
+        print("Plan: bootstrap (single PR)")
+    elif mode == "none":
+        print("Plan: nothing to translate")
+    else:
+        print("Plan: {} module(s)".format(len(matrix)))
+        for entry in matrix:
+            print("  - {}: {}".format(entry["module"], entry["files"]))
+    return 0
+
+
+def _cmd_build_pr_body(manifest_path: str, out_path: str) -> int:
+    with open(manifest_path, encoding="utf-8") as f:
+        manifest = json.load(f)
+
+    title, body = build_pr_body(
+        manifest,
+        module=os.environ["MODULE"],
+        is_bootstrap=os.environ.get("BOOTSTRAP", "").lower() == "true",
+        before=os.environ.get("BEFORE_SHA", ""),
+        after=os.environ.get("AFTER_SHA", ""),
+        run_id=os.environ.get("RUN_ID", ""),
+        event_name=os.environ.get("EVENT_NAME", ""),
+        repo=os.environ.get("REPO", ""),
+    )
+
+    with open(out_path, "w", encoding="utf-8") as f:
+        f.write(body)
+    print("Wrote {} ({} chars)".format(out_path, len(body)))
+
+    set_output("title", title)
+    print("PR title: {}".format(title))
+    return 0
+
+
+def _cmd_llm_requirements() -> int:
+    setup_py = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "setup.py")
+    with open(setup_py, encoding="utf-8") as f:
+        specs = extract_llm_requirements(f.read())
+    if not specs:
+        print("error: no LLM requirements found in setup.py core_packages", file=sys.stderr)
+        return 1
+    # One spec per line (valid requirements.txt): the workflow redirects this to
+    # a file and runs `pip install -r`, so a specifier containing a shell
+    # metacharacter (`>=`, `<`) is never word-split or treated as a redirection.
+    print("\n".join(specs))
+    return 0
+
+
+def main(argv: Optional[list[str]] = None) -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    sub = parser.add_subparsers(dest="command", required=True)
+
+    sub.add_parser("plan-matrix", help="Emit the translation matrix to $GITHUB_OUTPUT.")
+
+    body = sub.add_parser("build-pr-body", help="Build the draft-PR body from a manifest.")
+    body.add_argument("--manifest", required=True, help="Path to the translation manifest JSON.")
+    body.add_argument("--out", required=True, help="Where to write the PR body markdown.")
+
+    sub.add_parser(
+        "llm-requirements",
+        help="Print the LLM pip requirements (langchain*/pydantic/requests) from setup.py.",
+    )
+
+    args = parser.parse_args(argv)
+
+    if args.command == "plan-matrix":
+        return _cmd_plan_matrix()
+    if args.command == "build-pr-body":
+        return _cmd_build_pr_body(args.manifest, args.out)
+    if args.command == "llm-requirements":
+        return _cmd_llm_requirements()
+    parser.error("unknown command")
+    return 2
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/helper-scripts/map_item_converter.py b/helper-scripts/map_item_converter.py
new file mode 100644
index 000000000..8720920fb
--- /dev/null
+++ b/helper-scripts/map_item_converter.py
@@ -0,0 +1,1162 @@
+"""
+Translate 4CAT Zeeschuimer-import datasource `map_item` functions from Python
+to JavaScript and splice them into the corresponding Zeeschuimer
+`modules/<platform>.js` file.
+
+Designed to be invoked by a GitHub Action whenever a Zeeschuimer datasource's
+Python file changes on master. Can also be run locally for testing or via
+`workflow_dispatch` with `--bootstrap` to translate every datasource at once.
+
+The LLM produces only the new `map_item` function (plus any imports/helpers it
+needs and free-text commentary). This script does the file integration: it
+locates a marker block in the existing JS module and replaces its contents,
+preserving every hand-written line outside the markers.
+
+Usage:
+    PROVIDER_API_KEY=... python helper-scripts/map_item_converter.py \\
+        --files datasources/tiktok/search_tiktok.py \\
+        --zeeschuimer-checkout ../zeeschuimer \\
+        --output-manifest /tmp/manifest.json
+"""
+from __future__ import annotations
+
+import argparse
+import ast
+import json
+import os
+import re
+import sys
+import time
+import traceback
+from pathlib import Path
+from typing import Optional, TYPE_CHECKING
+
+sys.path.insert(0, os.path.join(os.path.abspath(os.path.dirname(__file__)), ".."))
+
+# NOTE: `common.lib.llm.LLMAdapter` (and the langchain stack it pulls in) is
+# imported lazily inside main(), not at module load. That keeps the pure
+# helpers here — path derivation, splicing, linting — importable for unit
+# tests without the heavy LLM dependencies installed. `from __future__ import
+# annotations` makes the `LLMAdapter` type hints below strings, so they don't
+# need the import at definition time either.
+if TYPE_CHECKING:
+    # Resolved only by type checkers / linters (never at runtime), so the
+    # `llm: LLMAdapter` annotations below have a defined name without forcing
+    # the langchain import.
+    from common.lib.llm import LLMAdapter
+
+# Sibling module — lives next to this script in helper-scripts/. Python adds the
+# script's directory to sys.path automatically when the file is run directly.
+from map_item_rules import RULES, get_regex_lint_rules
+
+
+# 4CAT datasource path -> Zeeschuimer module path is derived by convention:
+# the Python file is `datasources/<dir>/search_<name>.py`; the JS module is
+# `modules/<name-with-hyphens>.js`. The convention only depends on the Python
+# *filename*, not the directory, so cases where they differ still work
+# (e.g. `xiaohongshu/search_rednote.py` -> `modules/rednote.js`,
+#  `twitter-import/search_twitter.py` -> `modules/twitter.js`).
+#
+# Datasources without a matching Zeeschuimer module (today: facebook) are
+# skipped automatically — the JS file existence check in `translate_one`
+# handles them without any explicit allow-list. New Zeeschuimer datasources
+# added to 4CAT are picked up automatically as long as Zeeschuimer ships the
+# matching `modules/<name>.js` file.
+def python_to_js_module(python_rel: str) -> Optional[str]:
+    """
+    Derive the Zeeschuimer module path for a 4CAT datasource Python file.
+    Returns None if the path doesn't follow `datasources/<dir>/search_*.py`.
+    """
+    parts = python_rel.split("/")
+    if len(parts) != 3 or parts[0] != "datasources":
+        return None
+    filename = parts[2]
+    if not filename.startswith("search_") or not filename.endswith(".py"):
+        return None
+    base = filename[len("search_"):-len(".py")]
+    if not base:
+        return None
+    return f"modules/{base.replace('_', '-')}.js"
+
+DEFAULT_LLM_PROVIDER = "ollama"
+DEFAULT_BASE_URL = "https://ollama.digitalmethods.net"
+DEFAULT_MODEL = "qwen2.5-coder:14b"
+
+IMPORTS_MARKER_START = "// === auto-generated imports for map_item — BLOCK REPLACED AUTOMATICALLY ==="
+IMPORTS_MARKER_END = "// === end auto-generated imports ==="
+BLOCK_MARKER_START = "// === auto-generated by 4cat map_item sync — BLOCK REPLACED AUTOMATICALLY ==="
+BLOCK_MARKER_END = "// === end auto-generated ==="
+
+LLM_SCHEMA = {
+    "title": "MapItemTranslation",
+    "type": "object",
+    "required": ["map_item_function", "imports_to_add", "helpers_to_add", "commentary"],
+    "properties": {
+        "map_item_function": {
+            "type": "string",
+            "description": (
+                "Full JavaScript source of the new map_item function. Must include "
+                "the function declaration (e.g. 'export function map_item(item) {...}'). "
+                "Do not include surrounding code from the module."
+            ),
+        },
+        "imports_to_add": {
+            "type": "array",
+            "items": {"type": "string"},
+            "description": (
+                "Complete ES-module import statements that map_item needs. "
+                "Normally empty: Zeeschuimer's `js/lib.js` (which provides MappedItem, "
+                "MissingMappedField, normalize_url_encoding, strip_tags) is loaded as a "
+                "plain script, NOT an ES module — its declarations are already global, "
+                "so do not write `import { X } from '../js/lib.js'`. Only populate this "
+                "if you genuinely need to import from another ES module."
+            ),
+        },
+        "helpers_to_add": {
+            "type": "array",
+            "items": {"type": "string"},
+            "description": (
+                "Full source of any helper functions map_item depends on (e.g. a JS "
+                "port of normalize_url_encoding). Empty array if none needed."
+            ),
+        },
+        "commentary": {
+            "type": "string",
+            "description": (
+                "Notes for the human reviewer: assumptions made, fields you were "
+                "unsure about, Python idioms that don't translate cleanly. Plain text."
+            ),
+        },
+    },
+}
+
+SYSTEM_PROMPT = (
+    "You translate 4CAT Python `map_item` functions into JavaScript for the "
+    "Zeeschuimer browser extension. You return ONLY the new map_item function, "
+    "any imports it needs, any helper functions it depends on, and commentary "
+    "for the human reviewer. You NEVER return the surrounding module file. "
+    "You preserve the field names produced by the Python function exactly. "
+    "You do not invent fields not present in the Python output. "
+    "You output raw JavaScript source — never wrap it in markdown code fences "
+    "(```js, ```javascript, etc.). The fields in your structured response are "
+    "already typed as code; fences make them invalid."
+)
+
+
+# Whitelist of helpers that Zeeschuimer makes available as globals at runtime.
+# `js/lib.js` is loaded as a plain <script>, not as an ES module, so its top-
+# level declarations are global — they must NOT be imported. Anything not on
+# this list must be inlined or added to `helpers_to_add`. Update this list
+# whenever new helpers are added to `js/lib.js`.
+AVAILABLE_JS_HELPERS = [
+    {
+        "name": "MappedItem",
+        "kind": "class",
+        "usage": "new MappedItem({field: value, ...})",
+        "note": "Wraps the return value of map_item. Always instantiate with `new`.",
+    },
+    {
+        "name": "MissingMappedField",
+        "kind": "class",
+        "usage": "new MissingMappedField(value, label)",
+        "note": "Represents a field that may legitimately be missing. Always instantiate with `new`.",
+    },
+    {
+        "name": "normalize_url_encoding",
+        "kind": "function",
+        "usage": "normalize_url_encoding(url)",
+        "note": "Direct port of the Python helper of the same name.",
+    },
+    {
+        "name": "strip_tags",
+        "kind": "function",
+        "usage": "strip_tags(html, convertNewlines = true)",
+        "note": "Direct port of the Python helper of the same name.",
+    },
+    {
+        "name": "formatUtcTimestamp",
+        "kind": "function",
+        "usage": "formatUtcTimestamp(timestamp)",
+        "note": "Formats a UTC timestamp as a readable string.",
+    },
+    {
+        "name": "MapItemException",
+        "kind": "class",
+        "usage": "throw new MapItemException(message)",
+        "note": (
+            "Mirror of 4CAT's `MapItemException`. Throw from `map_item` to "
+            "signal a known mapping failure (e.g. unrecognized item shape); "
+            "callers catch it, skip the item, and warn that the platform's "
+            "format may have shifted. Always instantiate with `new`."
+        ),
+    },
+]
+
+
+def _format_available_helpers() -> str:
+    lines = []
+    for h in AVAILABLE_JS_HELPERS:
+        lines.append(
+            f"- `{h['name']}` ({h['kind']}, global) — {h['note']} "
+            f"Usage: `{h['usage']}`."
+        )
+    return "\n".join(lines)
+
+
+def _format_past_errors(rules) -> str:
+    """
+    Render rule records as a bulleted block for the "things to get right"
+    prompt section. Each rule emits one bullet with its `prompt_rule`, plus
+    optional `Wrong:` / `Right:` example lines. Multi-line examples are
+    indented as code blocks.
+    """
+    lines = []
+    for rule in rules:
+        lines.append(f"- **{rule.id}** — {rule.prompt_rule}")
+        for label, snippet in (("Wrong", rule.bad), ("Right", rule.good)):
+            if not snippet:
+                continue
+            if "\n" in snippet:
+                lines.append(f"    {label}:")
+                lines.extend(f"        {line}" for line in snippet.split("\n"))
+            else:
+                lines.append(f"    {label}: `{snippet}`")
+    return "\n".join(lines)
+
+
+def _format_verification_checklist(rules) -> str:
+    items = [r.verify for r in rules if r.verify]
+    return "\n".join(f"{i + 1}. {item}" for i, item in enumerate(items))
+
+
+def is_zeeschuimer_datasource(python_path: Path) -> bool:
+    """
+    Returns True if the given Python file defines a 4CAT Search subclass with
+    `is_from_zeeschuimer = True` as a class attribute.
+    """
+    try:
+        tree = ast.parse(python_path.read_text(encoding="utf-8"))
+    except (SyntaxError, OSError):
+        return False
+
+    for node in ast.walk(tree):
+        if not isinstance(node, ast.ClassDef):
+            continue
+        for stmt in node.body:
+            if not isinstance(stmt, ast.Assign):
+                continue
+            for target in stmt.targets:
+                if isinstance(target, ast.Name) and target.id == "is_from_zeeschuimer":
+                    if isinstance(stmt.value, ast.Constant) and stmt.value.value is True:
+                        return True
+    return False
+
+
+def discover_bootstrap_files(repo_root: Path, zeeschuimer_root: Path) -> list[Path]:
+    """
+    Find every Python datasource that has a matching Zeeschuimer module.
+    Scans `datasources/*/search_*.py`, keeping only Zeeschuimer datasources
+    whose derived JS module exists in the checkout (so e.g. facebook, which
+    4CAT supports but Zeeschuimer does not, is silently dropped).
+    """
+    found = []
+    for path in sorted((repo_root / "datasources").glob("*/search_*.py")):
+        if not is_zeeschuimer_datasource(path):
+            continue
+        rel = path.relative_to(repo_root).as_posix()
+        js_rel = python_to_js_module(rel)
+        if not js_rel:
+            continue
+        if (zeeschuimer_root / js_rel).exists():
+            found.append(path)
+    return found
+
+
+def build_user_prompt(python_source: str, existing_module_source: str, python_rel: str) -> str:
+    helpers_block = _format_available_helpers()
+    past_errors_block = _format_past_errors(RULES)
+    verification_block = _format_verification_checklist(RULES)
+    return (
+        f"# Source Python file (datasources/{python_rel})\n"
+        "This is the file that just changed in 4CAT. The `map_item` function on the "
+        "class is the source of truth — your JavaScript translation must produce an "
+        "object with the same field names and equivalent values.\n\n"
+        f"```python\n{python_source}\n```\n\n"
+        "# Existing Zeeschuimer module\n"
+        "This module's `capture()` function returns the raw items that will be "
+        "passed to `map_item(item)` as `item`. Use it to understand the input shape "
+        "and to match the existing code style (ES modules, `export` keyword, etc.).\n\n"
+        f"```javascript\n{existing_module_source}\n```\n\n"
+        "# Available Zeeschuimer JS helpers (globals)\n"
+        "Zeeschuimer loads `js/lib.js` as a plain `<script>`, NOT as an ES "
+        "module. Its top-level declarations are global — available everywhere "
+        "without any `import`. Use them by name only. The following are the "
+        "helpers you may use; everything else must be implemented as JavaScript "
+        "(inline in `map_item_function` or as separate snippets in `helpers_to_add`).\n\n"
+        f"{helpers_block}\n\n"
+        "# Imports — almost always none\n"
+        "Do NOT write `import { MappedItem } from '../js/lib.js'` or any similar "
+        "statement for the helpers above — `lib.js` is a script, not a module, "
+        "and the import will fail at runtime. The `imports_to_add` field should "
+        "normally be EMPTY; only include an import if you genuinely need to pull "
+        "from another ES module (rare for `map_item`).\n\n"
+        "Also forbidden, because they don't exist in JavaScript:\n"
+        "- Anything from `common.lib.helpers` not listed above (e.g. `convert_to_int`, `timify`)\n"
+        "- Anything from `common.lib.exceptions`, `common.lib.user_input`, `backend.lib.*`\n"
+        "- Python stdlib modules (`datetime`, `urllib.parse`, `re`, `json`, `hashlib`, etc.) — use the JavaScript native equivalents instead.\n\n"
+        "# Things to get right — past errors from this generator\n"
+        "Each item below has been observed in previous LLM output. The script lints "
+        "for many of them and surfaces matches as warnings on the PR. Translate accordingly.\n\n"
+        f"{past_errors_block}\n\n"
+        "# Before submitting, verify your output\n"
+        f"{verification_block}\n\n"
+        "# Output format\n"
+        "Use `export function map_item(item) { ... }` to match this module's ES-module style. "
+        "Return raw JavaScript source — do NOT wrap fields in markdown code fences. "
+        "The `imports_to_add` field is normally an empty array (the helpers above are global, not imported). "
+        "The `helpers_to_add` field should contain full helper-function source (each entry one complete function)."
+    )
+
+
+# Remove these code fences that keep appearing in the LLM output, even when explicitly told not to use them.
+_FENCE_OPEN = re.compile(r"^```(?:js|javascript|typescript|ts)?\s*\n?", re.IGNORECASE)
+_FENCE_CLOSE = re.compile(r"\n?```\s*$")
+
+
+def strip_code_fences(s: str) -> str:
+    """
+    Strip leading/trailing markdown code fences (```js, ```javascript, ``` etc.)
+    Idempotent; returns the input unchanged if no fences found. Defensive
+    post-processing because LLMs frequently wrap their answers in fences even
+    when explicitly told not to.
+    """
+    if not s:
+        return s
+    s = s.strip()
+    s = _FENCE_OPEN.sub("", s)
+    s = _FENCE_CLOSE.sub("", s)
+    return s.strip()
+
+
+# Helper to check for common issues
+def _strip_js_comments(s: str) -> str:
+    """
+    Remove `//` line comments and `/* */` block comments so they don't trip the
+    pattern checks, while preserving the contents of string and template
+    literals. A naive `re.sub(r"//[^\n]*", "", s)` would eat the `//` in a URL
+    like `"https://example.com"` and mangle the string, producing bogus lint
+    warnings (e.g. spurious "literal newline in string").
+
+    A small scanner tracks string state. It deliberately does NOT try to
+    distinguish a regex literal from division — `/` is only acted on when it
+    begins `//` or `/*`, neither of which is a valid start to a regex literal
+    in practice (an empty regex is written `/(?:)/`).
+    """
+    out = []
+    i, n = 0, len(s)
+    quote = None  # active string delimiter: ' " or `
+    while i < n:
+        c = s[i]
+        if quote is not None:
+            out.append(c)
+            if c == "\\" and i + 1 < n:  # escape: copy the next char verbatim
+                out.append(s[i + 1])
+                i += 2
+                continue
+            if c == quote:
+                quote = None
+            i += 1
+            continue
+        if c in ("'", '"', "`"):
+            quote = c
+            out.append(c)
+            i += 1
+            continue
+        if c == "/" and i + 1 < n and s[i + 1] == "/":
+            i += 2
+            while i < n and s[i] != "\n":
+                i += 1
+            continue
+        if c == "/" and i + 1 < n and s[i + 1] == "*":
+            i += 2
+            while i + 1 < n and not (s[i] == "*" and s[i + 1] == "/"):
+                i += 1
+            i += 2  # skip the closing */
+            continue
+        out.append(c)
+        i += 1
+    return "".join(out)
+
+
+# Regex checks for known anti-patterns. Sourced from the rule registry so
+# that prompt guidance and lint stay in sync. Bespoke checks below (class
+# instantiation, literal newlines, regex use) are not in the registry's
+# regex list — they're tied to records by `id`.
+LINT_PATTERNS = get_regex_lint_rules()
+
+
+# Lexer that matches JS string and template literals as whole units. Used to
+# check whether a single- or double-quoted literal contains a raw newline (a
+# JS syntax error). Template literals are allowed to span lines so they're
+# matched and skipped. Regex matches escapes (\\.) so embedded `\"` etc. don't
+# prematurely close the string.
+_JS_STRING_LITERAL = re.compile(
+    r'''
+        (?P<dq>"(?:[^"\\]|\\.)*")
+      | (?P<sq>'(?:[^'\\]|\\.)*')
+      | (?P<tl>`(?:[^`\\]|\\.)*`)
+    ''',
+    re.DOTALL | re.VERBOSE,
+)
+
+
+def _has_literal_newline_in_string(source: str) -> bool:
+    for m in _JS_STRING_LITERAL.finditer(source):
+        if m.group("tl"):
+            continue  # template literals legally span lines
+        if "\n" in m.group(0):
+            return True
+    return False
+
+
+# Heuristic openers for a regex literal: `.match(/`, `new RegExp(/`, etc.
+# Scoped to method-call contexts because regex `/` is hard to disambiguate
+# from division otherwise. False negatives are acceptable; false positives
+# would block valid code.
+_REGEX_LITERAL_OPENER = re.compile(
+    r"(?:\.(?:match|replace|replaceAll|split|search|matchAll|test|exec)\s*\(\s*"
+    r"|\bnew\s+RegExp\s*\(\s*)/"
+)
+
+
+def _uses_regex(source: str) -> bool:
+    """
+    Detect whether the source uses a regex in a recognized context
+    (`.match(/.../)`, `new RegExp(...)`, etc.). Current models translate
+    regex unreliably — this is a blanket "needs human review" flag, not a
+    bug detector. Reviewer must verify the regex behavior end-to-end.
+    """
+    return bool(_REGEX_LITERAL_OPENER.search(source))
+
+
+def lint_translation(translation: dict) -> list:
+    """
+    Scan the generated JS for known bugs. Returns a list of error strings
+    (empty if clean). Runs on `map_item_function` and each `helpers_to_add`
+    entry. Comments are stripped before scanning to avoid false positives
+    on commentary text.
+    """
+    issues = []
+    sources = []
+    fn = translation.get("map_item_function") or ""
+    if fn:
+        sources.append(("map_item_function", fn))
+    for i, h in enumerate(translation.get("helpers_to_add") or []):
+        if isinstance(h, str) and h:
+            sources.append((f"helpers_to_add[{i}]", h))
+
+    for label, source in sources:
+        clean = _strip_js_comments(source)
+
+        seen = set()
+        for regex, message in LINT_PATTERNS:
+            if regex.search(clean) and message not in seen:
+                issues.append(f"[{label}] {message}")
+                seen.add(message)
+
+        # Class instantiation without `new` (variable-width lookbehind, so
+        # check the chars before each match manually).
+        for cls in ("MappedItem", "MissingMappedField", "MapItemException"):
+            pattern = re.compile(rf"\b{cls}\s*\(")
+            reported = False
+            for m in pattern.finditer(clean):
+                before = clean[max(0, m.start() - 8) : m.start()]
+                if not re.search(r"\bnew\s+$", before):
+                    if not reported:
+                        issues.append(
+                            f"[{label}] `{cls}` instantiated without `new` keyword "
+                            f"(at offset {m.start()}). All class instantiations need `new`."
+                        )
+                        reported = True
+
+        # Literal newlines inside single- or double-quoted strings (a JS
+        # syntax error). The LLM sometimes emits e.g. `.join('\n')` as
+        # `.join('<actual newline>')` which doesn't parse.
+        if _has_literal_newline_in_string(clean):
+            issues.append(
+                f"[{label}] Literal newline inside a string literal — JS strings "
+                f"can't span lines without escape (`\"\\n\"`) or template literals "
+                f"(`` `\\n` ``)."
+            )
+
+        # Regex translation is unreliable on the current model — flag any
+        # regex use for manual reviewer verification rather than trying to
+        # detect specific failure modes (literal newlines, dropped escapes,
+        # flag-syntax differences, character-class drift). The reviewer is
+        # the source of truth here until we can upgrade the model.
+        if _uses_regex(clean):
+            issues.append(
+                f"[{label}] Regex detected. The current LLM translates regex "
+                f"unreliably (escapes, character classes, flags) — please verify "
+                f"the regex behavior against the Python original by hand."
+            )
+
+    return issues
+
+
+def validate_translation(translation: dict) -> Optional[str]:
+    """
+    Returns None if the translation passes basic sanity checks, else a string
+    describing what went wrong.
+    """
+    # `or ""` not a default: structured output can yield an explicit null for a
+    # required field, and `None.strip()` would raise instead of being reported.
+    fn = (translation.get("map_item_function") or "").strip()
+    if not fn:
+        return "LLM returned empty map_item_function"
+    if not re.search(r"\bmap_item\b", fn):
+        return "LLM output does not contain `map_item` identifier"
+    if not re.search(r"function\s+map_item|map_item\s*=|map_item\s*:", fn):
+        return "LLM output does not declare `map_item` as a function"
+    return None
+
+
+# A pre-existing top-level `map_item` declaration. Used on the first sync (no
+# markers yet) to refuse appending a SECOND declaration, which would be a JS
+# redeclaration error. Conservative: matches only real declaration forms
+# (`function map_item`, `const/let/var map_item =`), not incidental mentions;
+# comments are stripped before this is applied.
+_PREEXISTING_MAP_ITEM_RE = re.compile(
+    r"(?:export\s+)?(?:async\s+)?function\s+map_item\b"
+    r"|(?:export\s+)?(?:const|let|var)\s+map_item\s*="
+)
+
+
+def splice_into_module(existing: str, translation: dict, python_rel: str) -> str:
+    """
+    Insert / replace the auto-generated marker blocks in the JS
+    module text.
+
+    Raises ValueError if exactly one of (start, end) markers is present —
+    that means the file is corrupted or partially hand-edited and we should
+    refuse to touch it. Also raises ValueError on the first sync (no markers
+    yet) if the module already defines its own `map_item`, since appending a
+    second declaration would not parse.
+    """
+    main_block_body = []
+    for helper in translation.get("helpers_to_add", []):
+        helper = helper.strip()
+        if helper:
+            main_block_body.append(helper)
+    fn = translation["map_item_function"].strip()
+    main_block_body.append(fn)
+    main_block = (
+        f"{BLOCK_MARKER_START}\n"
+        f"// (regenerated from {python_rel})\n"
+        + "\n\n".join(main_block_body)
+        + f"\n{BLOCK_MARKER_END}\n"
+    )
+
+    imports = [imp.strip() for imp in translation.get("imports_to_add", []) if imp.strip()]
+    # Drop imports that already appear verbatim outside the marker block.
+    existing_outside_block = re.sub(
+        re.escape(BLOCK_MARKER_START) + r".*?" + re.escape(BLOCK_MARKER_END) + r"\n?",
+        "",
+        existing,
+        flags=re.DOTALL,
+    )
+    existing_outside_imports_block = re.sub(
+        re.escape(IMPORTS_MARKER_START) + r".*?" + re.escape(IMPORTS_MARKER_END) + r"\n?",
+        "",
+        existing_outside_block,
+        flags=re.DOTALL,
+    )
+    imports = [imp for imp in imports if imp not in existing_outside_imports_block]
+
+    imports_block = ""
+    if imports:
+        imports_block = (
+            f"{IMPORTS_MARKER_START}\n"
+            + "\n".join(imports)
+            + f"\n{IMPORTS_MARKER_END}\n"
+        )
+
+    updated = existing
+
+    # Replace or insert imports block.
+    has_imports_start = IMPORTS_MARKER_START in updated
+    has_imports_end = IMPORTS_MARKER_END in updated
+    if has_imports_start ^ has_imports_end:
+        raise ValueError(
+            "Auto-generated imports markers are partially missing in the existing "
+            "module — refusing to overwrite. Restore both markers or remove both."
+        )
+    if has_imports_start and has_imports_end:
+        # Replacement is a callable, NOT a string: a string replacement runs
+        # through re's escape processing, so backslashes in the generated JS
+        # (regex literals like `/\w+/`, `"\n"` escapes, `\g<...>`) would be
+        # corrupted or raise `re.error`. A callable's return value is inserted
+        # verbatim.
+        updated = re.sub(
+            re.escape(IMPORTS_MARKER_START) + r".*?" + re.escape(IMPORTS_MARKER_END) + r"\n?",
+            lambda _match: imports_block,
+            updated,
+            count=1,
+            flags=re.DOTALL,
+        )
+    elif imports_block:
+        # Prepend at top of file.
+        if updated and not updated.startswith("\n"):
+            updated = imports_block + "\n" + updated
+        else:
+            updated = imports_block + updated
+
+    # Replace or append main block.
+    has_main_start = BLOCK_MARKER_START in updated
+    has_main_end = BLOCK_MARKER_END in updated
+    if has_main_start ^ has_main_end:
+        raise ValueError(
+            "Auto-generated map_item markers are partially missing in the existing "
+            "module — refusing to overwrite. Restore both markers or remove both."
+        )
+    if has_main_start and has_main_end:
+        # Callable replacement (see note on the imports block above): the
+        # generated JS routinely contains `\w`/`\d` regex escapes and `"\n"`
+        # string escapes, which a string replacement would either mangle into
+        # a raw newline or reject with `re.error: bad escape`.
+        updated = re.sub(
+            re.escape(BLOCK_MARKER_START) + r".*?" + re.escape(BLOCK_MARKER_END) + r"\n?",
+            lambda _match: main_block,
+            updated,
+            count=1,
+            flags=re.DOTALL,
+        )
+    else:
+        # First sync for this module (no markers yet): appending is only safe if
+        # the module doesn't already declare its own `map_item`. Two declarations
+        # would be a JS redeclaration error. Strip comments first so a commented-
+        # out or merely-referenced `map_item` doesn't trip the check.
+        if _PREEXISTING_MAP_ITEM_RE.search(_strip_js_comments(updated)):
+            raise ValueError(
+                "Module already defines `map_item` outside the auto-generated "
+                f"markers. Wrap that function in the `{BLOCK_MARKER_START}` / "
+                f"`{BLOCK_MARKER_END}` markers (or remove it) so the sync can "
+                "replace it in place — refusing to append a second declaration."
+            )
+        if not updated.endswith("\n"):
+            updated += "\n"
+        updated += "\n" + main_block
+
+    return updated
+
+
+def extract_raw_from_exception(exc: BaseException) -> Optional[str]:
+    """
+    Pull whatever raw LLM output we can find off a LangChain exception. Tries
+    several attribute names since they vary by LangChain version. Returns None
+    if nothing recoverable.
+    """
+    for attr in ("llm_output", "observation", "output"):
+        val = getattr(exc, attr, None)
+        if isinstance(val, str):
+            return val
+    return None
+
+
+def describe_raw(raw) -> Optional[dict]:
+    """
+    Summarize the raw AIMessage from an `include_raw=True` structured-output call
+    so a failed (or surprising) translation is diagnosable from CI logs alone,
+    even when the JSON manifest artifact is lost.
+
+    The key signal is `finish_reason`: "length" means the model was truncated
+    mid-output (raise --max-tokens); "stop" with empty `content` but a populated
+    reasoning channel means a reasoning model put its answer where the
+    structured-output parser can't see it (a structured-output *method* problem,
+    not a token-budget one).
+    """
+    if raw is None:
+        return None
+    meta = getattr(raw, "response_metadata", {}) or {}
+    addl = getattr(raw, "additional_kwargs", {}) or {}
+    content = getattr(raw, "content", "") or ""
+    if not isinstance(content, str):
+        content = str(content)
+    # Reasoning models surface their analysis channel under varying keys
+    # depending on provider/proxy; check the common ones.
+    reasoning = (
+        addl.get("reasoning_content")
+        or addl.get("reasoning")
+        or meta.get("reasoning_content")
+        or ""
+    )
+    if not isinstance(reasoning, str):
+        reasoning = str(reasoning)
+    return {
+        "finish_reason": meta.get("finish_reason"),
+        "content_chars": len(content),
+        "reasoning_chars": len(reasoning),
+        "token_usage": meta.get("token_usage") or meta.get("usage"),
+        "content_preview": content[:500],
+        "reasoning_preview": reasoning[:500],
+    }
+
+
+def format_raw_summary(d: dict) -> str:
+    """One-line, CI-log-friendly rendering of describe_raw()."""
+    return (
+        f"finish_reason={d.get('finish_reason')!r} "
+        f"content_chars={d.get('content_chars')} "
+        f"reasoning_chars={d.get('reasoning_chars')} "
+        f"token_usage={d.get('token_usage')}"
+    )
+
+
+def build_json_format_instructions(schema: dict) -> str:
+    """
+    Render an LLM_SCHEMA-shaped JSON schema into prompt instructions, for
+    `prompt` output mode where no response_format is sent. Single source of
+    truth stays the schema: keys, types, and descriptions all come from it.
+    """
+    props = schema.get("properties", {})
+    required = schema.get("required", list(props))
+    field_lines = []
+    shape_lines = []
+    for key in required:
+        spec = props.get(key, {})
+        typ = spec.get("type", "string")
+        desc = " ".join(spec.get("description", "").split())
+        field_lines.append(f'- "{key}" ({typ}): {desc}'.rstrip())
+        shape_lines.append(f'  "{key}": <{typ}>')
+    return (
+        "Respond with ONLY a single JSON object — no markdown, no ``` code "
+        "fences, no prose before or after it. The object must have exactly "
+        "these keys:\n"
+        + "\n".join(field_lines)
+        + "\n\nShape:\n{\n"
+        + ",\n".join(shape_lines)
+        + "\n}\n\n"
+        "String values must be valid JSON strings (escape newlines as \\n, "
+        "quotes as \\\"). Array values must be JSON arrays of strings (use [] "
+        "when empty)."
+    )
+
+
+def parse_json_object(text: str) -> Optional[dict]:
+    """
+    Pull a JSON object out of a model reply in `prompt` output mode. Tries the
+    whole (fence-stripped) text first, then falls back to the first balanced
+    {...} span. Returns None if nothing parses to a dict.
+    """
+    if not text:
+        return None
+    candidate = strip_code_fences(text).strip()
+    for attempt in (candidate, _first_brace_span(candidate)):
+        if not attempt:
+            continue
+        try:
+            parsed = json.loads(attempt)
+        except (ValueError, TypeError):
+            continue
+        if isinstance(parsed, dict):
+            return parsed
+    return None
+
+
+def _first_brace_span(text: str) -> Optional[str]:
+    """Return the first balanced {...} substring, or None. Brace-depth scan that
+    ignores braces inside JSON string literals (respecting backslash escapes)."""
+    start = text.find("{")
+    if start == -1:
+        return None
+    depth = 0
+    in_string = False
+    escaped = False
+    for i in range(start, len(text)):
+        ch = text[i]
+        if in_string:
+            if escaped:
+                escaped = False
+            elif ch == "\\":
+                escaped = True
+            elif ch == '"':
+                in_string = False
+            continue
+        if ch == '"':
+            in_string = True
+        elif ch == "{":
+            depth += 1
+        elif ch == "}":
+            depth -= 1
+            if depth == 0:
+                return text[start:i + 1]
+    return None
+
+
+def translate_one(
+    llm: LLMAdapter,
+    python_path: Path,
+    repo_root: Path,
+    zeeschuimer_root: Path,
+    strict_lint: bool,
+    structured: bool = True,
+) -> dict:
+    """
+    Translate one Python file. Returns a manifest entry dict.
+
+    :param structured: When True (default), the schema was bound via
+        `set_structure` and the model returns a parsed dict. When False
+        (prompt mode), we ask for JSON in the prompt and parse the reply text
+        ourselves — for models/proxies whose response_format is broken.
+    """
+    rel = python_path.relative_to(repo_root).as_posix()
+    entry = {
+        "python_file": rel,
+        "js_file": None,
+        "status": "failed",
+        "commentary": "",
+        "duration_seconds": None,
+        "raw_response": None,
+        "lint_warnings": [],
+        "translation": None,
+        "error": None,
+    }
+
+    if not is_zeeschuimer_datasource(python_path):
+        entry["status"] = "skipped"
+        entry["error"] = "not a Zeeschuimer datasource (is_from_zeeschuimer != True)"
+        return entry
+
+    js_rel = python_to_js_module(rel)
+    if not js_rel:
+        entry["status"] = "skipped"
+        entry["error"] = f"could not derive Zeeschuimer module path from {rel} (expected `datasources/<dir>/search_*.py` form)"
+        return entry
+    entry["js_file"] = js_rel
+
+    js_path = zeeschuimer_root / js_rel
+    if not js_path.exists():
+        entry["status"] = "skipped"
+        entry["error"] = f"Zeeschuimer module {js_rel} does not exist in checkout"
+        return entry
+
+    python_source = python_path.read_text(encoding="utf-8")
+    existing_module = js_path.read_text(encoding="utf-8")
+    user_prompt = build_user_prompt(python_source, existing_module, rel)
+    # Prompt mode: no response_format is sent, so spell out the JSON contract in
+    # the prompt itself and parse the reply text ourselves below.
+    if not structured:
+        user_prompt = user_prompt + "\n\n" + build_json_format_instructions(LLM_SCHEMA)
+
+    started = time.monotonic()
+    raw_response: Optional[str] = None
+    translation: Optional[dict] = None
+    llm_error: Optional[str] = None
+    diagnostics: Optional[dict] = None
+
+    try:
+        response = llm.generate_text(user_prompt, system_prompt=SYSTEM_PROMPT)
+        if structured:
+            # Structured output bound with include_raw=True (see set_structure in
+            # main()): `generate_text` returns a {"raw", "parsed",
+            # "parsing_error"} wrapper instead of raising on a parse failure, so
+            # we can record *why* a translation failed (see describe_raw) rather
+            # than getting an opaque "Invalid json output:" with an empty payload.
+            if isinstance(response, dict) and "raw" in response and "parsed" in response:
+                diagnostics = describe_raw(response.get("raw"))
+                parsed = response.get("parsed")
+                parse_err = response.get("parsing_error")
+                if parsed is not None:
+                    translation = parsed
+                else:
+                    detail = f": {parse_err}" if parse_err else ""
+                    llm_error = f"structured output not parseable{detail}"
+            elif isinstance(response, dict):
+                translation = response
+            else:
+                llm_error = (
+                    f"Expected dict from structured output, got "
+                    f"{type(response).__name__}"
+                )
+        else:
+            # Prompt mode: `response` is a plain AIMessage. Capture diagnostics
+            # off it, then parse JSON out of its text content ourselves.
+            diagnostics = describe_raw(response)
+            text = response.content if isinstance(response.content, str) else str(response.content)
+            raw_response = text
+            translation = parse_json_object(text)
+            if translation is None:
+                llm_error = "could not parse a JSON object from the model reply"
+    except Exception as e:
+        raw_response = extract_raw_from_exception(e)
+        llm_error = f"LLM call failed: {e}"
+
+    entry["duration_seconds"] = round(time.monotonic() - started, 2)
+    entry["raw_response"] = raw_response
+    if diagnostics is not None:
+        entry["diagnostics"] = diagnostics
+        print(f"  -> {format_raw_summary(diagnostics)}", flush=True)
+
+    if llm_error:
+        entry["error"] = llm_error
+        return entry
+    if translation is None:
+        entry["error"] = "no translation produced (no error raised, no dict returned)"
+        return entry
+
+    # Defensive: strip stray markdown code fences from the function source and
+    # each helper. Models wrap things in ```js even when instructed not to.
+    if isinstance(translation.get("map_item_function"), str):
+        translation["map_item_function"] = strip_code_fences(translation["map_item_function"])
+    helpers = translation.get("helpers_to_add")
+    if isinstance(helpers, list):
+        translation["helpers_to_add"] = [
+            strip_code_fences(h) if isinstance(h, str) else h for h in helpers
+        ]
+
+    bad = validate_translation(translation)
+    if bad:
+        entry["translation"] = translation
+        entry["error"] = bad
+        return entry
+
+    lint_issues = lint_translation(translation)
+    if lint_issues:
+        if strict_lint:
+            entry["translation"] = translation
+            entry["lint_warnings"] = lint_issues
+            entry["error"] = "Lint issues (--strict-lint):\n  - " + "\n  - ".join(lint_issues)
+            return entry
+        # Non-strict (default): record as warnings and let the file ship. The
+        # reviewer sees the issues in the PR body and fixes them by hand.
+        entry["lint_warnings"] = lint_issues
+
+    try:
+        spliced = splice_into_module(existing_module, translation, rel)
+    except ValueError as e:
+        entry["translation"] = translation
+        entry["error"] = str(e)
+        return entry
+
+    js_path.write_text(spliced, encoding="utf-8")
+    entry["status"] = "ok"
+    entry["commentary"] = translation.get("commentary", "").strip()
+    # Keep parsed translation on warning entries so the PR / reviewer can see
+    # exactly what was emitted alongside the warnings.
+    if entry["lint_warnings"]:
+        entry["translation"] = translation
+    return entry
+
+
+def main():
+    cli = argparse.ArgumentParser(description=__doc__)
+    group = cli.add_mutually_exclusive_group(required=True)
+    group.add_argument("--files", nargs="+", help="Specific datasource files to translate.")
+    group.add_argument(
+        "--bootstrap",
+        action="store_true",
+        help="Translate every Zeeschuimer datasource in the repo.",
+    )
+    cli.add_argument(
+        "--zeeschuimer-checkout",
+        required=True,
+        type=Path,
+        help="Path to a local clone of the Zeeschuimer repo.",
+    )
+    cli.add_argument(
+        "--output-manifest",
+        required=True,
+        type=Path,
+        help="Where to write the JSON manifest of results.",
+    )
+    cli.add_argument(
+        "--llm_provider",
+        default=os.environ.get("LLM_PROVIDER", DEFAULT_LLM_PROVIDER),
+        help=f"LLM provider to use (default: {DEFAULT_LLM_PROVIDER} or $LLM_PROVIDER).",
+    )
+    cli.add_argument(
+        "--base_url",
+        default=os.environ.get("LLM_BASE_URL", DEFAULT_BASE_URL),
+        help=f"Base URL for the LLM API (default: {DEFAULT_BASE_URL}, or $LLM_BASE_URL).",
+    )
+    cli.add_argument(
+        "--model",
+        default=os.environ.get("LLM_MODEL", DEFAULT_MODEL),
+        help=f"Ollama model to use (default: {DEFAULT_MODEL}, or $LLM_MODEL).",
+    )
+    cli.add_argument(
+        "--max-tokens",
+        type=int,
+        default=int(os.environ.get("LLM_MAX_TOKENS", "50000")),
+        help=(
+            "Max output tokens (a ceiling, not a target). Reasoning models like "
+            "gpt-oss spend tokens in their analysis channel before emitting the "
+            "answer; too low a ceiling truncates mid-reasoning and yields empty "
+            "output. Default: 50000, or $LLM_MAX_TOKENS."
+        ),
+    )
+    cli.add_argument(
+        "--output-mode",
+        choices=["structured", "prompt"],
+        default=os.environ.get("LLM_OUTPUT_MODE", "structured"),
+        help=(
+            "How JSON is requested. 'structured' (default) binds the schema via "
+            "response_format / function calling — reliable on backends that "
+            "support it. 'prompt' asks for JSON in the prompt and parses the "
+            "reply text, for models/proxies whose response_format is broken "
+            "(e.g. gpt-oss-120b on llmproxy.uva.nl, which returns null content "
+            "with any response_format). Default: structured, or $LLM_OUTPUT_MODE."
+        ),
+    )
+    cli.add_argument(
+        "--no-fail-fast",
+        action="store_true",
+        help=(
+            "Continue translating remaining files even after one fails. By "
+            "default the script aborts on the first failure, since failures here "
+            "are typically configuration- or model-correlated and continuing "
+            "wastes LLM time."
+        ),
+    )
+    cli.add_argument(
+        "--strict-lint",
+        action="store_true",
+        help=(
+            "Treat lint findings (Python `.get()`, missing `new`, literal newlines "
+            "in strings, etc.) as failures rather than warnings. Default is "
+            "warnings — the file still ships and the PR body surfaces the "
+            "issues so the reviewer can fix them by hand."
+        ),
+    )
+    args = cli.parse_args()
+
+    provider_api_key = os.environ.get("PROVIDER_API_KEY")
+    if not provider_api_key:
+        sys.exit("Error: PROVIDER_API_KEY environment variable not set.")
+
+    repo_root = Path(__file__).resolve().parent.parent
+
+    if args.bootstrap:
+        files = discover_bootstrap_files(repo_root, args.zeeschuimer_checkout.resolve())
+        if not files:
+            sys.exit("No Zeeschuimer datasources found to bootstrap.")
+    else:
+        files = [Path(f).resolve() for f in args.files]
+
+    # Imported here (not at module top) so the pure helpers above stay
+    # importable for tests without the langchain stack. See note near the
+    # imports at the top of this file.
+    from common.lib.llm import LLMAdapter
+    provider = args.llm_provider.lower()
+    base_url = args.base_url
+
+    llm = LLMAdapter(
+        provider=provider,
+        model=args.model,
+        base_url=base_url,
+        api_key=provider_api_key,
+        temperature=0.2,
+        max_tokens=args.max_tokens,
+        client_kwargs={"headers": {"X-API-KEY": provider_api_key}},
+    )
+
+    # Structured output (default): bind the schema so the model returns a
+    # schema-validated dict — what makes splicing reliable. include_raw=True
+    # gives us the raw AIMessage on failure for diagnosis. Fail fast if the
+    # model/endpoint can't bind it rather than producing unparseable output.
+    #
+    # `prompt` mode skips this entirely: no response_format is sent, the JSON
+    # contract is spelled out in the prompt, and we parse the reply text
+    # ourselves. Use it for models/proxies whose response_format is broken —
+    # e.g. gpt-oss-120b on llmproxy.uva.nl returns null content with any
+    # response_format, but answers cleanly when just asked for JSON.
+    structured = args.output_mode == "structured"
+    if structured:
+        try:
+            llm.set_structure(LLM_SCHEMA, include_raw=True)
+        except Exception as e:
+            sys.exit(f"Error: could not enable structured output: {e}")
+
+    fail_fast = not args.no_fail_fast
+    print(
+        f"Using model: {args.model} "
+        f"(provider: {args.llm_provider}, output_mode: {args.output_mode}, "
+        f"fail_fast: {fail_fast}, strict_lint: {args.strict_lint})"
+    )
+
+    entries = []
+    overall_started = time.monotonic()
+    for python_path in files:
+        rel_for_log = python_path.relative_to(repo_root).as_posix()
+        print(f"Translating {rel_for_log}...", flush=True)
+        per_file_started = time.monotonic()
+        try:
+            entry = translate_one(
+                llm,
+                python_path,
+                repo_root,
+                args.zeeschuimer_checkout.resolve(),
+                args.strict_lint,
+                structured=structured,
+            )
+        except Exception as e:
+            entry = {
+                "python_file": str(python_path),
+                "js_file": None,
+                "status": "failed",
+                "commentary": "",
+                "duration_seconds": round(time.monotonic() - per_file_started, 2),
+                "error": f"unexpected exception: {e}\n{traceback.format_exc()}",
+            }
+        entry["model"] = args.model
+        entries.append(entry)
+        dur = entry.get("duration_seconds")
+        dur_str = f" in {dur}s" if dur is not None else ""
+        err_str = f" ({entry['error']})" if entry.get("error") else ""
+        warn_str = (
+            f" with {len(entry['lint_warnings'])} lint warning(s)"
+            if entry.get("lint_warnings")
+            else ""
+        )
+        print(f"  -> {entry['status']}{warn_str}{dur_str}{err_str}", flush=True)
+
+        if entry["status"] == "failed" and fail_fast:
+            remaining = len(files) - len(entries)
+            if remaining > 0:
+                print(
+                    f"\nFail-fast: aborting after first failure; skipping "
+                    f"{remaining} remaining file(s). Pass --no-fail-fast to continue past failures.",
+                    flush=True,
+                )
+            break
+
+    overall_duration = round(time.monotonic() - overall_started, 2)
+    manifest = {
+        "model": args.model,
+        "provider": args.llm_provider,
+        "output_mode": args.output_mode,
+        "fail_fast": fail_fast,
+        "strict_lint": args.strict_lint,
+        "total_duration_seconds": overall_duration,
+        "entries": entries,
+    }
+    args.output_manifest.parent.mkdir(parents=True, exist_ok=True)
+    args.output_manifest.write_text(json.dumps(manifest, indent=2), encoding="utf-8")
+
+    n_ok = sum(1 for e in entries if e["status"] == "ok")
+    n_with_warnings = sum(1 for e in entries if e["status"] == "ok" and e.get("lint_warnings"))
+    n_failed = sum(1 for e in entries if e["status"] == "failed")
+    n_skipped = sum(1 for e in entries if e["status"] == "skipped")
+    print(
+        f"\nDone with model `{args.model}` in {overall_duration}s: "
+        f"{n_ok} ok ({n_with_warnings} with warnings), "
+        f"{n_failed} failed, {n_skipped} skipped."
+    )
+    print(f"Manifest written to {args.output_manifest}")
+
+    if n_ok == 0 and n_failed > 0:
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/helper-scripts/map_item_rules.py b/helper-scripts/map_item_rules.py
new file mode 100644
index 000000000..59de0ac81
--- /dev/null
+++ b/helper-scripts/map_item_rules.py
@@ -0,0 +1,340 @@
+"""
+Registry of known Python → JavaScript translation pitfalls for the
+Zeeschuimer auto-generator.
+
+Each `TranslationError` record drives three things in
+`map_item_converter.py`:
+
+- The "things to get right" section of the LLM prompt.
+- The "before submitting" verification checklist.
+- The regex-based lint pass over LLM output.
+
+Cross-repo workflow:
+
+- `translation-errors.md` (in the Zeeschuimer repo) is the freeform
+  observation log. Reviewers add entries there as new bugs surface.
+- This file is the structured input for the prompt and linter. When an
+  observation in the md is worth teaching the generator about, mirror it
+  here using the same `id` as the md heading slug. Not every md entry
+  needs a record — this is a curated subset.
+
+Three lint checks are too complex for a single regex and live as bespoke
+code in `map_item_converter.lint_translation`:
+
+- `class_needs_new` — variable-width lookbehind for `new `.
+- `literal_newline_in_string` — JS string lexer.
+- `regex_in_use` — heuristic regex-use detection.
+
+Those records have `lint_pattern=None`; the bespoke check is the lint.
+"""
+import re
+from dataclasses import dataclass
+from typing import Optional
+
+
+@dataclass(frozen=True)
+class TranslationError:
+    id: str
+    prompt_rule: str
+    bad: Optional[str] = None
+    good: Optional[str] = None
+    verify: Optional[str] = None
+    lint_pattern: Optional[re.Pattern] = None
+    # Message surfaced on the PR when `lint_pattern` matches. Defaults to
+    # `prompt_rule`; set it separately when the regex is heuristic and can
+    # false-positive, so the reviewer-facing warning can carry that caveat
+    # without bloating the LLM prompt.
+    lint_message: Optional[str] = None
+
+
+RULES: list[TranslationError] = [
+
+    # ---- Python syntax that does not exist in JavaScript ----
+
+    TranslationError(
+        id="python_keywords",
+        prompt_rule=(
+            "Python keywords don't exist in JavaScript: `None` → `null`, "
+            "`True` / `False` → `true` / `false` (lowercase), `def name(...)` → `function name(...)`."
+        ),
+        bad="return None if not item.is_admin else True",
+        good="return item.is_admin ? true : null",
+        verify="No Python keywords (`None`, `True`, `False`, `def`) appear.",
+        lint_pattern=re.compile(r"\b(?:None|True|False)\b|\bdef\s+\w+\s*\("),
+    ),
+    TranslationError(
+        id="python_fstring",
+        prompt_rule=(
+            "Python f-strings (`f\"...\"` / `f'...'`) don't exist in JavaScript. Use "
+            "template literals with backticks and `${...}` instead."
+        ),
+        bad='throw new Error(f"item {item.id} not found")',
+        good="throw new Error(`item ${item.id} not found`)",
+        lint_pattern=re.compile(r"\bf\"|\bf'"),
+    ),
+    TranslationError(
+        id="unquoted_interpolation",
+        prompt_rule=(
+            "Even without an `f` prefix, `\"text {var}\"` / `'text {var}'` are literal "
+            "text in JavaScript — no interpolation happens. Whenever the original Python "
+            "used an f-string, the JS must use a template literal (backticks)."
+        ),
+        bad="throw new MapItemException('different user {user.id} and owner {owner.id}')",
+        good="throw new MapItemException(`different user ${user.id} and owner ${owner.id}`)",
+        verify="No `{var}` patterns remain inside single- or double-quoted strings.",
+        lint_pattern=re.compile(r"""['"][^'"\n]*\{[a-zA-Z_$][\w$.]*\}[^'"\n]*['"]"""),
+    ),
+    TranslationError(
+        id="python_from_import",
+        prompt_rule=(
+            "Python `from X import Y` doesn't exist in JavaScript. JavaScript uses "
+            "`import { Y } from 'X'` — and only when really needed; Zeeschuimer helpers "
+            "are globals, so `imports_to_add` is usually empty."
+        ),
+        bad="from common.lib.helpers import strip_tags",
+        good="// (no import — strip_tags is a global from js/lib.js)",
+        lint_pattern=re.compile(r"^\s*from\s+\S+\s+import\b", re.MULTILINE),
+    ),
+
+    # ---- dict.get is not a thing in JS ----
+
+    TranslationError(
+        id="dict_get",
+        prompt_rule=(
+            "Python `dict.get(k)` / `dict.get(k, default)` does not exist in JavaScript. "
+            "Replace every `.get(k)` with `[k]` and every `.get(k, default)` with `[k] ?? default`."
+        ),
+        bad="user.get('name', 'anonymous')",
+        good="user['name'] ?? 'anonymous'",
+        verify="The function contains zero `.get(` calls.",
+        lint_pattern=re.compile(r"\.get\("),
+        lint_message=(
+            "`.get(` call found. Python `dict.get(k[, default])` does not exist "
+            "in JavaScript — use `[k]` / `[k] ?? default`. NOTE: this check is a "
+            "plain substring match, so it also flags legitimate JS `.get()` on "
+            "`Map`, `URLSearchParams`, `Headers`, etc. — ignore the warning if "
+            "the receiver is one of those."
+        ),
+    ),
+
+    # ---- `in` operator: substring check vs key existence ----
+
+    TranslationError(
+        id="in_operator_on_strings",
+        prompt_rule=(
+            "Python `'x' in some_string` is a substring check. JavaScript's `in` operator "
+            "only works on objects (checking property names) — on a string it throws "
+            "TypeError. Use `someString.includes('x')` instead."
+        ),
+        bad="if ('polaris' in item.__typename.toLowerCase()) { ... }",
+        good="if (item.__typename.toLowerCase().includes('polaris')) { ... }",
+        verify="No `'literal' in someStringExpression` — use `.includes(...)`.",
+        # Conservative: only flag when the RHS ends in a known string method, since
+        # `'key' in someObj` is legitimate JS for property checks.
+        lint_pattern=re.compile(
+            r"""['"][^'"]*['"]\s+in\s+[\w.\[\]]+\.(?:"""
+            r"""toLowerCase|toUpperCase|toString|trim|trimStart|trimEnd|"""
+            r"""slice|substring|substr|concat|charAt|normalize|repeat|"""
+            r"""padStart|padEnd|replace|replaceAll)\s*\("""
+        ),
+    ),
+    TranslationError(
+        id="key_existence_vs_value_truthy",
+        prompt_rule=(
+            "Python `if node.get('X'):` is a *truthy check on the value* (false if the key "
+            "is missing OR if the value is `None`/empty). The naive translation "
+            "`if ('X' in node)` is a *key existence check* — true even when `node.X` is "
+            "`null`. Subsequent property accesses then throw. Use `if (node.X)` or "
+            "`if (node.X != null)`."
+        ),
+        bad="const usertags = 'usertags' in node ? node.usertags.in.map(...) : '';",
+        good="const usertags = node.usertags ? node.usertags.in.map(...) : '';",
+        lint_pattern=re.compile(r"'[^']+'\s+in\s+[a-zA-Z_$][\w$]*\s*\?"),
+    ),
+
+    # ---- Empty container is truthy in JS ----
+
+    TranslationError(
+        id="empty_container_truthy",
+        prompt_rule=(
+            "Empty `{}` and `[]` are TRUTHY in JavaScript but FALSY in Python. After "
+            "`const user = node.user ?? {}`, `if (user)` is always true. Either guard on "
+            "the original nullable BEFORE defaulting, or check `Object.keys(user).length` "
+            "/ `arr.length`."
+        ),
+        bad="const user = node.user ?? {};\nif (user) { /* always true */ }",
+        good="const user = node.user;\nif (user) { /* meaningful */ }",
+        verify="No `if (x)` guards where `x` was defaulted to `{}` or `[]` (always true in JS).",
+    ),
+
+    # ---- Object identity ----
+
+    TranslationError(
+        id="class_needs_new",
+        prompt_rule=(
+            "`MappedItem`, `MissingMappedField`, and `MapItemException` are CLASSES — "
+            "always `new MappedItem({...})`, `new MissingMappedField(...)`, "
+            "`throw new MapItemException(...)`. Calling them bare returns `undefined` "
+            "and silently breaks downstream."
+        ),
+        bad="return MappedItem({author: 'foo'})",
+        good="return new MappedItem({author: 'foo'})",
+        verify="Every `MappedItem(`, `MissingMappedField(`, and `MapItemException(` is preceded by `new`.",
+        # Bespoke check in `lint_translation` (variable-width lookbehind).
+        lint_pattern=None,
+    ),
+    TranslationError(
+        id="object_reference_equality",
+        prompt_rule=(
+            "`!==` / `===` on objects compares references, not values. "
+            "`caption !== new MissingMappedField('')` is always true because `new` "
+            "creates a fresh object each call. Use `instanceof MissingMappedField` for "
+            "type checks, or truthy-check the value directly."
+        ),
+        bad="caption !== new MissingMappedField('') ? caption.match(...) : ''",
+        good="caption instanceof MissingMappedField ? '' : caption.match(...)",
+        lint_pattern=re.compile(r"(?:!==|===)\s+new\s+[A-Z]"),
+    ),
+
+    # ---- Method calls on possibly-null receivers ----
+
+    TranslationError(
+        id="method_chain_on_nullable",
+        prompt_rule=(
+            "Calling a method on `null` / `undefined` throws TypeError. In Python the "
+            "equivalent AttributeError is sometimes caught by 4CAT — but the JS "
+            "`map_item` doesn't catch. Use optional chaining (`?.`) whenever the "
+            "receiver could be null/undefined."
+        ),
+        bad="caption.match(/#(\\w+)/g).join(',')",
+        good="caption?.match(/#(\\w+)/g)?.join(',') ?? ''",
+        # No reliable static check — leave to reviewer.
+        lint_pattern=None,
+    ),
+
+    # ---- Datetime: use the global helper ----
+
+    TranslationError(
+        id="datetime_helper_preferred",
+        prompt_rule=(
+            "For Python `datetime.utcfromtimestamp(t).strftime('%Y-%m-%d %H:%M:%S')`, "
+            "use the global `formatUtcTimestamp(t)` helper from `js/lib.js` — NOT "
+            "`new Date(t * 1000).toISOString()`. `.toISOString()` produces "
+            "`2026-05-13T21:27:31.000Z` (T separator, milliseconds, Z), which doesn't "
+            "match the Python output `2026-05-13 21:27:31`."
+        ),
+        bad="collected_at: new Date(node.taken_at * 1000).toISOString()",
+        good="collected_at: formatUtcTimestamp(node.taken_at)",
+        lint_pattern=re.compile(r"new\s+Date\([^)]+\)\.toISOString\(\)"),
+    ),
+
+    # ---- Regex translation traps ----
+
+    TranslationError(
+        id="regex_findall_capture_groups",
+        prompt_rule=(
+            "Python `re.findall(r'#(\\w+)', s)` returns CAPTURE GROUP contents "
+            "(`['lotr']`). JavaScript `s.match(/#(\\w+)/g)` returns FULL MATCHES "
+            "(`['#lotr']`) — capture groups are ignored with `/g`. For capture-group "
+            "behavior use `[...s.matchAll(/.../g)].map(m => m[1])`, or post-process the "
+            "full matches to strip the literal prefix."
+        ),
+        bad="caption.match(/#(\\w+)/g)?.join(',')",
+        good="[...caption.matchAll(/#(\\w+)/g)].map(m => m[1]).join(',')",
+        lint_pattern=re.compile(r"\.match\(\s*/[^/]*\([^/]*\)[^/]*/g\s*\)"),
+    ),
+    TranslationError(
+        id="regex_in_use",
+        prompt_rule=(
+            "Regex translation between Python and JavaScript is fragile: flag syntax "
+            "differs (`re.IGNORECASE` → `/.../i`), Python `re.compile(p).search(s)` "
+            "becomes JS `s.match(p)` or `new RegExp(p).exec(s)`, and regex literals "
+            "cannot span lines — encode any literal newline as `\\n`. Translate "
+            "carefully and verify behavior end-to-end."
+        ),
+        # Bespoke check in `lint_translation` flags any regex use for human review.
+        lint_pattern=None,
+    ),
+
+    # ---- String/regex literal syntax ----
+
+    TranslationError(
+        id="literal_newline_in_string",
+        prompt_rule=(
+            "JavaScript single- or double-quoted strings cannot contain a literal "
+            "newline — syntax error. Python `\"\\n\".join(xs)` becomes JS "
+            "`xs.join(\"\\n\")` — keep `\\n` as an escape sequence; never put a real "
+            "newline inside the quotes. Template literals (backticks) may span lines."
+        ),
+        bad='lines.join("\n")  // raw newline = syntax error',
+        good='lines.join("\\n")',
+        verify="No string or regex literal contains a raw newline character — use `\\n`.",
+        # Bespoke check in `lint_translation` (JS string lexer).
+        lint_pattern=None,
+    ),
+
+    # ---- Imports: don't, unless you really must ----
+
+    TranslationError(
+        id="lib_js_import",
+        prompt_rule=(
+            "`js/lib.js` is loaded as a plain `<script>`, NOT an ES module. Its "
+            "declarations (`MappedItem`, `MissingMappedField`, `MapItemException`, "
+            "`strip_tags`, `normalize_url_encoding`, `formatUtcTimestamp`) are GLOBALS. "
+            "Never write `import { ... } from '../js/lib.js'` — that import fails at "
+            "runtime."
+        ),
+        bad="import { MappedItem } from '../js/lib.js';",
+        good="// (no import — MappedItem is global)",
+        verify="`imports_to_add` is empty unless you really need an ES-module import (NOT for `MappedItem` etc.).",
+        lint_pattern=re.compile(
+            r"""import\s*(?:\{[^}]*\}|\*\s+as\s+\w+|\w+)\s+from\s+['"]\.\.?/js/lib\.js['"]"""
+        ),
+    ),
+    TranslationError(
+        id="bare_relative_path_import",
+        prompt_rule=(
+            "Every entry in `imports_to_add` must be a complete `import { ... } from '...'` "
+            "statement. Never emit a bare relative path (like `'../js/lib.js'`) as an "
+            "entry — JavaScript parses that as `..` `.` `/js/lib.js` and rejects the file."
+        ),
+        bad="imports_to_add: ['../js/lib.js']",
+        good="imports_to_add: []  // helpers are globals; no import needed",
+        # Surfaces in `imports_to_add`, not in the function body — not lint-able by
+        # the regex pass over `map_item_function`.
+        lint_pattern=None,
+    ),
+
+    # ---- JSON serialization difference ----
+
+    TranslationError(
+        id="undefined_dropped_from_json",
+        prompt_rule=(
+            "`JSON.stringify` omits keys whose value is `undefined`. Python's "
+            "`json.dumps` serializes `None` as `null`, keeping the key. When the Python "
+            "`map_item` explicitly returns `None` (or `\"\"`) for a missing field, the "
+            "JS must explicitly assign `null` (or `\"\"`) — typically with `value ?? null` "
+            "or `value ?? \"\"`, matching whichever Python uses for that field."
+        ),
+        bad="location_city: node.location.city  // undefined → key disappears from output",
+        good="location_city: node.location.city ?? null  // matches Python's `None`",
+        # Hard to lint statically (depends on per-field Python behavior).
+        lint_pattern=None,
+    ),
+]
+
+
+def get_regex_lint_rules() -> list[tuple[re.Pattern, str]]:
+    """
+    Return all (pattern, message) pairs for the regex-based lint pass. The
+    message is the rule's `lint_message` when set, else its `prompt_rule`.
+
+    Bespoke lint checks (class instantiation, literal newlines, regex use)
+    are NOT included here — they live in `map_item_converter.lint_translation`
+    and are tied to records by `id` in comments.
+    """
+    return [
+        (r.lint_pattern, r.lint_message or r.prompt_rule)
+        for r in RULES
+        if r.lint_pattern is not None
+    ]
diff --git a/tests/test_map_item_sync.py b/tests/test_map_item_sync.py
new file mode 100644
index 000000000..52055eb6b
--- /dev/null
+++ b/tests/test_map_item_sync.py
@@ -0,0 +1,476 @@
+"""
+Tests for the Zeeschuimer map_item sync helper scripts:
+
+- `helper-scripts/map_item_converter.py` — path derivation, JS-marker splicing,
+  the static lint, and the comment stripper.
+- `helper-scripts/map_item_ci.py` — translation-matrix planning and PR-body
+  construction (the CI glue).
+
+These import the helper scripts directly (they have no heavy dependencies once
+`LLMAdapter` is imported lazily), so they run on host Python as well as in the
+`4cat_backend` container via `docker exec 4cat_backend pytest`.
+"""
+import sys
+from pathlib import Path
+
+import pytest
+
+# helper-scripts is not a package; put it on sys.path so the modules (and their
+# sibling `from map_item_rules import ...`) import cleanly.
+HELPER_DIR = Path(__file__).resolve().parent.parent / "helper-scripts"
+if str(HELPER_DIR) not in sys.path:
+    sys.path.insert(0, str(HELPER_DIR))
+
+import map_item_ci  # noqa: E402
+import map_item_converter as mic  # noqa: E402
+import map_item_rules  # noqa: E402
+
+
+# --------------------------------------------------------------------------- #
+# python_to_js_module — convention-based path derivation
+# --------------------------------------------------------------------------- #
+
+def test_python_to_js_module_conventions():
+    f = mic.python_to_js_module
+    assert f("datasources/tiktok/search_tiktok.py") == "modules/tiktok.js"
+    # filename, not directory, drives the mapping (documented edge cases):
+    assert f("datasources/xiaohongshu/search_rednote.py") == "modules/rednote.js"
+    assert f("datasources/twitter-import/search_twitter.py") == "modules/twitter.js"
+    # underscores in the base become hyphens
+    assert f("datasources/x/search_a_b.py") == "modules/a-b.js"
+
+
+def test_python_to_js_module_rejects_non_conforming():
+    f = mic.python_to_js_module
+    assert f("datasources/x/notsearch.py") is None
+    assert f("datasources/x.py") is None
+    assert f("notdatasources/x/search_x.py") is None
+    assert f("datasources/x/search_.py") is None  # empty base
+
+
+# --------------------------------------------------------------------------- #
+# splice_into_module — the C1 regression surface
+# --------------------------------------------------------------------------- #
+
+def _translation(fn, helpers=None, imports=None):
+    return {
+        "map_item_function": fn,
+        "helpers_to_add": helpers or [],
+        "imports_to_add": imports or [],
+        "commentary": "",
+    }
+
+
+def test_splice_appends_when_no_markers():
+    out = mic.splice_into_module(
+        "const a = 1;\n",
+        _translation("export function map_item(item){ return new MappedItem({}); }"),
+        "datasources/x/search_x.py",
+    )
+    assert "const a = 1;" in out
+    assert mic.BLOCK_MARKER_START in out and mic.BLOCK_MARKER_END in out
+    assert "map_item" in out
+
+
+def test_splice_replace_preserves_regex_escapes():
+    """C1: a regex literal containing `\\w` must NOT raise `re.error` on the
+    re-sync (replace) path, and must survive verbatim."""
+    start, end = mic.BLOCK_MARKER_START, mic.BLOCK_MARKER_END
+    existing = f"const a = 1;\n{start}\n// old\nexport function map_item(){{}}\n{end}\nconst b = 2;\n"
+    out = mic.splice_into_module(
+        existing,
+        _translation(r"export function map_item(item){ return item.text.match(/#(\w+)/g) ?? []; }"),
+        "datasources/x/search_x.py",
+    )
+    assert r"/#(\w+)/g" in out                 # escape preserved verbatim
+    assert "const a = 1;" in out and "const b = 2;" in out  # surrounding code intact
+    assert "// old" not in out                 # old block replaced, not duplicated
+    assert out.count(start) == 1
+
+
+def test_splice_replace_preserves_newline_escape():
+    """C1: `"\\n"` must stay a two-char escape, not be turned into a raw newline
+    inside the string literal (which would be a JS syntax error)."""
+    start, end = mic.BLOCK_MARKER_START, mic.BLOCK_MARKER_END
+    existing = f"{start}\nold\n{end}\n"
+    out = mic.splice_into_module(
+        existing,
+        _translation(r'export function map_item(item){ return item.tags.join("\n"); }'),
+        "datasources/x/search_x.py",
+    )
+    assert r'join("\n")' in out          # backslash-n preserved
+    assert 'join("\n")' not in out       # NOT a raw newline (this literal has a real \n)
+
+
+def test_splice_idempotent_append_then_replace():
+    """Full re-sync lifecycle: first splice appends, second replaces. Both must
+    succeed (no re.error) and the block must not be duplicated."""
+    t = _translation(r"export function map_item(i){ return i.t.match(/#(\w+)/g); }")
+    once = mic.splice_into_module("base\n", t, "datasources/x/search_x.py")
+    twice = mic.splice_into_module(once, t, "datasources/x/search_x.py")
+    assert twice.count(mic.BLOCK_MARKER_START) == 1
+    assert twice.count(mic.BLOCK_MARKER_END) == 1
+    assert r"/#(\w+)/g" in twice
+
+
+def test_splice_includes_helpers():
+    out = mic.splice_into_module(
+        "base\n",
+        _translation("export function map_item(i){ return helperFn(i); }",
+                     helpers=["function helperFn(x){ return x; }"]),
+        "datasources/x/search_x.py",
+    )
+    assert "function helperFn(x)" in out
+    assert "export function map_item" in out
+
+
+def test_splice_partial_markers_refuses():
+    start = mic.BLOCK_MARKER_START
+    existing = f"x\n{start}\nonly start, no end\n"
+    with pytest.raises(ValueError):
+        mic.splice_into_module(
+            existing, _translation("function map_item(){}"), "datasources/x/search_x.py"
+        )
+
+
+def test_splice_dedups_existing_imports():
+    imp = "import { foo } from './other.js';"
+    out = mic.splice_into_module(
+        f"{imp}\nconst a = 1;\n",
+        _translation("export function map_item(i){ return foo(i); }", imports=[imp]),
+        "datasources/x/search_x.py",
+    )
+    # the import already exists outside any marker block, so it must not be re-added
+    assert out.count(imp) == 1
+
+
+# --------------------------------------------------------------------------- #
+# _strip_js_comments (C3) and lint_translation
+# --------------------------------------------------------------------------- #
+
+def test_strip_js_comments_preserves_url_in_string():
+    """C3: the `//` in a URL string literal must survive (the old naive regex
+    truncated `"https://x"` to `"https:` and produced bogus lint warnings)."""
+    out = mic._strip_js_comments('const u = "https://example.com/p"; // real comment')
+    assert "https://example.com/p" in out
+    assert "real comment" not in out
+
+
+def test_strip_js_comments_removes_block_and_line_comments():
+    assert mic._strip_js_comments("a /* x */ b") == "a  b"
+    assert mic._strip_js_comments("keep // drop").rstrip() == "keep"
+
+
+def test_lint_no_false_newline_warning_on_url():
+    """C3 follow-through: a URL string must not be flagged as a literal newline."""
+    issues = mic.lint_translation(
+        _translation('function map_item(i){ return "https://x/y"; }')
+    )
+    assert not any("newline" in i.lower() for i in issues)
+
+
+def test_lint_flags_dict_get_with_caveat():
+    """C5: `.get(` is flagged, and the message carries the JS-Map false-positive
+    caveat (decoupled from the prompt rule)."""
+    issues = mic.lint_translation(
+        _translation("function map_item(i){ return i.get('a'); }")
+    )
+    assert any(".get(" in i for i in issues)
+    assert any("Map" in i for i in issues)  # caveat present
+
+
+def test_lint_flags_missing_new():
+    issues = mic.lint_translation(
+        _translation("function map_item(i){ return MappedItem({a: 1}); }")
+    )
+    assert any("without" in i and "new" in i for i in issues)
+
+
+def test_lint_flags_literal_newline_in_string():
+    issues = mic.lint_translation(
+        _translation('function map_item(i){ return i.x.join("\n"); }')  # real newline
+    )
+    assert any("newline" in i.lower() for i in issues)
+
+
+def test_lint_flags_regex_use():
+    issues = mic.lint_translation(
+        _translation(r"function map_item(i){ return i.text.match(/\w+/); }")
+    )
+    assert any("regex" in i.lower() for i in issues)
+
+
+def test_lint_clean_translation_has_no_issues():
+    issues = mic.lint_translation(
+        _translation("export function map_item(i){ return new MappedItem({id: i['id'] ?? null}); }")
+    )
+    assert issues == []
+
+
+# --------------------------------------------------------------------------- #
+# map_item_rules registry wiring
+# --------------------------------------------------------------------------- #
+
+def test_regex_lint_rules_use_lint_message_when_set():
+    # dict_get carries a separate lint_message (mentions Map), distinct from its
+    # prompt_rule, and that is what the lint pass surfaces.
+    get_msg = next(
+        msg for pat, msg in map_item_rules.get_regex_lint_rules()
+        if pat.pattern == r"\.get\("
+    )
+    assert "Map" in get_msg
+    dict_get_rule = next(r for r in map_item_rules.RULES if r.id == "dict_get")
+    assert get_msg == dict_get_rule.lint_message
+    assert get_msg != dict_get_rule.prompt_rule
+
+
+# --------------------------------------------------------------------------- #
+# plan_matrix — including the S1 injection-rejection guarantee
+# --------------------------------------------------------------------------- #
+
+def test_plan_matrix_bootstrap():
+    mode, matrix, rejected = map_item_ci.plan_matrix("workflow_dispatch", "", True, "", "")
+    assert mode == "bootstrap"
+    assert matrix == [{"module": "bootstrap", "files": "", "bootstrap": True}]
+    assert rejected == []
+
+
+def test_plan_matrix_explicit_files_override_bootstrap():
+    mode, matrix, _ = map_item_ci.plan_matrix(
+        "workflow_dispatch", "datasources/tiktok/search_tiktok.py", True, "", ""
+    )
+    assert mode == "files"  # files win over bootstrap
+    assert [m["module"] for m in matrix] == ["tiktok"]
+
+
+def test_plan_matrix_groups_by_module_sorted():
+    mode, matrix, rejected = map_item_ci.plan_matrix(
+        "workflow_dispatch",
+        "datasources/tiktok/search_tiktok.py datasources/gab/search_gab.py",
+        False, "", "",
+    )
+    assert mode == "files"
+    assert [m["module"] for m in matrix] == ["gab", "tiktok"]
+    assert rejected == []
+
+
+def test_plan_matrix_push_uses_injected_git_diff():
+    changed = ["datasources/tiktok/search_tiktok.py", "datasources/tiktok/search_other.py"]
+    mode, matrix, _ = map_item_ci.plan_matrix(
+        "push", "", False, "aaaaaaa", "bbbbbbb", git_diff=lambda b, a: changed
+    )
+    assert mode == "files"
+    assert len(matrix) == 1 and matrix[0]["module"] == "tiktok"
+    assert "search_tiktok.py" in matrix[0]["files"]
+    assert "search_other.py" in matrix[0]["files"]
+
+
+def test_plan_matrix_none_when_no_changes():
+    mode, matrix, rejected = map_item_ci.plan_matrix(
+        "push", "", False, "a", "b", git_diff=lambda b, a: []
+    )
+    assert mode == "none" and matrix == [] and rejected == []
+
+
+def test_plan_matrix_rejects_shell_injection_paths():
+    """S1: paths with shell metacharacters (or otherwise not matching the strict
+    datasource shape) are dropped and reported, never placed in the matrix that
+    gets interpolated into the sync job's shell command."""
+    candidates = (
+        "datasources/x/search_$(id).py "        # command substitution
+        "datasources/x/search_x.py;whoami "      # command separator
+        "../../etc/passwd "                       # traversal
+        "datasources/ok/search_ok.py"            # the only valid one
+    )
+    mode, matrix, rejected = map_item_ci.plan_matrix(
+        "workflow_dispatch", candidates, False, "", ""
+    )
+    all_files = " ".join(m["files"] for m in matrix)
+    assert "datasources/ok/search_ok.py" in all_files
+    assert "$(id)" not in all_files
+    assert ";" not in all_files
+    assert ".." not in all_files
+    assert len(rejected) == 3
+    assert [m["module"] for m in matrix] == ["ok"]
+
+
+# --------------------------------------------------------------------------- #
+# build_pr_body
+# --------------------------------------------------------------------------- #
+
+def test_build_pr_body_single_module_title_and_warnings():
+    manifest = {
+        "model": "qwen2.5-coder:14b", "provider": "ollama",
+        "total_duration_seconds": 12.3,
+        "entries": [{
+            "python_file": "datasources/tiktok/search_tiktok.py",
+            "js_file": "modules/tiktok.js", "status": "ok", "duration_seconds": 5.0,
+            "commentary": "a note", "lint_warnings": ["[map_item_function] .get( found"],
+        }],
+    }
+    title, body = map_item_ci.build_pr_body(
+        manifest, module="tiktok", is_bootstrap=False, before="a" * 7, after="b" * 7,
+        run_id="42", event_name="workflow_dispatch", repo="org/4cat",
+    )
+    assert title == "Auto-translated map_item updates from 4CAT: tiktok"
+    assert "Lint warnings — fix before merging" in body
+    assert "modules/tiktok.js" in body
+    assert "qwen2.5-coder:14b" in body
+
+
+def test_build_pr_body_bootstrap_title_counts_modules():
+    manifest = {"entries": [
+        {"python_file": "datasources/a/search_a.py", "js_file": "modules/a.js", "status": "ok"},
+        {"python_file": "datasources/b/search_b.py", "js_file": "modules/b.js", "status": "ok"},
+    ]}
+    title, _ = map_item_ci.build_pr_body(
+        manifest, module="bootstrap", is_bootstrap=True, before="", after="",
+        run_id="1", event_name="workflow_dispatch", repo="org/4cat",
+    )
+    assert title == "Auto-translated map_item updates from 4CAT (bootstrap, 2 datasources)"
+
+
+def test_build_pr_body_push_invokes_injected_diff():
+    manifest = {"entries": [{
+        "python_file": "datasources/a/search_a.py", "js_file": "modules/a.js", "status": "ok",
+    }]}
+    calls = []
+
+    def fake_diff(before, after, path):
+        calls.append((before, after, path))
+        return "diff --git a/x b/x\n+added"
+
+    _, body = map_item_ci.build_pr_body(
+        manifest, module="a", is_bootstrap=False, before="X", after="Y",
+        run_id="1", event_name="push", repo="org/4cat", python_diff=fake_diff,
+    )
+    assert calls == [("X", "Y", "datasources/a/search_a.py")]
+    assert "<details><summary>Python diff</summary>" in body
+    assert "+added" in body
+
+
+# --------------------------------------------------------------------------- #
+# set_output — S2 (GITHUB_OUTPUT injection-safe)
+# --------------------------------------------------------------------------- #
+
+def test_set_output_uses_delimiter_form(tmp_path, monkeypatch):
+    out_file = tmp_path / "gh_output"
+    out_file.write_text("", encoding="utf-8")
+    monkeypatch.setenv("GITHUB_OUTPUT", str(out_file))
+
+    # a value containing a newline + an "=" line that would forge an extra
+    # output in the naive `name=value` form
+    map_item_ci.set_output("title", "Real Title\nmalicious=pwned")
+
+    content = out_file.read_text(encoding="utf-8")
+    assert content.startswith("title<<")          # delimiter form, not `title=`
+    assert "Real Title\nmalicious=pwned" in content  # value preserved verbatim
+    # the injected line is inside the heredoc body, not a standalone output line
+    assert not content.startswith("title=")
+
+
+def test_set_output_noop_without_env(monkeypatch):
+    monkeypatch.delenv("GITHUB_OUTPUT", raising=False)
+    # must not raise when not running under Actions
+    map_item_ci.set_output("title", "anything")
+
+
+# --------------------------------------------------------------------------- #
+# splice — refuse to duplicate a pre-existing map_item on first sync (review #2)
+# --------------------------------------------------------------------------- #
+
+def test_splice_refuses_preexisting_map_item_without_markers():
+    """First sync (no markers) must NOT append a second `map_item` when the
+    module already declares one — that would be a JS redeclaration error."""
+    existing = "export function map_item(item) { return item; }\n"
+    with pytest.raises(ValueError):
+        mic.splice_into_module(
+            existing,
+            _translation("export function map_item(i){ return new MappedItem({}); }"),
+            "datasources/x/search_x.py",
+        )
+
+
+def test_splice_refuses_preexisting_const_map_item():
+    existing = "const map_item = (item) => item;\n"
+    with pytest.raises(ValueError):
+        mic.splice_into_module(
+            existing,
+            _translation("export function map_item(i){ return new MappedItem({}); }"),
+            "datasources/x/search_x.py",
+        )
+
+
+def test_splice_allows_commented_map_item_without_markers():
+    """A `map_item` declaration that exists only inside a comment must not trip
+    the guard (comments are stripped before the check)."""
+    existing = "// old: export function map_item(item) {}  (removed)\nconst a = 1;\n"
+    out = mic.splice_into_module(
+        existing,
+        _translation("export function map_item(i){ return new MappedItem({}); }"),
+        "datasources/x/search_x.py",
+    )
+    assert mic.BLOCK_MARKER_START in out
+
+
+# --------------------------------------------------------------------------- #
+# _code_fence — PR-body diff can't be closed early by its own backticks (#5)
+# --------------------------------------------------------------------------- #
+
+def test_code_fence_default_three_backticks():
+    assert map_item_ci._code_fence("no backticks here", "diff") == ("```diff", "```")
+
+
+def test_code_fence_grows_past_inner_backticks():
+    # longest run inside is 4 backticks -> fence must be 5
+    open_f, close_f = map_item_ci._code_fence("a ``` b ```` c", "diff")
+    assert open_f == "`````diff"
+    assert close_f == "`````"
+
+
+def test_build_pr_body_diff_fence_survives_backticks():
+    manifest = {"entries": [{
+        "python_file": "datasources/a/search_a.py", "js_file": "modules/a.js", "status": "ok",
+    }]}
+
+    def fake_diff(before, after, path):
+        # a Python diff whose body itself contains a ``` fence
+        return "diff --git a/x b/x\n+doc = '''\n+```\n+'''"
+
+    _, body = map_item_ci.build_pr_body(
+        manifest, module="a", is_bootstrap=False, before="X", after="Y",
+        run_id="1", event_name="push", repo="org/4cat", python_diff=fake_diff,
+    )
+    # outer fence is longer than the inner ``` so the block isn't closed early
+    assert "````diff" in body
+
+
+# --------------------------------------------------------------------------- #
+# extract_llm_requirements — single source of truth from setup.py (review #3)
+# --------------------------------------------------------------------------- #
+
+def test_extract_llm_requirements_filters_and_preserves_specifiers():
+    setup_py = '''
+core_packages = {
+    "Flask~=3.0",
+    "langchain_core",
+    "langchain_ollama",
+    "pydantic",
+    "requests~=2.27",
+    "requests_futures",
+    "ruff",
+}
+processor_packages = {
+    "numpy",
+    "beautifulsoup4",
+}
+'''
+    reqs = map_item_ci.extract_llm_requirements(setup_py)
+    assert "langchain_core" in reqs
+    assert "langchain_ollama" in reqs
+    assert "pydantic" in reqs
+    assert "requests~=2.27" in reqs          # version specifier preserved verbatim
+    assert "requests_futures" not in reqs    # name-equality, not substring match
+    assert "Flask~=3.0" not in reqs
+    assert "ruff" not in reqs
+    assert reqs == sorted(reqs)              # output is sorted
diff --git a/webtool/__init__.py b/webtool/__init__.py
index a15729f74..e23eab8f1 100644
--- a/webtool/__init__.py
+++ b/webtool/__init__.py
@@ -177,7 +177,6 @@ def time_this(func):
     import webtool.views.views_explorer  # noqa: E402
     import webtool.views.api_standalone  # noqa: E402
     import webtool.views.api_tool  # noqa: E402
-    import webtool.views.api_map_item  # noqa: E402
 
     app.register_blueprint(webtool.views.views_restart.component)
     app.register_blueprint(webtool.views.views_admin.component)
@@ -188,7 +187,6 @@ def time_this(func):
     app.register_blueprint(webtool.views.views_explorer.component)
     app.register_blueprint(webtool.views.api_standalone.component)
     app.register_blueprint(webtool.views.api_tool.component)
-    app.register_blueprint(webtool.views.api_map_item.component)
 
     @app.before_request
     def before_request():
diff --git a/webtool/views/api_map_item.py b/webtool/views/api_map_item.py
deleted file mode 100644
index 7e78ddf3e..000000000
--- a/webtool/views/api_map_item.py
+++ /dev/null
@@ -1,199 +0,0 @@
-"""
-Map-item API endpoint - allows running a datasource's map_item function
-against a single submitted item via HTTP.
-
-Used by external tools (like Zeeschuimer) to validate that auto-generated
-map_item translations produce the same output as the Python original.
-"""
-
-import json
-import traceback
-from pathlib import Path
-
-from flask import Blueprint, current_app, jsonify, request, g
-from flask_login import login_required
-
-from webtool.lib.helpers import error
-from common.lib.exceptions import MapItemException
-from common.lib.item_mapping import MissingMappedField, MappedItem
-from common.lib.helpers import format_import_item
-
-
-component = Blueprint("map_item", __name__)
-api_ratelimit = current_app.limiter.shared_limit("100 per minute", scope="api")
-
-
-def _get_search_class(modules, datasource_id):
-	"""
-	Look up the search/import class for a datasource.
-
-	Abstracts the ModuleCollector convention where worker keys append a suffix
-	to the datasource ID. Most use `-search`, some (e.g. twitter-import) use
-	`-import`. Returns None if no matching worker is found.
-
-	TODO: ModuleCollector should expose this directly.
-	"""
-	return (
-		modules.workers.get(f"{datasource_id}-search")
-		or modules.workers.get(f"{datasource_id}-import")
-	)
-
-
-@component.route("/api/datasources/")
-@api_ratelimit
-@current_app.openapi.endpoint("map_item")
-def list_datasources():
-	"""
-	List all available datasources with map_item support.
-
-	Returns all datasources that have a map_item function, including a flag
-	indicating if they're from Zeeschuimer. Caller can filter as needed.
-
-	:return: JSON object with array of datasource metadata
-
-	:return-schema: {
-		type=object,
-		properties={
-			datasources={
-				type=array,
-				items={
-					type=object,
-					properties={
-						id={type=string},
-						name={type=string},
-						has_map_item={type=boolean},
-						is_from_zeeschuimer={type=boolean}
-					}
-				}
-			}
-		}
-	}
-	"""
-	
-	available = []
-	for datasource_id, metadata in g.modules.datasources.items():
-		search_class = _get_search_class(g.modules, datasource_id)
-		if not search_class:
-			continue
-
-		available.append({
-			"id": datasource_id,
-			"name": metadata.get("name", datasource_id),
-			"is_from_zeeschuimer": getattr(search_class, "is_from_zeeschuimer", False),
-			"has_map_item": hasattr(search_class, "map_item") and callable(getattr(search_class, "map_item"))
-		})
-
-	return jsonify({
-		"datasources": sorted(available, key=lambda x: x["id"])
-	}), 200
-
-
-class MissingMappedFieldEncoder(json.JSONEncoder):
-	"""Custom JSON encoder to serialize MissingMappedField objects."""
-
-	def default(self, obj):
-		if isinstance(obj, MissingMappedField):
-			return {
-				"__missing": True,
-				"value": obj.value
-			}
-		return super().default(obj)
-
-
-@component.route("/api/map-item/<string:datasource_id>/", methods=["POST"])
-@api_ratelimit
-@login_required
-@current_app.openapi.endpoint("map_item")
-def map_item_endpoint(datasource_id):
-	"""
-	Run a datasource's map_item function against a single item.
-
-	Used by external tools (e.g. Zeeschuimer's test runner) to validate that
-	an auto-generated JS port of `map_item` produces the same output as the
-	Python original.
-
-	The submitted item is passed through `format_import_item` first, matching
-	the transformation applied during normal NDJSON imports, so the endpoint
-	exercises the same code path as production imports.
-
-	Distinguishes three outcomes:
-	- `mapped`: map_item returned successfully
-	- `skipped`: map_item raised MapItemException (intentional skip)
-	- `error`: map_item raised an unexpected exception (bug or bad data)
-
-	Authenticate via the `Authentication` header or `?access-token` query
-	parameter using a 4CAT access token.
-
-	:param datasource_id: The datasource identifier (e.g., "tiktok", "instagram")
-	:request-body item: Zeeschuimer-format item with a `data` field
-
-	:return: JSON response. One of:
-	- `{status: "mapped", item: {...}}`
-	- `{status: "skipped", reason: "..."}`
-	- `{status: "error", message: "..."}`
-
-	:return-schema: {
-		type=object,
-		properties={
-			status={
-				type=string,
-				enum=["mapped", "skipped", "error"]
-			}
-		},
-		required=["status"]
-	}
-	"""
-	# Validate request body
-	body = request.get_json(silent=True)
-	if body is None:
-		return error(400, error="Request body must be valid JSON")
-	if "item" not in body:
-		return error(400, error="Request body must contain an 'item' field")
-	zeeschuimer_item = body["item"]
-	if not isinstance(zeeschuimer_item, dict):
-		return error(400, error="'item' field must be a JSON object")
-
-	# Look up the datasource's search class
-	search_class = _get_search_class(g.modules, datasource_id)
-	if search_class is None:
-		return error(404, error=f"Unknown datasource: {datasource_id}")
-	if not (hasattr(search_class, "map_item") and callable(getattr(search_class, "map_item"))):
-		return error(404, error=f"Datasource '{datasource_id}' does not implement map_item")
-
-	# Wrap item (mirrors the NDJSON import path)
-	wrapped_item = format_import_item(zeeschuimer_item)
-
-	# Call map_item directly; going through get_mapped_item would wrap 
-	# KeyError/IndexError and accidental errors would be skiped.
-	try:
-		mapped_item = search_class.map_item(wrapped_item)
-	except MapItemException as e:
-		# Intentional skip (e.g. Instagram ad detection)
-		return jsonify({
-			"status": "skipped",
-			"reason": str(e)
-		}), 200
-	except Exception as e:
-		# Unexpected error — point at the deepest frame for debugging
-		tb_frames = traceback.extract_tb(e.__traceback__)
-		frame = tb_frames[-1] if tb_frames else None
-		location = f" at {Path(frame.filename).name}:{frame.lineno}" if frame else ""
-		g.log.warning(f"map_item error for {datasource_id}: {traceback.format_exc()}")
-		return jsonify({
-			"status": "error",
-			"message": f"{type(e).__name__}: {e}{location}"
-		}), 200
-
-	# Unwrap MappedItem if returned; otherwise treat as plain dict
-	if isinstance(mapped_item, MappedItem):
-		item_data = mapped_item.get_item_data(safe=False)
-	else:
-		item_data = mapped_item
-
-	# Use the custom encoder to preserve MissingMappedField as a tagged object
-	response_data = json.loads(json.dumps(item_data, cls=MissingMappedFieldEncoder))
-
-	return jsonify({
-		"status": "mapped",
-		"item": response_data
-	}), 200
diff --git a/webtool/views/api_tool.py b/webtool/views/api_tool.py
index fcb695206..5145d0bd3 100644
--- a/webtool/views/api_tool.py
+++ b/webtool/views/api_tool.py
@@ -10,7 +10,7 @@
 import os
 
 from flask import Blueprint, current_app, jsonify, request, render_template, render_template_string, redirect, url_for, flash, \
-	get_flashed_messages, send_from_directory, g
+	get_flashed_messages, send_from_directory, stream_with_context, g
 from flask_login import login_required, current_user
 
 from webtool.lib.helpers import error, setting_required, parse_markdown
@@ -23,6 +23,7 @@
 from common.lib.helpers import UserInput, call_api
 from common.lib.user import User
 from backend.lib.worker import BasicWorker
+from common.lib.item_mapping import MissingMappedField
 
 component = Blueprint("toolapi", __name__)
 api_ratelimit = current_app.limiter.shared_limit("3 per second", scope="api")
@@ -32,6 +33,21 @@
 
 csv.field_size_limit(1024 * 1024 * 1024)
 
+def _get_search_class(modules, datasource_id):
+	"""
+	Look up the search/import class for a datasource.
+
+	Abstracts the ModuleCollector convention where worker keys append a suffix
+	to the datasource ID. Most use `-search`, some (e.g. twitter-import) use
+	`-import`. Returns None if no matching worker is found.
+
+	TODO: ModuleCollector should expose this directly.
+	"""
+	return (
+		modules.workers.get(f"{datasource_id}-search")
+		or modules.workers.get(f"{datasource_id}-import")
+	)
+
 @component.route("/api/")
 @api_ratelimit
 def openapi_overview():
@@ -1413,3 +1429,218 @@ def export_packed_dataset(key=None, component=None):
 
 	else:
 		return error(406, error="Dataset component unknown")
+
+@component.route("/api/datasources/")
+@api_ratelimit
+@current_app.openapi.endpoint("tool")
+def list_datasources():
+	"""
+	List all available datasources with map_item support.
+
+	Returns all datasources that have a map_item function, including a flag
+	indicating if they're from Zeeschuimer. Caller can filter as needed.
+
+	:return: JSON object with array of datasource metadata
+
+	:return-schema: {
+		type=object,
+		properties={
+			datasources={
+				type=array,
+				items={
+					type=object,
+					properties={
+						id={type=string},
+						name={type=string},
+						has_map_item={type=boolean},
+						is_from_zeeschuimer={type=boolean}
+					}
+				}
+			}
+		}
+	}
+	"""
+	
+	available = []
+	for datasource_id, metadata in g.modules.datasources.items():
+		search_class = _get_search_class(g.modules, datasource_id)
+		if not search_class:
+			continue
+
+		available.append({
+			"id": datasource_id,
+			"name": metadata.get("name", datasource_id),
+			"is_from_zeeschuimer": getattr(search_class, "is_from_zeeschuimer", False),
+			"has_map_item": hasattr(search_class, "map_item") and callable(getattr(search_class, "map_item"))
+		})
+
+	return jsonify({
+		"datasources": sorted(available, key=lambda x: x["id"])
+	}), 200
+
+class MissingMappedFieldEncoder(json.JSONEncoder):
+	"""Custom JSON encoder to serialize MissingMappedField objects."""
+
+	def default(self, obj):
+		if isinstance(obj, MissingMappedField):
+			return {
+				"__missing": True,
+				"value": obj.value
+			}
+		return super().default(obj)
+
+@component.route("/api/dataset/<string:key>/items/", methods=["GET", "HEAD"])
+@api_ratelimit
+@login_required
+@current_app.openapi.endpoint("tool")
+def dataset_items(key):
+	"""
+	Get a dataset's mapped items as JSON.
+
+	Returns the same per-item content as the CSV download at
+	`/mapped-result/<key>/` (datasource `map_item` applied, annotations
+	merged in by default), but as JSON. Two response modes:
+
+	- Default (paginated): a JSON envelope `{key, offset, limit, total,
+	  returned, next_offset, items}`. `next_offset` is `null` on the last
+	  page. Default `limit` is 100, max 1000.
+	- Stream (`?stream=true`): the entire dataset as NDJSON (one JSON
+	  object per line). `offset` and `limit` are ignored.
+
+	Paginated mode re-reads the dataset file from the start on every
+	request (it skips `offset` rows before yielding), so use `?stream=true` 
+	to enumerate the full dataset in one pass.
+
+	ZIP archive datasets are not supported and will return 400; download
+	the archive directly instead.
+
+	Responds to HEAD with the same status code and headers as GET but no
+	body — useful as a cheap metadata probe. Every response carries
+	`X-4CAT-Dataset-Type`, `X-4CAT-Dataset-Datasource`,
+	`X-4CAT-Dataset-Num-Rows`, `X-4CAT-Dataset-Is-Finished`,
+	`X-4CAT-Dataset-Extension`, and `X-4CAT-Dataset-Key`.
+
+	Authenticate via the `Authentication` header or `?access-token` query
+	parameter using a 4CAT access token.
+
+	:param str key: Dataset key.
+	:request-param int ?offset: Skip this many rows before returning items
+	                            (paginated mode only, default 0).
+	:request-param int ?limit: Return at most N items, 1-1000
+	                           (paginated mode only, default 100).
+	:request-param bool ?stream: If truthy, stream the full dataset as
+	                             NDJSON instead of paginating.
+	:request-param bool ?annotations: Include annotation columns
+	                                  (default true).
+	:request-param str ?missing_fields: How to represent fields that were
+	                                    missing in the source data. One of
+	                                    `default` (replace with the
+	                                    datasource's fallback value; the
+	                                    same behavior as the CSV export) or
+	                                    `keep` (preserve as
+	                                    `{"__missing": true, "value": ...}`
+	                                    in the JSON output so the caller
+	                                    can distinguish missing from
+	                                    present). Default `default`.
+	:request-param str ?access-token: Access token; only required if not
+	                                  logged in currently.
+
+	:return: JSON envelope (paginated) or NDJSON stream.
+	:return-error 404: If the dataset does not exist.
+	:return-error 403: If the dataset is private and the caller is not an
+	                   owner.
+	:return-error 400: If query parameters are invalid, or the dataset's
+	                   storage format is not iterable as items (e.g. ZIP).
+	"""
+	try:
+		dataset = DataSet(key=key, db=g.db, modules=g.modules)
+	except DataSetException:
+		return error(404, error="Dataset not found.")
+
+	if dataset.is_private and not (
+			g.config.get("privileges.can_view_private_datasets")
+			or dataset.is_accessible_by(current_user)):
+		return error(403, error="This dataset is private.")
+
+	if dataset.get_extension() == "zip":
+		return error(400, error="ZIP archive datasets cannot be served as JSON items; download the archive directly.")
+
+	# add headers for metadata (useful for HEAD requests and because Stijn hates sharing)
+	headers = {
+		"X-4CAT-Dataset-Key": dataset.key,
+		"X-4CAT-Dataset-Type": dataset.type,
+		"X-4CAT-Dataset-Num-Rows": str(dataset.num_rows),
+		"X-4CAT-Dataset-Is-Finished": "true" if dataset.is_finished() else "false",
+	}
+	datasource = dataset.parameters.get("datasource")
+	if datasource:
+		headers["X-4CAT-Dataset-Datasource"] = datasource
+
+	if request.method == "HEAD":
+		return current_app.response_class(status=200, headers=headers)
+
+	truthy = ("true", "1", "yes")
+	falsy = ("false", "0", "no")
+	stream = request.args.get("stream", "").lower() in truthy
+	include_annotations = request.args.get("annotations", "true").lower() not in falsy
+
+	missing_fields = request.args.get("missing_fields", "default").lower()
+	if missing_fields not in ("default", "keep"):
+		return error(400, error="`missing_fields` must be 'default' or 'keep'")
+
+	iter_kwargs = {
+		"warn_unmappable": False,
+		"get_annotations": include_annotations,
+		"map_missing": missing_fields,
+	}
+
+	if stream:
+		def ndjson_stream():
+			for item in dataset.iterate_items(**iter_kwargs):
+				yield json.dumps(item, cls=MissingMappedFieldEncoder) + "\n"
+
+		return current_app.response_class(
+			stream_with_context(ndjson_stream()),
+			mimetype="application/x-ndjson",
+			headers=headers,
+		)
+
+	# Paginated mode
+	try:
+		offset = int(request.args.get("offset", 0))
+	except ValueError:
+		return error(400, error="`offset` must be an integer")
+	if offset < 0:
+		return error(400, error="`offset` must be non-negative")
+
+	try:
+		limit = int(request.args.get("limit", 100))
+	except ValueError:
+		return error(400, error="`limit` must be an integer")
+
+	MAX_LIMIT = 1000
+	if limit < 1 or limit > MAX_LIMIT:
+		return error(400, error=f"`limit` must be between 1 and {MAX_LIMIT}; use ?stream=true for the full dataset")
+
+	items = list(itertools.islice(
+		dataset.iterate_items(offset=offset, **iter_kwargs),
+		limit
+	))
+
+	total = dataset.num_rows
+	end = offset + len(items)
+	next_offset = end if end < total else None
+
+	return current_app.response_class(
+		json.dumps({
+			"key": dataset.key,
+			"type": dataset.type,
+			"offset": offset,
+			"limit": limit,
+			"total": total,
+			"returned": len(items),
+			"next_offset": next_offset,
+			"items": items,
+		}, cls=MissingMappedFieldEncoder),
+		mimetype="application/json"
+	)
\ No newline at end of file