diff --git a/.github/workflows/zeeschuimer_map_item_sync.yml b/.github/workflows/zeeschuimer_map_item_sync.yml index 54e3d8221..755884e52 100644 --- a/.github/workflows/zeeschuimer_map_item_sync.yml +++ b/.github/workflows/zeeschuimer_map_item_sync.yml @@ -1,14 +1,252 @@ -# Bootstrap the Zeeschuimer map_item sync workflow -# This is necessary to test workflow in PR (so far as I can tell) +# Auto-translate Zeeschuimer datasource `map_item` functions from Python to JS +# and open a draft PR per module against digitalmethodsinitiative/zeeschuimer. +# +# Triggers on pushes to master that touch any Zeeschuimer datasource (or the +# helper script itself). Also exposes a `workflow_dispatch` trigger with a +# `bootstrap` input for the initial run that translates every Zeeschuimer +# datasource at once (single PR). Datasources without a matching Zeeschuimer +# module — e.g. facebook — are skipped automatically. +# +# Architecture: a `detect` job groups changed files by module and emits a +# matrix; a `sync` job fans out one parallel run per module, each opening +# (or updating) its own PR on a stable per-module branch. The planning +# and PR-body logic are in `helper-scripts/map_item_ci.py` (# unit-tested +# in tests/test_map_item_sync.py); the LLM translation and JS +# splicing live in `helper-scripts/map_item_converter.py`. +# +# Required secrets (configured in repo Settings -> Secrets and variables -> Actions): +# ZEESCHUIMER_APP_ID - numeric App ID of the GitHub App installed on +# digitalmethodsinitiative/zeeschuimer with permissions +# contents:write + pull-requests:write (and nothing else) +# ZEESCHUIMER_APP_PRIVATE_KEY - full PEM private key for that App (including BEGIN/END lines) +# DMI_OLLAMA_KEY - API key for https://ollama.digitalmethods.net (legacy fallback) +# +# Optional overrides — set in repo Settings -> Secrets and variables -> Actions to change +# the provider used by automatic (push-triggered) runs without editing this file. +# Resolution order for each setting (first non-empty wins): +# API key: workflow_dispatch input -> LLM_PROVIDER_API_KEY secret -> DMI_OLLAMA_KEY secret +# provider/url/model/output_mode: workflow_dispatch input -> repo variable -> hardcoded default below +# +# LLM_PROVIDER_API_KEY (secret) - generic key for the active provider; swap when switching providers +# LLM_PROVIDER (variable) - provider type for LLMAdapter (default: ollama) +# LLM_BASE_URL (variable) - provider base URL (default: https://ollama.digitalmethods.net) +# LLM_MODEL (variable) - model name (default: qwen2.5-coder:14b) +# LLM_OUTPUT_MODE (variable) - structured or prompt (default: structured) name: Sync Zeeschuimer map_item from 4CAT on: + push: + branches: [master] + paths: + # Only datasource changes drive a push-triggered translation: the detect + # job's plan-matrix diffs `datasources/**` and nothing else + - 'datasources/**/search_*.py' + - '.github/workflows/zeeschuimer_map_item_sync.yml' workflow_dispatch: + # NOTE: defaults are '' which falls through to Github settings (see Optional overrides above) + inputs: + bootstrap: + description: 'Translate every Zeeschuimer datasource (initial sync, single PR). Ignored if "files" is set.' + type: boolean + default: false + files: + description: 'Space-separated list of datasource files to translate (e.g. "datasources/tiktok/search_tiktok.py"). Overrides bootstrap. One PR per module.' + type: string + default: '' + llm_provider: + description: 'LLM provider type for LLMAdapter. Leave blank to use LLM_PROVIDER variable or default (ollama).' + type: string + default: '' + llm_base_url: + description: 'LLM provider base URL. Leave blank to use LLM_BASE_URL variable or default (https://ollama.digitalmethods.net).' + type: string + default: '' + llm_api_key: + description: 'LLM API key. Leave blank to use LLM_PROVIDER_API_KEY secret or DMI_OLLAMA_KEY secret.' + type: string + default: '' + model: + description: 'LLM model name. Leave blank to use LLM_MODEL variable or default (qwen2.5-coder:14b). Examples: qwen2.5-coder:7b, deepseek-coder-v2:16b, gemma3:27b' + type: string + default: '' + output_mode: + description: 'LLM output mode (structured or prompt). Leave blank to use LLM_OUTPUT_MODE variable or default (structured). Use prompt for models that do not support structured output (e.g. gpt-oss-120b).' + type: string + default: '' + +# Least privilege: this workflow's own GITHUB_TOKEN only needs to read the 4CAT +permissions: + contents: read jobs: - sync-map-item: + detect: + name: Detect modules to translate runs-on: ubuntu-latest + outputs: + mode: ${{ steps.plan.outputs.mode }} + matrix: ${{ steps.plan.outputs.matrix }} steps: - - name: Placeholder - run: echo "Workflow scaffold is valid." \ No newline at end of file + - name: Checkout 4CAT + uses: actions/checkout@v4 + with: + # Full history: the push-event plan diffs `github.event.before` + # against `github.sha`. A shallow clone may not contain `before` for + # a multi-commit push, in which case the diff resolves to nothing and + # the change is silently skipped. + fetch-depth: 0 + + - name: Set up Python + # `detect` runs map_item_ci.py (stdlib only — no LLM deps installed + # here), but still needs a `python` on PATH; don't rely on the runner + # image happening to provide one. + uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Plan translation matrix + id: plan + env: + EVENT_NAME: ${{ github.event_name }} + INPUTS_FILES: ${{ inputs.files }} + INPUTS_BOOTSTRAP: ${{ inputs.bootstrap }} + BEFORE_SHA: ${{ github.event.before }} + AFTER_SHA: ${{ github.sha }} + # Validates paths against a strict datasource shape (dropping anything + # else) and writes `mode` + `matrix` to $GITHUB_OUTPUT. See + # helper-scripts/map_item_ci.py. + run: python helper-scripts/map_item_ci.py plan-matrix + + sync: + name: Sync ${{ matrix.target.module }} + needs: detect + if: needs.detect.outputs.mode != 'none' + runs-on: ubuntu-latest + # Per-module concurrency: a newer push to master supersedes any in-flight + # sync for the same module (LLM run gets cancelled, latest run wins). + # Each matrix instance gets its own group, so different modules don't block. + concurrency: + group: zeeschuimer-sync-${{ matrix.target.module }} + cancel-in-progress: true + strategy: + fail-fast: false + matrix: + target: ${{ fromJson(needs.detect.outputs.matrix) }} + steps: + - name: Checkout 4CAT + uses: actions/checkout@v4 + with: + # Full history so the PR-body builder can `git diff before..after` + # for the changed Python file (see map_item_ci.py build-pr-body). + fetch-depth: 0 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Install LLM dependencies + run: | + # LLMAdapter (common/lib/llm.py) imports every provider's langchain + # package at module load, so all of them are required even though we + # only use the Ollama provider at runtime. Derive the exact list from + # setup.py (single source of truth) so it can't drift from what 4CAT + # declares; we install only this LLM subset, not all of 4CAT, to keep + # the job light. Write the specs to a requirements file (one per line) + # and install with `-r`, rather than an unquoted `pip install $VAR`: + # that way a version specifier that contains a shell metacharacter + # (e.g. a future `langchain_core>=0.3` pin — `>` is redirection) can't + # be misparsed by the shell. + python helper-scripts/map_item_ci.py llm-requirements > llm-requirements.txt + echo "Installing from setup.py:" + cat llm-requirements.txt + pip install -r llm-requirements.txt + + - name: Mint Zeeschuimer App token + id: app_token + uses: actions/create-github-app-token@v1 + with: + app-id: ${{ secrets.ZEESCHUIMER_APP_ID }} + private-key: ${{ secrets.ZEESCHUIMER_APP_PRIVATE_KEY }} + owner: digitalmethodsinitiative + repositories: zeeschuimer + + - name: Checkout Zeeschuimer + uses: actions/checkout@v4 + with: + repository: digitalmethodsinitiative/zeeschuimer + path: zeeschuimer-checkout + token: ${{ steps.app_token.outputs.token }} + + - name: Run translation + env: + PROVIDER_API_KEY: ${{ inputs.llm_api_key || secrets.LLM_PROVIDER_API_KEY || secrets.DMI_OLLAMA_KEY }} + LLM_PROVIDER: ${{ inputs.llm_provider || vars.LLM_PROVIDER || 'ollama' }} + LLM_BASE_URL: ${{ inputs.llm_base_url || vars.LLM_BASE_URL || 'https://ollama.digitalmethods.net' }} + LLM_MODEL: ${{ inputs.model || vars.LLM_MODEL || 'qwen2.5-coder:14b' }} + LLM_OUTPUT_MODE: ${{ inputs.output_mode || vars.LLM_OUTPUT_MODE || 'structured' }} + # Pass matrix values through env rather than interpolating. + # IS_BOOTSTRAP is always the literal true/false the detect job emitted + # MODULE_FILES is a list of paths the detect job already validated against + # `datasources//search_.py` shape (no shell + # metacharacters), so the unquoted `$MODULE_FILES` expansion is safe + # and still word-splits into multiple --files arguments. + IS_BOOTSTRAP: ${{ matrix.target.bootstrap }} + MODULE_FILES: ${{ matrix.target.files }} + run: | + if [ "$IS_BOOTSTRAP" = "true" ]; then + # Bootstrap translates every datasource in one run; --no-fail-fast + # so one datasource failing doesn't abort the whole initial sync. + python helper-scripts/map_item_converter.py \ + --bootstrap \ + --no-fail-fast \ + --zeeschuimer-checkout ./zeeschuimer-checkout \ + --output-manifest ./manifest.json + else + python helper-scripts/map_item_converter.py \ + --files $MODULE_FILES \ + --zeeschuimer-checkout ./zeeschuimer-checkout \ + --output-manifest ./manifest.json + fi + + - name: Build PR body + id: pr_body + env: + MODULE: ${{ matrix.target.module }} + BOOTSTRAP: ${{ matrix.target.bootstrap }} + BEFORE_SHA: ${{ github.event.before }} + AFTER_SHA: ${{ github.sha }} + RUN_ID: ${{ github.run_id }} + EVENT_NAME: ${{ github.event_name }} + REPO: ${{ github.repository }} + # Reads manifest.json, writes pr_body.md, and writes `title` to + # $GITHUB_OUTPUT (delimiter form, injection-safe). See map_item_ci.py. + run: python helper-scripts/map_item_ci.py build-pr-body --manifest manifest.json --out pr_body.md + + - name: Check there are JS changes to PR + id: have_changes + working-directory: zeeschuimer-checkout + run: | + if [ -z "$(git status --porcelain)" ]; then + echo "has_changes=false" >> "$GITHUB_OUTPUT" + echo "No JS changes produced by translation; not opening a PR." + else + echo "has_changes=true" >> "$GITHUB_OUTPUT" + fi + + - name: Open or update Zeeschuimer PR + if: steps.have_changes.outputs.has_changes == 'true' + # Third-party action that operates with a write token to the Zeeschuimer + # repo — pinned to a full commit SHA (the v6 release) rather than the + # mutable `@v6` tag, so a tag move can't silently change what runs here. + uses: peter-evans/create-pull-request@c5a7806660adbe173f04e3e038b0ccdcd758773c # v6 + with: + path: zeeschuimer-checkout + token: ${{ steps.app_token.outputs.token }} + # Stable per-module branch: a fresh push that retranslates the same + # module updates the same PR. Different modules never share a branch. + branch: auto/4cat-map-item-sync-${{ matrix.target.module }} + title: ${{ steps.pr_body.outputs.title }} + commit-message: "chore: sync map_item for ${{ matrix.target.module }} from 4CAT ${{ github.sha }}" + body-path: pr_body.md + draft: true diff --git a/.gitignore b/.gitignore index 8850a7bcc..5aa6eae56 100644 --- a/.gitignore +++ b/.gitignore @@ -46,6 +46,9 @@ webtool/venv/ *.ipynb venv/ __pycache__/ +.claude/ +# ignore symlink at the repo root -> config/extensions. +/extensions # do not ignore interface images !webtool/static/img/*.png diff --git a/common/lib/dataset.py b/common/lib/dataset.py index 37aaed1ba..53766b59a 100644 --- a/common/lib/dataset.py +++ b/common/lib/dataset.py @@ -495,14 +495,18 @@ def iterate_items( :param max_unmappable: Skip at most this many unmappable items; if more are encountered, stop iterating. `None` to never stop. :param map_missing: Indicates what to do with mapped items for which - some fields could not be mapped. Defaults to 'empty_str'. Must be one of: + some fields could not be mapped. Defaults to 'default'. Must be one of: - 'default': fill missing fields with the default passed by map_item + - 'keep': leave the MissingMappedField sentinel in place so the caller + can tell which fields were missing (useful for JSON serialisation + via MissingMappedFieldEncoder) - 'abort': raise a MappedItemIncompleteException if a field is missing - a callback: replace missing field with the return value of the callback. The MappedItem object is passed to the callback as the first argument and the name of the missing field as the second. - a dictionary with a key for each possible missing field: replace missing - field with a strategy for that field ('default', 'abort', or a callback) + field with a strategy for that field ('default', 'keep', 'abort', or + a callback) :param get_annotations: Whether to also fetch annotations from the database. This can be disabled to help speed up iteration. :param offset: After how many rows we should yield items. @@ -587,6 +591,10 @@ def iterate_items( mapped_item.data[missing_field] = strategy( mapped_item.data, missing_field ) + elif strategy == "keep": + # leave the MissingMappedField in place so the + # caller can distinguish missing from present + continue elif strategy == "abort": # raise an exception to be handled at the processor level raise MappedItemIncompleteException( @@ -599,7 +607,7 @@ def iterate_items( ].value else: raise ValueError( - "map_missing must be 'abort', 'default', or a callback." + "map_missing must be 'abort', 'default', 'keep', or a callback." ) else: mapped_item = original_item diff --git a/common/lib/llm.py b/common/lib/llm.py index 0901194d1..e696d2918 100644 --- a/common/lib/llm.py +++ b/common/lib/llm.py @@ -121,6 +121,22 @@ def _load_llm(self) -> BaseChatModel: ) self.model = llm.model_name return llm + elif self.provider == "litellm": + url = f"{self.base_url}/" if not self.base_url.endswith("/") else self.base_url + url += "v1/" if not url.endswith("v1/") else "" + + llm = ChatOpenAI( + model=self.model, + temperature=self.temperature, + api_key=SecretStr(self.api_key), + base_url=url, + max_tokens=self.max_tokens, + default_headers={ + "x-litellm-api-key": f"Bearer {self.api_key}" + } + ) + self.model = llm.model_name + return llm else: raise ValueError(f"Unsupported LLM provider: {self.provider}") @@ -287,7 +303,25 @@ def _format_media_block( }} return {"type": "image_url", "image_url": {"url": data_uri}} - def set_structure(self, json_schema): + def set_structure(self, json_schema, method=None, include_raw=False, strict=None): + """ + Bind a JSON schema so the model returns schema-validated structured output. + + :param json_schema: JSON schema dict (or JSON string) describing the output. + :param method: How structured output is enforced. None uses LangChain's + per-provider default (usually "function_calling", which binds a tool). + For reasoning models served over an OpenAI-compatible proxy, pass + "json_schema" — constrained decoding forces the answer channel itself + to match the schema, rather than relying on a clean tool call that the + model may emit in the wrong channel (yielding empty, unparseable output). + :param include_raw: When True, structured-output calls return a + {"raw", "parsed", "parsing_error"} dict instead of raising on a parse + failure, so callers can inspect the raw AIMessage (finish_reason, + reasoning channel, token usage) to diagnose what went wrong. + :param strict: Passed through to with_structured_output when not None. + Use strict=False for schemas that don't satisfy OpenAI strict-mode + requirements but are fine for a guided-decoding backend (e.g. vLLM). + """ if not json_schema: raise ValueError("json_schema is None") @@ -301,7 +335,12 @@ def set_structure(self, json_schema): json_schema = {"type": "json_schema", "json_schema": {"schema": json_schema}} self.llm = self.llm.bind(response_format=json_schema) else: - self.llm = self.llm.with_structured_output(json_schema) + kwargs = {"include_raw": include_raw} + if method: + kwargs["method"] = method + if strict is not None: + kwargs["strict"] = strict + self.llm = self.llm.with_structured_output(json_schema, **kwargs) self.structured_output = True @staticmethod diff --git a/helper-scripts/map_item_ci.py b/helper-scripts/map_item_ci.py new file mode 100644 index 000000000..78b742a50 --- /dev/null +++ b/helper-scripts/map_item_ci.py @@ -0,0 +1,483 @@ +""" +CI glue for the Zeeschuimer `map_item` sync workflow: translation-matrix +planning (`plan-matrix`) and PR-body construction (`build-pr-body`). + +This logic used to live as Python heredocs embedded in +`.github/workflows/zeeschuimer_map_item_sync.yml`. It was moved here so it can +be unit-tested and linted like the rest of the codebase. + +IMPORTANT: this module is intentionally pure-stdlib and MUST NOT import +`map_item_converter` (or anything under `common/`). The `detect` job runs +`plan-matrix` WITHOUT installing the LLM dependencies (langchain etc.), so any +heavy import here would break it. + +Usage (from the 4CAT repo root): + python helper-scripts/map_item_ci.py plan-matrix + python helper-scripts/map_item_ci.py build-pr-body --manifest manifest.json --out pr_body.md + python helper-scripts/map_item_ci.py llm-requirements + +`llm-requirements` prints the langchain/pydantic/requests pip specs read from +setup.py, so the workflow installs the same LLM stack 4CAT declares instead of +a hand-maintained list that can drift. + +`plan-matrix` reads EVENT_NAME / INPUTS_FILES / INPUTS_BOOTSTRAP / BEFORE_SHA / +AFTER_SHA from the environment and writes `mode` and `matrix` to $GITHUB_OUTPUT. +`build-pr-body` reads MODULE / BOOTSTRAP / BEFORE_SHA / AFTER_SHA / RUN_ID / +EVENT_NAME / REPO, writes the PR body to `--out`, and writes `title` to +$GITHUB_OUTPUT. +""" +from __future__ import annotations + +import argparse +import ast +import json +import os +import re +import subprocess +import sys +from typing import Callable, Optional + + +# Strict shape for a datasource path: `datasources//search_.py`. +# Anchored and restricted to a safe charset (no shell metacharacters) so that a +# path coming from `git diff` or, especially, a `workflow_dispatch` `files` +# input can be interpolated into the sync job's `--files` shell argument +# without any risk of command injection. Anything not matching is dropped and +# logged (never silently passed through). +DATASOURCE_PATH_RE = re.compile(r"^datasources/[A-Za-z0-9_-]+/search_[A-Za-z0-9_]+\.py$") + +# Pathspec used to limit the push-event diff to datasource search files. +_DATASOURCE_PATHSPEC = "datasources/*/search_*.py" + + +def _dist_name(spec: str) -> str: + """Bare distribution name from a requirement spec: strip version, extras, + and environment markers. `requests~=2.27` -> `requests`, + `Flask_Limiter[memcached]` -> `Flask_Limiter`.""" + return re.split(r"[<>=!~;\[ ]", spec, maxsplit=1)[0].strip() + + +def extract_llm_requirements(setup_py_source: str) -> list[str]: + """ + Pull the LLM dependency specs (langchain*, pydantic, requests) straight out + of setup.py's package sets. The sync job installs only this subset (not all + of 4CAT) to stay light, but deriving it from setup.py means the list can't + silently drift from what the app actually declares — and it picks up new + langchain providers automatically. + + Returns sorted, de-duplicated requirement strings with whatever version + specifiers setup.py uses. + """ + specs = set() + for node in ast.walk(ast.parse(setup_py_source)): + if not isinstance(node, ast.Assign): + continue + if not any( + isinstance(t, ast.Name) and t.id in ("core_packages", "processor_packages") + for t in node.targets + ): + continue + for elt in getattr(node.value, "elts", []): + if isinstance(elt, ast.Constant) and isinstance(elt.value, str): + specs.add(elt.value) + + return sorted( + spec + for spec in specs + if _dist_name(spec).startswith("langchain") or _dist_name(spec) in ("pydantic", "requests") + ) + + +def _git_diff_names(before: str, after: str) -> list[str]: + """ + Names of datasource search files changed between two commits. Returns an + empty list (rather than raising) if git can't resolve the range — e.g. a + shallow clone that doesn't contain `before`. The caller treats "can't tell" + the same as "nothing changed". + """ + try: + out = subprocess.check_output( + ["git", "diff", "--name-only", before, after, "--", _DATASOURCE_PATHSPEC], + text=True, + ) + except (subprocess.CalledProcessError, OSError): + # CalledProcessError: range can't resolve (shallow clone). OSError / + # FileNotFoundError: git not on PATH. Either way, "can't tell" == "nothing". + return [] + return [line for line in out.splitlines() if line.strip()] + + +def plan_matrix( + event_name: str, + inputs_files: str, + inputs_bootstrap: bool, + before: str, + after: str, + git_diff: Optional[Callable[[str, str], list[str]]] = None, +) -> tuple[str, list[dict], list[str]]: + """ + Decide what to translate. Returns `(mode, matrix, rejected)`: + + - mode: "bootstrap" | "files" | "none" + - matrix: list of `{"module", "files", "bootstrap"}` entries for the + `sync` job's matrix. + - rejected: candidate paths dropped because they don't match + `DATASOURCE_PATH_RE` (logged by the caller for transparency). + + `git_diff` is injectable for testing; it defaults to a real `git diff`. + """ + inputs_files = (inputs_files or "").strip() + + # Bootstrap is special: a single PR covering every datasource. An explicit + # `files` input overrides bootstrap; honor that. + if event_name == "workflow_dispatch" and inputs_bootstrap and not inputs_files: + return "bootstrap", [{"module": "bootstrap", "files": "", "bootstrap": True}], [] + + # Resolve the candidate file list. + if event_name == "workflow_dispatch" and inputs_files: + candidates = inputs_files.split() + else: + if git_diff is None: + git_diff = _git_diff_names + candidates = git_diff(before, after) + + # Validate before anything reaches a shell. Drop (and report) anything that + # isn't a plain `datasources//search_.py` path. + files: list[str] = [] + rejected: list[str] = [] + for path in candidates: + path = path.strip() + if not path: + continue + if DATASOURCE_PATH_RE.match(path): + files.append(path) + else: + rejected.append(path) + + # Group by module: datasources//search_*.py + modules: dict[str, list[str]] = {} + for path in files: + parts = path.split("/") + modules.setdefault(parts[1], []).append(path) + + if not modules: + return "none", [], rejected + + matrix = [ + {"module": mod, "files": " ".join(sorted(paths)), "bootstrap": False} + for mod, paths in sorted(modules.items()) + ] + return "files", matrix, rejected + + +def _git_python_diff(before: str, after: str, python_file: str) -> str: + """`git diff before..after -- `; "" if the range can't resolve + or git isn't available.""" + try: + return subprocess.check_output( + ["git", "diff", "{}..{}".format(before, after), "--", python_file], + text=True, + ) + except (subprocess.CalledProcessError, OSError): + return "" + + +def _code_fence(content: str, lang: str = "") -> tuple[str, str]: + """ + Return `(open, close)` markdown code-fence markers long enough that nothing + inside `content` can close the block early. A diff (or LLM text) may itself + contain a ``` run or an HTML tag like ``; GitHub renders those + literally only while the fence stays intact, so we use one more backtick + than the longest run already present (minimum three). + """ + longest = max((len(run) for run in re.findall(r"`+", content)), default=0) + ticks = "`" * max(3, longest + 1) + return ticks + lang, ticks + + +def build_pr_body( + manifest: dict, + module: str, + is_bootstrap: bool, + before: str, + after: str, + run_id: str, + event_name: str, + repo: str, + python_diff: Optional[Callable[[str, str, str], str]] = None, +) -> tuple[str, str]: + """ + Build the draft-PR `(title, body)` from a translation manifest. `python_diff` + is injectable for testing; it defaults to a real `git diff` and is only + invoked for `push` events. + """ + if python_diff is None: + python_diff = _git_python_diff + + model = manifest.get("model", "(unknown)") + provider = manifest.get("provider", "ollama") + total_duration = manifest.get("total_duration_seconds") + entries = manifest.get("entries", []) + + short_sha = after[:7] + lines: list[str] = [] + lines.append( + "> :robot: This PR was auto-generated by the [4CAT map_item sync " + "workflow](https://github.com/{}/actions/runs/{}). The JavaScript was " + "produced by an LLM and **requires human review** before merging — " + "including manual fixes for any lint warnings flagged below.".format(repo, run_id) + ) + lines.append("") + lines.append("## Generation parameters") + lines.append("- **Model:** `{}` (provider: `{}`)".format(model, provider)) + if total_duration is not None: + lines.append("- **Total LLM time:** {}s".format(total_duration)) + if is_bootstrap: + lines.append( + "- **Trigger:** manual `workflow_dispatch` with `bootstrap=true` " + "(initial sync of all Zeeschuimer datasources)." + ) + elif event_name == "workflow_dispatch": + lines.append("- **Trigger:** manual `workflow_dispatch` for `{}`.".format(module)) + else: + lines.append( + "- **Trigger:** push of [`{}`](https://github.com/{}/commit/{}) to 4CAT " + "master (module: `{}`).".format(short_sha, repo, after, module) + ) + lines.append("") + + ok = [e for e in entries if e["status"] == "ok"] + ok_with_warnings = [e for e in ok if e.get("lint_warnings")] + failed = [e for e in entries if e["status"] == "failed"] + skipped = [e for e in entries if e["status"] == "skipped"] + + lines.append("## Summary") + lines.append("- :white_check_mark: {} translated".format(len(ok))) + if ok_with_warnings: + lines.append( + "- :warning: {} translated with lint warnings (require manual fix)".format( + len(ok_with_warnings) + ) + ) + lines.append("- :x: {} failed".format(len(failed))) + lines.append("- :grey_question: {} skipped".format(len(skipped))) + lines.append("") + + if ok: + lines.append("| Datasource | Module | Time | Warnings |") + lines.append("|---|---|---:|---:|") + for entry in ok: + dur = entry.get("duration_seconds") + dur_cell = "{}s".format(dur) if dur is not None else "—" + warn_count = len(entry.get("lint_warnings") or []) + warn_cell = ":warning: {}".format(warn_count) if warn_count else "—" + lines.append( + "| `{}` | `{}` | {} | {} |".format( + entry["python_file"], entry["js_file"], dur_cell, warn_cell + ) + ) + lines.append("") + + if ok_with_warnings: + lines.append("## :warning: Lint warnings — fix before merging") + lines.append("") + lines.append( + "The following datasources translated successfully but the static lint " + "flagged issues that need human fixes. The auto-generated code was " + "spliced into the JS module as-is; please patch the file directly in " + "this PR." + ) + lines.append("") + for entry in ok_with_warnings: + lines.append("**`{}` -> `{}`**".format(entry["python_file"], entry["js_file"])) + for w in entry["lint_warnings"]: + lines.append("- {}".format(w)) + lines.append("") + + for entry in ok: + dur = entry.get("duration_seconds") + header_dur = " ({}s)".format(dur) if dur is not None else "" + warn_marker = " :warning:" if entry.get("lint_warnings") else "" + lines.append( + "## `{}` -> `{}`{}{}".format( + entry["python_file"], entry["js_file"], header_dur, warn_marker + ) + ) + if entry.get("commentary"): + lines.append("**LLM commentary:**") + lines.append("") + lines.append("> " + entry["commentary"].replace("\n", "\n> ")) + lines.append("") + if event_name == "push": + diff = python_diff(before, after, entry["python_file"]) + else: + diff = "" + if diff.strip(): + fence_open, fence_close = _code_fence(diff, "diff") + lines.append("
Python diff") + lines.append("") + lines.append(fence_open) + lines.append(diff.rstrip()) + lines.append(fence_close) + lines.append("
") + lines.append("") + + if failed: + lines.append("## Failures") + for entry in failed: + dur = entry.get("duration_seconds") + dur_str = " (after {}s)".format(dur) if dur is not None else "" + lines.append( + "- `{}`{}: {}".format( + entry["python_file"], dur_str, entry.get("error", "(no error message)") + ) + ) + lines.append("") + + if skipped: + lines.append("## Skipped") + for entry in skipped: + lines.append("- `{}`: {}".format(entry["python_file"], entry.get("error", ""))) + lines.append("") + + body = "\n".join(lines) + + # Title is single-module in the matrix path; bootstrap is its own + # special-case (one PR covering every datasource). + ok_modules: list[str] = [] + for entry in ok: + parts = entry["python_file"].split("/") + if len(parts) >= 2 and parts[0] == "datasources": + mod = parts[1] + if mod not in ok_modules: + ok_modules.append(mod) + + if is_bootstrap: + title = "Auto-translated map_item updates from 4CAT (bootstrap, {} datasources)".format( + len(ok_modules) + ) + elif not ok_modules: + title = "Auto-translated map_item updates from 4CAT: {}".format(module) + else: + title = "Auto-translated map_item updates from 4CAT: {}".format(", ".join(ok_modules)) + + return title, body + + +def set_output(name: str, value: str) -> None: + """ + Append a `name=value` step output to $GITHUB_OUTPUT using the heredoc + delimiter form, which is safe for values containing `\\n` or `=` (a plain + `name=value` line can be abused to inject extra outputs). No-op when + $GITHUB_OUTPUT is unset (e.g. running locally). + """ + out_path = os.environ.get("GITHUB_OUTPUT") + if not out_path: + return + # A delimiter that cannot appear in our values. If it somehow does, strip it + # rather than emit a malformed/forgeable block. + delim = "ghadelim_{}_b3f9c1".format(name) + safe_value = value.replace(delim, "") + with open(out_path, "a", encoding="utf-8") as f: + f.write("{name}<<{delim}\n{value}\n{delim}\n".format(name=name, delim=delim, value=safe_value)) + + +def _cmd_plan_matrix() -> int: + mode, matrix, rejected = plan_matrix( + event_name=os.environ.get("EVENT_NAME", ""), + inputs_files=os.environ.get("INPUTS_FILES", ""), + inputs_bootstrap=os.environ.get("INPUTS_BOOTSTRAP", "").lower() == "true", + before=os.environ.get("BEFORE_SHA", ""), + after=os.environ.get("AFTER_SHA", ""), + ) + + if rejected: + print( + "Plan: dropped {} path(s) not matching `datasources//search_.py`:".format( + len(rejected) + ) + ) + for path in rejected: + print(" - {!r}".format(path)) + + set_output("mode", mode) + set_output("matrix", json.dumps(matrix)) + + if mode == "bootstrap": + print("Plan: bootstrap (single PR)") + elif mode == "none": + print("Plan: nothing to translate") + else: + print("Plan: {} module(s)".format(len(matrix))) + for entry in matrix: + print(" - {}: {}".format(entry["module"], entry["files"])) + return 0 + + +def _cmd_build_pr_body(manifest_path: str, out_path: str) -> int: + with open(manifest_path, encoding="utf-8") as f: + manifest = json.load(f) + + title, body = build_pr_body( + manifest, + module=os.environ["MODULE"], + is_bootstrap=os.environ.get("BOOTSTRAP", "").lower() == "true", + before=os.environ.get("BEFORE_SHA", ""), + after=os.environ.get("AFTER_SHA", ""), + run_id=os.environ.get("RUN_ID", ""), + event_name=os.environ.get("EVENT_NAME", ""), + repo=os.environ.get("REPO", ""), + ) + + with open(out_path, "w", encoding="utf-8") as f: + f.write(body) + print("Wrote {} ({} chars)".format(out_path, len(body))) + + set_output("title", title) + print("PR title: {}".format(title)) + return 0 + + +def _cmd_llm_requirements() -> int: + setup_py = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "setup.py") + with open(setup_py, encoding="utf-8") as f: + specs = extract_llm_requirements(f.read()) + if not specs: + print("error: no LLM requirements found in setup.py core_packages", file=sys.stderr) + return 1 + # One spec per line (valid requirements.txt): the workflow redirects this to + # a file and runs `pip install -r`, so a specifier containing a shell + # metacharacter (`>=`, `<`) is never word-split or treated as a redirection. + print("\n".join(specs)) + return 0 + + +def main(argv: Optional[list[str]] = None) -> int: + parser = argparse.ArgumentParser(description=__doc__) + sub = parser.add_subparsers(dest="command", required=True) + + sub.add_parser("plan-matrix", help="Emit the translation matrix to $GITHUB_OUTPUT.") + + body = sub.add_parser("build-pr-body", help="Build the draft-PR body from a manifest.") + body.add_argument("--manifest", required=True, help="Path to the translation manifest JSON.") + body.add_argument("--out", required=True, help="Where to write the PR body markdown.") + + sub.add_parser( + "llm-requirements", + help="Print the LLM pip requirements (langchain*/pydantic/requests) from setup.py.", + ) + + args = parser.parse_args(argv) + + if args.command == "plan-matrix": + return _cmd_plan_matrix() + if args.command == "build-pr-body": + return _cmd_build_pr_body(args.manifest, args.out) + if args.command == "llm-requirements": + return _cmd_llm_requirements() + parser.error("unknown command") + return 2 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/helper-scripts/map_item_converter.py b/helper-scripts/map_item_converter.py new file mode 100644 index 000000000..8720920fb --- /dev/null +++ b/helper-scripts/map_item_converter.py @@ -0,0 +1,1162 @@ +""" +Translate 4CAT Zeeschuimer-import datasource `map_item` functions from Python +to JavaScript and splice them into the corresponding Zeeschuimer +`modules/.js` file. + +Designed to be invoked by a GitHub Action whenever a Zeeschuimer datasource's +Python file changes on master. Can also be run locally for testing or via +`workflow_dispatch` with `--bootstrap` to translate every datasource at once. + +The LLM produces only the new `map_item` function (plus any imports/helpers it +needs and free-text commentary). This script does the file integration: it +locates a marker block in the existing JS module and replaces its contents, +preserving every hand-written line outside the markers. + +Usage: + PROVIDER_API_KEY=... python helper-scripts/map_item_converter.py \\ + --files datasources/tiktok/search_tiktok.py \\ + --zeeschuimer-checkout ../zeeschuimer \\ + --output-manifest /tmp/manifest.json +""" +from __future__ import annotations + +import argparse +import ast +import json +import os +import re +import sys +import time +import traceback +from pathlib import Path +from typing import Optional, TYPE_CHECKING + +sys.path.insert(0, os.path.join(os.path.abspath(os.path.dirname(__file__)), "..")) + +# NOTE: `common.lib.llm.LLMAdapter` (and the langchain stack it pulls in) is +# imported lazily inside main(), not at module load. That keeps the pure +# helpers here — path derivation, splicing, linting — importable for unit +# tests without the heavy LLM dependencies installed. `from __future__ import +# annotations` makes the `LLMAdapter` type hints below strings, so they don't +# need the import at definition time either. +if TYPE_CHECKING: + # Resolved only by type checkers / linters (never at runtime), so the + # `llm: LLMAdapter` annotations below have a defined name without forcing + # the langchain import. + from common.lib.llm import LLMAdapter + +# Sibling module — lives next to this script in helper-scripts/. Python adds the +# script's directory to sys.path automatically when the file is run directly. +from map_item_rules import RULES, get_regex_lint_rules + + +# 4CAT datasource path -> Zeeschuimer module path is derived by convention: +# the Python file is `datasources//search_.py`; the JS module is +# `modules/.js`. The convention only depends on the Python +# *filename*, not the directory, so cases where they differ still work +# (e.g. `xiaohongshu/search_rednote.py` -> `modules/rednote.js`, +# `twitter-import/search_twitter.py` -> `modules/twitter.js`). +# +# Datasources without a matching Zeeschuimer module (today: facebook) are +# skipped automatically — the JS file existence check in `translate_one` +# handles them without any explicit allow-list. New Zeeschuimer datasources +# added to 4CAT are picked up automatically as long as Zeeschuimer ships the +# matching `modules/.js` file. +def python_to_js_module(python_rel: str) -> Optional[str]: + """ + Derive the Zeeschuimer module path for a 4CAT datasource Python file. + Returns None if the path doesn't follow `datasources//search_*.py`. + """ + parts = python_rel.split("/") + if len(parts) != 3 or parts[0] != "datasources": + return None + filename = parts[2] + if not filename.startswith("search_") or not filename.endswith(".py"): + return None + base = filename[len("search_"):-len(".py")] + if not base: + return None + return f"modules/{base.replace('_', '-')}.js" + +DEFAULT_LLM_PROVIDER = "ollama" +DEFAULT_BASE_URL = "https://ollama.digitalmethods.net" +DEFAULT_MODEL = "qwen2.5-coder:14b" + +IMPORTS_MARKER_START = "// === auto-generated imports for map_item — BLOCK REPLACED AUTOMATICALLY ===" +IMPORTS_MARKER_END = "// === end auto-generated imports ===" +BLOCK_MARKER_START = "// === auto-generated by 4cat map_item sync — BLOCK REPLACED AUTOMATICALLY ===" +BLOCK_MARKER_END = "// === end auto-generated ===" + +LLM_SCHEMA = { + "title": "MapItemTranslation", + "type": "object", + "required": ["map_item_function", "imports_to_add", "helpers_to_add", "commentary"], + "properties": { + "map_item_function": { + "type": "string", + "description": ( + "Full JavaScript source of the new map_item function. Must include " + "the function declaration (e.g. 'export function map_item(item) {...}'). " + "Do not include surrounding code from the module." + ), + }, + "imports_to_add": { + "type": "array", + "items": {"type": "string"}, + "description": ( + "Complete ES-module import statements that map_item needs. " + "Normally empty: Zeeschuimer's `js/lib.js` (which provides MappedItem, " + "MissingMappedField, normalize_url_encoding, strip_tags) is loaded as a " + "plain script, NOT an ES module — its declarations are already global, " + "so do not write `import { X } from '../js/lib.js'`. Only populate this " + "if you genuinely need to import from another ES module." + ), + }, + "helpers_to_add": { + "type": "array", + "items": {"type": "string"}, + "description": ( + "Full source of any helper functions map_item depends on (e.g. a JS " + "port of normalize_url_encoding). Empty array if none needed." + ), + }, + "commentary": { + "type": "string", + "description": ( + "Notes for the human reviewer: assumptions made, fields you were " + "unsure about, Python idioms that don't translate cleanly. Plain text." + ), + }, + }, +} + +SYSTEM_PROMPT = ( + "You translate 4CAT Python `map_item` functions into JavaScript for the " + "Zeeschuimer browser extension. You return ONLY the new map_item function, " + "any imports it needs, any helper functions it depends on, and commentary " + "for the human reviewer. You NEVER return the surrounding module file. " + "You preserve the field names produced by the Python function exactly. " + "You do not invent fields not present in the Python output. " + "You output raw JavaScript source — never wrap it in markdown code fences " + "(```js, ```javascript, etc.). The fields in your structured response are " + "already typed as code; fences make them invalid." +) + + +# Whitelist of helpers that Zeeschuimer makes available as globals at runtime. +# `js/lib.js` is loaded as a plain