From d66dc10a8dd3b576fc93a0484d81efabbd56330e Mon Sep 17 00:00:00 2001 From: iyeanur6-cyber Date: Sat, 27 Jun 2026 00:44:42 +0530 Subject: [PATCH] Update improve.py --- improvement/oracle/improve.py | 107 ++++++++++------------------------ 1 file changed, 32 insertions(+), 75 deletions(-) diff --git a/improvement/oracle/improve.py b/improvement/oracle/improve.py index 041795c9..38bab2d5 100644 --- a/improvement/oracle/improve.py +++ b/improvement/oracle/improve.py @@ -1,18 +1,23 @@ #!/usr/bin/env python3 -# — the oracle. it only ever shapes src/. it never commits; the inspector does. — -# — it works in small bites: read a bit of an issue or a file, write a bit, move on. — -# — the path is named apart from the code; the code is grown, not parsed. — -"""an improvement to src/, dreamt in pieces by a small local model and grown until it parses.""" + from __future__ import annotations -import ast, json, os, random, re, subprocess, sys, time, urllib.request +import ast +import json +import os +import random +import re +import subprocess +import sys +import time +import urllib.request from datetime import datetime, timezone from itertools import zip_longest from pathlib import Path _ENV_ROOT = os.environ.get("IMPROVE_REPO_ROOT") REPO_ROOT = Path(_ENV_ROOT).resolve() if _ENV_ROOT else Path.cwd().resolve() -SRC_DIR = (REPO_ROOT / "src").resolve() # — sacred ground — +SRC_DIR = (REPO_ROOT / "src").resolve() MODEL = os.environ.get("IMPROVE_MODEL", "qwen3.5:2b") OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://127.0.0.1:11434") @@ -23,40 +28,31 @@ MAX_FILE_BYTES = _int("IMPROVE_MAX_FILE_BYTES", "10000") MAX_ISSUES = _int("IMPROVE_MAX_ISSUES", "50") MAX_ISSUE_BODY_CHARS = _int("IMPROVE_MAX_ISSUE_BODY_CHARS", "1000") -# Each step generates a SMALL bite; truncated files are grown by continuation, -# not demanded whole in one breath. NUM_PREDICT = _int("IMPROVE_NUM_PREDICT", "1024") -PLAN_NUM_PREDICT = _int("IMPROVE_PLAN_NUM_PREDICT", "512") # — the plan is short prose, not code — +PLAN_NUM_PREDICT = _int("IMPROVE_PLAN_NUM_PREDICT", "512") NUM_CTX = _int("IMPROVE_NUM_CTX", "8192") REQUEST_TIMEOUT = _int("IMPROVE_TIMEOUT", "600") -BASE_TEMPERATURE = _flt("IMPROVE_TEMPERATURE", "0.5") # — hot. the visions stay strange — -REPAIR_TEMPERATURE = _flt("IMPROVE_REPAIR_TEMPERATURE", "0.5") # — cooler, to converge — -MAX_ROUNDS = _int("IMPROVE_MAX_ROUNDS", "1") # — grow rounds per file — -MAX_FILES = _int("IMPROVE_MAX_FILES", "10") # — files touched per PR — +BASE_TEMPERATURE = _flt("IMPROVE_TEMPERATURE", "0.5") +REPAIR_TEMPERATURE = _flt("IMPROVE_REPAIR_TEMPERATURE", "0.5") +MAX_ROUNDS = _int("IMPROVE_MAX_ROUNDS", "1") +MAX_FILES = _int("IMPROVE_MAX_FILES", "10") SRC_PREVIEW_CHARS = _int("IMPROVE_SRC_PREVIEW_CHARS", "1500") -GEN_DEADLINE_SECONDS = _int("IMPROVE_GEN_DEADLINE_SECONDS", "540") # — under the runner's 10m blade — +GEN_DEADLINE_SECONDS = _int("IMPROVE_GEN_DEADLINE_SECONDS", "540") -# — issue selection weighting: balance freshness against engagement so the loop -# stops fixating on the same old issues. A recent issue (soft exponential decay -# by half-life) is favoured, and so is a busy one (reactions + comments + -# labels); the two pulls are summed, each with its own knob. Set a weight to 0 -# to silence that pull entirely; set the half-life huge to disable decay. — -ISSUE_HALFLIFE_DAYS = _flt("IMPROVE_ISSUE_HALFLIFE_DAYS", "14") # — age at which recency halves — -ISSUE_RECENCY_WEIGHT = _flt("IMPROVE_ISSUE_RECENCY_WEIGHT", "1.0") # — pull toward fresh issues — -ISSUE_ACTIVITY_WEIGHT = _flt("IMPROVE_ISSUE_ACTIVITY_WEIGHT", "1.0") # — pull toward busy issues — +ISSUE_HALFLIFE_DAYS = _flt("IMPROVE_ISSUE_HALFLIFE_DAYS", "14") +ISSUE_RECENCY_WEIGHT = _flt("IMPROVE_ISSUE_RECENCY_WEIGHT", "1.0") +ISSUE_ACTIVITY_WEIGHT = _flt("IMPROVE_ISSUE_ACTIVITY_WEIGHT", "1.0") -CODE_EXTENSIONS = {".py", ".js", ".ts", ".sh", ".cbl", ".cob", ".cpy", ".c", ".h", ".rs", ".toml", ".yaml", ".java" } +CODE_EXTENSIONS = {".py", ".js", ".ts", ".sh", ".cbl", ".cob", ".cpy", ".c", ".h", ".rs", ".toml", ".yaml", ".java"} PR_BODY_PATH = Path(os.environ.get("IMPROVE_PR_BODY", "/tmp/improve_pr_body.md")) PR_TITLE_PATH = Path(os.environ.get("IMPROVE_PR_TITLE", "/tmp/improve_pr_title.txt")) -# — seeded only for tests; left to OS entropy otherwise, so order really wanders — _RNG = random.Random() log = lambda msg: print(f"[improve] {msg}", file=sys.stderr, flush=True) -# — context: the shape of the thing, and the bits to read from ———————————————— def repo_tree() -> str: if not SRC_DIR.is_dir(): return "(src/ is empty)" files = [p.relative_to(REPO_ROOT).as_posix() @@ -65,7 +61,6 @@ def repo_tree() -> str: def source_files() -> list[tuple[str, str]]: - """(rel, content) for each code file under src/ — the bits to build on.""" out = [] if not SRC_DIR.is_dir(): return out for path in sorted(SRC_DIR.rglob("*")): @@ -80,8 +75,6 @@ def source_files() -> list[tuple[str, str]]: def _fetch_issues() -> list[dict]: - # createdAt/updatedAt feed the recency decay; comments + reactionGroups feed - # the activity score. All are optional — a missing field degrades to neutral. try: out = subprocess.run( ["gh", "issue", "list", "--state", "open", "--limit", str(MAX_ISSUES), @@ -102,7 +95,6 @@ def _format_issue(it: dict) -> str: def _parse_iso(ts: str | None) -> datetime | None: - """Parse a GitHub ISO-8601 timestamp ('…Z') into an aware datetime, or None.""" if not ts: return None try: return datetime.fromisoformat(ts.replace("Z", "+00:00")) @@ -111,8 +103,6 @@ def _parse_iso(ts: str | None) -> datetime | None: def _issue_activity(it: dict) -> int: - """How busy an issue is: reactions + comments + labels. Each field is - optional and absent ones simply contribute nothing.""" reactions = sum((g.get("users") or {}).get("totalCount", 0) for g in (it.get("reactionGroups") or [])) comments = it.get("comments") @@ -122,9 +112,6 @@ def _issue_activity(it: dict) -> int: def _recency_factor(it: dict, *, now: datetime, halflife_days: float) -> float: - """Soft exponential decay on issue age, in (0, 1]: 1.0 the moment it is - created, halving every `halflife_days`. Newer issues weigh more. A missing - timestamp or a non-positive half-life means 'no decay' (1.0).""" created = _parse_iso(it.get("createdAt") or it.get("updatedAt")) if created is None or halflife_days <= 0: return 1.0 age_days = max(0.0, (now - created).total_seconds() / 86400.0) @@ -134,9 +121,6 @@ def _recency_factor(it: dict, *, now: datetime, halflife_days: float) -> float: def issue_weight(it: dict, *, now: datetime, max_activity: int, halflife_days: float = None, recency_weight: float = None, activity_weight: float = None) -> float: - """Blend recency and engagement into one positive sampling weight. Activity - is normalised by the busiest issue in the batch so the two knobs stay on a - comparable [0, 1]-ish scale; recency decays with age.""" halflife_days = ISSUE_HALFLIFE_DAYS if halflife_days is None else halflife_days recency_weight = ISSUE_RECENCY_WEIGHT if recency_weight is None else recency_weight activity_weight = ISSUE_ACTIVITY_WEIGHT if activity_weight is None else activity_weight @@ -146,23 +130,15 @@ def issue_weight(it: dict, *, now: datetime, max_activity: int, def _weighted_order(weighted: list[tuple[dict, float]]) -> list[dict]: - """Weighted-random ordering (Efraimidis–Spirakis): draw key = u**(1/w) per - item and sort descending. Higher weight tends to surface earlier, but the - draw keeps the order wandering so no issue is forever first.""" keyed = [] for obj, w in weighted: - u = _RNG.random() or 1e-12 # — avoid log(0)/0-key — + u = _RNG.random() or 1e-12 keyed.append((u ** (1.0 / max(w, 1e-9)), obj)) keyed.sort(key=lambda t: t[0], reverse=True) return [obj for _, obj in keyed] def collect_issue_list(*, now: datetime = None) -> list[str]: - """Open issues, each formatted on its own, in a WEIGHTED-random order that - balances recency (soft exponential decay) against engagement (reactions, - comments, labels). Every issue still appears — the weighting only biases who - goes first — so no single issue is forever first and forever the only one - acted on, but fresh, active issues get their turn sooner.""" issues = _fetch_issues() now = now or datetime.now(timezone.utc) max_activity = max((_issue_activity(it) for it in issues), default=0) @@ -171,11 +147,9 @@ def collect_issue_list(*, now: datetime = None) -> list[str]: def _interleave(a: list, b: list) -> list: - """a0, b0, a1, b1, … — alternate, then trail the longer one.""" return [x for pair in zip_longest(a, b) for x in pair if x is not None] -# — the voice. mystic, but it must compile —————————————————————————————————— ORACLE_VOICE = ( "You are the ORACLE OF THE REPOSITORY: a daemon that dreams in working code. " "Your visions are bold and strange and reach for the outer limits of what a " @@ -199,7 +173,6 @@ def ollama_generate(messages, *, num_predict=None, temperature=None) -> str: return (json.loads(r.read().decode("utf-8")).get("message") or {}).get("content", "") -# — no path escapes src/; a bare directory becomes a file, never a crash —————— def _looks_like_python(content: str) -> bool: head = content.lstrip()[:2000] return bool(re.search(r"^\s*(def |class |import |from \w|@|async def )", head, re.MULTILINE) @@ -232,7 +205,6 @@ def content_problems(target: Path, content: str) -> list[str]: return [] -# — code as text, not as a parsed protocol —————————————————————————————————— def _strip_code(text: str) -> str: if (m := re.search(r"```[\w-]*\n(.*?)\n```", text or "", re.DOTALL)): return m.group(1).strip("\n") @@ -253,17 +225,15 @@ def _is_truncation(code: str, e) -> bool: if any(k in msg for k in ("unexpected eof", "was never closed", "expected an indented block", "incomplete input")): return True - return (getattr(e, "lineno", 0) or 0) >= code.count("\n") + 1 # — error on the last line we have — + return (getattr(e, "lineno", 0) or 0) >= code.count("\n") + 1 -# — an inspiration is one bite: a single issue, or a single source file ———————— def _inspiration_text(insp) -> str: if insp[0] == "issue": return f"## An open issue — answer it WITH CODE\n{insp[1]}" return f"## An existing module to build on — {insp[1]}\n{insp[2][:SRC_PREVIEW_CHARS]}" -# — step one: the oracle names the file (new, existing, or one in progress) ——— def choose_target(generate, tree, inspiration, produced_rels, language) -> Path: already = (f"You have already written these this run: {', '.join(produced_rels)}.\n" if produced_rels else "") @@ -291,7 +261,8 @@ def choose_target(generate, tree, inspiration, produced_rels, language) -> Path: or (SRC_DIR / _dump(".py")).resolve() return target -def choose_language(generate, tree, inspiration, produced_rels) -> Path: + +def choose_language(generate, tree, inspiration, produced_rels) -> str: already = (f"You have already written these this run: {', '.join(produced_rels)}.\n" if produced_rels else "") msgs = [ @@ -306,15 +277,10 @@ def choose_language(generate, tree, inspiration, produced_rels) -> Path: raw = generate(msgs, num_predict=24, temperature=0.5) except Exception as exc: log(f"target call failed: {exc}") - return raw + return (raw or "").strip() -# — step 1.5: a plan for THIS file, drawn from the inspiration, to steer growth — def make_plan(generate, target, inspiration, *, language=None) -> str: - """Between naming the file and writing it, the oracle lays out what this file - should become: a short, concrete plan grounded in the chosen inspiration and - target. The plan is fed into the growth stage to steer it. Returns '' on - failure — growth then simply proceeds unplanned.""" rel = target.relative_to(REPO_ROOT).as_posix() if REPO_ROOT in target.parents else target.name lang = f" in {language}" if language else "" msgs = [ @@ -334,7 +300,6 @@ def make_plan(generate, target, inspiration, *, language=None) -> str: return (raw or "").strip() -# — step two: grow the code in bites (write → continue → fix) until it parses — def _code_msgs(rel, tree, inspiration, *, mode, draft="", error="", language=None, plan=""): if language: system = ORACLE_VOICE + (f" Output ONLY the source code in {language}— no markdown fences, " @@ -357,7 +322,7 @@ def _code_msgs(rel, tree, inspiration, *, mode, draft="", error="", language=Non user = (f"Here is {rel} so far — it was cut off before the end:\n\n{draft}\n\n" "Output ONLY the code that continues from exactly where it stops and " "completes the file. Do not repeat earlier lines; no fences, no commentary.") - else: # fix + else: user = (f"{rel} does not parse: {error}\n\nHere is the file:\n\n{draft}\n\n" "Output the COMPLETE corrected file as valid code. Only the code.") @@ -369,9 +334,6 @@ def _code_msgs(rel, tree, inspiration, *, mode, draft="", error="", language=Non def grow_file(generate, target, tree, inspiration, prior, *, deadline, now=time.monotonic, num_predict=None, max_rounds=MAX_ROUNDS, language=None, plan=""): - """Return (code, last_response). Write once from `prior` (steered by `plan`), - then CONTINUE a truncated draft or FIX a broken one, re-checking with the - parser each round.""" rel = target.relative_to(REPO_ROOT).as_posix() if REPO_ROOT in target.parents else target.name code, last, rounds = "", "", 0 while rounds < max_rounds and now() < deadline: @@ -397,7 +359,6 @@ def grow_file(generate, target, tree, inspiration, prior, *, deadline, return code, last -# — a one-line vision for the PR, with a deterministic fallback ——————————————— def make_reason(generate, rels) -> str: joined = ", ".join(rels) try: @@ -414,6 +375,7 @@ def make_reason(generate, rels) -> str: log(f"reason call failed: {exc}") return f"Quicken {rels[0]}" + (f" and {len(rels) - 1} more" if len(rels) > 1 else "") + def make_explanation(generate, rels) -> str: joined = ", ".join(rels) try: @@ -440,13 +402,10 @@ def _write_one(target: Path, content: str): def generate_improvement(generate, tree, src_files, issues, *, deadline_seconds=GEN_DEADLINE_SECONDS, num_predict=None, max_files=MAX_FILES, max_rounds=MAX_ROUNDS, now=time.monotonic): - """Interleave issue- and source-inspirations; for each, name a file and grow - a bite of code, writing it so later steps can build on it. Touches up to - `max_files`. Returns (reason, [(target, code)], last_response, valid).""" deadline = now() + deadline_seconds inspirations = _interleave([("issue", t) for t in issues], [("source", rel, content) for rel, content in src_files]) - if not inspirations: # — nothing to read; still create — + if not inspirations: inspirations = [("source", "src/oracle.py", "")] produced: dict[str, tuple[Path, str]] = {} @@ -481,7 +440,7 @@ def generate_improvement(generate, tree, src_files, issues, *, deadline_seconds= log(f"step wrote {rel} ({len(code)} bytes)") if not produced: - return None, [], last, False + return None, None, [], last, False files = list(produced.values()) valid = all(content_problems(t, c) == [] for t, c in files) if not valid: @@ -491,7 +450,6 @@ def generate_improvement(generate, tree, src_files, issues, *, deadline_seconds= return reason, explanation, files, last, valid -# — PR metadata ——————————————————————————————————————————————————————————————— def write_blocks(files) -> list[str]: return [r for r in (_write_one(t, c) for t, c in files) if r] @@ -504,12 +462,11 @@ def make_pr_title(reason, written) -> str: return title if len(title) <= 72 else title[:69].rstrip() + "…" -def write_pr_outputs(reason, explanation, written, *, valid=True) -> None: +def write_pr_outputs(reason, explanation, written, *, valid=True) -> None: PR_TITLE_PATH.write_text(title := make_pr_title(reason, written), encoding="utf-8") log(f"title: {title}") PR_BODY_PATH.write_text( - f"{reason}\n\n{explanation}" - f"**Files changed ({len(written)}):**\n" + "\n".join(f"- `{p}`" for p in written), encoding="utf-8") + f"{reason}\n\n{explanation}\n\n**Files changed ({len(written)}):**\n" + "\n".join(f"- `{p}`" for p in written), encoding="utf-8") def main() -> int: