From 70bcefdee2e4a4f5d2326299bfd7bf15260cadd5 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Sun, 7 Jun 2026 16:08:12 -0500 Subject: [PATCH 01/13] feat(node): withheld-paths endpoint reporting a caller's denied globs --- crates/gitlawb-node/src/api/visibility.rs | 27 +++++++++++++++ crates/gitlawb-node/src/server.rs | 4 +++ crates/gitlawb-node/src/visibility.rs | 41 +++++++++++++++++++++++ 3 files changed, 72 insertions(+) diff --git a/crates/gitlawb-node/src/api/visibility.rs b/crates/gitlawb-node/src/api/visibility.rs index 531c724..5a36648 100644 --- a/crates/gitlawb-node/src/api/visibility.rs +++ b/crates/gitlawb-node/src/api/visibility.rs @@ -185,6 +185,33 @@ pub async fn list_visibility( }))) } +/// GET /api/v1/repos/{owner}/{repo}/withheld-paths +/// +/// Returns only the path globs the (optionally authenticated) caller is denied, +/// so a clean-clone client can sparse-exclude them. Unlike `list_visibility` +/// this is not owner-gated and never exposes reader_dids. +pub async fn withheld_paths( + State(state): State, + auth: Option>, + Path((owner, repo)): Path<(String, String)>, +) -> Result> { + let record = state + .db + .get_repo(&owner, &repo) + .await? + .ok_or_else(|| AppError::RepoNotFound(format!("{owner}/{repo}")))?; + + let rules = state.db.list_visibility_rules(&record.id).await?; + let caller = auth.as_ref().map(|e| e.0 .0.as_str()); + let withheld = + crate::visibility::withheld_globs(&rules, record.is_public, &record.owner_did, caller); + + Ok(Json(serde_json::json!({ + "repo": format!("{owner}/{repo}"), + "withheld": withheld, + }))) +} + #[cfg(test)] mod tests { use super::validate_path_glob; diff --git a/crates/gitlawb-node/src/server.rs b/crates/gitlawb-node/src/server.rs index 4a8ec37..9baea20 100644 --- a/crates/gitlawb-node/src/server.rs +++ b/crates/gitlawb-node/src/server.rs @@ -352,6 +352,10 @@ pub fn build_router(state: AppState) -> Router { "/{owner}/{repo}/git-upload-pack", post(repos::git_upload_pack), ) + .route( + "/api/v1/repos/{owner}/{repo}/withheld-paths", + axum::routing::get(visibility::withheld_paths), + ) .layer(DefaultBodyLimit::disable()) .layer(RequestBodyLimitLayer::new(pack_limit)) .layer(middleware::from_fn(auth::optional_signature)); diff --git a/crates/gitlawb-node/src/visibility.rs b/crates/gitlawb-node/src/visibility.rs index b246dbf..6cc6445 100644 --- a/crates/gitlawb-node/src/visibility.rs +++ b/crates/gitlawb-node/src/visibility.rs @@ -96,6 +96,29 @@ pub fn visibility_check( } } +/// The subtree path globs that `caller` (None = anonymous) may NOT read, given +/// the repo's rules. Whole-repo ("/") rules are excluded: a denied whole-repo +/// read is handled by the 404 gate before a clone ever starts. Each remaining +/// rule is reported when `visibility_check` denies the caller at the glob's +/// representative path. Used by the clean-clone client to sparse-exclude the +/// private paths from checkout. +pub fn withheld_globs( + rules: &[VisibilityRule], + is_public: bool, + owner_did: &str, + caller: Option<&str>, +) -> Vec { + rules + .iter() + .filter(|r| r.path_glob != "/") + .filter(|r| { + let probe = glob_prefix(&r.path_glob); + visibility_check(rules, is_public, owner_did, caller, probe) == Decision::Deny + }) + .map(|r| r.path_glob.clone()) + .collect() +} + #[cfg(test)] mod tests { use super::*; @@ -116,6 +139,24 @@ mod tests { const OWNER: &str = "did:key:z6MkOwner"; + #[test] + fn withheld_globs_lists_only_denied_subtrees() { + let rules = [ + rule("/secret/**", VisibilityMode::B, &["did:key:z6MkFriend"]), + rule("/docs/**", VisibilityMode::B, &["did:key:z6MkStranger"]), + ]; + // Stranger is denied /secret but allowed /docs. + let mut got = withheld_globs(&rules, true, OWNER, Some("did:key:z6MkStranger")); + got.sort(); + assert_eq!(got, vec!["/secret/**".to_string()]); + // Owner is denied nothing. + assert!(withheld_globs(&rules, true, OWNER, Some(OWNER)).is_empty()); + // Anonymous is denied both. + let mut anon = withheld_globs(&rules, true, OWNER, None); + anon.sort(); + assert_eq!(anon, vec!["/docs/**".to_string(), "/secret/**".to_string()]); + } + #[test] fn no_rules_public_allows_anonymous() { assert_eq!( From 3e1a2038fa758fae7e27735d7dfe7c783cb81698 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Sun, 7 Jun 2026 16:15:20 -0500 Subject: [PATCH 02/13] feat(gl): gl clone with promisor + sparse-exclude for private subtrees --- crates/gl/src/clone.rs | 248 +++++++++++++++++++++++++++++++++++++++++ crates/gl/src/main.rs | 5 + 2 files changed, 253 insertions(+) create mode 100644 crates/gl/src/clone.rs diff --git a/crates/gl/src/clone.rs b/crates/gl/src/clone.rs new file mode 100644 index 0000000..9b1ef3e --- /dev/null +++ b/crates/gl/src/clone.rs @@ -0,0 +1,248 @@ +//! `gl clone`: clean partial clone of a gitlawb repo with private subtrees. +//! +//! A repo may withhold blob content under some path globs from the caller +//! (Phase 3). The resulting pack is not closed under reachability, so a stock +//! `git clone` is refused at fetch. This command clones as a promisor +//! (`--filter=blob:none`) and sparse-excludes the caller's withheld globs, +//! producing a clean checkout: public files present, private paths absent. + +use anyhow::{bail, Context, Result}; +use clap::Args; +use serde_json::Value; +use std::path::Path; +use std::process::Command; + +use crate::http::NodeClient; +use crate::identity::load_keypair_from_dir; + +#[derive(Args)] +pub struct CloneArgs { + /// Repo to clone: gitlawb:/// or /. + pub repo: String, + + /// Destination directory (default: the repo name). + pub dir: Option, + + /// Branch to check out (default: the remote's default branch). + #[arg(long)] + pub branch: Option, + + #[arg(long, default_value = "https://node.gitlawb.com", env = "GITLAWB_NODE")] + pub node: String, +} + +/// Run a git command inside `dir`, erroring with stderr on failure. +fn git(dir: &Path, args: &[&str]) -> Result<()> { + let out = Command::new("git") + .args(args) + .current_dir(dir) + .output() + .with_context(|| format!("running git {args:?}"))?; + if !out.status.success() { + bail!( + "git {args:?} failed: {}", + String::from_utf8_lossy(&out.stderr) + ); + } + Ok(()) +} + +/// Run a git command not tied to a working tree (e.g. `clone`). +fn git_global(args: &[&str]) -> Result<()> { + let out = Command::new("git") + .args(args) + .output() + .with_context(|| format!("running git {args:?}"))?; + if !out.status.success() { + bail!( + "git {args:?} failed: {}", + String::from_utf8_lossy(&out.stderr) + ); + } + Ok(()) +} + +/// Clone `remote_url` into `dest`, excluding `withheld_globs` from checkout. +/// `dest` must not already exist. With nothing withheld this is a plain full +/// clone. With globs withheld it clones as a promisor (`--filter=blob:none`, +/// marking the repo a promisor so the node's non-closed pack is accepted) +/// without checkout, sparse-excludes each glob, then checks out so the absent +/// blobs are never materialized. `--no-cone` is required for negated excludes. +pub fn setup_partial_clone( + dest: &Path, + remote_url: &str, + withheld_globs: &[String], + branch: Option<&str>, +) -> Result<()> { + let dest_str = dest + .to_str() + .context("destination path is not valid UTF-8")?; + + if withheld_globs.is_empty() { + match branch { + Some(b) => git_global(&["clone", "-q", "--branch", b, remote_url, dest_str])?, + None => git_global(&["clone", "-q", remote_url, dest_str])?, + } + return Ok(()); + } + + git_global(&[ + "clone", + "-q", + "--filter=blob:none", + "--no-checkout", + remote_url, + dest_str, + ])?; + git(dest, &["sparse-checkout", "init", "--no-cone"])?; + let mut spec = String::from("/*\n"); + for g in withheld_globs { + // "/secret/**" -> "!/secret/" + let dir = g.trim_end_matches("**").trim_end_matches('/'); + spec.push('!'); + spec.push_str(dir); + spec.push_str("/\n"); + } + std::fs::write(dest.join(".git/info/sparse-checkout"), spec) + .context("writing sparse-checkout spec")?; + + match branch { + Some(b) => git(dest, &["checkout", "-q", b])?, + None => { + let out = Command::new("git") + .args(["remote", "show", "origin"]) + .current_dir(dest) + .output()?; + let text = String::from_utf8_lossy(&out.stdout); + let head = text + .lines() + .find_map(|l| l.trim().strip_prefix("HEAD branch: ")) + .map(|s| s.to_string()) + .context("could not determine default branch")?; + git(dest, &["checkout", "-q", &head])?; + } + } + Ok(()) +} + +/// Parse `repo` into (gitlawb_url, owner, name). Accepts a full +/// `gitlawb:///` URL or a bare `/`. The owner DID may +/// itself contain colons but no slash, so split on the first slash. +fn parse_repo(repo: &str) -> Result<(String, String, String)> { + let stripped = repo.strip_prefix("gitlawb://").unwrap_or(repo); + let (owner, name) = stripped + .trim_end_matches('/') + .split_once('/') + .context("repo must be / or gitlawb:///")?; + if owner.is_empty() || name.is_empty() { + bail!("repo must be / or gitlawb:///"); + } + Ok(( + format!("gitlawb://{owner}/{name}"), + owner.to_string(), + name.to_string(), + )) +} + +/// Ask the node which globs are withheld for this caller. Any error or non-2xx +/// is treated as "nothing withheld" so public repos clone normally. +async fn fetch_withheld(node: &str, owner: &str, name: &str) -> Vec { + let kp = load_keypair_from_dir(None).ok(); + let signed = kp.is_some(); + let client = NodeClient::new(node, kp); + let path = format!("/api/v1/repos/{owner}/{name}/withheld-paths"); + let resp = if signed { + client.get_signed(&path).await + } else { + client.get(&path).await + }; + let resp = match resp { + Ok(r) if r.status().is_success() => r, + _ => return Vec::new(), + }; + let body: Value = resp.json().await.unwrap_or_default(); + body.get("withheld") + .and_then(|w| w.as_array()) + .map(|a| { + a.iter() + .filter_map(|x| x.as_str().map(String::from)) + .collect() + }) + .unwrap_or_default() +} + +pub async fn run(args: CloneArgs) -> Result<()> { + let (url, owner, name) = parse_repo(&args.repo)?; + let dest_name = args.dir.unwrap_or_else(|| name.clone()); + let dest = std::path::PathBuf::from(&dest_name); + if dest.exists() { + bail!("destination '{dest_name}' already exists"); + } + + let withheld = fetch_withheld(&args.node, &owner, &name).await; + if withheld.is_empty() { + println!("Cloning {url} into {dest_name}"); + } else { + println!( + "Cloning {url} into {dest_name} ({} private path(s) excluded)", + withheld.len() + ); + } + + setup_partial_clone(&dest, &url, &withheld, args.branch.as_deref())?; + println!("Done. Cloned into {dest_name}"); + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::process::Command; + use tempfile::TempDir; + + fn g(args: &[&str], dir: &Path) { + assert!(Command::new("git") + .args(args) + .current_dir(dir) + .status() + .unwrap() + .success()); + } + + #[test] + fn setup_partial_clone_excludes_withheld_path() { + let td = TempDir::new().unwrap(); + let origin = td.path().join("origin"); + let bare = td.path().join("bare.git"); + std::fs::create_dir_all(origin.join("secret")).unwrap(); + std::fs::create_dir_all(origin.join("public")).unwrap(); + std::fs::write(origin.join("public/a.txt"), b"pub\n").unwrap(); + std::fs::write(origin.join("secret/b.txt"), b"SECRET\n").unwrap(); + g(&["init", "-q"], &origin); + g(&["config", "user.email", "t@t"], &origin); + g(&["config", "user.name", "t"], &origin); + g(&["add", "."], &origin); + g(&["commit", "-qm", "init"], &origin); + g( + &[ + "clone", + "-q", + "--bare", + origin.to_str().unwrap(), + bare.to_str().unwrap(), + ], + td.path(), + ); + + // file:// so --filter is honored (local-path clones ignore it). + let dest = td.path().join("dest"); + let url = format!("file://{}", bare.display()); + setup_partial_clone(&dest, &url, &["/secret/**".to_string()], None).unwrap(); + + assert!(dest.join("public/a.txt").exists(), "public file present"); + assert!( + !dest.join("secret/b.txt").exists(), + "withheld path must be excluded from checkout" + ); + } +} diff --git a/crates/gl/src/main.rs b/crates/gl/src/main.rs index 0af7398..1c1a50d 100644 --- a/crates/gl/src/main.rs +++ b/crates/gl/src/main.rs @@ -7,6 +7,7 @@ mod agent; mod bounty; mod cert; mod changelog; +mod clone; mod doctor; mod http; mod identity; @@ -57,6 +58,9 @@ enum Commands { /// Register this agent with a gitlawb node Register(register::RegisterArgs), + /// Clone a gitlawb repo, handling private subtrees cleanly + Clone(clone::CloneArgs), + /// Manage repositories Repo(repo::RepoArgs), @@ -150,6 +154,7 @@ async fn main() -> Result<()> { match cli.command { Commands::Identity { cmd } => identity::run(cmd).await, Commands::Register(args) => register::run(args).await, + Commands::Clone(args) => clone::run(args).await, Commands::Repo(args) => repo::run(args).await, Commands::Issue(args) => issue::run(args).await, Commands::Pr(args) => pr::run(args).await, From 2fd4fe131c893bd71751f6d61e914084fc2dd8af Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Sun, 7 Jun 2026 16:15:30 -0500 Subject: [PATCH 03/13] test(gl): gl clone repo-argument parsing --- crates/gl/src/clone.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/crates/gl/src/clone.rs b/crates/gl/src/clone.rs index 9b1ef3e..d06135e 100644 --- a/crates/gl/src/clone.rs +++ b/crates/gl/src/clone.rs @@ -245,4 +245,22 @@ mod tests { "withheld path must be excluded from checkout" ); } + + #[test] + fn parse_repo_accepts_url_and_bare() { + let (url, o, n) = parse_repo("gitlawb://did:key:zAbc/myrepo").unwrap(); + assert_eq!(url, "gitlawb://did:key:zAbc/myrepo"); + assert_eq!((o.as_str(), n.as_str()), ("did:key:zAbc", "myrepo")); + + let (url2, o2, n2) = parse_repo("did:key:zAbc/myrepo").unwrap(); + assert_eq!(url2, "gitlawb://did:key:zAbc/myrepo"); + assert_eq!((o2.as_str(), n2.as_str()), ("did:key:zAbc", "myrepo")); + } + + #[test] + fn parse_repo_rejects_malformed() { + assert!(parse_repo("noslash").is_err()); + assert!(parse_repo("gitlawb://owner/").is_err()); + assert!(parse_repo("/name").is_err()); + } } From af972499a94a8a644389ceef92fac34047866224 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Sun, 7 Jun 2026 17:24:00 -0500 Subject: [PATCH 04/13] fix(visibility): gate withheld-paths and handle nested/exact globs Three fixes from the PR #33 review: - withheld_paths now applies the whole-repo "/" read gate (returns repo-not-found when the caller cannot read the root), matching the git read endpoints. Without it the endpoint disclosed a private repo's existence and path layout to unauthorized callers. The withheld_globs doc already assumed this gate existed; now it does. - A nested allow under a denied parent (e.g. "/secret/public/**" allowed, "/secret/**" denied) was over-withheld: the client sparse-excluded the whole parent and hid paths the caller may read. The endpoint now also returns a "reinclude" list (allowed globs strictly under a denied one) and gl clone re-includes them in the sparse spec after the excludes. - Wildcard-free globs like "/docs/private" match both the exact path and a subtree (per glob_matches), but the client only emitted the subtree exclude. sparse_patterns now emits both "/docs/private" and "/docs/private/". Verified the exclude-then-reinclude sparse ordering checks out cleanly with real git, plus unit tests for reincluded_globs, the nested re-include, the exact-path exclude, and sparse_patterns. --- crates/gitlawb-node/src/api/visibility.rs | 21 ++- crates/gitlawb-node/src/visibility.rs | 68 ++++++++++ crates/gl/src/clone.rs | 148 +++++++++++++++++++--- 3 files changed, 214 insertions(+), 23 deletions(-) diff --git a/crates/gitlawb-node/src/api/visibility.rs b/crates/gitlawb-node/src/api/visibility.rs index 5a36648..6665b9e 100644 --- a/crates/gitlawb-node/src/api/visibility.rs +++ b/crates/gitlawb-node/src/api/visibility.rs @@ -187,9 +187,11 @@ pub async fn list_visibility( /// GET /api/v1/repos/{owner}/{repo}/withheld-paths /// -/// Returns only the path globs the (optionally authenticated) caller is denied, -/// so a clean-clone client can sparse-exclude them. Unlike `list_visibility` -/// this is not owner-gated and never exposes reader_dids. +/// Returns the path globs the (optionally authenticated) caller is denied +/// (`withheld`) plus any more-specific globs that are allowed underneath a +/// denied one (`reinclude`), so a clean-clone client can sparse-exclude the +/// denied subtrees while re-including the allowed nested paths. Unlike +/// `list_visibility` this is not owner-gated and never exposes reader_dids. pub async fn withheld_paths( State(state): State, auth: Option>, @@ -203,12 +205,25 @@ pub async fn withheld_paths( let rules = state.db.list_visibility_rules(&record.id).await?; let caller = auth.as_ref().map(|e| e.0 .0.as_str()); + + // Whole-repo read gate: a caller who cannot read "/" gets repo-not-found, + // matching the git read endpoints, so this never discloses a private repo's + // existence or its path layout to an unauthorized caller. + if crate::visibility::visibility_check(&rules, record.is_public, &record.owner_did, caller, "/") + == crate::visibility::Decision::Deny + { + return Err(AppError::RepoNotFound(format!("{owner}/{repo}"))); + } + let withheld = crate::visibility::withheld_globs(&rules, record.is_public, &record.owner_did, caller); + let reinclude = + crate::visibility::reincluded_globs(&rules, record.is_public, &record.owner_did, caller); Ok(Json(serde_json::json!({ "repo": format!("{owner}/{repo}"), "withheld": withheld, + "reinclude": reinclude, }))) } diff --git a/crates/gitlawb-node/src/visibility.rs b/crates/gitlawb-node/src/visibility.rs index 6cc6445..345f41d 100644 --- a/crates/gitlawb-node/src/visibility.rs +++ b/crates/gitlawb-node/src/visibility.rs @@ -119,6 +119,55 @@ pub fn withheld_globs( .collect() } +/// The allowed globs that sit strictly underneath a denied glob. A clean-clone +/// client sparse-excludes everything in `withheld_globs`, which would also hide +/// these nested allowed paths; re-including them restores the caller's access. +/// Example: with `/secret/**` denied and `/secret/public/**` allowed for the +/// same caller, `/secret/public/**` is returned here so the client re-includes +/// it after excluding `/secret/`. +pub fn reincluded_globs( + rules: &[VisibilityRule], + is_public: bool, + owner_did: &str, + caller: Option<&str>, +) -> Vec { + let denied: Vec<&str> = rules + .iter() + .filter(|r| r.path_glob != "/") + .filter(|r| { + visibility_check( + rules, + is_public, + owner_did, + caller, + glob_prefix(&r.path_glob), + ) == Decision::Deny + }) + .map(|r| glob_prefix(&r.path_glob)) + .collect(); + + rules + .iter() + .filter(|r| r.path_glob != "/") + .filter(|r| { + visibility_check( + rules, + is_public, + owner_did, + caller, + glob_prefix(&r.path_glob), + ) == Decision::Allow + }) + .filter(|r| { + let p = glob_prefix(&r.path_glob); + denied + .iter() + .any(|d| *d != p && p.starts_with(&format!("{d}/"))) + }) + .map(|r| r.path_glob.clone()) + .collect() +} + #[cfg(test)] mod tests { use super::*; @@ -157,6 +206,25 @@ mod tests { assert_eq!(anon, vec!["/docs/**".to_string(), "/secret/**".to_string()]); } + #[test] + fn reincluded_globs_restores_allowed_nested_path() { + let rules = [ + rule("/secret/**", VisibilityMode::B, &["did:key:z6MkFriend"]), + rule( + "/secret/public/**", + VisibilityMode::B, + &["did:key:z6MkFriend", "did:key:z6MkStranger"], + ), + ]; + // Stranger is denied /secret/** but allowed the nested /secret/public/**. + let withheld = withheld_globs(&rules, true, OWNER, Some("did:key:z6MkStranger")); + assert_eq!(withheld, vec!["/secret/**".to_string()]); + let reinc = reincluded_globs(&rules, true, OWNER, Some("did:key:z6MkStranger")); + assert_eq!(reinc, vec!["/secret/public/**".to_string()]); + // Owner is denied nothing, so there is nothing to re-include. + assert!(reincluded_globs(&rules, true, OWNER, Some(OWNER)).is_empty()); + } + #[test] fn no_rules_public_allows_anonymous() { assert_eq!( diff --git a/crates/gl/src/clone.rs b/crates/gl/src/clone.rs index d06135e..7bb481b 100644 --- a/crates/gl/src/clone.rs +++ b/crates/gl/src/clone.rs @@ -62,6 +62,18 @@ fn git_global(args: &[&str]) -> Result<()> { Ok(()) } +/// Sparse-checkout pattern(s) for a visibility glob. A subtree glob +/// (`/secret/**`) maps to the directory `/secret/`. A wildcard-free glob +/// (`/docs/private`) matches both the exact path and a subtree at that path +/// (mirroring the node's `glob_matches`), so it maps to both `/docs/private` +/// and `/docs/private/`. Callers prefix these with `!` to exclude. +fn sparse_patterns(glob: &str) -> Vec { + match glob.strip_suffix("/**") { + Some(base) => vec![format!("{base}/")], + None => vec![glob.to_string(), format!("{glob}/")], + } +} + /// Clone `remote_url` into `dest`, excluding `withheld_globs` from checkout. /// `dest` must not already exist. With nothing withheld this is a plain full /// clone. With globs withheld it clones as a promisor (`--filter=blob:none`, @@ -72,6 +84,7 @@ pub fn setup_partial_clone( dest: &Path, remote_url: &str, withheld_globs: &[String], + reinclude_globs: &[String], branch: Option<&str>, ) -> Result<()> { let dest_str = dest @@ -95,13 +108,22 @@ pub fn setup_partial_clone( dest_str, ])?; git(dest, &["sparse-checkout", "init", "--no-cone"])?; + // Non-cone sparse-checkout, gitignore-style and order-sensitive: include + // everything, exclude the withheld globs, then re-include any allowed globs + // nested under an excluded one (later patterns win). let mut spec = String::from("/*\n"); for g in withheld_globs { - // "/secret/**" -> "!/secret/" - let dir = g.trim_end_matches("**").trim_end_matches('/'); - spec.push('!'); - spec.push_str(dir); - spec.push_str("/\n"); + for pat in sparse_patterns(g) { + spec.push('!'); + spec.push_str(&pat); + spec.push('\n'); + } + } + for g in reinclude_globs { + for pat in sparse_patterns(g) { + spec.push_str(&pat); + spec.push('\n'); + } } std::fs::write(dest.join(".git/info/sparse-checkout"), spec) .context("writing sparse-checkout spec")?; @@ -144,9 +166,11 @@ fn parse_repo(repo: &str) -> Result<(String, String, String)> { )) } -/// Ask the node which globs are withheld for this caller. Any error or non-2xx -/// is treated as "nothing withheld" so public repos clone normally. -async fn fetch_withheld(node: &str, owner: &str, name: &str) -> Vec { +/// Ask the node which globs are withheld for this caller and which allowed globs +/// nested under them must be re-included. Returns `(withheld, reinclude)`. Any +/// error or non-2xx is treated as "nothing withheld" so public repos clone +/// normally. +async fn fetch_withheld(node: &str, owner: &str, name: &str) -> (Vec, Vec) { let kp = load_keypair_from_dir(None).ok(); let signed = kp.is_some(); let client = NodeClient::new(node, kp); @@ -158,17 +182,20 @@ async fn fetch_withheld(node: &str, owner: &str, name: &str) -> Vec { }; let resp = match resp { Ok(r) if r.status().is_success() => r, - _ => return Vec::new(), + _ => return (Vec::new(), Vec::new()), }; let body: Value = resp.json().await.unwrap_or_default(); - body.get("withheld") - .and_then(|w| w.as_array()) - .map(|a| { - a.iter() - .filter_map(|x| x.as_str().map(String::from)) - .collect() - }) - .unwrap_or_default() + let globs = |field: &str| -> Vec { + body.get(field) + .and_then(|w| w.as_array()) + .map(|a| { + a.iter() + .filter_map(|x| x.as_str().map(String::from)) + .collect() + }) + .unwrap_or_default() + }; + (globs("withheld"), globs("reinclude")) } pub async fn run(args: CloneArgs) -> Result<()> { @@ -179,7 +206,7 @@ pub async fn run(args: CloneArgs) -> Result<()> { bail!("destination '{dest_name}' already exists"); } - let withheld = fetch_withheld(&args.node, &owner, &name).await; + let (withheld, reinclude) = fetch_withheld(&args.node, &owner, &name).await; if withheld.is_empty() { println!("Cloning {url} into {dest_name}"); } else { @@ -189,7 +216,7 @@ pub async fn run(args: CloneArgs) -> Result<()> { ); } - setup_partial_clone(&dest, &url, &withheld, args.branch.as_deref())?; + setup_partial_clone(&dest, &url, &withheld, &reinclude, args.branch.as_deref())?; println!("Done. Cloned into {dest_name}"); Ok(()) } @@ -237,7 +264,7 @@ mod tests { // file:// so --filter is honored (local-path clones ignore it). let dest = td.path().join("dest"); let url = format!("file://{}", bare.display()); - setup_partial_clone(&dest, &url, &["/secret/**".to_string()], None).unwrap(); + setup_partial_clone(&dest, &url, &["/secret/**".to_string()], &[], None).unwrap(); assert!(dest.join("public/a.txt").exists(), "public file present"); assert!( @@ -246,6 +273,87 @@ mod tests { ); } + /// Build a bare remote with the given files (relative path -> contents), + /// committed on one branch. Returns (tempdir, file:// url). + fn bare_remote(files: &[(&str, &[u8])]) -> (TempDir, String) { + let td = TempDir::new().unwrap(); + let origin = td.path().join("origin"); + let bare = td.path().join("bare.git"); + for (path, contents) in files { + let full = origin.join(path); + std::fs::create_dir_all(full.parent().unwrap()).unwrap(); + std::fs::write(full, contents).unwrap(); + } + g(&["init", "-q"], &origin); + g(&["config", "user.email", "t@t"], &origin); + g(&["config", "user.name", "t"], &origin); + g(&["add", "."], &origin); + g(&["commit", "-qm", "init"], &origin); + g( + &[ + "clone", + "-q", + "--bare", + origin.to_str().unwrap(), + bare.to_str().unwrap(), + ], + td.path(), + ); + let url = format!("file://{}", bare.display()); + (td, url) + } + + #[test] + fn reinclude_restores_allowed_nested_path() { + let (td, url) = bare_remote(&[ + ("public/a.txt", b"pub\n"), + ("secret/private/p.txt", b"PRIV\n"), + ("secret/public/s.txt", b"SHARED\n"), + ]); + let dest = td.path().join("dest"); + setup_partial_clone( + &dest, + &url, + &["/secret/**".to_string()], + &["/secret/public/**".to_string()], + None, + ) + .unwrap(); + + assert!(dest.join("public/a.txt").exists(), "public present"); + assert!( + dest.join("secret/public/s.txt").exists(), + "allowed nested path must be re-included" + ); + assert!( + !dest.join("secret/private/p.txt").exists(), + "denied nested path must stay excluded" + ); + } + + #[test] + fn exact_path_glob_is_excluded() { + // A wildcard-free glob must exclude the exact file, not just a subtree. + let (td, url) = bare_remote(&[("public/a.txt", b"pub\n"), ("docs/private", b"SECRET\n")]); + let dest = td.path().join("dest"); + setup_partial_clone(&dest, &url, &["/docs/private".to_string()], &[], None).unwrap(); + + assert!(dest.join("public/a.txt").exists(), "public present"); + assert!( + !dest.join("docs/private").exists(), + "exact-path withheld file must be excluded" + ); + } + + #[test] + fn sparse_patterns_subtree_and_exact() { + assert_eq!(sparse_patterns("/secret/**"), vec!["/secret/".to_string()]); + assert_eq!( + sparse_patterns("/docs/private"), + vec!["/docs/private".to_string(), "/docs/private/".to_string()] + ); + } + #[test] fn parse_repo_accepts_url_and_bare() { let (url, o, n) = parse_repo("gitlawb://did:key:zAbc/myrepo").unwrap(); From 8f7060ad349754569ac821618670a5d6088184e3 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Mon, 8 Jun 2026 09:12:25 -0500 Subject: [PATCH 05/13] fix(gl): reject multi-slash repo input and stop failing open on withheld-path errors split_once('/') accepted owner/name/extra, smuggling a path segment into the repo name that then flowed into the API path and clone URL; reject it. fetch_withheld swallowed every network/auth/5xx/JSON error into an empty result, dropping to a stock clone that the node refuses once blobs are withheld. Now only 404/501 (endpoint unsupported) fall back to empty; the rest propagate so the real cause surfaces. --- crates/gl/src/clone.rs | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/crates/gl/src/clone.rs b/crates/gl/src/clone.rs index 7bb481b..3f9af70 100644 --- a/crates/gl/src/clone.rs +++ b/crates/gl/src/clone.rs @@ -156,7 +156,7 @@ fn parse_repo(repo: &str) -> Result<(String, String, String)> { .trim_end_matches('/') .split_once('/') .context("repo must be / or gitlawb:///")?; - if owner.is_empty() || name.is_empty() { + if owner.is_empty() || name.is_empty() || name.contains('/') { bail!("repo must be / or gitlawb:///"); } Ok(( @@ -167,10 +167,13 @@ fn parse_repo(repo: &str) -> Result<(String, String, String)> { } /// Ask the node which globs are withheld for this caller and which allowed globs -/// nested under them must be re-included. Returns `(withheld, reinclude)`. Any -/// error or non-2xx is treated as "nothing withheld" so public repos clone -/// normally. -async fn fetch_withheld(node: &str, owner: &str, name: &str) -> (Vec, Vec) { +/// nested under them must be re-included. Returns `(withheld, reinclude)`. A +/// node that does not implement the endpoint (404/501) yields empties so public +/// repos on older nodes still clone normally. Other failures (network, auth, +/// 5xx, malformed JSON) are propagated: failing open here would silently fall +/// back to a stock clone, which the node refuses once blobs are withheld, hiding +/// the real cause behind a confusing fetch error. +async fn fetch_withheld(node: &str, owner: &str, name: &str) -> Result<(Vec, Vec)> { let kp = load_keypair_from_dir(None).ok(); let signed = kp.is_some(); let client = NodeClient::new(node, kp); @@ -182,9 +185,14 @@ async fn fetch_withheld(node: &str, owner: &str, name: &str) -> (Vec, Ve }; let resp = match resp { Ok(r) if r.status().is_success() => r, - _ => return (Vec::new(), Vec::new()), + Ok(r) if matches!(r.status().as_u16(), 404 | 501) => return Ok((Vec::new(), Vec::new())), + Ok(r) => bail!("withheld-paths lookup failed: {}", r.status()), + Err(err) => return Err(err).context("fetching withheld paths"), }; - let body: Value = resp.json().await.unwrap_or_default(); + let body: Value = resp + .json() + .await + .context("parsing withheld-paths response")?; let globs = |field: &str| -> Vec { body.get(field) .and_then(|w| w.as_array()) @@ -195,7 +203,7 @@ async fn fetch_withheld(node: &str, owner: &str, name: &str) -> (Vec, Ve }) .unwrap_or_default() }; - (globs("withheld"), globs("reinclude")) + Ok((globs("withheld"), globs("reinclude"))) } pub async fn run(args: CloneArgs) -> Result<()> { @@ -206,7 +214,7 @@ pub async fn run(args: CloneArgs) -> Result<()> { bail!("destination '{dest_name}' already exists"); } - let (withheld, reinclude) = fetch_withheld(&args.node, &owner, &name).await; + let (withheld, reinclude) = fetch_withheld(&args.node, &owner, &name).await?; if withheld.is_empty() { println!("Cloning {url} into {dest_name}"); } else { @@ -370,5 +378,7 @@ mod tests { assert!(parse_repo("noslash").is_err()); assert!(parse_repo("gitlawb://owner/").is_err()); assert!(parse_repo("/name").is_err()); + // An extra slash would otherwise smuggle a path segment into the name. + assert!(parse_repo("owner/name/extra").is_err()); } } From 95a223e3bbc520dcb74ef46df5e18f981b16a623 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Wed, 10 Jun 2026 11:47:39 -0500 Subject: [PATCH 06/13] test(gl): guard three-level nested visibility in sparse clone Add a regression test for deny /secret, allow /secret/public, deny /secret/public/admin and clarify the sparse-checkout comment. git does not re-traverse an explicitly excluded directory, so emitting all excludes before re-includes keeps the deepest deny in force; the broader parent re-include does not resurrect it. --- crates/gl/src/clone.rs | 48 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 45 insertions(+), 3 deletions(-) diff --git a/crates/gl/src/clone.rs b/crates/gl/src/clone.rs index 3f9af70..5b9403f 100644 --- a/crates/gl/src/clone.rs +++ b/crates/gl/src/clone.rs @@ -108,9 +108,12 @@ pub fn setup_partial_clone( dest_str, ])?; git(dest, &["sparse-checkout", "init", "--no-cone"])?; - // Non-cone sparse-checkout, gitignore-style and order-sensitive: include - // everything, exclude the withheld globs, then re-include any allowed globs - // nested under an excluded one (later patterns win). + // Non-cone sparse-checkout, gitignore-style: include everything, exclude the + // withheld globs, then re-include any allowed globs nested under an excluded + // one. Emitting all excludes before the re-includes is safe even for deeper + // re-denials (deny /secret, allow /secret/public, deny /secret/public/admin): + // git does not re-traverse an explicitly excluded directory, so a broader + // parent re-include never resurrects a more specific excluded subtree. let mut spec = String::from("/*\n"); for g in withheld_globs { for pat in sparse_patterns(g) { @@ -339,6 +342,45 @@ mod tests { ); } + #[test] + fn three_level_alternating_nesting_respects_specificity() { + // deny /secret, allow /secret/public, deny /secret/public/admin. + // The deepest deny must win even though a shallower allow re-includes + // its parent: order patterns by depth, not all-excludes-then-includes. + let (td, url) = bare_remote(&[ + ("public/a.txt", b"pub\n"), + ("secret/private/p.txt", b"PRIV\n"), + ("secret/public/s.txt", b"SHARED\n"), + ("secret/public/admin/k.txt", b"ADMIN\n"), + ]); + let dest = td.path().join("dest"); + setup_partial_clone( + &dest, + &url, + &[ + "/secret/**".to_string(), + "/secret/public/admin/**".to_string(), + ], + &["/secret/public/**".to_string()], + None, + ) + .unwrap(); + + assert!(dest.join("public/a.txt").exists(), "public present"); + assert!( + dest.join("secret/public/s.txt").exists(), + "allowed middle path must be re-included" + ); + assert!( + !dest.join("secret/private/p.txt").exists(), + "denied sibling must stay excluded" + ); + assert!( + !dest.join("secret/public/admin/k.txt").exists(), + "deepest denied path must stay excluded despite the shallower re-include" + ); + } + #[test] fn exact_path_glob_is_excluded() { // A wildcard-free glob must exclude the exact file, not just a subtree. From 720f8ef6878a651cb0ce906f6e814c69ed60648f Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Wed, 10 Jun 2026 11:56:10 -0500 Subject: [PATCH 07/13] fix(gl): robust default-branch detection and strict withheld-paths schema Read the default branch from the local origin/HEAD symref clone already set, instead of parsing the localized, network-dependent output of git remote show origin. Deserialize the withheld-paths body into a typed struct so a missing or mistyped withheld/reinclude field is a hard error rather than silently becoming an empty list, which would mask a server regression behind a later clone failure. --- crates/gl/src/clone.rs | 66 ++++++++++++++++++++++++++++-------------- 1 file changed, 45 insertions(+), 21 deletions(-) diff --git a/crates/gl/src/clone.rs b/crates/gl/src/clone.rs index 5b9403f..b5fe39d 100644 --- a/crates/gl/src/clone.rs +++ b/crates/gl/src/clone.rs @@ -8,7 +8,7 @@ use anyhow::{bail, Context, Result}; use clap::Args; -use serde_json::Value; +use serde::Deserialize; use std::path::Path; use std::process::Command; @@ -134,17 +134,25 @@ pub fn setup_partial_clone( match branch { Some(b) => git(dest, &["checkout", "-q", b])?, None => { + // Read the default branch from the local `origin/HEAD` symref that + // clone just set, instead of parsing `git remote show origin`, whose + // "HEAD branch:" line is localized and needs a network round-trip. let out = Command::new("git") - .args(["remote", "show", "origin"]) + .args(["symbolic-ref", "--short", "refs/remotes/origin/HEAD"]) .current_dir(dest) .output()?; - let text = String::from_utf8_lossy(&out.stdout); - let head = text - .lines() - .find_map(|l| l.trim().strip_prefix("HEAD branch: ")) - .map(|s| s.to_string()) - .context("could not determine default branch")?; - git(dest, &["checkout", "-q", &head])?; + if !out.status.success() { + bail!( + "could not determine default branch: {}", + String::from_utf8_lossy(&out.stderr) + ); + } + let symref = String::from_utf8_lossy(&out.stdout); + let head = symref + .trim() + .strip_prefix("origin/") + .context("unexpected origin/HEAD format")?; + git(dest, &["checkout", "-q", head])?; } } Ok(()) @@ -192,21 +200,21 @@ async fn fetch_withheld(node: &str, owner: &str, name: &str) -> Result<(Vec bail!("withheld-paths lookup failed: {}", r.status()), Err(err) => return Err(err).context("fetching withheld paths"), }; - let body: Value = resp + let body: WithheldPathsResponse = resp .json() .await .context("parsing withheld-paths response")?; - let globs = |field: &str| -> Vec { - body.get(field) - .and_then(|w| w.as_array()) - .map(|a| { - a.iter() - .filter_map(|x| x.as_str().map(String::from)) - .collect() - }) - .unwrap_or_default() - }; - Ok((globs("withheld"), globs("reinclude"))) + Ok((body.withheld, body.reinclude)) +} + +/// The node's `/withheld-paths` 200 body. Both fields are always emitted as JSON +/// arrays; deserializing into this struct (rather than poking at a `Value`) makes +/// a missing or mistyped field a hard error instead of silently becoming `[]`, +/// which would mask a server regression behind a confusing later clone failure. +#[derive(Deserialize)] +struct WithheldPathsResponse { + withheld: Vec, + reinclude: Vec, } pub async fn run(args: CloneArgs) -> Result<()> { @@ -404,6 +412,22 @@ mod tests { ); } + #[test] + fn withheld_response_requires_both_fields() { + let ok: WithheldPathsResponse = + serde_json::from_str(r#"{"withheld":["/secret/**"],"reinclude":[]}"#).unwrap(); + assert_eq!(ok.withheld, vec!["/secret/**".to_string()]); + assert!(ok.reinclude.is_empty()); + + // A missing field is a schema mismatch: it must error, not default to []. + assert!(serde_json::from_str::(r#"{"withheld":[]}"#).is_err()); + // A wrong-typed field must error too. + assert!(serde_json::from_str::( + r#"{"withheld":"nope","reinclude":[]}"# + ) + .is_err()); + } + #[test] fn parse_repo_accepts_url_and_bare() { let (url, o, n) = parse_repo("gitlawb://did:key:zAbc/myrepo").unwrap(); From 53902435d77295b88ecc1ec7922854c2d7e5ad59 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Wed, 10 Jun 2026 09:37:30 -0500 Subject: [PATCH 08/13] feat(node): classify mirror mode from origin withheld-paths --- crates/gitlawb-node/src/sync.rs | 51 +++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/crates/gitlawb-node/src/sync.rs b/crates/gitlawb-node/src/sync.rs index cdcfd3e..348568a 100644 --- a/crates/gitlawb-node/src/sync.rs +++ b/crates/gitlawb-node/src/sync.rs @@ -18,6 +18,32 @@ use tracing::{info, warn}; use crate::config::Config; use crate::db::Db; +/// How to mirror a repo, decided from the origin's `withheld-paths` answer. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum MirrorMode { + /// No withheld content: a normal full mirror. + Plain, + /// Withheld content present: a promisor mirror that tolerates the blobs the + /// origin omits for an anonymous caller. + Promisor, +} + +/// Decide the mirror mode from the origin's `withheld-paths` response. +/// +/// `Some(non-empty)` → the repo has a private subtree → `Promisor`. +/// `Some(empty)` → fully public → `Plain`. +/// `None` → the lookup 404'd or failed. Attempt a `Plain` mirror; a +/// mode-A repo also 404s the git read endpoint, so the clone +/// fails and nothing is mirrored (fail-closed at the git +/// layer), while a public repo on a peer that predates the +/// `withheld-paths` route still gets mirrored. +fn classify_mirror(withheld: Option>) -> MirrorMode { + match withheld { + Some(globs) if !globs.is_empty() => MirrorMode::Promisor, + _ => MirrorMode::Plain, + } +} + /// Start the background sync worker. Returns immediately; the worker runs /// as a detached tokio task that exits cleanly when `shutdown_rx` flips /// to `true`. @@ -174,3 +200,28 @@ async fn fetch_repo(local_path: &Path, remote_url: &str) -> anyhow::Result<()> { } Ok(()) } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn classify_promisor_when_withheld_nonempty() { + let mode = classify_mirror(Some(vec!["/secret/**".to_string()])); + assert!(matches!(mode, MirrorMode::Promisor)); + } + + #[test] + fn classify_plain_when_withheld_empty() { + let mode = classify_mirror(Some(vec![])); + assert!(matches!(mode, MirrorMode::Plain)); + } + + #[test] + fn classify_plain_when_lookup_failed() { + // None == 404 / network error / parse failure: attempt a plain mirror + // and let the git read endpoint fail-close a mode-A repo. + let mode = classify_mirror(None); + assert!(matches!(mode, MirrorMode::Plain)); + } +} From 469a1a43cb7671daa068d33b6d9e072600ade0b4 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Wed, 10 Jun 2026 09:40:11 -0500 Subject: [PATCH 09/13] feat(node): promisor-aware mirror clone for withheld repos --- crates/gitlawb-node/src/sync.rs | 105 +++++++++++++++++++++++++++++--- 1 file changed, 97 insertions(+), 8 deletions(-) diff --git a/crates/gitlawb-node/src/sync.rs b/crates/gitlawb-node/src/sync.rs index 348568a..c93c084 100644 --- a/crates/gitlawb-node/src/sync.rs +++ b/crates/gitlawb-node/src/sync.rs @@ -132,7 +132,7 @@ async fn process_batch(db: &Db, config: &Config, machine_id: Option<&str>) { let result = if local_path.exists() { fetch_repo(&local_path, &remote_url).await } else { - clone_repo(&remote_url, &local_path).await + clone_repo(&remote_url, &local_path, MirrorMode::Plain).await }; match result { @@ -160,14 +160,20 @@ async fn process_batch(db: &Db, config: &Config, machine_id: Option<&str>) { } /// Mirror-clone a repo from a remote URL into a local bare repo. -async fn clone_repo(remote_url: &str, local_path: &Path) -> anyhow::Result<()> { +/// `Promisor` mode adds `--filter=blob:limit=10g`, which marks the repo a git +/// promisor (so a pack with origin-omitted withheld blobs is accepted) while +/// the huge size limit means every blob the origin *does* send is kept. +async fn clone_repo(remote_url: &str, local_path: &Path, mode: MirrorMode) -> anyhow::Result<()> { + let local_str = local_path.to_str().unwrap_or("."); + let mut args = vec!["clone", "--mirror"]; + if mode == MirrorMode::Promisor { + args.push("--filter=blob:limit=10g"); + } + args.push(remote_url); + args.push(local_str); + let out = tokio::process::Command::new("git") - .args([ - "clone", - "--mirror", - remote_url, - local_path.to_str().unwrap_or("."), - ]) + .args(&args) .output() .await .map_err(|e| anyhow::anyhow!("git clone failed to spawn: {e}"))?; @@ -204,6 +210,8 @@ async fn fetch_repo(local_path: &Path, remote_url: &str) -> anyhow::Result<()> { #[cfg(test)] mod tests { use super::*; + use std::process::Command; + use tempfile::TempDir; #[test] fn classify_promisor_when_withheld_nonempty() { @@ -224,4 +232,85 @@ mod tests { let mode = classify_mirror(None); assert!(matches!(mode, MirrorMode::Plain)); } + + fn g(args: &[&str], dir: &Path) { + assert!(Command::new("git") + .args(args) + .current_dir(dir) + .status() + .unwrap() + .success()); + } + + /// Build a bare remote containing `files`, committed on one branch. + /// Returns (tempdir, file:// url). file:// makes git honor --filter. + fn bare_remote(files: &[(&str, &[u8])]) -> (TempDir, String) { + let td = TempDir::new().unwrap(); + let origin = td.path().join("origin"); + let bare = td.path().join("bare.git"); + for (path, contents) in files { + let full = origin.join(path); + std::fs::create_dir_all(full.parent().unwrap()).unwrap(); + std::fs::write(full, contents).unwrap(); + } + g(&["init", "-q"], &origin); + g(&["config", "user.email", "t@t"], &origin); + g(&["config", "user.name", "t"], &origin); + g(&["add", "."], &origin); + g(&["commit", "-qm", "init"], &origin); + g( + &["clone", "-q", "--bare", origin.to_str().unwrap(), bare.to_str().unwrap()], + td.path(), + ); + let url = format!("file://{}", bare.display()); + (td, url) + } + + fn git_config(repo: &Path, key: &str) -> String { + let out = Command::new("git") + .args(["-C", repo.to_str().unwrap(), "config", "--get", key]) + .output() + .unwrap(); + String::from_utf8_lossy(&out.stdout).trim().to_string() + } + + fn object_count(repo: &Path) -> usize { + let out = Command::new("git") + .args([ + "-C", + repo.to_str().unwrap(), + "cat-file", + "--batch-all-objects", + "--batch-check=%(objectname)", + ]) + .output() + .unwrap(); + String::from_utf8_lossy(&out.stdout) + .lines() + .filter(|l| !l.trim().is_empty()) + .count() + } + + #[tokio::test] + async fn promisor_clone_marks_promisor_and_keeps_objects() { + let (td, url) = bare_remote(&[("public/a.txt", b"pub\n"), ("secret/b.txt", b"SECRET\n")]); + let dest = td.path().join("mirror.git"); + clone_repo(&url, &dest, MirrorMode::Promisor).await.unwrap(); + + assert_eq!(git_config(&dest, "remote.origin.promisor"), "true"); + assert_eq!(git_config(&dest, "remote.origin.mirror"), "true"); + // No withholding on a plain bare origin, so every object is present: + // 1 commit + 1 root tree + 2 subtrees + 2 blobs = 6. + assert_eq!(object_count(&dest), 6); + } + + #[tokio::test] + async fn plain_clone_is_not_promisor() { + let (td, url) = bare_remote(&[("public/a.txt", b"pub\n")]); + let dest = td.path().join("mirror.git"); + clone_repo(&url, &dest, MirrorMode::Plain).await.unwrap(); + + assert_eq!(git_config(&dest, "remote.origin.promisor"), ""); + assert_eq!(git_config(&dest, "remote.origin.mirror"), "true"); + } } From 97c82d55350ed515446cfaf3bef51079b993a179 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Wed, 10 Jun 2026 09:43:15 -0500 Subject: [PATCH 10/13] feat(node): promisor-aware mirror fetch via origin remote --- crates/gitlawb-node/src/sync.rs | 73 ++++++++++++++++++++++++--------- 1 file changed, 54 insertions(+), 19 deletions(-) diff --git a/crates/gitlawb-node/src/sync.rs b/crates/gitlawb-node/src/sync.rs index c93c084..b043149 100644 --- a/crates/gitlawb-node/src/sync.rs +++ b/crates/gitlawb-node/src/sync.rs @@ -130,7 +130,7 @@ async fn process_batch(db: &Db, config: &Config, machine_id: Option<&str>) { let remote_url = format!("{}/{}", origin_url, item.repo); let result = if local_path.exists() { - fetch_repo(&local_path, &remote_url).await + fetch_repo(&local_path, &remote_url, MirrorMode::Plain).await } else { clone_repo(&remote_url, &local_path, MirrorMode::Plain).await }; @@ -159,6 +159,20 @@ async fn process_batch(db: &Db, config: &Config, machine_id: Option<&str>) { } } +/// Run a git subprocess, returning an error with stderr on non-zero exit. +async fn git_run(args: &[&str]) -> anyhow::Result<()> { + let out = tokio::process::Command::new("git") + .args(args) + .output() + .await + .map_err(|e| anyhow::anyhow!("git failed to spawn: {e}"))?; + if !out.status.success() { + let stderr = String::from_utf8_lossy(&out.stderr); + return Err(anyhow::anyhow!("git {args:?} failed: {stderr}")); + } + Ok(()) +} + /// Mirror-clone a repo from a remote URL into a local bare repo. /// `Promisor` mode adds `--filter=blob:limit=10g`, which marks the repo a git /// promisor (so a pack with origin-omitted withheld blobs is accepted) while @@ -185,26 +199,26 @@ async fn clone_repo(remote_url: &str, local_path: &Path, mode: MirrorMode) -> an Ok(()) } -/// Fetch all refs from the remote into an existing mirror repo. -async fn fetch_repo(local_path: &Path, remote_url: &str) -> anyhow::Result<()> { - let out = tokio::process::Command::new("git") - .args([ - "-C", - local_path.to_str().unwrap_or("."), - "fetch", - "--prune", - remote_url, - "+refs/*:refs/*", - ]) - .output() - .await - .map_err(|e| anyhow::anyhow!("git fetch failed to spawn: {e}"))?; +/// Fetch all refs from the remote into an existing mirror repo. Refreshes the +/// stored `origin` URL (the peer's URL may have changed), applies promisor +/// config when `Promisor` (covers a repo that became mode-B after a plain +/// initial mirror), and fetches via the `origin` remote so any stored promisor +/// settings are honored. +async fn fetch_repo(local_path: &Path, remote_url: &str, mode: MirrorMode) -> anyhow::Result<()> { + let local_str = local_path.to_str().unwrap_or("."); - if !out.status.success() { - let stderr = String::from_utf8_lossy(&out.stderr); - return Err(anyhow::anyhow!("git fetch failed: {stderr}")); + git_run(&["-C", local_str, "remote", "set-url", "origin", remote_url]).await?; + + if mode == MirrorMode::Promisor { + git_run(&["-C", local_str, "config", "remote.origin.promisor", "true"]).await?; + git_run(&[ + "-C", local_str, + "config", "remote.origin.partialclonefilter", "blob:limit=10g", + ]) + .await?; } - Ok(()) + + git_run(&["-C", local_str, "fetch", "--prune", "origin"]).await } #[cfg(test)] @@ -313,4 +327,25 @@ mod tests { assert_eq!(git_config(&dest, "remote.origin.promisor"), ""); assert_eq!(git_config(&dest, "remote.origin.mirror"), "true"); } + + #[tokio::test] + async fn promisor_fetch_updates_existing_mirror() { + let (td, url) = bare_remote(&[("public/a.txt", b"pub\n")]); + let dest = td.path().join("mirror.git"); + clone_repo(&url, &dest, MirrorMode::Promisor).await.unwrap(); + let before = object_count(&dest); + + // Add a second commit to the origin working tree and push to the bare + // (the working repo has no named remote, so push via the file:// URL). + let origin = td.path().join("origin"); + std::fs::write(origin.join("public/c.txt"), b"more\n").unwrap(); + g(&["add", "."], &origin); + g(&["commit", "-qm", "second"], &origin); + g(&["push", "-q", &url, "HEAD"], &origin); + + fetch_repo(&dest, &url, MirrorMode::Promisor).await.unwrap(); + + assert_eq!(git_config(&dest, "remote.origin.promisor"), "true"); + assert!(object_count(&dest) > before, "fetch pulled the new commit"); + } } From 1e8ae3c2b5c235fae72bb7464cfcd0673a9a8ea3 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Wed, 10 Jun 2026 09:46:53 -0500 Subject: [PATCH 11/13] feat(node): classify and mirror peers per withheld-paths --- crates/gitlawb-node/src/sync.rs | 36 +++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/crates/gitlawb-node/src/sync.rs b/crates/gitlawb-node/src/sync.rs index b043149..54dcb65 100644 --- a/crates/gitlawb-node/src/sync.rs +++ b/crates/gitlawb-node/src/sync.rs @@ -63,12 +63,13 @@ async fn run( shutdown_rx: &mut tokio::sync::watch::Receiver, ) { let machine_id = std::env::var("FLY_MACHINE_ID").ok(); + let client = reqwest::Client::new(); info!("sync worker started (auto_sync=true)"); let mut interval = tokio::time::interval(std::time::Duration::from_secs(30)); loop { tokio::select! { _ = interval.tick() => { - process_batch(&db, &config, machine_id.as_deref()).await; + process_batch(&db, &config, machine_id.as_deref(), &client).await; } _ = shutdown_rx.changed() => { if *shutdown_rx.borrow() { @@ -80,7 +81,7 @@ async fn run( } } -async fn process_batch(db: &Db, config: &Config, machine_id: Option<&str>) { +async fn process_batch(db: &Db, config: &Config, machine_id: Option<&str>, client: &reqwest::Client) { let items = match db.dequeue_pending_syncs(10).await { Ok(v) => v, Err(e) => { @@ -129,10 +130,13 @@ async fn process_batch(db: &Db, config: &Config, machine_id: Option<&str>) { // (no .git suffix — the server routes don't include it) let remote_url = format!("{}/{}", origin_url, item.repo); + let withheld = fetch_withheld(client, &origin_url, owner_short, repo_name).await; + let mode = classify_mirror(withheld); + let result = if local_path.exists() { - fetch_repo(&local_path, &remote_url, MirrorMode::Plain).await + fetch_repo(&local_path, &remote_url, mode).await } else { - clone_repo(&remote_url, &local_path, MirrorMode::Plain).await + clone_repo(&remote_url, &local_path, mode).await }; match result { @@ -159,6 +163,30 @@ async fn process_batch(db: &Db, config: &Config, machine_id: Option<&str>) { } } +/// Query the origin's anonymous `withheld-paths` endpoint. Returns the withheld +/// glob list on a 2xx, or `None` on any non-success / network / parse error +/// (treated as "unknown" by `classify_mirror`). +async fn fetch_withheld( + client: &reqwest::Client, + origin_url: &str, + owner: &str, + repo: &str, +) -> Option> { + let url = format!("{origin_url}/api/v1/repos/{owner}/{repo}/withheld-paths"); + let resp = client.get(&url).send().await.ok()?; + if !resp.status().is_success() { + return None; + } + let body: serde_json::Value = resp.json().await.ok()?; + let globs = body + .get("withheld")? + .as_array()? + .iter() + .filter_map(|v| v.as_str().map(str::to_string)) + .collect(); + Some(globs) +} + /// Run a git subprocess, returning an error with stderr on non-zero exit. async fn git_run(args: &[&str]) -> anyhow::Result<()> { let out = tokio::process::Command::new("git") From f1e2207301f4b4136a5129cda5cfc33bbae2232c Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Wed, 10 Jun 2026 10:52:27 -0500 Subject: [PATCH 12/13] style: cargo fmt --- crates/gitlawb-node/src/sync.rs | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/crates/gitlawb-node/src/sync.rs b/crates/gitlawb-node/src/sync.rs index 54dcb65..718cd69 100644 --- a/crates/gitlawb-node/src/sync.rs +++ b/crates/gitlawb-node/src/sync.rs @@ -81,7 +81,12 @@ async fn run( } } -async fn process_batch(db: &Db, config: &Config, machine_id: Option<&str>, client: &reqwest::Client) { +async fn process_batch( + db: &Db, + config: &Config, + machine_id: Option<&str>, + client: &reqwest::Client, +) { let items = match db.dequeue_pending_syncs(10).await { Ok(v) => v, Err(e) => { @@ -240,8 +245,11 @@ async fn fetch_repo(local_path: &Path, remote_url: &str, mode: MirrorMode) -> an if mode == MirrorMode::Promisor { git_run(&["-C", local_str, "config", "remote.origin.promisor", "true"]).await?; git_run(&[ - "-C", local_str, - "config", "remote.origin.partialclonefilter", "blob:limit=10g", + "-C", + local_str, + "config", + "remote.origin.partialclonefilter", + "blob:limit=10g", ]) .await?; } @@ -301,7 +309,13 @@ mod tests { g(&["add", "."], &origin); g(&["commit", "-qm", "init"], &origin); g( - &["clone", "-q", "--bare", origin.to_str().unwrap(), bare.to_str().unwrap()], + &[ + "clone", + "-q", + "--bare", + origin.to_str().unwrap(), + bare.to_str().unwrap(), + ], td.path(), ); let url = format!("file://{}", bare.display()); From 25390bfed9240e90757cb917b3b74144a8b885a8 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Wed, 10 Jun 2026 11:38:38 -0500 Subject: [PATCH 13/13] fix(node): bound sync withheld lookup and rehydrate on promisor->plain Give the sync worker's HTTP client a 30s timeout so a stalled peer withheld-paths lookup cannot hang the worker loop. When a repo that was mirrored as a promisor (mode B) becomes fully public, fetch_repo now clears remote.origin.promisor and partialclonefilter and refetches, so the once-withheld blobs are backfilled instead of the mirror staying permanently partial. --- crates/gitlawb-node/src/sync.rs | 109 +++++++++++++++++++++++++++----- 1 file changed, 92 insertions(+), 17 deletions(-) diff --git a/crates/gitlawb-node/src/sync.rs b/crates/gitlawb-node/src/sync.rs index 718cd69..df41470 100644 --- a/crates/gitlawb-node/src/sync.rs +++ b/crates/gitlawb-node/src/sync.rs @@ -63,7 +63,11 @@ async fn run( shutdown_rx: &mut tokio::sync::watch::Receiver, ) { let machine_id = std::env::var("FLY_MACHINE_ID").ok(); - let client = reqwest::Client::new(); + // Bound each withheld-paths lookup so a stalled peer cannot hang the worker. + let client = reqwest::Client::builder() + .timeout(std::time::Duration::from_secs(30)) + .build() + .unwrap_or_else(|_| reqwest::Client::new()); info!("sync worker started (auto_sync=true)"); let mut interval = tokio::time::interval(std::time::Duration::from_secs(30)); loop { @@ -206,6 +210,29 @@ async fn git_run(args: &[&str]) -> anyhow::Result<()> { Ok(()) } +/// Run a git subprocess, ignoring a non-zero exit. Used for idempotent +/// `config --unset`, which exits non-zero when the key is already absent. +async fn git_run_lenient(args: &[&str]) { + let _ = tokio::process::Command::new("git") + .args(args) + .output() + .await; +} + +/// Read a single git config value; `None` if unset or on error. +async fn git_config_get(repo: &str, key: &str) -> Option { + let out = tokio::process::Command::new("git") + .args(["-C", repo, "config", "--get", key]) + .output() + .await + .ok()?; + if !out.status.success() { + return None; + } + let value = String::from_utf8_lossy(&out.stdout).trim().to_string(); + (!value.is_empty()).then_some(value) +} + /// Mirror-clone a repo from a remote URL into a local bare repo. /// `Promisor` mode adds `--filter=blob:limit=10g`, which marks the repo a git /// promisor (so a pack with origin-omitted withheld blobs is accepted) while @@ -233,28 +260,60 @@ async fn clone_repo(remote_url: &str, local_path: &Path, mode: MirrorMode) -> an } /// Fetch all refs from the remote into an existing mirror repo. Refreshes the -/// stored `origin` URL (the peer's URL may have changed), applies promisor -/// config when `Promisor` (covers a repo that became mode-B after a plain -/// initial mirror), and fetches via the `origin` remote so any stored promisor -/// settings are honored. +/// stored `origin` URL (the peer's URL may have changed) and fetches via the +/// `origin` remote so any stored promisor settings are honored. +/// +/// `Promisor` applies the promisor config first (covers a repo that became +/// mode-B after a plain initial mirror). `Plain` on a mirror that was previously +/// a promisor (the repo went private -> public) clears the partial-clone config +/// and `--refetch`es, so the once-withheld, now-public blobs are backfilled +/// rather than left permanently missing. async fn fetch_repo(local_path: &Path, remote_url: &str, mode: MirrorMode) -> anyhow::Result<()> { let local_str = local_path.to_str().unwrap_or("."); git_run(&["-C", local_str, "remote", "set-url", "origin", remote_url]).await?; - if mode == MirrorMode::Promisor { - git_run(&["-C", local_str, "config", "remote.origin.promisor", "true"]).await?; - git_run(&[ - "-C", - local_str, - "config", - "remote.origin.partialclonefilter", - "blob:limit=10g", - ]) - .await?; + match mode { + MirrorMode::Promisor => { + git_run(&["-C", local_str, "config", "remote.origin.promisor", "true"]).await?; + git_run(&[ + "-C", + local_str, + "config", + "remote.origin.partialclonefilter", + "blob:limit=10g", + ]) + .await?; + git_run(&["-C", local_str, "fetch", "--prune", "origin"]).await + } + MirrorMode::Plain => { + let was_promisor = git_config_get(local_str, "remote.origin.promisor") + .await + .as_deref() + == Some("true"); + if was_promisor { + git_run_lenient(&[ + "-C", + local_str, + "config", + "--unset", + "remote.origin.promisor", + ]) + .await; + git_run_lenient(&[ + "-C", + local_str, + "config", + "--unset", + "remote.origin.partialclonefilter", + ]) + .await; + git_run(&["-C", local_str, "fetch", "--refetch", "--prune", "origin"]).await + } else { + git_run(&["-C", local_str, "fetch", "--prune", "origin"]).await + } + } } - - git_run(&["-C", local_str, "fetch", "--prune", "origin"]).await } #[cfg(test)] @@ -390,4 +449,20 @@ mod tests { assert_eq!(git_config(&dest, "remote.origin.promisor"), "true"); assert!(object_count(&dest) > before, "fetch pulled the new commit"); } + + #[tokio::test] + async fn plain_fetch_clears_promisor_config_on_transition() { + // Repo started mode-B (promisor mirror), then went fully public, so the + // next sync classifies Plain. fetch_repo must drop the partial-clone + // config and refetch instead of leaving the mirror a promisor forever. + let (td, url) = bare_remote(&[("public/a.txt", b"pub\n")]); + let dest = td.path().join("mirror.git"); + clone_repo(&url, &dest, MirrorMode::Promisor).await.unwrap(); + assert_eq!(git_config(&dest, "remote.origin.promisor"), "true"); + + fetch_repo(&dest, &url, MirrorMode::Plain).await.unwrap(); + + assert_eq!(git_config(&dest, "remote.origin.promisor"), ""); + assert_eq!(git_config(&dest, "remote.origin.partialclonefilter"), ""); + } }