fix(webfetch): keep body read inside the timeout scope to stop "context canceled" (#59)

ysyneu · web-flow · commit 4e4cf6fd5d6b · 2026-06-09T15:31:29.000+08:00
fetchURL created a per-request timeout context and deferred its cancel, then
returned the *http.Response. The cancel fired the moment fetchURL returned —
before WebFetch read the body via readResponseBody. net/http ties the body
read to the request context, so any response still streaming when Do()
returned aborted with "failed to read response: context canceled".

Large/slow pages (e.g. Aliyun help docs) hit this every time because their
body is still in flight when Do() returns; small pages got lucky only because
their body was already buffered. The error is "context canceled", not
"deadline exceeded" — it was never a real timeout, just a premature cancel.

Merge the request and the body read into one timeout-scoped function
(fetchBody), mirroring Safari's correct localWebFetch, so the cancel cannot
fire before the read. Add a regression test that streams a slow body and
asserts it is read in full (reproduces the exact error against the old
structure).
diff --git a/environment/webfetch.go b/environment/webfetch.go
@@ -57,37 +57,49 @@ func (e *Environment) WebFetch(ctx context.Context, args *protocol.WebFetchArgs)
 		format = "markdown"
 	}
 
-	resp, err := e.fetchURL(ctx, args.URL, format, timeout)
+	page, err := e.fetchBody(ctx, args.URL, format, timeout)
 	if err != nil {
 		return nil, err
 	}
-	defer func() {
-		_ = resp.Body.Close()
-	}()
 
-	body, err := readResponseBody(resp)
-	if err != nil {
-		return nil, err
-	}
-
-	content := convertContent(string(body), format, resp.Header.Get("Content-Type"))
+	content := convertContent(string(page.body), format, page.contentType)
 	processor := NewLargeOutputProcessor(e, DefaultLargeOutputConfig())
+	// processor.Process runs on the outer ctx: the body is already fully read
+	// into memory, so it must not be bound to the per-fetch timeout.
 	processed, err := processor.Process(ctx, content, "webfetch")
 	if err != nil {
 		return nil, err
 	}
 
 	return &protocol.WebFetchResult{
 		Content:   processed.Content,
-		URL:       resp.Request.URL.String(),
+		URL:       page.finalURL,
 		Truncated: processed.Truncated,
 		FilePath:  processed.FilePath,
 		TotalSize: processed.TotalSize,
 	}, nil
 }
 
-// fetchURL performs the HTTP request via the netsafe-wrapped client.
-func (e *Environment) fetchURL(ctx context.Context, url, format string, timeout time.Duration) (*http.Response, error) {
+// fetchedPage is the raw result of a single HTTP fetch: the response body plus
+// the metadata WebFetch still needs after the connection has been closed.
+type fetchedPage struct {
+	body        []byte
+	finalURL    string
+	contentType string
+}
+
+// fetchBody performs the HTTP request AND reads the full response body within
+// a single timeout-scoped context.
+//
+// Keeping the body read inside this scope is load-bearing: net/http ties the
+// lifetime of resp.Body to the request context, so a cancel() that fires
+// before the read — e.g. a helper that returns the *http.Response and defers
+// its cancel — aborts the read with "failed to read response: context
+// canceled". Large or slow pages (e.g. Aliyun help docs) trip this every time
+// because their body is still streaming when Do() returns; small pages got
+// lucky only because their body was already buffered. Do() and the body read
+// must therefore share one cancel scope.
+func (e *Environment) fetchBody(ctx context.Context, url, format string, timeout time.Duration) (*fetchedPage, error) {
 	httpCtx, cancel := context.WithTimeout(ctx, timeout)
 	defer cancel()
 
@@ -105,8 +117,23 @@ func (e *Environment) fetchURL(ctx context.Context, url, format string, timeout
 		}
 		return nil, fmt.Errorf("request failed: %w", err)
 	}
+	defer func() { _ = resp.Body.Close() }()
 
-	return resp, nil
+	body, err := readResponseBody(resp)
+	if err != nil {
+		// The deadline can still be hit mid-read; surface it as a timeout
+		// rather than a raw "context deadline exceeded" read error.
+		if httpCtx.Err() == context.DeadlineExceeded {
+			return nil, fmt.Errorf("request timed out")
+		}
+		return nil, err
+	}
+
+	return &fetchedPage{
+		body:        body,
+		finalURL:    resp.Request.URL.String(),
+		contentType: resp.Header.Get("Content-Type"),
+	}, nil
 }
 
 // setRequestHeaders sets appropriate headers for the request.
diff --git a/environment/webfetch_test.go b/environment/webfetch_test.go
@@ -0,0 +1,62 @@
+package environment
+
+import (
+	"context"
+	"io"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/require"
+)
+
+// TestFetchBody_ReadsSlowStreamedBody is the regression test for the
+// "context canceled" bug. The old code split the fetch across the cancel
+// boundary: fetchURL created a per-request timeout context, deferred its
+// cancel, and returned the *http.Response — so cancel() fired the moment
+// fetchURL returned, BEFORE WebFetch read the body. net/http ties the body
+// read to the request context, so any response still streaming when Do()
+// returned aborted with "failed to read response: context canceled". Large or
+// slow pages (e.g. Aliyun help docs) hit this every time; small pages got lucky
+// because their body was already buffered.
+//
+// The fix keeps Do() and the body read inside one cancel scope (fetchBody).
+// This test streams a body slowly enough that it cannot be pre-buffered, then
+// asserts the full body is still read.
+func TestFetchBody_ReadsSlowStreamedBody(t *testing.T) {
+	const (
+		chunk  = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef\n" // 65 bytes
+		chunks = 400
+	)
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "text/plain")
+		w.WriteHeader(http.StatusOK)
+		flusher, ok := w.(http.Flusher)
+		require.True(t, ok)
+		flusher.Flush() // deliver headers so Do() returns before the body finishes
+		for i := 0; i < chunks; i++ {
+			if _, err := io.WriteString(w, chunk); err != nil {
+				return
+			}
+			flusher.Flush()
+			time.Sleep(time.Millisecond) // keep the body streaming, not pre-buffered
+		}
+	}))
+	t.Cleanup(srv.Close)
+
+	// The production fetchClient dials through netsafe, which refuses loopback
+	// (see security_test.go). Swap in the test server's plain client so we can
+	// exercise the body-read / context-lifecycle path against a real stream;
+	// dial-time SSRF protection is orthogonal and covered separately.
+	orig := fetchClient
+	fetchClient = srv.Client()
+	t.Cleanup(func() { fetchClient = orig })
+
+	env := newTestEnvironment(t)
+	page, err := env.fetchBody(context.Background(), srv.URL, "text", 30*time.Second)
+	require.NoError(t, err) // pre-fix: "failed to read response: context canceled"
+	require.Equal(t, chunks*len(chunk), len(page.body))
+	require.True(t, strings.HasPrefix(string(page.body), "0123456789abcdef"))
+}