diff --git a/packages/data/src/cache/functions/hashing/blob-to-hash.ts b/packages/data/src/cache/functions/hashing/blob-to-hash.ts index 47f2df29..af397af6 100644 --- a/packages/data/src/cache/functions/hashing/blob-to-hash.ts +++ b/packages/data/src/cache/functions/hashing/blob-to-hash.ts @@ -1,26 +1,77 @@ // © 2026 Adobe. MIT License. See /LICENSE for details. -import { createSHA256 } from "hash-wasm"; -export async function blobToHash(blob: Blob): Promise { - const hasher = await createSHA256(); - hasher.init(); +// Performance assumptions +// ----------------------- +// Blobs hashed here are frequently NOT fully in memory (disk- or +// stream-backed), so each `reader.read()` is genuine I/O latency, not a +// microtask hop over resident bytes. This makes the path I/O-bound, and the +// design optimizes for overlapping those waits across concurrent calls: +// +// - A single WASM hasher instance is reused for every call (instantiation +// is not free, and we don't want one per call). +// - Concurrency is achieved via hash-wasm's resumable save()/load() rather +// than a pool of instances: each call keeps its own `state` and only +// touches the shared instance in synchronous critical sections, so calls +// interleave at `read()` without serializing or corrupting each other. +// +// The trade-off is a save()/load() pair per chunk (cheap for SHA-256). If +// blobs were instead known to be fully in memory, this would be compute-bound, +// the I/O overlap would buy nothing, and buffering then hashing synchronously +// (one init→update→digest block, no await) would suffice. +// +// Note on the "single global hasher" decision: even compute-bound, extra +// instances would NOT help on this thread. WASM has no threads of its own and +// hasher.update() is synchronous, so on one JS thread only one hash advances +// at a time regardless of how many instances exist — a pool buys nothing here. +// Servicing multiple CPU-bound hashes truly in parallel requires Web Workers, +// each with its OWN instance on its OWN thread. That is the only thing a second +// instance is ever good for, and it lives at the worker boundary, not here. So +// within this thread, one global hasher is strictly correct and loses nothing. +import { type IHasher, createSHA256 } from "hash-wasm"; - // Encode MIME type as UTF-16LE - const tCodes = new Uint16Array(blob.type.length); - for (let i = 0; i < blob.type.length; i++) { - tCodes[i] = blob.type.charCodeAt(i); +let hasherPromise: Promise | undefined; + +export async function blobToHash(blob: Blob): Promise { + if (hasherPromise === undefined) { + hasherPromise = createSHA256(); + // Allow a later call to retry if instantiation failed, rather than + // poisoning the module with a permanently-rejected promise. + hasherPromise.catch(() => { + hasherPromise = undefined; + }); } - hasher.update(new Uint8Array(tCodes.buffer)); + const hasher = await hasherPromise; + + // One shared WASM instance serves all concurrent calls. The instance is + // touched only in synchronous init→…→save / load→update→save sequences, + // never held across an `await`, so each call carries its own `state` and + // their `reader.read()` I/O waits overlap freely without corrupting one + // another. See hash-wasm save()/load() resumable hashing. + hasher.init(); + hasher.update(mimeTypeBytes(blob.type)); + let state = hasher.save(); const reader = blob.stream().getReader(); let done = false; while (!done) { - const result = await reader.read(); - done = result.done === true; - if (!done && result.value != null) { - hasher.update(result.value); + const chunk = await reader.read(); + done = chunk.done === true; + if (!done && chunk.value != null) { + hasher.load(state); + hasher.update(chunk.value); + state = hasher.save(); } } + hasher.load(state); return hasher.digest("hex"); } + +function mimeTypeBytes(type: string): Uint8Array { + // Encode MIME type as UTF-16LE + const codes = new Uint16Array(type.length); + for (let i = 0; i < type.length; i++) { + codes[i] = type.charCodeAt(i); + } + return new Uint8Array(codes.buffer); +} diff --git a/packages/data/src/cache/functions/hashing/hashing.test.ts b/packages/data/src/cache/functions/hashing/hashing.test.ts index a7bb2047..2d695149 100644 --- a/packages/data/src/cache/functions/hashing/hashing.test.ts +++ b/packages/data/src/cache/functions/hashing/hashing.test.ts @@ -3,6 +3,43 @@ import { blobToHash } from "./blob-to-hash.js"; import { jsonToHash } from "./json-to-hash.js"; import { describe, expect, it } from "vitest"; +const bytes = (s: string): Uint8Array => new TextEncoder().encode(s); + +// Lets all pending microtasks (and the WASM-instantiation promise) settle, so +// each in-flight blobToHash call advances to its next `await reader.read()`. +const flush = (): Promise => new Promise((resolve) => setTimeout(resolve, 0)); + +// A stand-in Blob whose stream yields `chunks` one `read()` at a time, but only +// when the test releases each gate. This hands the test control of the exact +// interleaving across concurrent calls — impossible with a real in-memory Blob, +// whose reads resolve on their own schedule. +function gatedBlob(type: string, chunks: Uint8Array[]) { + const gates: Array<() => void> = []; + let i = 0; + const reader = { + read: () => + new Promise>((resolve) => { + gates.push(() => + resolve( + i < chunks.length + ? { done: false, value: chunks[i++] } + : { done: true, value: undefined }, + ), + ); + }), + }; + return { + // Case 1 cast: this implements the only members blobToHash reads off a + // Blob — `type` and `stream().getReader().read()`. + blob: { type, stream: () => ({ getReader: () => reader }) } as unknown as Blob, + releaseNext: (): boolean => { + const gate = gates.shift(); + gate?.(); + return gate !== undefined; + }, + }; +} + describe("test hashing", () => { describe("blobToHash", () => { it("should avoid collisions based on content and type", async () => { @@ -108,6 +145,40 @@ describe("test hashing", () => { expect(hash).toMatch(/^[a-f0-9]{64}$/); } }); + + it("interleaved concurrent reads match serial hashes", async () => { + const inputs = [ + { type: "text/plain", chunks: ["alpha-", "one-", "end"] }, + { type: "application/octet-stream", chunks: ["BETA-", "two-", "END"] }, + { type: "", chunks: ["g", "amma", "!!!"] }, + ]; + + // Oracle: hash each input serially. SHA-256 is over the byte stream, so a + // real Blob of the concatenated chunks yields the same digest the gated + // blob must produce regardless of chunk boundaries. + const oracle: string[] = []; + for (const { type, chunks } of inputs) { + oracle.push(await blobToHash(new Blob(chunks, { type }))); + } + + // Concurrent: start every call, then drive the gates round-robin so the + // calls interleave between chunks — the exact pattern that corrupts a + // naively shared hasher. + const gated = inputs.map(({ type, chunks }) => gatedBlob(type, chunks.map(bytes))); + const results = gated.map((g) => blobToHash(g.blob)); + + let progressed = true; + while (progressed) { + await flush(); + progressed = false; + for (const g of gated) { + if (g.releaseNext()) progressed = true; + } + } + await flush(); + + expect(await Promise.all(results)).toEqual(oracle); + }); }); describe("jsonToHash", () => {