diff --git a/.changeset/indexnow-incremental.md b/.changeset/indexnow-incremental.md new file mode 100644 index 0000000..9a8059e --- /dev/null +++ b/.changeset/indexnow-incremental.md @@ -0,0 +1,29 @@ +--- +'@jdevalk/astro-seo-graph': minor +--- + +**Incremental IndexNow submission via a published content-hash manifest.** + +The `indexNow` integration option submitted every URL on the site on every +build. The IndexNow spec asks senders to submit only added/updated/deleted +URLs, and full resubmits can trip per-host rate limits (HTTP 429). + +New opt-in `indexNow.incremental` (`true` or an options object). When enabled, +each build hashes every eligible page into a manifest, fetches the previously +published manifest from the live site, diffs them, and submits only the URLs +that changed (added + updated + deleted) — then writes the new manifest into +the build output so it ships with the deploy and becomes the next baseline. + +The previous state lives on the live site, so this works identically whether +you build locally or in CI and needs no external store. A clean `404` is +treated as a first run (baseline); any other fetch/parse failure is handled by +`onError` (`'skip'` by default, so a transient blip can't trigger a full +resubmit; `'full'` to fall back to submitting everything). A `normalize` hook +lets you strip per-build volatile markup (nonces, timestamps) before hashing. + +Default behavior is unchanged — without `incremental`, the integration still +submits the full set. + +Also exports the underlying pure helpers (`buildUrlManifest`, `diffManifests`, +`changedUrls`, `hashContent`, `serializeManifest`, `parseManifest`) for callers +who want to compute the changed set themselves. diff --git a/packages/astro-seo-graph/README.md b/packages/astro-seo-graph/README.md index ca094d0..5641a61 100644 --- a/packages/astro-seo-graph/README.md +++ b/packages/astro-seo-graph/README.md @@ -591,7 +591,8 @@ Options: `example.com`), `siteUrl` (absolute origin), `keyLocation?` (defaults to `https:///.txt`), `endpoint?` (defaults to `api.indexnow.org`), `filter?` (drop URLs for which the callback returns `false`; composed on -top of the built-in `/404` exclusion). +top of the built-in `/404` exclusion), `incremental?` (submit only changed +URLs — see [Incremental IndexNow submission](#incremental-indexnow-submission)). `validateMetadataLength` accepts `true`/`false` for the defaults, or an object to override bounds. Length is measured on the whitespace-collapsed, @@ -678,6 +679,47 @@ own deploy hook. > have to rotate it. Ship the route, deploy, confirm the `.txt` loads > over HTTPS, _then_ enable `indexNow` in the integration. +## Incremental IndexNow submission + +By default the integration submits **every** eligible URL on every build. The +[IndexNow spec](https://www.indexnow.org/documentation) asks senders to submit +only URLs that were **added, updated, or deleted**, and full resubmits can trip +a host's rate limit (HTTP 429). Set `indexNow.incremental` to submit just the +difference: + +```js +seoGraph({ + indexNow: { + key: process.env.INDEXNOW_KEY, + host: 'example.com', + siteUrl: 'https://example.com', + incremental: true, + }, +}); +``` + +On each build the integration hashes every eligible page into a manifest, +**fetches the previously published manifest from the live site**, diffs the two, +and submits only the changed URLs (added + updated + deleted). It then writes +the new manifest into the build output (default `indexnow-manifest.json` at the +site root) so it ships with the deploy and becomes the next build's baseline. + +Because the previous state lives on the live site — not local disk or a +key/value store — this behaves identically whether you build locally or in CI, +and adds no infrastructure (the manifest is a static file; the only network call +is one `GET` at build time). The manifest just lists URLs and opaque hashes — +the same URLs your sitemap already exposes. + +`incremental` sub-options (all optional): `manifestPath` (build-output path and +served URL path; default `indexnow-manifest.json`), `manifestUrl` (absolute URL +to fetch the previous manifest from; default `/`), +`normalize` (`(html, url) => string` to strip per-build volatile markup — CSP +nonces, timestamps — before hashing, so unchanged pages don't read as modified), +and `onError` (`'skip'` (default) or `'full'` — what to do when the previous +manifest can't be fetched or parsed for a reason other than a clean `404`; a +`404` is always treated as a first run and submits everything once). `'skip'` +means a transient fetch failure can never trigger an accidental full resubmit. + ## Validating your output The build-time integration checks only catch a narrow set of issues. diff --git a/packages/astro-seo-graph/src/index.ts b/packages/astro-seo-graph/src/index.ts index f54db99..0923c4e 100644 --- a/packages/astro-seo-graph/src/index.ts +++ b/packages/astro-seo-graph/src/index.ts @@ -25,6 +25,23 @@ export type { BreadcrumbsFromUrlInput } from './breadcrumbs.js'; export { createIndexNowKeyRoute, submitToIndexNow, validateIndexNowKey } from './indexnow.js'; export type { IndexNowKeyRouteOptions, IndexNowSubmitResult } from './indexnow.js'; +export { + buildUrlManifest, + diffManifests, + changedUrls, + hashContent, + serializeManifest, + parseManifest, + DEFAULT_HASH_ALGORITHM, + MANIFEST_VERSION, +} from './indexnow-manifest.js'; +export type { + UrlManifest, + ManifestEntry, + ManifestDiff, + SerializedManifest, +} from './indexnow-manifest.js'; + export { renderLlmsTxt } from './llms-txt.js'; export type { LlmsTxtInput, LlmsTxtSection, LlmsTxtLink } from './llms-txt.js'; diff --git a/packages/astro-seo-graph/src/indexnow-manifest.ts b/packages/astro-seo-graph/src/indexnow-manifest.ts new file mode 100644 index 0000000..df52bf4 --- /dev/null +++ b/packages/astro-seo-graph/src/indexnow-manifest.ts @@ -0,0 +1,143 @@ +import { createHash } from 'node:crypto'; + +/** + * Incremental-IndexNow manifest helpers. + * + * The integration submits the whole site to IndexNow on every build, which + * the IndexNow spec discourages (submit only added/updated/deleted URLs) and + * which can trip per-host rate limits. These pure helpers let a caller compute + * a content-hash manifest of the built pages, diff it against the previously + * published manifest, and submit only the URLs that actually changed. + * + * The helpers are runtime-agnostic and IO-free: the caller decides how to + * read the built pages, where to fetch the previous manifest from, and where + * to publish the new one. The integration wires them to the Astro build + * output and the live site (see `seoGraph({ indexNow: { incremental } })`). + */ + +/** Mapping of each eligible page URL to a short content hash. */ +export type UrlManifest = Record; + +/** A single page's URL plus the content to hash (typically built HTML). */ +export interface ManifestEntry { + url: string; + content: string; +} + +/** Identifier for the default hash, recorded in the serialized manifest. */ +export const DEFAULT_HASH_ALGORITHM = 'sha256-16'; + +/** Current serialized-manifest format version. */ +export const MANIFEST_VERSION = 1; + +/** The on-disk / over-the-wire manifest shape. */ +export interface SerializedManifest { + version: number; + algorithm: string; + urls: UrlManifest; +} + +/** SHA-256 of `content` as hex, truncated to 16 chars (64 bits of collision space). */ +export function hashContent(content: string): string { + return createHash('sha256').update(content, 'utf8').digest('hex').slice(0, 16); +} + +/** + * Build a `{ url: hash }` manifest from the eligible page entries. + * + * Pass a custom `hash` to normalize volatile markup (per-build nonces, + * timestamps, build ids) before hashing, so unchanged content doesn't read as + * "updated". The default hashes the raw content. + */ +export function buildUrlManifest( + entries: readonly ManifestEntry[], + hash: (content: string, url: string) => string = (content) => hashContent(content), +): UrlManifest { + const manifest: UrlManifest = {}; + for (const entry of entries) { + manifest[entry.url] = hash(entry.content, entry.url); + } + return manifest; +} + +/** The three change sets produced by comparing two manifests. */ +export interface ManifestDiff { + /** URLs present now but not in the previous manifest. */ + added: string[]; + /** URLs present in both, with a different hash. */ + updated: string[]; + /** URLs present previously but gone now. */ + deleted: string[]; +} + +/** + * Diff a previous manifest against the current one. Result arrays are sorted + * for stable logs and deterministic output. + */ +export function diffManifests(prev: UrlManifest, curr: UrlManifest): ManifestDiff { + const added: string[] = []; + const updated: string[] = []; + const deleted: string[] = []; + + for (const url of Object.keys(curr)) { + if (!(url in prev)) added.push(url); + else if (prev[url] !== curr[url]) updated.push(url); + } + for (const url of Object.keys(prev)) { + if (!(url in curr)) deleted.push(url); + } + + added.sort(); + updated.sort(); + deleted.sort(); + return { added, updated, deleted }; +} + +/** + * Flatten a diff into the URL set to submit to IndexNow: added + updated + + * deleted. IndexNow accepts removed URLs (the engine recrawls and drops the + * 404/410), so deletions are included. + */ +export function changedUrls(diff: ManifestDiff): string[] { + return [...diff.added, ...diff.updated, ...diff.deleted]; +} + +/** + * Serialize a manifest to stable JSON with sorted keys (so the published file + * diffs cleanly between deploys). + */ +export function serializeManifest( + urls: UrlManifest, + algorithm: string = DEFAULT_HASH_ALGORITHM, +): string { + const sorted: UrlManifest = {}; + for (const url of Object.keys(urls).sort()) { + const hash = urls[url]; + if (hash !== undefined) sorted[url] = hash; + } + const payload: SerializedManifest = { version: MANIFEST_VERSION, algorithm, urls: sorted }; + return JSON.stringify(payload, null, 2); +} + +/** + * Parse a previously published manifest. Returns the `{ url: hash }` map, or + * `null` when the input is missing or not a recognizable manifest — the caller + * decides whether that means "first run" or "fetch failed, skip submission". + */ +export function parseManifest(text: string | null | undefined): UrlManifest | null { + if (!text) return null; + let data: unknown; + try { + data = JSON.parse(text); + } catch { + return null; + } + if (typeof data !== 'object' || data === null) return null; + const urls = (data as Partial).urls; + if (typeof urls !== 'object' || urls === null) return null; + const out: UrlManifest = {}; + for (const [url, hash] of Object.entries(urls)) { + if (typeof hash === 'string') out[url] = hash; + } + return out; +} diff --git a/packages/astro-seo-graph/src/integration.ts b/packages/astro-seo-graph/src/integration.ts index 59bd95c..fc8f205 100644 --- a/packages/astro-seo-graph/src/integration.ts +++ b/packages/astro-seo-graph/src/integration.ts @@ -3,6 +3,16 @@ import { join, relative } from 'node:path'; import { fileURLToPath } from 'node:url'; import { submitToIndexNow } from '@jdevalk/seo-graph-core'; import { renderLlmsTxt, type LlmsTxtSection } from './llms-txt.js'; +import { + buildUrlManifest, + changedUrls, + diffManifests, + hashContent, + parseManifest, + serializeManifest, + type ManifestEntry, + type UrlManifest, +} from './indexnow-manifest.js'; // Narrow shape of the Astro integration hook we use. We don't import // Astro types here to keep this package installable without `astro` @@ -68,6 +78,55 @@ export interface IndexNowIntegrationOptions { * ``` */ filter?: (url: string) => boolean; + /** + * Submit only URLs that changed since the last build, instead of the + * whole site every time (the IndexNow spec asks for added/updated/ + * deleted URLs only, and full resubmits can trip per-host rate limits). + * + * Enabled with `true` (defaults) or an options object. On each build the + * integration hashes every eligible page into a manifest, fetches the + * previously published manifest from the live site, diffs them, and + * submits only the difference — then writes the new manifest into the + * build output so it ships with this deploy and becomes the next baseline. + * + * The previous state lives on the live site (not local disk or a + * key/value store), so this works the same whether you build locally or + * in CI, and adds no infrastructure. Default is off (full submit). + */ + incremental?: boolean | IndexNowIncrementalOptions; +} + +/** + * Options for incremental IndexNow submission (`indexNow.incremental`). + */ +export interface IndexNowIncrementalOptions { + /** + * Build-output path the manifest is written to, and the URL path it is + * served from. Defaults to `indexnow-manifest.json` at the site root. + */ + manifestPath?: string; + /** + * Absolute URL to fetch the previous manifest from. Defaults to + * `/`. Override if the manifest is served from a + * different host or path than where it's written. + */ + manifestUrl?: string; + /** + * Normalize a page's HTML before hashing, to strip per-build volatile + * markup (CSP nonces, timestamps, build ids) that would otherwise make + * unchanged pages look modified. Receives the raw HTML and the page URL. + */ + normalize?: (html: string, url: string) => string; + /** + * What to do when the previous manifest can't be fetched or parsed for a + * reason other than a clean 404 (network error, 5xx, malformed body). A + * 404 is always treated as "first run" (submit everything once). + * + * - `'skip'` (default): submit nothing this build, so a transient fetch + * failure can't trigger an accidental full resubmit. + * - `'full'`: fall back to submitting every eligible URL. + */ + onError?: 'skip' | 'full'; } /** @@ -1024,14 +1083,16 @@ export default function seoGraph(options: SeoGraphIntegrationOptions = {}): Astr } if (indexNow) { - const urls = htmlFiles - .map((f) => htmlFileToUrl(f, indexNow.siteUrl)) - .filter((u) => !isDefaultExcludedFromIndexNow(u)) - .filter((u) => (indexNow.filter ? indexNow.filter(u) : true)); + const eligible = htmlFiles + .map((f) => ({ file: f, url: htmlFileToUrl(f, indexNow.siteUrl) })) + .filter((e) => !isDefaultExcludedFromIndexNow(e.url)) + .filter((e) => (indexNow.filter ? indexNow.filter(e.url) : true)); - if (urls.length === 0) { - logger.info('IndexNow: no URLs to submit.'); - } else { + const submit = async (urls: string[]) => { + if (urls.length === 0) { + logger.info('IndexNow: no URLs to submit.'); + return; + } const results = await submitToIndexNow({ host: indexNow.host, key: indexNow.key, @@ -1050,6 +1111,76 @@ export default function seoGraph(options: SeoGraphIntegrationOptions = {}): Astr ); } } + }; + + if (!indexNow.incremental) { + await submit(eligible.map((e) => e.url)); + } else { + const opts: IndexNowIncrementalOptions = + indexNow.incremental === true ? {} : indexNow.incremental; + const manifestRelPath = ( + opts.manifestPath ?? 'indexnow-manifest.json' + ).replace(/^\//, ''); + const siteBase = indexNow.siteUrl.replace(/\/$/, ''); + const manifestUrl = opts.manifestUrl ?? `${siteBase}/${manifestRelPath}`; + const onError = opts.onError ?? 'skip'; + const normalize = opts.normalize; + + // Current manifest: hash every eligible page's built HTML. + const entries: ManifestEntry[] = []; + for (const e of eligible) { + const html = await readFile(join(buildDir, e.file), 'utf8'); + entries.push({ url: e.url, content: html }); + } + const current = buildUrlManifest( + entries, + normalize + ? (content, url) => hashContent(normalize(content, url)) + : undefined, + ); + + // Previous state lives on the live site. A clean 404 means + // "first run" (baseline). Any other failure leaves `previous` + // null so we never accidentally full-resubmit on a transient + // network blip. + let previous: UrlManifest | null = null; + try { + const res = await fetch(manifestUrl); + if (res.status === 404) { + previous = {}; + } else if (res.ok) { + previous = parseManifest(await res.text()); + } + } catch { + previous = null; + } + + // Always publish the new manifest so it ships with this + // deploy and becomes the next build's baseline. + await writeFile( + join(buildDir, manifestRelPath), + serializeManifest(current), + 'utf8', + ); + + if (previous === null) { + if (onError === 'full') { + logger.warn( + `IndexNow: could not read previous manifest at ${manifestUrl}; submitting all URLs.`, + ); + await submit(Object.keys(current)); + } else { + logger.warn( + `IndexNow: could not read previous manifest at ${manifestUrl}; skipping submission this build.`, + ); + } + } else { + const diff = diffManifests(previous, current); + logger.info( + `IndexNow: ${diff.added.length} added, ${diff.updated.length} updated, ${diff.deleted.length} deleted.`, + ); + await submit(changedUrls(diff)); + } } } diff --git a/packages/astro-seo-graph/test/indexnow-manifest.test.ts b/packages/astro-seo-graph/test/indexnow-manifest.test.ts new file mode 100644 index 0000000..9a21be2 --- /dev/null +++ b/packages/astro-seo-graph/test/indexnow-manifest.test.ts @@ -0,0 +1,102 @@ +import { describe, expect, it } from 'vitest'; +import { + buildUrlManifest, + changedUrls, + diffManifests, + hashContent, + parseManifest, + serializeManifest, + MANIFEST_VERSION, + DEFAULT_HASH_ALGORITHM, +} from '../src/indexnow-manifest.js'; + +describe('hashContent', () => { + it('is stable and content-sensitive', () => { + expect(hashContent('abc')).toBe(hashContent('abc')); + expect(hashContent('abc')).not.toBe(hashContent('abd')); + }); + + it('returns a 16-char hex string', () => { + expect(hashContent('hello world')).toMatch(/^[0-9a-f]{16}$/); + }); +}); + +describe('buildUrlManifest', () => { + it('maps each url to its content hash', () => { + const manifest = buildUrlManifest([ + { url: 'https://x.com/a/', content: 'A' }, + { url: 'https://x.com/b/', content: 'B' }, + ]); + expect(manifest['https://x.com/a/']).toBe(hashContent('A')); + expect(manifest['https://x.com/b/']).toBe(hashContent('B')); + }); + + it('supports a custom hash for normalizing volatile markup', () => { + const stripNonce = (html: string) => hashContent(html.replace(/nonce="[^"]*"/g, '')); + const a = buildUrlManifest([{ url: '/p/', content: 'hi' }], stripNonce); + const b = buildUrlManifest([{ url: '/p/', content: 'hi' }], stripNonce); + expect(a['/p/']).toBe(b['/p/']); + }); +}); + +describe('diffManifests', () => { + it('classifies added, updated, and deleted, sorted', () => { + const prev = { '/keep/': 'h1', '/change/': 'h2', '/gone/': 'h3' }; + const curr = { '/keep/': 'h1', '/change/': 'h2-new', '/new/': 'h4' }; + expect(diffManifests(prev, curr)).toEqual({ + added: ['/new/'], + updated: ['/change/'], + deleted: ['/gone/'], + }); + }); + + it('treats an empty previous manifest as all-added (first run)', () => { + const curr = { '/b/': 'h', '/a/': 'h' }; + const diff = diffManifests({}, curr); + expect(diff.added).toEqual(['/a/', '/b/']); + expect(diff.updated).toEqual([]); + expect(diff.deleted).toEqual([]); + }); + + it('reports no changes when manifests match', () => { + const m = { '/a/': 'h1', '/b/': 'h2' }; + expect(changedUrls(diffManifests(m, { ...m }))).toEqual([]); + }); +}); + +describe('changedUrls', () => { + it('concatenates added, updated, and deleted', () => { + expect(changedUrls({ added: ['/a/'], updated: ['/b/'], deleted: ['/c/'] })).toEqual([ + '/a/', + '/b/', + '/c/', + ]); + }); +}); + +describe('serializeManifest / parseManifest', () => { + it('round-trips a manifest', () => { + const urls = { '/a/': 'h1', '/b/': 'h2' }; + expect(parseManifest(serializeManifest(urls))).toEqual(urls); + }); + + it('serializes with sorted keys and a version + algorithm header', () => { + const json = serializeManifest({ '/b/': 'h2', '/a/': 'h1' }); + const parsed = JSON.parse(json); + expect(parsed.version).toBe(MANIFEST_VERSION); + expect(parsed.algorithm).toBe(DEFAULT_HASH_ALGORITHM); + expect(Object.keys(parsed.urls)).toEqual(['/a/', '/b/']); + }); + + it('returns null for missing, malformed, or non-manifest input', () => { + expect(parseManifest(null)).toBeNull(); + expect(parseManifest('')).toBeNull(); + expect(parseManifest('not json')).toBeNull(); + expect(parseManifest('"a string"')).toBeNull(); + expect(parseManifest('{"version":1}')).toBeNull(); + }); + + it('ignores non-string hash values when parsing', () => { + expect(parseManifest('{"urls":{"/a/":"h1","/b/":123}}')).toEqual({ '/a/': 'h1' }); + }); +});