From 491f51bc07520317f31416a68a9a221ccade03f9 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Tue, 5 May 2026 17:38:25 +0200 Subject: [PATCH 01/41] minimal changes for direct from 4CAT mapping --- js/lib.js | 16 +++++++++++++++- modules/_loader.js | 6 +++++- popup/interface.js | 2 +- 3 files changed, 21 insertions(+), 3 deletions(-) diff --git a/js/lib.js b/js/lib.js index 6199d01..1579195 100644 --- a/js/lib.js +++ b/js/lib.js @@ -57,4 +57,18 @@ class MissingMappedField { toString() { return `${this.value}`; } -} \ No newline at end of file +} + +/** + * Wrap a Zeeschuimer stored item to match the shape a 4CAT map_item expects. + * + * 4CAT's importer constructs: + * { ...item.data, __import_meta: { ...everything in item except data } } + * + * Mirroring that here means map_item functions auto-generated from 4CAT + * data sources can run against Zeeschuimer-stored items without translation. + */ +function wrap_for_map_item(stored_item) { + const { data, ...meta } = stored_item; + return { ...data, __import_meta: meta }; +} diff --git a/modules/_loader.js b/modules/_loader.js index 47697ca..afae2d7 100644 --- a/modules/_loader.js +++ b/modules/_loader.js @@ -17,11 +17,15 @@ async function load() { ]; for(const module of imported_modules) { + const mapper = module.map_item + ? (stored_item) => module.map_item(wrap_for_map_item(stored_item)) + : null; + zeeschuimer.register_module( module.MODULE_NAME, module.DOMAIN, module.capture, - module.map_item, + mapper, module.MODULE_ID ? module.MODULE_ID : module.MODULE_DOMAIN, module.overwrite_partial, module.TOOLTIP ? module.TOOLTIP : null, diff --git a/popup/interface.js b/popup/interface.js index 5cc7864..1ae60a2 100644 --- a/popup/interface.js +++ b/popup/interface.js @@ -619,7 +619,7 @@ async function get_csv_blob(platform) { let csv = []; const module = background.zeeschuimer.modules[platform]; await iterate_items(platform, function(item) { - item = module.mapper(item.data); + item = module.mapper(item); if(csv.length === 0) { csv.push(Object.keys(item).map(v => csv_escape(v)).join(CSV_SEPARATOR) + "\n"); } From b06805f711a97fad6e9e3f6615db3a0cf936205e Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 6 May 2026 10:54:13 +0200 Subject: [PATCH 02/41] give me some standard helper functions --- js/lib.js | 54 +++++++++++++++++++++ modules/tiktok.js | 119 +++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 172 insertions(+), 1 deletion(-) diff --git a/js/lib.js b/js/lib.js index 1579195..3b144d2 100644 --- a/js/lib.js +++ b/js/lib.js @@ -72,3 +72,57 @@ function wrap_for_map_item(stored_item) { const { data, ...meta } = stored_item; return { ...data, __import_meta: meta }; } + +/** + * Ports of 4CAT functions commonly used by `map_item` below + */ + +/** + * Strip HTML tags from a string. + * @param {string} html + * @param {boolean} convertNewlines Convert
and

tags to \n before stripping. + * @returns {string} + */ +function strip_tags(html, convertNewlines = true) { + if (!html) return ""; + if (convertNewlines) { + html = html.replace(//gi, "\n").replace(/<\/p>/gi, "

\n"); + html = html.replace(/\n+/g, "\n"); + } + const doc = new DOMParser().parseFromString(html, "text/html"); + return doc.body.textContent || ""; +} + +/** + * Normalize URL encoding for display and linking. + * Decodes percent-encoded URLs and re-encodes the query string canonically. + * Returns the original URL on parse failure. + * @param {string} url + * @returns {string} + */ +function normalize_url_encoding(url) { + if (!url) return ""; + try { + // Iterative decode handles double-encoded inputs. + let decoded = url; + let prev; + do { + prev = decoded; + try { + decoded = decodeURIComponent(prev); + } catch { + decoded = prev; + break; + } + } while (decoded !== prev); + const parsed = new URL(decoded); + // URL.toString() re-encodes the query/fragment correctly. + return parsed.toString(); + } catch { + return url; + } +} + +function formatUtcTimestamp(unixSeconds) { + return new Date(unixSeconds * 1000).toISOString().replace('T', ' ').slice(0, 19); +} \ No newline at end of file diff --git a/modules/tiktok.js b/modules/tiktok.js index 55e6fbf..ea52532 100644 --- a/modules/tiktok.js +++ b/modules/tiktok.js @@ -1,3 +1,4 @@ + export const MODULE_NAME = 'TikTok (posts)'; export const DOMAIN = 'tiktok.com'; @@ -103,4 +104,120 @@ export function capture(response, source_platform_url, source_url) { } else { return []; } -} \ No newline at end of file +} + +// === auto-generated by 4cat map_item sync — DO NOT EDIT BY HAND === +// (regenerated from datasources/tiktok/search_tiktok.py) +export function map_item(post) { + // Zeeschuimer metadata + const metadata = post.__import_meta || {}; + + const challenges = Array.isArray(post.challenges) + ? post.challenges.map(ch => ch.title).filter(Boolean) + : []; + + const hashtags = Array.isArray(post.textExtra) + ? post.textExtra + .filter(e => e.hasOwnProperty('hashtagName') && e.hashtagName) + .map(e => e.hashtagName) + : []; + + const diversificationLabels = Array.isArray(post.diversificationLabels) + ? post.diversificationLabels.join(',') + : ''; + + let user_nickname = ''; + let user_fullname = ''; + let user_thumbnail = ''; + + if (post.author && typeof post.author === 'object') { + user_nickname = post.author.uniqueId || ''; + user_fullname = post.author.nickname || ''; + user_thumbnail = post.author.avatarThumb || ''; + } else if (post.author) { + user_nickname = post.author || ''; + user_fullname = post.nickname || ''; + user_thumbnail = ''; + } + + const thumbnailOptions = []; + + if (post.video && Array.isArray(post.video.shareCover)) { + thumbnailOptions.push(...post.video.shareCover); + } + + if (post.video && post.video.cover) { + thumbnailOptions.push(post.video.cover); + } + + const now = Math.floor(Date.now() / 1000); + + const validThumbnails = thumbnailOptions.filter(url => { + try { + const parsedUrl = new URL(url); + const expires = parseInt(parsedUrl.searchParams.get('x-expires'), 10) || 0; + return expires >= now; + } catch (e) { + return false; + } + }); + + const thumbnail_url = validThumbnails.length ? validThumbnails[validThumbnails.length - 1] : ''; + + return new MappedItem({ + collected_from_url: metadata.source_platform_url + ? normalize_url_encoding(metadata.source_platform_url) + : '', + id: post.id || '', + thread_id: post.id || '', + author: user_nickname, + author_full: user_fullname, + author_followers: post.authorStats?.followerCount ?? '', + author_likes: post.authorStats?.diggCount ?? '', + author_videos: post.authorStats?.videoCount ?? '', + author_avatar: user_thumbnail, + body: post.desc || '', + stickers: Array.isArray(post.stickersOnItem) + ? post.stickersOnItem + .map(s => (Array.isArray(s.stickerText) ? s.stickerText.join(' ') : '')) + .filter(Boolean) + .join('') + : '', + timestamp: post.createTime + ? formatUtcTimestamp(parseInt(post.createTime, 10)) + : '', + unix_timestamp: post.createTime ? parseInt(post.createTime, 10) : 0, + is_duet: + post.duetInfo && post.duetInfo.duetFromId && post.duetInfo.duetFromId !== '0' + ? 'yes' + : 'no', + is_ad: post.isAd ? 'yes' : 'no', + is_paid_partnership: post.adAuthorization ? 'yes' : 'no', + is_sensitive: post.maskType === 3 ? 'yes' : 'no', + is_photosensitive: post.maskType === 4 ? 'yes' : 'no', + music_name: post.music?.title ?? '', + music_id: post.music?.id ?? '', + music_url: post.music?.playUrl ?? '', + music_thumbnail: post.music?.coverLarge ?? '', + music_author: post.music?.authorName ?? '', + video_url: post.video?.downloadAddr ?? '', + tiktok_url: `https://www.tiktok.com/@${user_nickname}/video/${post.id}`, + thumbnail_url: thumbnail_url, + likes: post.stats?.diggCount ?? '', + comments: post.stats?.commentCount ?? '', + shares: post.stats?.shareCount ?? '', + plays: post.stats?.playCount ?? '', + hashtags: hashtags.join(','), + challenges: challenges.join(','), + diversification_labels: diversificationLabels, + location_created: post.locationCreated ?? '', + effects: Array.isArray(post.effectStickers) + ? post.effectStickers.map(e => e.name).join(',') + : '', + warning: Array.isArray(post.warnInfo) + ? post.warnInfo.map(w => w.text).join(',') + : '', + }); +} +// === end auto-generated === +// === end auto-generated === From f9a2405a0703bcadfdee7492ccd57af12917733e Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 6 May 2026 13:07:43 +0200 Subject: [PATCH 03/41] fix csv export --- popup/interface.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/popup/interface.js b/popup/interface.js index 1ae60a2..8afd1b1 100644 --- a/popup/interface.js +++ b/popup/interface.js @@ -595,7 +595,7 @@ const CSV_ESCAPED = `"${CSV_SEPARATOR}\n`; function csv_escape(value) { value = String(value); let needs_escape = false; - for(const character in CSV_ESCAPED) { + for(const character of CSV_ESCAPED) { if(value.indexOf(character) >= 0) { needs_escape = true; } From 2f084b9352c25a1034429bb05d8390b5961d35ef Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 6 May 2026 15:19:18 +0200 Subject: [PATCH 04/41] another to CSV fix --- popup/interface.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/popup/interface.js b/popup/interface.js index 8afd1b1..94fff77 100644 --- a/popup/interface.js +++ b/popup/interface.js @@ -626,7 +626,7 @@ async function get_csv_blob(platform) { csv.push(Object.values(item).map(v => csv_escape(v)).join(CSV_SEPARATOR) + "\n"); }) - return new Blob([csv], {type: 'text/csv'}); + return new Blob(csv, {type: 'text/csv'}); } /** From d7870426c7765a6107c47c4fff062f5643725167 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 6 May 2026 15:25:42 +0200 Subject: [PATCH 05/41] revert tiktok (mistaken test result commited) --- modules/tiktok.js | 119 +--------------------------------------------- 1 file changed, 1 insertion(+), 118 deletions(-) diff --git a/modules/tiktok.js b/modules/tiktok.js index ea52532..55e6fbf 100644 --- a/modules/tiktok.js +++ b/modules/tiktok.js @@ -1,4 +1,3 @@ - export const MODULE_NAME = 'TikTok (posts)'; export const DOMAIN = 'tiktok.com'; @@ -104,120 +103,4 @@ export function capture(response, source_platform_url, source_url) { } else { return []; } -} - -// === auto-generated by 4cat map_item sync — DO NOT EDIT BY HAND === -// (regenerated from datasources/tiktok/search_tiktok.py) -export function map_item(post) { - // Zeeschuimer metadata - const metadata = post.__import_meta || {}; - - const challenges = Array.isArray(post.challenges) - ? post.challenges.map(ch => ch.title).filter(Boolean) - : []; - - const hashtags = Array.isArray(post.textExtra) - ? post.textExtra - .filter(e => e.hasOwnProperty('hashtagName') && e.hashtagName) - .map(e => e.hashtagName) - : []; - - const diversificationLabels = Array.isArray(post.diversificationLabels) - ? post.diversificationLabels.join(',') - : ''; - - let user_nickname = ''; - let user_fullname = ''; - let user_thumbnail = ''; - - if (post.author && typeof post.author === 'object') { - user_nickname = post.author.uniqueId || ''; - user_fullname = post.author.nickname || ''; - user_thumbnail = post.author.avatarThumb || ''; - } else if (post.author) { - user_nickname = post.author || ''; - user_fullname = post.nickname || ''; - user_thumbnail = ''; - } - - const thumbnailOptions = []; - - if (post.video && Array.isArray(post.video.shareCover)) { - thumbnailOptions.push(...post.video.shareCover); - } - - if (post.video && post.video.cover) { - thumbnailOptions.push(post.video.cover); - } - - const now = Math.floor(Date.now() / 1000); - - const validThumbnails = thumbnailOptions.filter(url => { - try { - const parsedUrl = new URL(url); - const expires = parseInt(parsedUrl.searchParams.get('x-expires'), 10) || 0; - return expires >= now; - } catch (e) { - return false; - } - }); - - const thumbnail_url = validThumbnails.length ? validThumbnails[validThumbnails.length - 1] : ''; - - return new MappedItem({ - collected_from_url: metadata.source_platform_url - ? normalize_url_encoding(metadata.source_platform_url) - : '', - id: post.id || '', - thread_id: post.id || '', - author: user_nickname, - author_full: user_fullname, - author_followers: post.authorStats?.followerCount ?? '', - author_likes: post.authorStats?.diggCount ?? '', - author_videos: post.authorStats?.videoCount ?? '', - author_avatar: user_thumbnail, - body: post.desc || '', - stickers: Array.isArray(post.stickersOnItem) - ? post.stickersOnItem - .map(s => (Array.isArray(s.stickerText) ? s.stickerText.join(' ') : '')) - .filter(Boolean) - .join('') - : '', - timestamp: post.createTime - ? formatUtcTimestamp(parseInt(post.createTime, 10)) - : '', - unix_timestamp: post.createTime ? parseInt(post.createTime, 10) : 0, - is_duet: - post.duetInfo && post.duetInfo.duetFromId && post.duetInfo.duetFromId !== '0' - ? 'yes' - : 'no', - is_ad: post.isAd ? 'yes' : 'no', - is_paid_partnership: post.adAuthorization ? 'yes' : 'no', - is_sensitive: post.maskType === 3 ? 'yes' : 'no', - is_photosensitive: post.maskType === 4 ? 'yes' : 'no', - music_name: post.music?.title ?? '', - music_id: post.music?.id ?? '', - music_url: post.music?.playUrl ?? '', - music_thumbnail: post.music?.coverLarge ?? '', - music_author: post.music?.authorName ?? '', - video_url: post.video?.downloadAddr ?? '', - tiktok_url: `https://www.tiktok.com/@${user_nickname}/video/${post.id}`, - thumbnail_url: thumbnail_url, - likes: post.stats?.diggCount ?? '', - comments: post.stats?.commentCount ?? '', - shares: post.stats?.shareCount ?? '', - plays: post.stats?.playCount ?? '', - hashtags: hashtags.join(','), - challenges: challenges.join(','), - diversification_labels: diversificationLabels, - location_created: post.locationCreated ?? '', - effects: Array.isArray(post.effectStickers) - ? post.effectStickers.map(e => e.name).join(',') - : '', - warning: Array.isArray(post.warnInfo) - ? post.warnInfo.map(w => w.text).join(',') - : '', - }); -} -// === end auto-generated === -// === end auto-generated === +} \ No newline at end of file From a9fba9a9caee86d8799ee35d11374fbb602c9a41 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 6 May 2026 15:57:45 +0200 Subject: [PATCH 06/41] clean up UI (make download menu button) --- popup/interface.html | 32 +++++++++++++++++++++- popup/interface.js | 63 +++++++++++++++++++++++++++++++++----------- 2 files changed, 78 insertions(+), 17 deletions(-) diff --git a/popup/interface.html b/popup/interface.html index 356f2b5..e9d9b3f 100644 --- a/popup/interface.html +++ b/popup/interface.html @@ -215,10 +215,39 @@ text-indent: 2em; } - td > button:not(:last-child) { + td > button:not(:last-child), + td > .download-menu:not(:last-child) { margin-right: 0.25em; } + /* download chooser: trigger is a regular button (inherits all button + styles); */ + .download-menu { + display: inline-block; + position: relative; + } + + /* :not([hidden]) so the explicit display:flex doesn't override the + [hidden] attribute's default display:none */ + .download-menu > .download-options:not([hidden]) { + position: absolute; + top: calc(100% + 0.25em); + left: 0; + display: flex; + flex-direction: column; + gap: 0.25em; + padding: 0.25em; + background: var(--neutral-contrast-alt); + border: 2px solid var(--neutral-contrast); + border-radius: 0.5em; + z-index: 10; + white-space: nowrap; + } + + .download-menu > .download-options > button { + margin: 0; + } + input:not([type=checkbox]):not([type=radio]), button { background: var(--neutral-contrast-alt); color: var(--accent); @@ -302,6 +331,7 @@ .toggle-switch input { -moz-appearance: none; + appearance: none; opacity: 0; } diff --git a/popup/interface.js b/popup/interface.js index 94fff77..3b8aaa9 100644 --- a/popup/interface.js +++ b/popup/interface.js @@ -119,7 +119,7 @@ async function set_4cat_url(e) { function activate_buttons() { document.querySelectorAll("td button").forEach(button => { let current = button.disabled; - let items = parseInt(button.parentNode.parentNode.querySelector('.num-items').innerText); + let items = parseInt(button.closest('tr').querySelector('.num-items').innerText); let new_status = current; if(button.classList.contains('upload-to-4cat') && !is_uploading) { @@ -132,7 +132,7 @@ function activate_buttons() { button.setAttribute('title', ''); } - } else if(button.classList.contains('download-ndjson') || button.classList.contains('reset') || button.classList.contains('download-csv')) { + } else if(button.classList.contains('download-format') || button.classList.contains('download-menu-trigger') || button.classList.contains('reset')) { new_status = !(items > 0); } @@ -234,21 +234,32 @@ async function get_stats() { let actions = createElement("td"); const clear_button = createElement("button", {"data-platform": platform, "class": "reset"}, "Delete"); - const csv_button = createElement("button", {"data-platform": platform, 'class': 'download-csv'}, '.csv'); - const download_button = createElement("button", { - "data-platform": platform, - "class": "download-ndjson" - }, ".ndjson"); + + // Render the download chooser as a button + popover panel, + // (even when only NDJSON is available as visual consistent) + const download_widget = createElement("span", {"class": "download-menu"}); + const trigger = createElement("button", { + "data-platform": platform, "class": "download-menu-trigger" + }, "Download"); + const options = createElement("div", {"class": "download-options", "hidden": ""}); + options.appendChild(createElement("button", { + "data-platform": platform, "data-format": "ndjson", "class": "download-format" + }, ".ndjson (original)")); + if(module.mapper) { + options.appendChild(createElement("button", { + "data-platform": platform, "data-format": "csv", "class": "download-format" + }, ".csv")); + } + download_widget.appendChild(trigger); + download_widget.appendChild(options); + const fourcat_button = createElement("button", { "data-platform": platform, "class": "upload-to-4cat", }, "to 4CAT"); actions.appendChild(clear_button); - if(module.mapper) { - actions.appendChild(csv_button); - } - actions.appendChild(download_button); + actions.appendChild(download_widget); actions.appendChild(fourcat_button); row.appendChild(actions); @@ -317,22 +328,38 @@ async function get_stats() { async function button_handler(event) { let status = document.getElementById('upload-status'); - if (event.target.matches('.reset')) { + // Close any open download-format popovers when clicking outside their host. + // Skip if the click is on a trigger or inside an options panel + if(!event.target.matches('.download-menu-trigger') && !event.target.closest('.download-options')) { + document.querySelectorAll('.download-options:not([hidden])').forEach(el => el.hidden = true); + } + + if (event.target.matches('.download-menu-trigger')) { + const widget = event.target.closest('.download-menu'); + const options = widget.querySelector('.download-options'); + const opening = options.hidden; + // close any other menus before opening this one + document.querySelectorAll('.download-options:not([hidden])').forEach(el => { + if(el !== options) el.hidden = true; + }); + options.hidden = !opening; + + } else if (event.target.matches('.reset')) { let platform = event.target.getAttribute('data-platform'); await background.db.items.where("source_platform").equals(platform).delete(); } else if (event.target.matches('.reset-all')) { await background.db.items.clear(); - } else if (event.target.matches('.download-ndjson') || event.target.matches('.download-csv')) { - const blobber = event.target.matches('.download-ndjson') ? get_ndjson_blob : get_csv_blob; - const extension = event.target.matches('.download-ndjson') ? 'ndjson' : 'csv'; + } else if (event.target.matches('.download-format')) { + const format = event.target.getAttribute('data-format'); + const blobber = format === 'csv' ? get_csv_blob : get_ndjson_blob; + const extension = format; let platform = event.target.getAttribute('data-platform'); let date = new Date(); event.target.classList.add('loading'); - //let blob = await download_blob(platform, 'zeeschuimer-export-' + platform + '-' + date.toISOString().split(".")[0].replace(/:/g, "") + '.ndjson'); let blob = await blobber(platform); let filename = 'zeeschuimer-export-' + platform + '-' + date.toISOString().split(".")[0].replace(/:/g, "") + '.' + extension; const downloadUrl = window.URL.createObjectURL(blob); @@ -345,6 +372,10 @@ async function button_handler(event) { event.target.classList.remove('loading'); + // collapse the popover menu after the download fires + const widget = event.target.closest('.download-menu'); + if(widget) widget.querySelector('.download-options').hidden = true; + } else if (event.target.matches('.upload-to-4cat')) { let platform = event.target.getAttribute('data-platform'); status.innerText = 'Creating data file for uploading...'; From 0980a56f0ba6872884bfc1e891efc2cb9f4e4c33 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 6 May 2026 16:13:52 +0200 Subject: [PATCH 07/41] testing is hard in JS --- docs/test-plan.md | 162 ++++++++++++++++++++++ modules/package.json | 3 + tests/__pycache__/test.cpython-39.pyc | Bin 0 -> 7345 bytes tests/duplicate-behavior.test.js | 3 +- tests/{jest.config.js => jest.config.cjs} | 3 +- tests/map_item.test.js | 130 +++++++++++++++++ tests/package.json | 5 +- tests/setup-globals.cjs | 41 ++++++ 8 files changed, 343 insertions(+), 4 deletions(-) create mode 100644 docs/test-plan.md create mode 100644 modules/package.json create mode 100644 tests/__pycache__/test.cpython-39.pyc rename tests/{jest.config.js => jest.config.cjs} (64%) create mode 100644 tests/map_item.test.js create mode 100644 tests/setup-globals.cjs diff --git a/docs/test-plan.md b/docs/test-plan.md new file mode 100644 index 0000000..249a7e0 --- /dev/null +++ b/docs/test-plan.md @@ -0,0 +1,162 @@ +# Selenium Test Harness — Improvement Plan + +Date: 2026-04-30 + +Overview + +This document captures an actionable plan to improve the Selenium-based integration tests in `tests/test.py` for the Zeeschuimer Firefox extension. The goals are to: + +- Make profile handling reliable and reusable (so logged-in sessions persist across runs). +- Preserve and export captured data per platform for offline analysis and for passing to 4CAT. +- Add optional automated upload to a 4CAT instance for mapping/validation tests. +- Reduce fragility caused by popups and interactive dialogs (pausing/dismissal patterns). +- Improve robustness, error handling, and machine-readable results. + +Scope + +All changes are confined to the test harness and test metadata (`tests/test.py` and `tests/tests.json`) and to this planning document. No changes are required in the extension source for the planned items (the test harness will interact with the extension's UI pages and background DB). + +Phases & Changes + +Phase 1 — Profile management + +- Problem: copying an entire profile can race with a running Firefox and the current ignore rule hides potentially useful session data. +- Changes: + - Detect if the selected profile directory appears locked (presence of `lock` or `.parentlock`) and warn if Firefox is running. + - Replace the naive ignore lambda used in `shutil.copytree` with a function that only excludes `storage`, `extensions`, and `signedInUser.json` at the profile root. + - Add CLI flags: `--profile-name NAME` (choose profile by display name from `profiles.ini`), `--save-profile PATH` (save the temp profile for reuse), and `--no-cleanup` (do not remove `.temp-profile` after run). + +Implementation note (copytree ignore example): + +```python +def _profile_ignore(root, names): + # Only ignore these entries in the root profile dir + if os.path.abspath(root) == os.path.abspath(profile_dir): + return {"storage", "extensions", "signedInUser.json"} + return set() + +shutil.copytree(profile_dir, profile_file, ignore=_profile_ignore) +``` + +Phase 2 — Data preservation & export + +- Problem: `reset-all` wipes the DB before each URL; no artifacts are kept for post-mortem or mapping tests. +- Decision: export a single combined NDJSON file per platform containing items collected while testing that platform. +- Changes: + - Add CLI `--export-dir PATH` (default `./zeeschuimer-exports/{timestamp}/`). + - Before clicking `reset-all` for each URL, read the current DB contents from the extension background page (Dexie) via `execute_async_script` and append those items to a per-platform in-memory list in Python. After all URLs for a platform are done, write `{export-dir}/{platform}.ndjson`. + - Optionally add `--no-reset` to skip the `reset-all` call entirely (default behavior remains to reset before each URL). + +Execute_async_script pattern (example): + +```python +script = ''' +const cb = arguments[0]; +background.db.items.toArray().then(items => cb(JSON.stringify(items))).catch(e => cb(JSON.stringify({error: String(e)}))); +''' +items_json = driver.execute_async_script(script) +items = json.loads(items_json) +``` + +Phase 3 — 4CAT integration (optional) + +- Problem: mapping tests live in 4CAT and need NDJSON input. +- Changes: + - Add CLI flags: `--4cat-url URL` and `--4cat-key KEY` (API key). Require both for upload. + - After writing the per-platform NDJSON, POST it to `{4cat_url.rstrip('/')}/api/import-dataset/` with header `X-Zeeschuimer-Platform: {platform}` and `Authorization: Bearer {key}` (confirm header with your 4CAT instance; alternative is to trigger the extension UI upload button when cookie-based auth is required). + - Do not fail the test run on 4CAT errors — print status and continue. + +Example upload with `requests`: + +```python +import requests +with open(ndjson_path, 'rb') as f: + headers = { + 'X-Zeeschuimer-Platform': platform, + 'Authorization': f'Bearer {fourcat_key}' + } + r = requests.post(f"{fourcat_url.rstrip('/')}/api/import-dataset/", headers=headers, data=f) + # check r.status_code and r.text for details +``` + +Phase 4 — Interactive controls & popup dismissals + +- Problem: cookie banners, paywall prompts, and other popups frequently interfere with automated navigation and can cause false failures. +- Decision: pause by default **once per platform** (not before every URL) so the tester can clear residual prompts; provide opt-out and finer-grained options. +- Changes: + - CLI flags: `--no-interactive` (disable all pauses), `--pause-before-url` (pause before each URL), `--pause-on-fail` (pause on failure), `--extra-wait N` (add N seconds to every wait), `--screenshot-dir PATH` (capture screenshots on fail/warning). + - Add a `dismiss-selectors` optional field in `tests.json` per URL: a list of CSS selectors to click to dismiss known popups. Example: + +```json +"dismiss-selectors": ["button.cookie-accept", ".modal .close"] +``` + + - Add per-URL `timeout` (page load timeout override). + +Phase 5 — Runner robustness & reporting + +- Problem: unhandled exceptions abort the run; final runtime is calculated incorrectly; no machine-readable results. +- Changes: + - Wrap each URL test body in try/except, increment `failed` on exceptions, and continue. + - Move the global `start_time = time.time()` to before the outer platform loop so the final elapsed time is for the full run. + - Add CLI flags: `--results-file PATH` (write JSON summary), `--resume-from PLATFORM` (skip earlier platforms), and `--screenshot-dir PATH` (as noted). + - Fix small test metadata issues (e.g., `more-after-scrolll` typo in `tests.json`). + +tests.json schema additions + +- Per-URL optional fields: + - `dismiss-selectors`: array of CSS selectors to click after page load + - `timeout`: numeric page load timeout seconds for this URL + - `extra-wait`: per-URL additional wait seconds + +CLI flags (summary) + +- `--profiledir PATH` — explicit profile path (existing) +- `--profile-name NAME` — choose Firefox profile by display name +- `--save-profile PATH` — persist the copied profile for reuse +- `--no-cleanup` — keep `.temp-profile` +- `--export-dir PATH` — where to write NDJSON exports +- `--no-reset` — do not click `reset-all` between URLs +- `--4cat-url URL` — base URL for 4CAT server +- `--4cat-key KEY` — API key for 4CAT uploads +- `--4cat-per-url` — upload per URL instead of per platform (optional) +- `--no-interactive` — disable pausing (default is to pause per-platform) +- `--pause-before-url` — pause before each URL +- `--pause-on-fail` — pause when a test fails +- `--extra-wait N` — add N seconds to every URL wait +- `--screenshot-dir PATH` — save screenshots on fail/warning +- `--results-file PATH` — write machine-readable results JSON +- `--resume-from PLATFORM` — resume a run from a platform + +Verification checklist + +1. `python tests/test.py --sources instagram.com --export-dir ./exports` -> `exports/instagram.com.ndjson` exists and contains NDJSON with captured items. +2. `python tests/test.py --save-profile .saved-profile --login` -> create a saved profile that can be reused with `--profiledir .saved-profile`. +3. Run with default interactive behavior and confirm one pause per platform. +4. `python tests/test.py --results-file results.json` -> JSON summary produced with per-URL status and counts. +5. Test 4CAT upload using a local mock server and `--4cat-url http://localhost:8000 --4cat-key KEY`. + +Implementation steps (recommended order) + +1. Docs and small fixes (this document + tests.json typo fix). +2. Profile management changes (`--profile-name`, improved copy ignore, `--save-profile`, lock detection). +3. Export behavior: `--export-dir` + `execute_async_script` collection and NDJSON write. +4. Runner robustness: try/except around URL loop, `--results-file`, fix `start_time` placement. +5. Interactive and dismissal features (`dismiss-selectors`, pause flags, screenshots). +6. 4CAT upload integration (optional, requires confirmation of auth header). + +Estimated effort: 6–10 hours of focused work to implement and test everything end-to-end; can be split into 3-4 incremental PRs. + +Open questions / confirmations needed + +- Confirm 4CAT API key header format (currently suggested: `Authorization: Bearer {key}`). If your 4CAT requires cookie-based auth, we should emulate the extension upload button via Selenium instead. +- Confirm desired default for interactive mode. (Current recommendation: pause once per platform by default; provide `--no-interactive` to run fully headless.) + +Next steps + +- I have created a matching TODO list in the session tracker and written this document to `docs/test-plan.md`. +- If you want, I can start implementing Phase 1 (profile management) in `tests/test.py` now and submit incremental changes. + +--- + +Requested file: `docs/test-plan.md` diff --git a/modules/package.json b/modules/package.json new file mode 100644 index 0000000..3dbc1ca --- /dev/null +++ b/modules/package.json @@ -0,0 +1,3 @@ +{ + "type": "module" +} diff --git a/tests/__pycache__/test.cpython-39.pyc b/tests/__pycache__/test.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..745e2b4aaad921a459372bb50b39980c50a68136 GIT binary patch literal 7345 zcmai3-E$k)b>CeqKnNl!lATnGrZeqKUpk()>F?ZK zfYgUo?Cjn9aqc#`iup9_Rb`1mDjmADRio3;Y0oamS9A_{-Zn zI^O|iiy!3Q;nuH=&MRVKL*uV(8SId->aT93{IL20jve)WFJ2Ur;?Rb^p|fd`7tSsMTX) znjhuUpKCnNk3CEQ%k)KU>2-EojCW>4x>FFxxAb~(OJ^nQIW9__6MSY%2S-@>LVIKq zmQDT6@3C5>zBh(m30UiHNgf0MuUxrUvyol_h;E^3h5X>j#U=M10ioE2|%-rdr+^exB}bWd*@ z@=L6qQfsH_)bV+|EM`w9Cpn|O*$s`I6MH-7#e3r91~5Q^$2%AJSw6940N3%(#Vv3k zj*D~U-e*Pih zy4?9dyvg6g{P&4lHFJeuNP1WK#iVzQ&#B%NzodA)+be-L|A4>0 zWz=u5dH#Wzhi?br&a$LC;2bI__ahHBlP7u zTfn=BcS)EX6T1C`e+ZqHx3v1Fd~Q-3*O(1?FK=lVwaxU>_@cq~u}TjXr@z(V;v=%6 zgwGH7^?vUMVqZt_*ZGYtW534d_s1|(!d5(--ZbT}#XfQRuC97hVjs?QGuGXvd8a5U z30-sPFF|*ORYkQw^ApU}m?Pp{x5Nte-V!zFKCQ4ni?bgp&hla(v~+u1;~(wUSiOIS z!|@p+OG&h*V4(rwKp!7NM+VK@*1R`OP3)Gy*k-{)9n^1&QO@`RU##kU>0t^p zpTvImvE~^kG=-;3cD9+8{}P=B71F62&%>Gx))0-3hnSL zN45rQTUV#9ku+Fav^NLjcppom2esGfX@S>OkMKQrZd+UY@ML0>PuS@5xm!sc9`}3p!khawwR?dd#&^N{w-`$T|E1{d{8H__vu$ZyiU9ltk5UPSC52(h`@24E z6I>rkFi?+ZKaC-L3PWwjS+cU;$EO2dUEwoS?@4^ECpfVbMaJ*;PrH^tEGk|Xz288l z>rY58EC28q#XtPNGW)a-Z*K0AxoAtIKgC}x)2I)b@Cm=XNAU&m>34X4qvF)% z&V7|NE_WWN?{-e?#kwi+nOG2uVyOxbP(F4q$+=VfejH83CzBd~0Pp)OsUfELkD#R= ziv{I}#s7=PzEJD>iuUYpN%&<@N5sht1IOy@0y&Lf&;M0$t<1 zHrkw6uZZpCitqYTa4hUv6U$sWcZI|t%k5BvAcB4-Xa}Ka#Cw~z$6JAdF|XTU4YOv| z^_Qc;LZK=B6~`6akf++jadli&!CGwR(x=f)J-Ehq5K|DM=-@+=Hf@ zu`48y2zWx0WOS%U<%9VTuUW*kRq?&hc09+cS}{yceTww}8+R27f!~r95kzm!#~mU| z5Hr}>XxN27G;Il~@l)1v+e%J1qd^Li`JS-EI}l6-EOYzp zJ3#TDcr`hR<|eE%wJO@(0V!brfrwHCYdRX8`nWLFC`@r{>hkQ=)!C`}$jDpKI~Quc zACR3i{2;WXs0fdw0bArsLer2n797tCYr=9J^zCK;uEo z)QUk8NPg6(SE(e6P8C(?U+YkZ0d=qe@+`Y5D1L;(3n;uqqk&WP1b^QnMJPv6x`8zH z^{(DlgSgobgi6%csq+FF3M|+ScYVwDnQ_ zjbPTg_aHK6tY|m}fmmhBo3)~`8!b=82)J|{CyLSN6}#nCYNUq5lcT9XR9eyr+eL~k z=oN|0N?7)#gE&@h`tYGPxE_GyC_Z-w{5U8_MzNUJ*{;40BLp>JyI~Ci{+goM*Xz`z z(L@L5ya32Sct={WoB(>aQRhWdu)9_`%f)i53bNH#OE?9&QYoxCK_Q6=)nEP<{r>UY z=whi9Ai^)43*q15%%L4v4=t5S#bT+%!-WtgFr7bh=zT8~(z9Lb)~)w1_Sbndf|yh2 zo5^gc6zwZD{bs9Ka%jhjT@l4v*l<}Yw3nlyoqD0Q_<1q z5P3BU3c|COkrNA1YH_g`z40wHUU@bVkN^!U8nk9-tzgw@Hlf7Gq)~M8*@UN$F1Nza z_lgjs2n#S%kW6?!^^I;tDeUf=YsjN5vY~~q{^qw!EYlX0Pno%lJWXC@hoKCjBUTeW z51IO_5WZK1GQye-@xz2*SR`z+v}mMaHz7t!g0YNS*MOSXT=?pjoqzw`&z7tUF$5B0 z2rUSXh-5uS)q(Hcg{n!(3GQwdfl+OZvdbb|6T-8iw5kQZ(F;*#;gb`Ecb1|fQiLt( zS>Gm|S1G3ig=oK3^jeLA13wEOA;ez&&Xa=3CureOP>8i+J8lb**0f?hQTk(e^Z6fj2g+p_ zYq`v_wydIt34U@zl41wj+nX(K-zq3|$%eXDNRm9u_JHI7hLUQF zZnhV>!SMIP4HA6vf=ZxszUj7l81M<_N`s)AZX$z*T@%BQ?e2l2)?(RY5OhrELE;@g_PIL)_tTH9E+jERIzQFF{%()Mm9qD zI?Gm2&{3~z`c2`nw91Ms<9GztH7VSp7>lnUD1K0jhisT6OhL7*pt~8WVNf}nM73+8 zvJK=>CB~#axgJZQn*xKhJ1jS0KSBzxBG_1~Nw>cpQvut(<2ELb5UPO$75?#{E{$x+ z@gN4*Eki55$ABHD_UHCEaPat|QKoE# z!@5E=+29I-aTy1u%+AsuT|a;GGF$|qo0+Ya!`Yr4^?$h~UA8Mu^5xKHIZv#W<7m!? z<8K)Wjk_XX1HDdK-6p$fRTQvH95Gmi+*$B$wihkvwrR^yG29)6twH({7ZBSm2Tp{F z4nELocmXqN638ggkZ+(GMGVCyEhbNQ4mbnkp6#vS9MTO~ig<_fgdxQyG67B}9=RDO z-?`pBo(*1@pD)kPU75RZ^TQi#Pm(YDA{umpGIwApb{ob!am}A^Rcdpw{Q5{B!FH4E z3oao`q~IYBWhwkR&N^VkUO_DbTTYR;q=MEm+l35Tjt@3MU6mI|k4TzqD3tA_5B|Gp zS#Jf{*9g$BxVYAI%{wiq6_^@3s&=~ENxClJvexi<%N1wowivi*&2(1pKgOW`jK|38 zBY4JCn;kH8<9{=$%>U3%n#~&7)Sy00SPW`0gZH43GsdtJk2$92^ju~f*pC?#`lO!G zC$J}r?=a|P4UF~S)PdAk3UGREAWN+=q7Py`sAmjNAS$}4o-FVZ)e*zgjX#>8jTUbP z7)OB7#OjznnHm9>5wr%r6Z(j8BsFXp`Zz}Pj_4Wlm_C~tS9@}Q%%)65i+1&%3Ggti r)7nAsdQd+C-g9`Zeye5-%!0d@)ao#;8W#8)2S0R@tbPE`VLbl_#VpU) literal 0 HcmV?d00001 diff --git a/tests/duplicate-behavior.test.js b/tests/duplicate-behavior.test.js index 031f663..9f0662b 100644 --- a/tests/duplicate-behavior.test.js +++ b/tests/duplicate-behavior.test.js @@ -5,8 +5,9 @@ * update or merge behaviors to duplicates across navigation boundaries. */ +import 'fake-indexeddb/auto'; + let Dexie; -require('fake-indexeddb/auto'); // Mock browser extension APIs global.browser = { diff --git a/tests/jest.config.js b/tests/jest.config.cjs similarity index 64% rename from tests/jest.config.js rename to tests/jest.config.cjs index 7dd5b02..ea72b10 100644 --- a/tests/jest.config.js +++ b/tests/jest.config.cjs @@ -3,6 +3,7 @@ module.exports = { testMatch: ['**/*.test.js'], transform: {}, moduleFileExtensions: ['js', 'json'], - collectCoverageFrom: ['duplicate-behavior.test.js'], + collectCoverageFrom: ['*.test.js'], + setupFiles: ['/setup-globals.cjs'], verbose: true }; diff --git a/tests/map_item.test.js b/tests/map_item.test.js new file mode 100644 index 0000000..9dee6e8 --- /dev/null +++ b/tests/map_item.test.js @@ -0,0 +1,130 @@ +/** + * Auto-discovery test driver for module `map_item` functions. + * + * Convention: + * tests/fixtures//*.ndjson + * + * matches a file in modules/ (e.g. "tiktok" maps to modules/tiktok.js). + * Each .ndjson line is one Zeeschuimer-stored item exported from the popup. + * + * Each item is wrapped via wrap_for_map_item to mirror how 4CAT's importer + * presents items to a map_item function, then run through the module's + * map_item. Tests assert: function returns a non-null object, and any fields + * listed in REQUIRED_NON_EMPTY for that module are present and non-empty. + */ + +import { readdirSync, readFileSync, statSync, existsSync } from 'node:fs'; +import { spawnSync } from 'node:child_process'; +import { join, dirname } from 'node:path'; +import { fileURLToPath } from 'node:url'; + +/** + * Local mirror of wrap_for_map_item from js/lib.js. + * + * lib.js is loaded by the browser as a plain script (it defines globals + * like traverse_data, MappedItem, wrap_for_map_item) and so cannot be + * imported from Node. The wrap is three trivial lines with no dependencies + * — duplicating it here is cheaper than restructuring lib.js into a module. + * If lib.js's wrap_for_map_item ever gains real logic, this needs to track. + */ +function wrap_for_map_item(stored_item) { + const { data, ...meta } = stored_item; + return { ...data, __import_meta: meta }; +} + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const FIXTURE_ROOT = join(__dirname, 'fixtures'); +const MODULES_ROOT = join(__dirname, '..', 'modules'); + +/** + * Pre-validate module syntax before dynamic import. + * + * `await import()` on a module with a syntax error throws inside V8's module + * linker in a way Jest's experimental-vm-modules can't always recover from + * (worker retry loop or Node process exit). Running `node --check` first + * gives us a clean error string we can fail the test with. + */ +function check_module_syntax(module_name) { + const module_path = join(MODULES_ROOT, `${module_name}.js`); + const result = spawnSync(process.execPath, ['--check', module_path], { + encoding: 'utf8', + }); + if (result.status === 0) return null; + return (result.stderr || result.stdout || `exit code ${result.status}`).trim(); +} + +const REQUIRED_NON_EMPTY = { + tiktok: ['id', 'author', 'unix_timestamp'], +}; + +function list_module_dirs() { + if (!existsSync(FIXTURE_ROOT)) return []; + return readdirSync(FIXTURE_ROOT).filter(name => { + try { return statSync(join(FIXTURE_ROOT, name)).isDirectory(); } + catch { return false; } + }); +} + +const module_dirs = list_module_dirs(); +let total_fixtures = 0; + +for (const module_name of module_dirs) { + const fixture_dir = join(FIXTURE_ROOT, module_name); + const fixture_files = readdirSync(fixture_dir).filter(f => f.endsWith('.ndjson')); + + if (fixture_files.length === 0) continue; + total_fixtures += fixture_files.length; + + describe(`map_item: ${module_name}`, () => { + let map_item; + let import_error; + + beforeAll(async () => { + const syntax_error = check_module_syntax(module_name); + if (syntax_error) { + import_error = new Error(`syntax error:\n${syntax_error}`); + return; + } + try { + const mod = await import(`../modules/${module_name}.js`); + map_item = mod.map_item; + if (typeof map_item !== 'function') { + import_error = new Error(`modules/${module_name}.js does not export a map_item function`); + } + } catch (e) { + import_error = e; + } + }); + + for (const fixture_file of fixture_files) { + const lines = readFileSync(join(fixture_dir, fixture_file), 'utf8') + .split('\n') + .filter(line => line.trim().length > 0); + + describe(fixture_file, () => { + lines.forEach((line, i) => { + test(`item ${i} maps without throwing`, () => { + if (import_error) { + throw new Error(`failed to import modules/${module_name}.js: ${import_error.message}`); + } + const stored_item = JSON.parse(line); + const mapped = map_item(wrap_for_map_item(stored_item)); + expect(mapped).not.toBeNull(); + expect(typeof mapped).toBe('object'); + for (const field of REQUIRED_NON_EMPTY[module_name] ?? []) { + expect(mapped[field]).toBeDefined(); + expect(mapped[field]).not.toBe(''); + expect(mapped[field]).not.toBeNull(); + } + }); + }); + }); + } + }); +} + +if (total_fixtures === 0) { + describe('map_item', () => { + test.skip('no fixtures found under tests/fixtures//*.ndjson', () => {}); + }); +} diff --git a/tests/package.json b/tests/package.json index dc3654c..6dd35fb 100644 --- a/tests/package.json +++ b/tests/package.json @@ -2,9 +2,10 @@ "name": "zeeschuimer-db-tests", "version": "1.0.0", "description": "Unit tests for Zeeschuimer duplicate handling logic", + "type": "module", "scripts": { - "test": "jest", - "test:watch": "jest --watch" + "test": "node --experimental-vm-modules node_modules/jest/bin/jest.js", + "test:watch": "node --experimental-vm-modules node_modules/jest/bin/jest.js --watch" }, "devDependencies": { "dexie": "^3.2.4", diff --git a/tests/setup-globals.cjs b/tests/setup-globals.cjs new file mode 100644 index 0000000..a19fb09 --- /dev/null +++ b/tests/setup-globals.cjs @@ -0,0 +1,41 @@ +/** + * Make js/lib.js's helpers available as globals inside the Jest test + * environment, mirroring how the browser sees them after the manifest + * loads lib.js as a plain script. + * + * map_item bodies reference these as free identifiers (MappedItem, + * MissingMappedField, strip_tags, normalize_url_encoding, ...). Without this + * shim they'd hit ReferenceError as soon as a test invokes map_item. + * + * Approach: read lib.js, wrap it in a new Function() body that returns the + * named helpers, call the function, and assign the returned object onto + * globalThis. (Earlier attempt with vm.runInThisContext failed because in + * the jsdom env the vm context's global differs from jsdom's window.) + * + * If a new helper is added to lib.js, append its name to EXPOSED_NAMES. + */ + +const fs = require('node:fs'); +const path = require('node:path'); + +const EXPOSED_NAMES = [ + 'traverse_data', + 'MappedItem', + 'MissingMappedField', + 'wrap_for_map_item', + 'strip_tags', + 'normalize_url_encoding', + 'formatUtcTimestamp', +]; + +const lib_source = fs.readFileSync( + path.join(__dirname, '..', 'js', 'lib.js'), + 'utf8', +); + +const factory = new Function(` +${lib_source} +return { ${EXPOSED_NAMES.join(', ')} }; +`); + +Object.assign(globalThis, factory()); From 46b96c77ffd45f465f90880915e1f6d2836bd87e Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 6 May 2026 16:25:56 +0200 Subject: [PATCH 08/41] add fixtures folder and README.md to explain what I did --- tests/fixtures/.gitignore | 5 +++++ tests/fixtures/README.md | 29 +++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 tests/fixtures/.gitignore create mode 100644 tests/fixtures/README.md diff --git a/tests/fixtures/.gitignore b/tests/fixtures/.gitignore new file mode 100644 index 0000000..8e89a83 --- /dev/null +++ b/tests/fixtures/.gitignore @@ -0,0 +1,5 @@ +# Ignore everything in this directory +* +# Except these files +!.gitignore +!README.md \ No newline at end of file diff --git a/tests/fixtures/README.md b/tests/fixtures/README.md new file mode 100644 index 0000000..d24fe06 --- /dev/null +++ b/tests/fixtures/README.md @@ -0,0 +1,29 @@ +# Test fixtures for `map_item` + +Real captured items used to exercise each module's auto-generated `map_item` +function. + +## Layout + +``` +tests/fixtures/ + / + .ndjson + .ndjson +``` + +`` matches the filename in `modules/` without `.js` — +e.g. `tiktok/` → `modules/tiktok.js`, `pinterest/` → `modules/pinterest.js`. +You can drop multiple `.ndjson` files in a module folder; each gets its own +`describe` block and each line becomes its own `test`. + +Filenames are free-form — the auto-export filename from the popup +(`zeeschuimer-export--.ndjson`) is fine. + +## Privacy / committing + +These files contain real captured platform data — usernames, post +content, URLs, sometimes images and other PII. + +If we want to create test exports or annonomize real exports, add them to +.gitignore. \ No newline at end of file From 487b5b618e4a989cbfca7dbfe2b30b1e78dc62ad Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Thu, 7 May 2026 15:53:22 +0200 Subject: [PATCH 09/41] add MapItemException --- js/lib.js | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/js/lib.js b/js/lib.js index 3b144d2..e38430e 100644 --- a/js/lib.js +++ b/js/lib.js @@ -59,6 +59,19 @@ class MissingMappedField { } } +/** + * Raised by `map_item` to signal a known mapping failure. + * + * Mirrors 4CAT's MapItemException: callers should catch it, skip the item, + * and warn the user that the platform's format may have shifted. + */ +class MapItemException extends Error { + constructor(message) { + super(message); + this.name = "MapItemException"; + } +} + /** * Wrap a Zeeschuimer stored item to match the shape a 4CAT map_item expects. * From b6f487dbfa017a79207726f04f059078aaf4c4b5 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Thu, 7 May 2026 15:56:14 +0200 Subject: [PATCH 10/41] make a warning pop up --- popup/interface.html | 42 ++++++++++++++++++++++++++++++ popup/interface.js | 62 +++++++++++++++++++++++++++++++++++++++----- 2 files changed, 97 insertions(+), 7 deletions(-) diff --git a/popup/interface.html b/popup/interface.html index e9d9b3f..0570e40 100644 --- a/popup/interface.html +++ b/popup/interface.html @@ -303,6 +303,42 @@ text-align: center; } + #csv-warning { + position: fixed; + inset: 0; + background: rgba(60, 60, 59, 0.55); + display: flex; + align-items: center; + justify-content: center; + z-index: 1000; + } + + #csv-warning[hidden] { + display: none; + } + + #csv-warning .csv-warning-content { + background: var(--accent); + color: var(--neutral-contrast); + border: 2px solid var(--accent-alt); + border-radius: 6px; + padding: 1.25em 1.25em 1em 1.25em; + max-width: 24em; + text-align: center; + box-shadow: 0 0 20px var(--neutral-contrast); + } + + #csv-warning .csv-warning-content p { + margin: 0 0 1em 0; + line-height: 1.4; + } + + #csv-warning .dismiss-csv-warning { + display: block; + margin: 0 auto; + padding: 0.3em 1.25em; + } + .tooltippable:not(a):not(button) { display: inline-block; background: var(--neutral-contrast); @@ -409,6 +445,12 @@ +

Zeeschuimer

diff --git a/popup/interface.js b/popup/interface.js index 3b8aaa9..c56375a 100644 --- a/popup/interface.js +++ b/popup/interface.js @@ -351,16 +351,29 @@ async function button_handler(event) { } else if (event.target.matches('.reset-all')) { await background.db.items.clear(); + } else if (event.target.matches('.dismiss-csv-warning')) { + const warning = document.getElementById('csv-warning'); + if(warning) warning.hidden = true; + } else if (event.target.matches('.download-format')) { const format = event.target.getAttribute('data-format'); - const blobber = format === 'csv' ? get_csv_blob : get_ndjson_blob; const extension = format; let platform = event.target.getAttribute('data-platform'); let date = new Date(); event.target.classList.add('loading'); - let blob = await blobber(platform); + let blob; + if(format === 'csv') { + const result = await get_csv_blob(platform); + blob = result.blob; + if(result.skipped > 0) { + console.warn(`Zeeschuimer: skipped ${result.skipped} ${platform} item(s) during CSV export. First reason: ${result.firstReason}`); + show_csv_warning(platform, result.skipped); + } + } else { + blob = await get_ndjson_blob(platform); + } let filename = 'zeeschuimer-export-' + platform + '-' + date.toISOString().split(".")[0].replace(/:/g, "") + '.' + extension; const downloadUrl = window.URL.createObjectURL(blob); const downloadId = await browser.downloads.download({ @@ -637,27 +650,62 @@ function csv_escape(value) { return value; } +/** + * Surface a CSV-export skip warning in the popup. + * + * Shown when the platform's `map_item` raised MapItemException for one or + * more items — typically the platform's response shape has shifted and the + * mapper no longer recognises every field. The user is steered to the + * .ndjson export, which is unaffected because it skips the mapper entirely. + */ +function show_csv_warning(platform, skipped) { + const warning = document.getElementById('csv-warning'); + if(!warning) return; + const message = warning.querySelector('p'); + message.innerText = `Skipped ${skipped} ${platform} item${skipped === 1 ? '' : 's'} in the CSV export — the platform's data format may have changed. Use the .ndjson export to get the full dataset until Zeeschuimer is updated.`; + warning.hidden = false; +} + /** * Get a CSV dump of items * * Returns a Blob with all items in it as CSV rows, mapped via the module's * registered mapper function. A header row is included. * + * Items whose mapper raises MapItemException are skipped and counted; any + * other error propagates. Skip count and the first skip reason are returned + * alongside the blob so the caller can warn the user. Just like 4CAT! + * * @param platform - * @returns {Promise} + * @returns {Promise<{blob: Blob, skipped: number, firstReason: string|null}>} */ async function get_csv_blob(platform) { let csv = []; + let skipped = 0; + let firstReason = null; const module = background.zeeschuimer.modules[platform]; await iterate_items(platform, function(item) { - item = module.mapper(item); + let mapped; + try { + mapped = module.mapper(item); + } catch(e) { + // More JS fun: Check tag rather than `instanceof`. + // Actual Exception lives in some other realm (where modules and lib.js live), and cross-realm + // `instanceof` is unreliable under Firefox's wrappers. + if(e && e.name === 'MapItemException') { + skipped++; + if(firstReason === null) firstReason = e.message; + return; + } + throw e; + } if(csv.length === 0) { - csv.push(Object.keys(item).map(v => csv_escape(v)).join(CSV_SEPARATOR) + "\n"); + csv.push(Object.keys(mapped).map(v => csv_escape(v)).join(CSV_SEPARATOR) + "\n"); } - csv.push(Object.values(item).map(v => csv_escape(v)).join(CSV_SEPARATOR) + "\n"); + csv.push(Object.values(mapped).map(v => csv_escape(v)).join(CSV_SEPARATOR) + "\n"); }) - return new Blob(csv, {type: 'text/csv'}); + return {blob: new Blob(csv, {type: 'text/csv'}), skipped, firstReason}; } /** From f28e310c8893bb49ac535d33cc94089e8d0686b2 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Thu, 7 May 2026 16:42:19 +0200 Subject: [PATCH 11/41] add MapItemException --- tests/setup-globals.cjs | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/setup-globals.cjs b/tests/setup-globals.cjs index a19fb09..4f54e34 100644 --- a/tests/setup-globals.cjs +++ b/tests/setup-globals.cjs @@ -22,6 +22,7 @@ const EXPOSED_NAMES = [ 'traverse_data', 'MappedItem', 'MissingMappedField', + 'MapItemException', 'wrap_for_map_item', 'strip_tags', 'normalize_url_encoding', From 5baff31ae49167d215a56cf16ead326b22d975f3 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 27 May 2026 15:16:06 +0200 Subject: [PATCH 12/41] add env variables for tests (to connect to 4CAT) --- .gitignore | 2 ++ tests/.env.example | 9 +++++++++ tests/package-lock.json | 14 ++++++++++++++ tests/package.json | 4 +++- 4 files changed, 28 insertions(+), 1 deletion(-) create mode 100644 tests/.env.example diff --git a/.gitignore b/.gitignore index 6cf9326..fea65f3 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,8 @@ # Testing artefacts .temp-profile +tests/.env +tests/.env.local # logs geckodriver.log diff --git a/tests/.env.example b/tests/.env.example new file mode 100644 index 0000000..2e021bb --- /dev/null +++ b/tests/.env.example @@ -0,0 +1,9 @@ +# 4CAT API config for the map_item comparison tests. +# Copy this file to .env in this directory and fill in real values. +# .env is gitignored; .env.example is the committed template. + +# Base URL of the 4CAT instance to hit. No trailing slash. +FOURCAT_URL=http://localhost + +# API key for that 4CAT instance. Get one from the 4CAT UI; tied to your user. +FOURCAT_API_KEY=your-api-key-here diff --git a/tests/package-lock.json b/tests/package-lock.json index cc8f457..d055883 100644 --- a/tests/package-lock.json +++ b/tests/package-lock.json @@ -9,6 +9,7 @@ "version": "1.0.0", "devDependencies": { "dexie": "^3.2.4", + "dotenv": "^16.4.5", "fake-indexeddb": "^5.0.1", "jest": "^29.7.0", "jest-environment-jsdom": "^29.7.0" @@ -1758,6 +1759,19 @@ "node": ">=12" } }, + "node_modules/dotenv": { + "version": "16.6.1", + "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.6.1.tgz", + "integrity": "sha512-uBq4egWHTcTt33a72vpSG0z3HnPuIl6NqYcTrKEg2azoEyl2hpW0zqlxysq2pK9HlDIHyHyakeYaYnSAwd8bow==", + "dev": true, + "license": "BSD-2-Clause", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://dotenvx.com" + } + }, "node_modules/dunder-proto": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz", diff --git a/tests/package.json b/tests/package.json index 6dd35fb..333564a 100644 --- a/tests/package.json +++ b/tests/package.json @@ -5,10 +5,12 @@ "type": "module", "scripts": { "test": "node --experimental-vm-modules node_modules/jest/bin/jest.js", - "test:watch": "node --experimental-vm-modules node_modules/jest/bin/jest.js --watch" + "test:watch": "node --experimental-vm-modules node_modules/jest/bin/jest.js --watch", + "probe": "node probe-4cat.mjs" }, "devDependencies": { "dexie": "^3.2.4", + "dotenv": "^16.4.5", "fake-indexeddb": "^5.0.1", "jest": "^29.7.0", "jest-environment-jsdom": "^29.7.0" From 6a8ce3870f4e0b6c050d68573d8affa4cc46e37b Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 27 May 2026 15:16:34 +0200 Subject: [PATCH 13/41] mirror 4CAT API missing value --- js/lib.js | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/js/lib.js b/js/lib.js index e38430e..c618a6a 100644 --- a/js/lib.js +++ b/js/lib.js @@ -57,6 +57,12 @@ class MissingMappedField { toString() { return `${this.value}`; } + + // Mirror 4CAT's API serialization so JSON.stringify produces the same + // tagged form on both sides. See docs/4cat-map-item-api.md. + toJSON() { + return { __missing: true, value: this.value }; + } } /** From 0c3140376ebd6e37cb1706fc48a105168d84d089 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 27 May 2026 18:41:52 +0200 Subject: [PATCH 14/41] test the 4cat API endpoint --- tests/probe-4cat.mjs | 140 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 140 insertions(+) create mode 100644 tests/probe-4cat.mjs diff --git a/tests/probe-4cat.mjs b/tests/probe-4cat.mjs new file mode 100644 index 0000000..0bf4e4d --- /dev/null +++ b/tests/probe-4cat.mjs @@ -0,0 +1,140 @@ +/** + * Manually exercise 4CAT's /api/map-item/ endpoint against a fixture item. + * + * Usage: + * node probe-4cat.mjs [] [--index N] + * + * is the Zeeschuimer module filename without `.js` (e.g. + * "tiktok", "pinterest"). If is omitted, the first + * .ndjson in tests/fixtures// is used. --index selects which + * line of the fixture to send (default 0). + * + * Requires tests/.env with FOURCAT_URL and FOURCAT_API_KEY. + */ + +import 'dotenv/config'; +import { readFileSync, existsSync, readdirSync } from 'node:fs'; +import { join, dirname } from 'node:path'; +import { fileURLToPath } from 'node:url'; + +const __dirname = dirname(fileURLToPath(import.meta.url)); + +const FOURCAT_URL = process.env.FOURCAT_URL?.replace(/\/$/, ''); +const FOURCAT_API_KEY = process.env.FOURCAT_API_KEY; + +if (!FOURCAT_URL || !FOURCAT_API_KEY || FOURCAT_API_KEY === 'your-api-key-here') { + console.error('error: FOURCAT_URL and FOURCAT_API_KEY must be set in tests/.env'); + console.error(' (copy tests/.env.example to tests/.env and fill in real values)'); + process.exit(1); +} + +const ID_MAP_PATH = join(__dirname, 'zeeschuimer-to-4cat.json'); +const ID_MAP = existsSync(ID_MAP_PATH) + ? JSON.parse(readFileSync(ID_MAP_PATH, 'utf8')) + : {}; + +function auth_headers() { + return { 'Authorization': `${FOURCAT_API_KEY}` }; +} + +async function list_datasources() { + const res = await fetch(`${FOURCAT_URL}/api/datasources/`, { headers: auth_headers() }); + if (!res.ok) { + throw new Error(`GET /api/datasources/ → ${res.status}: ${await res.text()}`); + } + const body = await res.json(); + return body.datasources ?? []; +} + +async function map_item(datasource_id, item) { + const res = await fetch(`${FOURCAT_URL}/api/map-item/${datasource_id}/`, { + method: 'POST', + headers: { ...auth_headers(), 'Content-Type': 'application/json' }, + body: JSON.stringify({ item }), + }); + const text = await res.text(); + let body; + try { body = JSON.parse(text); } catch { body = { raw: text }; } + return { status_code: res.status, body }; +} + +function parse_args(argv) { + const args = { module: null, fixture: null, index: 0 }; + const positional = []; + for (let i = 2; i < argv.length; i++) { + if (argv[i] === '--index') { + args.index = parseInt(argv[++i], 10); + } else if (argv[i].startsWith('--index=')) { + args.index = parseInt(argv[i].split('=')[1], 10); + } else { + positional.push(argv[i]); + } + } + args.module = positional[0]; + args.fixture = positional[1]; + return args; +} + +async function main() { + const args = parse_args(process.argv); + if (!args.module) { + console.error('Usage: node probe-4cat.mjs [] [--index N]'); + process.exit(1); + } + + const datasource_id = ID_MAP[args.module] ?? args.module; + const fixture_dir = join(__dirname, 'fixtures', args.module); + + if (!existsSync(fixture_dir)) { + console.error(`error: no fixture dir at ${fixture_dir}`); + process.exit(1); + } + + const candidates = readdirSync(fixture_dir).filter(f => f.endsWith('.ndjson')); + if (candidates.length === 0) { + console.error(`error: no .ndjson fixtures under ${fixture_dir}`); + process.exit(1); + } + const fixture_name = args.fixture ?? candidates[0]; + const fixture_path = join(fixture_dir, fixture_name); + if (!existsSync(fixture_path)) { + console.error(`error: fixture ${fixture_path} not found`); + process.exit(1); + } + + const lines = readFileSync(fixture_path, 'utf8').split('\n').filter(l => l.trim().length > 0); + if (args.index >= lines.length) { + console.error(`error: --index ${args.index} but fixture has ${lines.length} items`); + process.exit(1); + } + const item = JSON.parse(lines[args.index]); + + console.log(`Module: ${args.module}`); + console.log(`Datasource id: ${datasource_id}${ID_MAP[args.module] ? ' (mapped via zeeschuimer-to-4cat.json)' : ''}`); + console.log(`URL: ${FOURCAT_URL}/api/map-item/${datasource_id}/`); + console.log(`Fixture: ${fixture_name}, item ${args.index} (item_id=${item.item_id ?? item.id})`); + console.log(''); + + const { status_code, body } = await map_item(datasource_id, item); + console.log(`HTTP ${status_code}`); + console.log(JSON.stringify(body, null, 2)); + + if (status_code === 404) { + console.error(''); + console.error('Hint: datasource id may be wrong. Available Zeeschuimer-origin datasources:'); + try { + const datasources = await list_datasources(); + datasources + .filter(d => d.is_from_zeeschuimer && d.has_map_item) + .forEach(d => console.error(` - ${d.id} (${d.name})`)); + } catch (e) { + console.error(` (couldn't fetch list: ${e.message})`); + } + process.exit(2); + } +} + +main().catch(e => { + console.error(`probe failed: ${e.message}`); + process.exit(2); +}); From be2f3087d8dd5af07175101a808903604c84d78b Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 27 May 2026 18:43:04 +0200 Subject: [PATCH 15/41] update docs and packages --- docs/test-plan.md | 6 +++--- tests/package-lock.json | 13 ++++++++++++- tests/setup-globals.cjs | 11 +++++++++++ 3 files changed, 26 insertions(+), 4 deletions(-) diff --git a/docs/test-plan.md b/docs/test-plan.md index 249a7e0..a4265eb 100644 --- a/docs/test-plan.md +++ b/docs/test-plan.md @@ -63,7 +63,7 @@ Phase 3 — 4CAT integration (optional) - Problem: mapping tests live in 4CAT and need NDJSON input. - Changes: - Add CLI flags: `--4cat-url URL` and `--4cat-key KEY` (API key). Require both for upload. - - After writing the per-platform NDJSON, POST it to `{4cat_url.rstrip('/')}/api/import-dataset/` with header `X-Zeeschuimer-Platform: {platform}` and `Authorization: Bearer {key}` (confirm header with your 4CAT instance; alternative is to trigger the extension UI upload button when cookie-based auth is required). + - After writing the per-platform NDJSON, POST it to `{4cat_url.rstrip('/')}/api/import-dataset/` with header `X-Zeeschuimer-Platform: {platform}` and `Authorization: {key}` (confirm header with your 4CAT instance; alternative is to trigger the extension UI upload button when cookie-based auth is required). - Do not fail the test run on 4CAT errors — print status and continue. Example upload with `requests`: @@ -73,7 +73,7 @@ import requests with open(ndjson_path, 'rb') as f: headers = { 'X-Zeeschuimer-Platform': platform, - 'Authorization': f'Bearer {fourcat_key}' + 'Authorization': f'{fourcat_key}' } r = requests.post(f"{fourcat_url.rstrip('/')}/api/import-dataset/", headers=headers, data=f) # check r.status_code and r.text for details @@ -149,7 +149,7 @@ Estimated effort: 6–10 hours of focused work to implement and test everything Open questions / confirmations needed -- Confirm 4CAT API key header format (currently suggested: `Authorization: Bearer {key}`). If your 4CAT requires cookie-based auth, we should emulate the extension upload button via Selenium instead. +- Confirm 4CAT API key header format (currently suggested: `Authorization: {key}`). If your 4CAT requires cookie-based auth, we should emulate the extension upload button via Selenium instead. - Confirm desired default for interactive mode. (Current recommendation: pause once per platform by default; provide `--no-interactive` to run fully headless.) Next steps diff --git a/tests/package-lock.json b/tests/package-lock.json index d055883..7758e9f 100644 --- a/tests/package-lock.json +++ b/tests/package-lock.json @@ -12,7 +12,8 @@ "dotenv": "^16.4.5", "fake-indexeddb": "^5.0.1", "jest": "^29.7.0", - "jest-environment-jsdom": "^29.7.0" + "jest-environment-jsdom": "^29.7.0", + "undici": "^6.20.0" } }, "node_modules/@babel/code-frame": { @@ -4197,6 +4198,16 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/undici": { + "version": "6.26.0", + "resolved": "https://registry.npmjs.org/undici/-/undici-6.26.0.tgz", + "integrity": "sha512-4yqz8a3n5HmGTlsbADNtr/dJlhkh/55Rq798G6ibiULcXbDtaLpTl1pvdqcbFfeoj3iSi52lePFM7h9H21cw/A==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=18.17" + } + }, "node_modules/undici-types": { "version": "7.16.0", "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.16.0.tgz", diff --git a/tests/setup-globals.cjs b/tests/setup-globals.cjs index 4f54e34..6793cc0 100644 --- a/tests/setup-globals.cjs +++ b/tests/setup-globals.cjs @@ -40,3 +40,14 @@ return { ${EXPOSED_NAMES.join(', ')} }; `); Object.assign(globalThis, factory()); + +// jsdom doesn't expose fetch and Jest's jsdom env shadows Node's global +// fetch, so the comparator can't hit 4CAT without help. Polyfill from +// undici (a Node-friendly HTTP client, separately installable on npm — +// distinct from the undici bundled internally by Node, which isn't +// require()-able by name). +// Note: tests that use fetch (e.g. map_item_compare.test.js) declare +// `@jest-environment node` at the top of the file. Node env has fetch +// natively. Don't try to polyfill into jsdom — undici's internals use +// Node-specific globals that jsdom shadows (clearImmediate, +// markResourceTiming, fast timers), and polyfilling them all is brittle. From caf1c7f48a19524282c06b688c08001e534791db Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 27 May 2026 18:43:17 +0200 Subject: [PATCH 16/41] some mapping for odd datasource names --- tests/zeeschuimer-to-4cat.json | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 tests/zeeschuimer-to-4cat.json diff --git a/tests/zeeschuimer-to-4cat.json b/tests/zeeschuimer-to-4cat.json new file mode 100644 index 0000000..f7de942 --- /dev/null +++ b/tests/zeeschuimer-to-4cat.json @@ -0,0 +1,7 @@ +{ + "_comment": "Maps Zeeschuimer module filenames (without .js) to 4CAT datasource ids when they differ. Default behavior is identity — only include entries where the two diverge. Discovered via http://localhost/api/datasources/.", + "9gag": "ninegag", + "truth": "truthsocial", + "rednote": "xiaohongshu", + "rednote-comments": "xiaohongshu-comments" +} From f10fc492845051c87b96b75561eb91de2af99d18 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 27 May 2026 18:44:05 +0200 Subject: [PATCH 17/41] update existing map_item tests and add helper --- tests/_module-info.js | 45 ++++++++++++++++++ tests/map_item.test.js | 105 +++++++++++++++++++---------------------- 2 files changed, 93 insertions(+), 57 deletions(-) create mode 100644 tests/_module-info.js diff --git a/tests/_module-info.js b/tests/_module-info.js new file mode 100644 index 0000000..e261e4e --- /dev/null +++ b/tests/_module-info.js @@ -0,0 +1,45 @@ +/** + * Shared helper for the map_item test drivers. + * + * Pre-validates a module by: + * 1. Running `node --check` on its file (syntax check; avoids the + * worker-killing experimental-ESM crash when a syntax error reaches + * the dynamic importer). + * 2. Dynamically importing it and checking for a `map_item` export. + * + * Returns one of four states the test driver can branch on: + * { state: 'ok', map_item: } + * { state: 'no_map_item' } + * { state: 'syntax_error', error: } + * { state: 'import_error', error: } + */ + +import { spawnSync } from 'node:child_process'; +import { join, dirname } from 'node:path'; +import { fileURLToPath } from 'node:url'; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const MODULES_ROOT = join(__dirname, '..', 'modules'); + +function check_module_syntax(module_name) { + const module_path = join(MODULES_ROOT, `${module_name}.js`); + const result = spawnSync(process.execPath, ['--check', module_path], { encoding: 'utf8' }); + if (result.status === 0) return null; + return (result.stderr || result.stdout || `exit code ${result.status}`).trim(); +} + +export async function inspect_module(module_name) { + const syntax_error = check_module_syntax(module_name); + if (syntax_error) { + return { state: 'syntax_error', error: syntax_error }; + } + try { + const mod = await import(`../modules/${module_name}.js`); + if (typeof mod.map_item !== 'function') { + return { state: 'no_map_item' }; + } + return { state: 'ok', map_item: mod.map_item }; + } catch (e) { + return { state: 'import_error', error: e }; + } +} diff --git a/tests/map_item.test.js b/tests/map_item.test.js index 9dee6e8..2dc1bb6 100644 --- a/tests/map_item.test.js +++ b/tests/map_item.test.js @@ -1,5 +1,5 @@ /** - * Auto-discovery test driver for module `map_item` functions. + * Smoke test driver for module `map_item` functions. * * Convention: * tests/fixtures//*.ndjson @@ -11,52 +11,36 @@ * presents items to a map_item function, then run through the module's * map_item. Tests assert: function returns a non-null object, and any fields * listed in REQUIRED_NON_EMPTY for that module are present and non-empty. + * + * Module-level state is determined upfront by inspect_module(): + * - 'ok' → register per-item tests + * - 'no_map_item' → register a single skipped test (not applicable) + * - 'syntax_error' → register a single failing test pointing at the line + * - 'import_error' → register a single failing test with the message */ import { readdirSync, readFileSync, statSync, existsSync } from 'node:fs'; -import { spawnSync } from 'node:child_process'; import { join, dirname } from 'node:path'; import { fileURLToPath } from 'node:url'; - -/** - * Local mirror of wrap_for_map_item from js/lib.js. - * - * lib.js is loaded by the browser as a plain script (it defines globals - * like traverse_data, MappedItem, wrap_for_map_item) and so cannot be - * imported from Node. The wrap is three trivial lines with no dependencies - * — duplicating it here is cheaper than restructuring lib.js into a module. - * If lib.js's wrap_for_map_item ever gains real logic, this needs to track. - */ -function wrap_for_map_item(stored_item) { - const { data, ...meta } = stored_item; - return { ...data, __import_meta: meta }; -} +import { inspect_module } from './_module-info.js'; const __dirname = dirname(fileURLToPath(import.meta.url)); const FIXTURE_ROOT = join(__dirname, 'fixtures'); -const MODULES_ROOT = join(__dirname, '..', 'modules'); - -/** - * Pre-validate module syntax before dynamic import. - * - * `await import()` on a module with a syntax error throws inside V8's module - * linker in a way Jest's experimental-vm-modules can't always recover from - * (worker retry loop or Node process exit). Running `node --check` first - * gives us a clean error string we can fail the test with. - */ -function check_module_syntax(module_name) { - const module_path = join(MODULES_ROOT, `${module_name}.js`); - const result = spawnSync(process.execPath, ['--check', module_path], { - encoding: 'utf8', - }); - if (result.status === 0) return null; - return (result.stderr || result.stdout || `exit code ${result.status}`).trim(); -} const REQUIRED_NON_EMPTY = { tiktok: ['id', 'author', 'unix_timestamp'], }; +/** + * Local mirror of wrap_for_map_item from js/lib.js. lib.js is loaded by + * the browser as a plain script and so cannot be imported from Node; this + * three-line mirror is cheaper than restructuring lib.js into a module. + */ +function wrap_for_map_item(stored_item) { + const { data, ...meta } = stored_item; + return { ...data, __import_meta: meta }; +} + function list_module_dirs() { if (!existsSync(FIXTURE_ROOT)) return []; return readdirSync(FIXTURE_ROOT).filter(name => { @@ -66,36 +50,46 @@ function list_module_dirs() { } const module_dirs = list_module_dirs(); + +// Pre-pass: synchronously determine each module's state so we can branch +// on it at describe/test registration time. Top-level await is supported +// in Jest's experimental-vm-modules mode. +const module_info = {}; +for (const module_name of module_dirs) { + module_info[module_name] = await inspect_module(module_name); +} + let total_fixtures = 0; for (const module_name of module_dirs) { const fixture_dir = join(FIXTURE_ROOT, module_name); const fixture_files = readdirSync(fixture_dir).filter(f => f.endsWith('.ndjson')); - if (fixture_files.length === 0) continue; total_fixtures += fixture_files.length; - describe(`map_item: ${module_name}`, () => { - let map_item; - let import_error; - - beforeAll(async () => { - const syntax_error = check_module_syntax(module_name); - if (syntax_error) { - import_error = new Error(`syntax error:\n${syntax_error}`); - return; - } - try { - const mod = await import(`../modules/${module_name}.js`); - map_item = mod.map_item; - if (typeof map_item !== 'function') { - import_error = new Error(`modules/${module_name}.js does not export a map_item function`); - } - } catch (e) { - import_error = e; - } + const info = module_info[module_name]; + + if (info.state === 'no_map_item') { + describe(`map_item: ${module_name}`, () => { + test.skip(`modules/${module_name}.js does not export a map_item function — nothing to smoke test`, () => {}); + }); + continue; + } + + if (info.state === 'syntax_error' || info.state === 'import_error') { + const msg = info.state === 'syntax_error' + ? `syntax error:\n${info.error}` + : `import failed: ${info.error.message}`; + describe(`map_item: ${module_name}`, () => { + test(`module loads`, () => { throw new Error(msg); }); }); + continue; + } + + // state === 'ok' — register per-item tests + const map_item = info.map_item; + describe(`map_item: ${module_name}`, () => { for (const fixture_file of fixture_files) { const lines = readFileSync(join(fixture_dir, fixture_file), 'utf8') .split('\n') @@ -104,9 +98,6 @@ for (const module_name of module_dirs) { describe(fixture_file, () => { lines.forEach((line, i) => { test(`item ${i} maps without throwing`, () => { - if (import_error) { - throw new Error(`failed to import modules/${module_name}.js: ${import_error.message}`); - } const stored_item = JSON.parse(line); const mapped = map_item(wrap_for_map_item(stored_item)); expect(mapped).not.toBeNull(); From 3633cde656da3f70880ae49a2909deba3a044953 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 27 May 2026 18:44:23 +0200 Subject: [PATCH 18/41] comparison testing for datasources --- tests/map_item_compare.test.js | 283 +++++++++++++++++++++++++++++++++ 1 file changed, 283 insertions(+) create mode 100644 tests/map_item_compare.test.js diff --git a/tests/map_item_compare.test.js b/tests/map_item_compare.test.js new file mode 100644 index 0000000..37e3e4c --- /dev/null +++ b/tests/map_item_compare.test.js @@ -0,0 +1,283 @@ +/** + * @jest-environment node + * + * This file runs in Node test environment (not jsdom) because undici's + * fetch implementation uses Node-internal APIs (`clearImmediate`, + * `markResourceTiming`, fast-now timers, etc.) that jsdom shadows or + * doesn't expose. Polyfilling them into jsdom is whack-a-mole; node env + * has them all natively. + * + * Trade-off: no DOMParser in node env. The four modules that use + * `strip_tags` (gab, pinterest, rednote, truth) will need a DOMParser + * polyfill (e.g. via linkedom) before the comparator can run against + * them. Other modules (including instagram) work as-is. + */ +/** + * Compare JS map_item output against 4CAT's Python map_item via the API. + * + * For every line in every fixture, runs the JS map_item locally AND sends + * the same stored item to 4CAT's /api/map-item// endpoint, then + * diffs the two outputs field-by-field. Each item is its own Jest test — + * failures point at exactly which item and which fields diverge. + * + * Skips itself entirely if FOURCAT_URL / FOURCAT_API_KEY aren't set, so + * `npm test` keeps working without 4CAT configuration. Drop real values in + * tests/.env to enable. + * + * Datasource id mapping: tests/zeeschuimer-to-4cat.json (Zeeschuimer + * module filename → 4CAT datasource id, for the few names that diverge). + * + * Module-level state is determined upfront by inspect_module() (no + * map_item / syntax errors / import errors are handled before tests are + * registered, so they appear once per module, not once per item). + */ + +import 'dotenv/config'; +import { jest } from '@jest/globals'; +import { readdirSync, readFileSync, statSync, existsSync } from 'node:fs'; +import { join, dirname } from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { inspect_module } from './_module-info.js'; + +const __dirname = dirname(fileURLToPath(import.meta.url)); + +const FOURCAT_URL = process.env.FOURCAT_URL?.replace(/\/$/, ''); +const FOURCAT_API_KEY = process.env.FOURCAT_API_KEY; +const HAS_4CAT = Boolean( + FOURCAT_URL && FOURCAT_API_KEY && FOURCAT_API_KEY !== 'your-api-key-here' +); + +// When true (default), once any item in a module fails, subsequent items +// in that same module skip the HTTP + map_item work and fail fast with a +// "halted" message. Saves time when generator output is broken at the top. +// Set FAIL_FAST=0 in env to run all items regardless. +// Trim because cmd.exe's `set FAIL_FAST=0 && ...` includes the trailing +// space in the variable value, which would otherwise defeat `!== '0'`. +const FAIL_FAST = (process.env.FAIL_FAST ?? '').trim() !== '0'; +const halted_modules = new Set(); + +const FIXTURE_ROOT = join(__dirname, 'fixtures'); +const ID_MAP_PATH = join(__dirname, 'zeeschuimer-to-4cat.json'); +const ID_MAP = existsSync(ID_MAP_PATH) + ? JSON.parse(readFileSync(ID_MAP_PATH, 'utf8')) + : {}; + +function wrap_for_map_item(stored_item) { + const { data, ...meta } = stored_item; + return { ...data, __import_meta: meta }; +} + +async function call_4cat_map_item(datasource_id, item) { + const res = await fetch(`${FOURCAT_URL}/api/map-item/${datasource_id}/`, { + method: 'POST', + headers: { + // 4CAT accepts the raw key without a `Bearer ` prefix, per probe + 'Authorization': FOURCAT_API_KEY, + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ item }), + }); + const text = await res.text(); + if (!res.ok) { + throw new Error(`HTTP ${res.status} from 4CAT: ${text}`); + } + return JSON.parse(text); +} + +// Round-trip a value through JSON so MappedItem, MissingMappedField, etc. +// become plain JSON-compatible objects matching what 4CAT emits. +function normalize(value) { + return JSON.parse(JSON.stringify(value)); +} + +// Recursive structural equality. Doesn't care about object key order, which +// matters for nested values like {__missing: true, value: ""} where JS and +// Python might emit keys in different orders. +function deep_equal(a, b) { + if (a === b) return true; + if (a === null || b === null) return a === b; + if (typeof a !== typeof b) return false; + if (typeof a !== 'object') return false; + if (Array.isArray(a) !== Array.isArray(b)) return false; + if (Array.isArray(a)) { + if (a.length !== b.length) return false; + return a.every((v, i) => deep_equal(v, b[i])); + } + const a_keys = Object.keys(a); + const b_keys = Object.keys(b); + if (a_keys.length !== b_keys.length) return false; + return a_keys.every(k => k in b && deep_equal(a[k], b[k])); +} + +function diff_objects(js_obj, py_obj) { + const diffs = []; + const keys = new Set([...Object.keys(js_obj ?? {}), ...Object.keys(py_obj ?? {})]); + for (const key of keys) { + const in_js = js_obj && key in js_obj; + const in_py = py_obj && key in py_obj; + if (!in_js) { + diffs.push({ key, kind: 'only_python', python: py_obj[key] }); + } else if (!in_py) { + diffs.push({ key, kind: 'only_js', js: js_obj[key] }); + } else if (!deep_equal(js_obj[key], py_obj[key])) { + diffs.push({ key, kind: 'mismatch', js: js_obj[key], python: py_obj[key] }); + } + } + return diffs; +} + +function format_diffs(diffs) { + return diffs.map(d => { + if (d.kind === 'only_js') { + return ` + only in JS: ${d.key} = ${JSON.stringify(d.js)}`; + } + if (d.kind === 'only_python') { + return ` - only in Python: ${d.key} = ${JSON.stringify(d.python)}`; + } + return ` ~ ${d.key}\n JS: ${JSON.stringify(d.js)}\n Python: ${JSON.stringify(d.python)}`; + }).join('\n'); +} + +// Pull out the first few module-frame lines from an error's stack so the +// failure message points at where in modules/.js the throw happened. +function format_error_with_location(err) { + if (!err) return String(err); + const message = err.message || String(err); + const stack = err.stack || ''; + const module_frames = stack.split('\n') + .filter(l => l.includes('/modules/') || l.includes('\\modules\\')) + .slice(0, 3) + .map(l => l.trim()); + return module_frames.length + ? `${message}\n ${module_frames.join('\n ')}` + : message; +} + +function list_module_dirs() { + if (!existsSync(FIXTURE_ROOT)) return []; + return readdirSync(FIXTURE_ROOT).filter(name => { + try { return statSync(join(FIXTURE_ROOT, name)).isDirectory(); } + catch { return false; } + }); +} + +// Per-test timeout: each test does one HTTP round-trip to 4CAT. Jest's +// default 5s is tight under load. +jest.setTimeout(30000); + +if (!HAS_4CAT) { + describe('map_item compare (JS vs 4CAT Python)', () => { + test.skip('FOURCAT_URL / FOURCAT_API_KEY not configured — set them in tests/.env to enable', () => {}); + }); +} else { + const module_dirs = list_module_dirs(); + + // Pre-pass: synchronously determine each module's state so we can branch + // on it at registration time. + const module_info = {}; + for (const module_name of module_dirs) { + module_info[module_name] = await inspect_module(module_name); + } + + let any_fixtures = false; + + for (const module_name of module_dirs) { + const fixture_dir = join(FIXTURE_ROOT, module_name); + const fixture_files = readdirSync(fixture_dir).filter(f => f.endsWith('.ndjson')); + if (fixture_files.length === 0) continue; + any_fixtures = true; + + const datasource_id = ID_MAP[module_name] ?? module_name; + const info = module_info[module_name]; + + if (info.state === 'no_map_item') { + // eslint-disable-next-line no-console + console.log(`[compare] skipping ${module_name}: modules/${module_name}.js does not export a map_item`); + continue; + } + + if (info.state === 'syntax_error' || info.state === 'import_error') { + const msg = info.state === 'syntax_error' + ? `syntax error:\n${info.error}` + : `import failed: ${info.error.message}`; + describe(`map_item compare: ${module_name}`, () => { + test(`module loads`, () => { throw new Error(msg); }); + }); + continue; + } + + // state === 'ok' — register per-item comparison tests + const map_item = info.map_item; + + describe(`map_item compare: ${module_name} (4CAT id: ${datasource_id})`, () => { + for (const fixture_file of fixture_files) { + const lines = readFileSync(join(fixture_dir, fixture_file), 'utf8') + .split('\n') + .filter(line => line.trim().length > 0); + + describe(fixture_file, () => { + lines.forEach((line, i) => { + test(`item ${i}`, async () => { + if (FAIL_FAST && halted_modules.has(module_name)) { + throw new Error( + '[halted after prior failure in this module — set FAIL_FAST=0 to run all items]' + ); + } + try { + const stored_item = JSON.parse(line); + + // 4CAT side + const response = await call_4cat_map_item(datasource_id, stored_item); + + // JS side + let js_result; + let js_error; + try { + js_result = map_item(wrap_for_map_item(stored_item)); + } catch (e) { + js_error = e; + } + + if (response.status === 'mapped') { + if (js_error) { + throw new Error( + `4CAT mapped this item but JS threw: ${format_error_with_location(js_error)}` + ); + } + const js_obj = normalize(js_result); + const py_obj = normalize(response.item); + const diffs = diff_objects(js_obj, py_obj); + if (diffs.length > 0) { + throw new Error( + `${diffs.length} field(s) differ between JS and 4CAT:\n${format_diffs(diffs)}` + ); + } + } else if (response.status === 'skipped') { + if (!js_error) { + throw new Error( + `4CAT skipped this item ("${response.reason}") but JS produced a result` + ); + } + // Both rejected — good. Skip reasons may differ in wording. + } else if (response.status === 'error') { + throw new Error(`4CAT errored on this item: ${response.message}`); + } else { + throw new Error(`unexpected 4CAT response status: ${JSON.stringify(response)}`); + } + } catch (e) { + if (FAIL_FAST) halted_modules.add(module_name); + throw e; + } + }); + }); + }); + } + }); + } + + if (!any_fixtures) { + describe('map_item compare (JS vs 4CAT Python)', () => { + test.skip('no fixtures under tests/fixtures//*.ndjson', () => {}); + }); + } +} From 7d97a0fe342e3b7f932c79fe22e9b8c6b3c25bb3 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 27 May 2026 18:44:35 +0200 Subject: [PATCH 19/41] list common translation errors --- tests/translation-errors.md | 430 ++++++++++++++++++++++++++++++++++++ 1 file changed, 430 insertions(+) create mode 100644 tests/translation-errors.md diff --git a/tests/translation-errors.md b/tests/translation-errors.md new file mode 100644 index 0000000..fcc160d --- /dev/null +++ b/tests/translation-errors.md @@ -0,0 +1,430 @@ +# Auto-generator translation errors + +Patterns of incorrect Python → JavaScript translation observed in +auto-generated `modules/*.js` files. Each entry has a search pattern so +this doc doubles as a checklist when reviewing a new auto-generator PR. + +When an entry is fixed at the generator level (no longer appears in +fresh output), mark it `[fixed]` and keep the entry around — useful +history when something regresses. + +## How to use + +- Found a new pattern? Add an entry below following the template. +- Reviewing a generator PR? `grep` each `Search pattern` against the + changed module files. Anything that hits is worth a manual look. +- Iterating on the generator prompt? The "Why" lines are the + feedback to add — they describe the exact Python-vs-JS semantic + difference the LLM keeps missing. + +## Template + +``` +### + +**Status:** open | fixed in generator | accepted + +**Why it happens:** + +**Wrong JS:** +```js + +``` + +**Correct JS:** +```js + +``` + +**Example:** `modules/.js:` + +**Search pattern:** `` +``` + +--- + +## Observed patterns + +### `in` operator on strings + +**Status:** open + +**Why it happens:** In Python, `"x" in some_string` is a substring check. +In JavaScript, the `in` operator only works on **objects** and checks for +property/key existence; using it with a string on the right-hand side +throws `TypeError: cannot use 'in' operator to search for "x" in `. + +**Wrong JS:** +```js +const is_polaris = '__typename' in item && 'polaris' in item.__typename.toLowerCase(); +``` + +**Correct JS:** +```js +const is_polaris = '__typename' in item && item.__typename.toLowerCase().includes('polaris'); +``` + +**Example:** `modules/instagram.js:513` + +**Search pattern:** `'[^']+' in [a-zA-Z_$][\w$]*\.` — quoted string followed +by `in` followed by a method call. Quick rough check: `grep -E "' in [a-zA-Z]" modules/` + +**Watch out for partial fixes:** seen as `'polaris' in (item.__typename ?? '').toLowerCase()` +— adding `?? ''` guards against `undefined` but the `in` operator itself +still throws on the resulting *string*. The fix is `.includes()`, not just +defaulting the operand. + +--- + +### Python f-string syntax left in single-quoted JS strings + +**Status:** open + +**Why it happens:** Python `f"... {var} ..."` interpolates. JS uses +template literals (backticks) with `${var}`. The auto-generator leaves the +`{var}` notation in a regular single- or double-quoted JS string, which is +just literal text — no interpolation happens. + +**Wrong JS:** +```js +throw new MapItemException('Unable to parse item: different user {user.id} and owner {owner.id}'); +``` + +**Correct JS:** +```js +throw new MapItemException(`Unable to parse item: different user ${user.id} and owner ${owner.id}`); +``` + +**Example:** `modules/instagram.js:754` + +**Search pattern:** `'[^']*\{[a-zA-Z_$][\w$.]*\}[^']*'` or `"[^"]*\{[a-zA-Z_$][\w$.]*\}[^"]*"` +— a non-template-literal string containing `{identifier}` or `{identifier.path}`. +Quick check: `grep -nE "['\"][^'\"]*\{[a-zA-Z_][a-zA-Z0-9_.]*\}[^'\"]*['\"]" modules/` + +--- + +### `?? {}` default that defeats subsequent truthy checks + +**Status:** open + +**Why it happens:** When porting Python's `node.get('user') or {}` (which is +intended to make subsequent code safe to call), the generator emits +`node.user ?? {}`. That's a *valid* Python-equivalent, **but** any following +`if (user && owner) { ... }` guard then never short-circuits because both +`{}` references are truthy. The check ends up reading "if user and owner +*objects* exist" when the intent was "if user and owner data exist." +Subsequent property accesses then compare real ids/usernames against +`undefined` on the missing side, often throwing. + +**Wrong JS:** +```js +const user = node.user ?? {}; +const owner = node.owner ?? {}; +if (user && owner) { + if (user.id === owner.id) { /* … */ } + else if (user.username !== owner.username) { + throw new MapItemException('different user and owner'); + } +} +``` + +**Correct JS** (depending on intent — pick one): +```js +// (a) drop the defaults so truthy guard means "both present" +const user = node.user; +const owner = node.owner; +if (user && owner) { /* compare */ } +``` +```js +// (b) check for actual content, not just object identity +const user = node.user ?? {}; +const owner = node.owner ?? {}; +if (Object.keys(user).length && Object.keys(owner).length) { /* compare */ } +``` + +**Example:** `modules/instagram.js:748-756` + +**Search pattern:** `\?\?\s*\{\s*\}` — any `?? {}` occurrence is worth a +review of subsequent guards. Quick check: `grep -nE "\?\?\s*\{\s*\}" modules/` + +--- + +### Bare relative path as a statement (junk auto-imports section) + +**Status:** open + +**Why it happens:** The generator emits an "auto-generated imports" marker +block at the top of the module but writes the import target as a bare +relative path on its own line (`../js/lib.js`) instead of a real `import` +statement. JS parses that as `..` then `.` then `/js/lib.js` — syntax error. + +**Wrong JS:** +```js +// === auto-generated imports for map_item — DO NOT EDIT BY HAND === +../js/lib.js +// === end auto-generated imports === +``` + +**Correct JS** (one of): +```js +// === auto-generated imports — DO NOT EDIT BY HAND === +// Provided as globals by js/lib.js (loaded via manifest.json): +// MappedItem, MissingMappedField, MapItemException, traverse_data, +// strip_tags, normalize_url_encoding, formatUtcTimestamp +// === end auto-generated imports === +``` + +Or, if a real import is intended, an ESM import with named bindings: +```js +import { MappedItem, MissingMappedField } from '../js/lib.js'; +``` + +**Example:** seen historically in `modules/tiktok.js:2` + +**Search pattern:** `^\.\./` at the start of a line in module files. +Quick check: `grep -nE "^\.\." modules/*.js` + +--- + +### Key-existence check (`'X' in obj`) used where Python intended value-truthiness (`obj.get('X')`) + +**Status:** open + +**Why it happens:** Python's `if node.get('usertags'):` is a *truthy check on +the value* — returns False if the key is missing **or** if the value is +`None`/empty/falsy. The generator translates this to `if ('usertags' in +node)`, which in JS is a *key-existence check* — returns True even when +the value is `null`. Subsequent property accesses on the null value then +throw `Cannot read properties of null`. + +**Wrong JS:** +```js +const usertags = 'usertags' in node ? node.usertags.in.map(...).join(',') : ''; +// node.usertags can be null → .in.map blows up +``` + +**Correct JS:** +```js +const usertags = node.usertags ? node.usertags.in.map(...).join(',') : ''; +``` + +**Example:** `modules/instagram.js:777` + +**Search pattern:** `'[^']+' in [a-zA-Z_$][\w$]*\s*\?` — quoted-string `in` +identifier followed by `?` (ternary). Quick check: +`grep -nE "'[^']+' in [a-zA-Z_]+ \?" modules/` + +--- + +### Datetime serialization format mismatch + +**Status:** open + +**Why it happens:** Python's `datetime.utcfromtimestamp(t).strftime('%Y-%m-%d %H:%M:%S')` +produces `"2026-05-13 21:27:31"` — space-separated, no timezone marker. JS's +`new Date(t * 1000).toISOString()` produces `"2026-05-13T21:27:31.000Z"` — T +separator, milliseconds, Z. The generator emits the JS `.toISOString()` form +instead of using the existing `formatUtcTimestamp` helper from lib.js that +mimics Python's output exactly. + +**Wrong JS:** +```js +collected_at = new Date(node.taken_at * 1000).toISOString(); +``` + +**Correct JS:** +```js +collected_at = formatUtcTimestamp(node.taken_at); +// formatUtcTimestamp is defined in js/lib.js as: +// new Date(unixSeconds * 1000).toISOString().replace('T', ' ').slice(0, 19) +``` + +**Example:** `modules/instagram.js:782` + +**Search pattern:** `new Date\([^)]+\)\.toISOString\(\)` — any use of +`.toISOString()`. The helper should be used instead. Quick check: +`grep -nE "\.toISOString\(\)" modules/` + +--- + +### `re.findall` capture groups vs JS `.match` with /g flag + +**Status:** open + +**Why it happens:** Python's `re.findall(r'#(\w+)', s)` returns the **capture +group contents**: `['lotr', 'woodart']`. JS's `s.match(/#(\w+)/g)` (with the +global flag) returns the **full matches**: `['#lotr', '#woodart']` — capture +groups are ignored. The generator translates the regex literally without +adjusting for this semantic difference, so the resulting strings keep +prefixes/wrappers that Python would have stripped. + +**Wrong JS:** +```js +hashtags: caption.match(/#([^\s!@#$%^&*()_+{}:"|<>?;',./`~]+)/g)?.join(',') +// produces "#lotr,#woodart" +``` + +**Correct JS:** +```js +// Option A: strip the literal prefix from each full match +hashtags: caption.match(/#([^\s...]+)/g)?.map(h => h.slice(1)).join(',') ?? '' +// Option B: use matchAll to get capture groups properly +hashtags: [...caption.matchAll(/#([^\s...]+)/g)].map(m => m[1]).join(',') ?? '' +``` + +**Example:** `modules/instagram.js:812` (also 766, 870 — three copies) + +**Search pattern:** `\.match\(/[^/]*\([^/]*\)[^/]*/g\)` — any `.match()` with +a global-flag regex containing a capture group. Quick check: +`grep -nE "\.match\(/.*\(.*\).*\/g\)" modules/` + +--- + +### `undefined` field values get dropped from JSON, but Python's `None` becomes `null` + +**Status:** open + +**Why it happens:** When `JSON.stringify` encounters an object property whose +value is `undefined`, it **omits the key entirely** from the output. Python's +`json.dumps` serializes `None` as `null`, keeping the key. The generator +writes assignments like `location.city = node.location.city` where the +right-hand side can be `undefined`, producing missing keys in JS output +that show up as `only in Python: = null` diffs against 4CAT. + +**Wrong JS:** +```js +location.city = node.location.city; // undefined if .city missing +// JSON.stringify({location_city: undefined}) → "{}" (key omitted) + +body: caption, // null if no caption — Python returns "" here, not null +``` + +**Correct JS:** +```js +// Whichever fallback Python uses for that specific field: +location.city = node.location.city ?? null; // some fields → null +body: caption ?? '', // other fields → "" +``` + +**Example:** `modules/instagram.js:745, 853` (`null` flavor), +559, 648, 798 (`""` flavor for `body`) + +**Note:** Python's choice of `None` vs `""` is per-field — there's no +universal rule. When the comparator reports `~ X JS: null Python: ""` use +`?? ''`. When it reports `- only in Python: X = null` use `?? null`. The +distinction matters because the JS output should match Python's choice +exactly for that field. + +**Search pattern:** harder to grep automatically — any property assignment +where the RHS could be `undefined`/`null` and the resulting field is +expected to appear in the mapped output. Look at "only in Python: X = null" +and "~ X JS: null Python: \"\"" diffs in the comparator output to find +specific cases. + +--- + +### Object-reference inequality used as type check + +**Status:** open + +**Why it happens:** The generator emits `caption !== new MissingMappedField('')` +to mean "caption is not a missing-marker", but `new MissingMappedField('')` +creates a fresh object every time, and `!==` on objects compares references. +The expression is **always true**, so the conditional never takes the +"missing" branch. Likely originates from Python idioms like `caption != ""` +or `caption is not None`, mistranslated through the MissingMappedField +abstraction. + +**Wrong JS:** +```js +hashtags: caption !== new MissingMappedField('') ? caption.match(...) : '', +// !== between two different object references is always true +``` + +**Correct JS:** +```js +// If the intent was "if caption has content", just truthy-check it: +hashtags: caption ? caption.match(...) : '', +// If the intent was "if caption is not a MissingMappedField instance": +hashtags: !(caption instanceof MissingMappedField) ? caption.match(...) : '', +``` + +**Example:** `modules/instagram.js:812` (and two other copies) + +**Search pattern:** `!== new [A-Z]` or `=== new [A-Z]` — any equality +comparison with a freshly-constructed object. Quick check: +`grep -nE "(!==|===) new [A-Z]" modules/` + +--- + +### `.method()` chain on potentially-null result + +**Status:** open + +**Why it happens:** In Python, calling a method on `None` raises +`AttributeError`, which 4CAT sometimes catches. In JS, calling a method on +`null`/`undefined` throws `TypeError: Cannot read properties of null +(reading '')`. The generator emits the same dotted chain without +optional-chaining (`?.`) protection. + +**Wrong JS:** +```js +hashtags: caption !== new MissingMappedField('') + ? caption.match(/#([^\s!@#$%^&*()_+{}:"|<>?;',./`~]+)/g)?.join(',') + : '', +``` +(here `caption` is allowed to be `null`, so `caption.match(...)` blows up +on null caption) + +**Correct JS:** +```js +hashtags: caption + ? caption.match(/#([^\s!@#$%^&*()_+{}:"|<>?;',./`~]+)/g)?.join(',') ?? '' + : '', +``` + +**Example:** `modules/instagram.js:809` + +**Search pattern:** harder to grep — needs reading. Worth manual review of +any field that uses `caption.match`, `something.split`, `something.join` +without `?.` on a value that could be null/undefined. + +--- + +## Generator prompt feedback (running list) + +Concrete things to fold into the generator's prompt over time: + +1. **Python `x in y` where `y` is a string** → use `y.includes(x)` in JS, + never `x in y`. +2. **Python f-strings** → use JS template literals (backticks) with + `${...}` syntax. Never leave `{...}` in single- or double-quoted strings. +3. **`?? {}` after a `.get(...) or {}` translation** → only use this if the + following code does property-access. If the following code does a + truthy guard (`if (x && y)`), drop the default and use just `node.user`. +4. **Method chains on possibly-null values** → use `?.` (optional + chaining) instead of `.` whenever the receiver could be null/undefined. +5. **The auto-imports header block** → emit either real `import { ... }` + statements with valid relative paths, or a comment-only header. + Never emit bare paths as JS statements. +6. **Python `node.get('X')` truthy check** → in JS, use `node.X` (or + `node.X != null`), not `'X' in node`. The `in` operator checks key + existence, which is True even for explicit-null values. +7. **Datetime serialization** → use the `formatUtcTimestamp` helper from + lib.js (which mimics Python's `strftime('%Y-%m-%d %H:%M:%S')` format), + not `new Date(...).toISOString()` (which has a different output shape: + T separator, milliseconds, Z suffix). +8. **`re.findall` with capture groups** → in JS, `.match(/.../g)` returns + full matches, NOT capture groups. To get capture-group behavior, use + either `[...s.matchAll(/.../g)].map(m => m[1])` or post-process the + full matches with `.map(...)` to strip the literal parts. +9. **Object-reference equality (`!== new X(...)`)** → never. Creating an + object with `new` produces a fresh reference; `===`/`!==` compares + identity. Use `instanceof X` for type checks, or compare values + directly. The MissingMappedField "is this missing?" check should be + `caption instanceof MissingMappedField` or just truthy-check the value. +10. **Python `None` → JSON `null` vs JS `undefined` → omitted** — when a + field's value could be missing and Python returns `null` for it, + JS must explicitly assign `null` (not leave the value as `undefined`). + `JSON.stringify` drops `undefined` keys silently. Use `value ?? null` + when the field is expected to appear in the mapped output. From 6ad4c134cf35d0993b2968f3b2dc832e2766794d Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 27 May 2026 18:45:52 +0200 Subject: [PATCH 20/41] package.json fix --- tests/package.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/package.json b/tests/package.json index 333564a..390fdd3 100644 --- a/tests/package.json +++ b/tests/package.json @@ -13,6 +13,7 @@ "dotenv": "^16.4.5", "fake-indexeddb": "^5.0.1", "jest": "^29.7.0", - "jest-environment-jsdom": "^29.7.0" + "jest-environment-jsdom": "^29.7.0", + "undici": "^6.20.0" } } From 11ffffbdea4b853fd88e219d719d6d7947fab6df Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 3 Jun 2026 15:04:51 +0200 Subject: [PATCH 21/41] rm other test doc --- docs/test-plan.md | 162 ---------------------------------------------- 1 file changed, 162 deletions(-) delete mode 100644 docs/test-plan.md diff --git a/docs/test-plan.md b/docs/test-plan.md deleted file mode 100644 index a4265eb..0000000 --- a/docs/test-plan.md +++ /dev/null @@ -1,162 +0,0 @@ -# Selenium Test Harness — Improvement Plan - -Date: 2026-04-30 - -Overview - -This document captures an actionable plan to improve the Selenium-based integration tests in `tests/test.py` for the Zeeschuimer Firefox extension. The goals are to: - -- Make profile handling reliable and reusable (so logged-in sessions persist across runs). -- Preserve and export captured data per platform for offline analysis and for passing to 4CAT. -- Add optional automated upload to a 4CAT instance for mapping/validation tests. -- Reduce fragility caused by popups and interactive dialogs (pausing/dismissal patterns). -- Improve robustness, error handling, and machine-readable results. - -Scope - -All changes are confined to the test harness and test metadata (`tests/test.py` and `tests/tests.json`) and to this planning document. No changes are required in the extension source for the planned items (the test harness will interact with the extension's UI pages and background DB). - -Phases & Changes - -Phase 1 — Profile management - -- Problem: copying an entire profile can race with a running Firefox and the current ignore rule hides potentially useful session data. -- Changes: - - Detect if the selected profile directory appears locked (presence of `lock` or `.parentlock`) and warn if Firefox is running. - - Replace the naive ignore lambda used in `shutil.copytree` with a function that only excludes `storage`, `extensions`, and `signedInUser.json` at the profile root. - - Add CLI flags: `--profile-name NAME` (choose profile by display name from `profiles.ini`), `--save-profile PATH` (save the temp profile for reuse), and `--no-cleanup` (do not remove `.temp-profile` after run). - -Implementation note (copytree ignore example): - -```python -def _profile_ignore(root, names): - # Only ignore these entries in the root profile dir - if os.path.abspath(root) == os.path.abspath(profile_dir): - return {"storage", "extensions", "signedInUser.json"} - return set() - -shutil.copytree(profile_dir, profile_file, ignore=_profile_ignore) -``` - -Phase 2 — Data preservation & export - -- Problem: `reset-all` wipes the DB before each URL; no artifacts are kept for post-mortem or mapping tests. -- Decision: export a single combined NDJSON file per platform containing items collected while testing that platform. -- Changes: - - Add CLI `--export-dir PATH` (default `./zeeschuimer-exports/{timestamp}/`). - - Before clicking `reset-all` for each URL, read the current DB contents from the extension background page (Dexie) via `execute_async_script` and append those items to a per-platform in-memory list in Python. After all URLs for a platform are done, write `{export-dir}/{platform}.ndjson`. - - Optionally add `--no-reset` to skip the `reset-all` call entirely (default behavior remains to reset before each URL). - -Execute_async_script pattern (example): - -```python -script = ''' -const cb = arguments[0]; -background.db.items.toArray().then(items => cb(JSON.stringify(items))).catch(e => cb(JSON.stringify({error: String(e)}))); -''' -items_json = driver.execute_async_script(script) -items = json.loads(items_json) -``` - -Phase 3 — 4CAT integration (optional) - -- Problem: mapping tests live in 4CAT and need NDJSON input. -- Changes: - - Add CLI flags: `--4cat-url URL` and `--4cat-key KEY` (API key). Require both for upload. - - After writing the per-platform NDJSON, POST it to `{4cat_url.rstrip('/')}/api/import-dataset/` with header `X-Zeeschuimer-Platform: {platform}` and `Authorization: {key}` (confirm header with your 4CAT instance; alternative is to trigger the extension UI upload button when cookie-based auth is required). - - Do not fail the test run on 4CAT errors — print status and continue. - -Example upload with `requests`: - -```python -import requests -with open(ndjson_path, 'rb') as f: - headers = { - 'X-Zeeschuimer-Platform': platform, - 'Authorization': f'{fourcat_key}' - } - r = requests.post(f"{fourcat_url.rstrip('/')}/api/import-dataset/", headers=headers, data=f) - # check r.status_code and r.text for details -``` - -Phase 4 — Interactive controls & popup dismissals - -- Problem: cookie banners, paywall prompts, and other popups frequently interfere with automated navigation and can cause false failures. -- Decision: pause by default **once per platform** (not before every URL) so the tester can clear residual prompts; provide opt-out and finer-grained options. -- Changes: - - CLI flags: `--no-interactive` (disable all pauses), `--pause-before-url` (pause before each URL), `--pause-on-fail` (pause on failure), `--extra-wait N` (add N seconds to every wait), `--screenshot-dir PATH` (capture screenshots on fail/warning). - - Add a `dismiss-selectors` optional field in `tests.json` per URL: a list of CSS selectors to click to dismiss known popups. Example: - -```json -"dismiss-selectors": ["button.cookie-accept", ".modal .close"] -``` - - - Add per-URL `timeout` (page load timeout override). - -Phase 5 — Runner robustness & reporting - -- Problem: unhandled exceptions abort the run; final runtime is calculated incorrectly; no machine-readable results. -- Changes: - - Wrap each URL test body in try/except, increment `failed` on exceptions, and continue. - - Move the global `start_time = time.time()` to before the outer platform loop so the final elapsed time is for the full run. - - Add CLI flags: `--results-file PATH` (write JSON summary), `--resume-from PLATFORM` (skip earlier platforms), and `--screenshot-dir PATH` (as noted). - - Fix small test metadata issues (e.g., `more-after-scrolll` typo in `tests.json`). - -tests.json schema additions - -- Per-URL optional fields: - - `dismiss-selectors`: array of CSS selectors to click after page load - - `timeout`: numeric page load timeout seconds for this URL - - `extra-wait`: per-URL additional wait seconds - -CLI flags (summary) - -- `--profiledir PATH` — explicit profile path (existing) -- `--profile-name NAME` — choose Firefox profile by display name -- `--save-profile PATH` — persist the copied profile for reuse -- `--no-cleanup` — keep `.temp-profile` -- `--export-dir PATH` — where to write NDJSON exports -- `--no-reset` — do not click `reset-all` between URLs -- `--4cat-url URL` — base URL for 4CAT server -- `--4cat-key KEY` — API key for 4CAT uploads -- `--4cat-per-url` — upload per URL instead of per platform (optional) -- `--no-interactive` — disable pausing (default is to pause per-platform) -- `--pause-before-url` — pause before each URL -- `--pause-on-fail` — pause when a test fails -- `--extra-wait N` — add N seconds to every URL wait -- `--screenshot-dir PATH` — save screenshots on fail/warning -- `--results-file PATH` — write machine-readable results JSON -- `--resume-from PLATFORM` — resume a run from a platform - -Verification checklist - -1. `python tests/test.py --sources instagram.com --export-dir ./exports` -> `exports/instagram.com.ndjson` exists and contains NDJSON with captured items. -2. `python tests/test.py --save-profile .saved-profile --login` -> create a saved profile that can be reused with `--profiledir .saved-profile`. -3. Run with default interactive behavior and confirm one pause per platform. -4. `python tests/test.py --results-file results.json` -> JSON summary produced with per-URL status and counts. -5. Test 4CAT upload using a local mock server and `--4cat-url http://localhost:8000 --4cat-key KEY`. - -Implementation steps (recommended order) - -1. Docs and small fixes (this document + tests.json typo fix). -2. Profile management changes (`--profile-name`, improved copy ignore, `--save-profile`, lock detection). -3. Export behavior: `--export-dir` + `execute_async_script` collection and NDJSON write. -4. Runner robustness: try/except around URL loop, `--results-file`, fix `start_time` placement. -5. Interactive and dismissal features (`dismiss-selectors`, pause flags, screenshots). -6. 4CAT upload integration (optional, requires confirmation of auth header). - -Estimated effort: 6–10 hours of focused work to implement and test everything end-to-end; can be split into 3-4 incremental PRs. - -Open questions / confirmations needed - -- Confirm 4CAT API key header format (currently suggested: `Authorization: {key}`). If your 4CAT requires cookie-based auth, we should emulate the extension upload button via Selenium instead. -- Confirm desired default for interactive mode. (Current recommendation: pause once per platform by default; provide `--no-interactive` to run fully headless.) - -Next steps - -- I have created a matching TODO list in the session tracker and written this document to `docs/test-plan.md`. -- If you want, I can start implementing Phase 1 (profile management) in `tests/test.py` now and submit incremental changes. - ---- - -Requested file: `docs/test-plan.md` From 6cc61003e95be381b191baae1486f989a2ed3e71 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 3 Jun 2026 15:05:55 +0200 Subject: [PATCH 22/41] map_item.test.js verify modules import and map_item exists only --- tests/map_item.test.js | 134 ++++++++++------------------------------- 1 file changed, 31 insertions(+), 103 deletions(-) diff --git a/tests/map_item.test.js b/tests/map_item.test.js index 2dc1bb6..774c083 100644 --- a/tests/map_item.test.js +++ b/tests/map_item.test.js @@ -1,121 +1,49 @@ /** - * Smoke test driver for module `map_item` functions. + * Load-only smoke for every module under `modules/*.js`. * - * Convention: - * tests/fixtures//*.ndjson + * For each module file, runs `inspect_module()` and asserts the module: + * - parses (no SyntaxError) + * - imports without throwing + * - either exports a `map_item` function, or doesn't (both are fine here) * - * matches a file in modules/ (e.g. "tiktok" maps to modules/tiktok.js). - * Each .ndjson line is one Zeeschuimer-stored item exported from the popup. + * No data is fed through `map_item`. That work belongs in the comparator + * (Tier 2 — `npm run test:compare`), where real items pulled from a 4CAT + * dataset provide both the input and the expected output. * - * Each item is wrapped via wrap_for_map_item to mirror how 4CAT's importer - * presents items to a map_item function, then run through the module's - * map_item. Tests assert: function returns a non-null object, and any fields - * listed in REQUIRED_NON_EMPTY for that module are present and non-empty. - * - * Module-level state is determined upfront by inspect_module(): - * - 'ok' → register per-item tests - * - 'no_map_item' → register a single skipped test (not applicable) - * - 'syntax_error' → register a single failing test pointing at the line - * - 'import_error' → register a single failing test with the message + * Catches: parse errors, import-time throws, broken top-level statements. + * Does NOT catch: anything that requires running `map_item` on real input. */ -import { readdirSync, readFileSync, statSync, existsSync } from 'node:fs'; +import { readdirSync } from 'node:fs'; import { join, dirname } from 'node:path'; import { fileURLToPath } from 'node:url'; import { inspect_module } from './_module-info.js'; const __dirname = dirname(fileURLToPath(import.meta.url)); -const FIXTURE_ROOT = join(__dirname, 'fixtures'); - -const REQUIRED_NON_EMPTY = { - tiktok: ['id', 'author', 'unix_timestamp'], -}; - -/** - * Local mirror of wrap_for_map_item from js/lib.js. lib.js is loaded by - * the browser as a plain script and so cannot be imported from Node; this - * three-line mirror is cheaper than restructuring lib.js into a module. - */ -function wrap_for_map_item(stored_item) { - const { data, ...meta } = stored_item; - return { ...data, __import_meta: meta }; -} - -function list_module_dirs() { - if (!existsSync(FIXTURE_ROOT)) return []; - return readdirSync(FIXTURE_ROOT).filter(name => { - try { return statSync(join(FIXTURE_ROOT, name)).isDirectory(); } - catch { return false; } - }); -} +const MODULES_ROOT = join(__dirname, '..', 'modules'); -const module_dirs = list_module_dirs(); +const module_files = readdirSync(MODULES_ROOT) + .filter(f => f.endsWith('.js') && !f.startsWith('_')); -// Pre-pass: synchronously determine each module's state so we can branch -// on it at describe/test registration time. Top-level await is supported -// in Jest's experimental-vm-modules mode. const module_info = {}; -for (const module_name of module_dirs) { - module_info[module_name] = await inspect_module(module_name); +for (const file of module_files) { + const name = file.replace(/\.js$/, ''); + module_info[name] = await inspect_module(name); } -let total_fixtures = 0; - -for (const module_name of module_dirs) { - const fixture_dir = join(FIXTURE_ROOT, module_name); - const fixture_files = readdirSync(fixture_dir).filter(f => f.endsWith('.ndjson')); - if (fixture_files.length === 0) continue; - total_fixtures += fixture_files.length; - - const info = module_info[module_name]; - - if (info.state === 'no_map_item') { - describe(`map_item: ${module_name}`, () => { - test.skip(`modules/${module_name}.js does not export a map_item function — nothing to smoke test`, () => {}); +describe('module load smoke', () => { + for (const file of module_files) { + const name = file.replace(/\.js$/, ''); + test(`modules/${file} loads cleanly`, () => { + const info = module_info[name]; + if (info.state === 'syntax_error') { + throw new Error(`syntax error in modules/${file}:\n${info.error}`); + } + if (info.state === 'import_error') { + throw new Error(`import failed for modules/${file}: ${info.error.message}`); + } + // 'ok' or 'no_map_item' — both acceptable at this tier. + expect(['ok', 'no_map_item']).toContain(info.state); }); - continue; } - - if (info.state === 'syntax_error' || info.state === 'import_error') { - const msg = info.state === 'syntax_error' - ? `syntax error:\n${info.error}` - : `import failed: ${info.error.message}`; - describe(`map_item: ${module_name}`, () => { - test(`module loads`, () => { throw new Error(msg); }); - }); - continue; - } - - // state === 'ok' — register per-item tests - const map_item = info.map_item; - - describe(`map_item: ${module_name}`, () => { - for (const fixture_file of fixture_files) { - const lines = readFileSync(join(fixture_dir, fixture_file), 'utf8') - .split('\n') - .filter(line => line.trim().length > 0); - - describe(fixture_file, () => { - lines.forEach((line, i) => { - test(`item ${i} maps without throwing`, () => { - const stored_item = JSON.parse(line); - const mapped = map_item(wrap_for_map_item(stored_item)); - expect(mapped).not.toBeNull(); - expect(typeof mapped).toBe('object'); - for (const field of REQUIRED_NON_EMPTY[module_name] ?? []) { - expect(mapped[field]).toBeDefined(); - expect(mapped[field]).not.toBe(''); - expect(mapped[field]).not.toBeNull(); - } - }); - }); - }); - } - }); -} - -if (total_fixtures === 0) { - describe('map_item', () => { - test.skip('no fixtures found under tests/fixtures//*.ndjson', () => {}); - }); -} +}); From a090675c162573b3ae8633584010464d3d264bdc Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 3 Jun 2026 15:06:24 +0200 Subject: [PATCH 23/41] remove old fixtures and 4cat probe --- tests/__pycache__/test.cpython-39.pyc | Bin 7345 -> 0 bytes tests/fixtures/.gitignore | 5 - tests/fixtures/README.md | 29 ------ tests/probe-4cat.mjs | 140 -------------------------- 4 files changed, 174 deletions(-) delete mode 100644 tests/__pycache__/test.cpython-39.pyc delete mode 100644 tests/fixtures/.gitignore delete mode 100644 tests/fixtures/README.md delete mode 100644 tests/probe-4cat.mjs diff --git a/tests/__pycache__/test.cpython-39.pyc b/tests/__pycache__/test.cpython-39.pyc deleted file mode 100644 index 745e2b4aaad921a459372bb50b39980c50a68136..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 7345 zcmai3-E$k)b>CeqKnNl!lATnGrZeqKUpk()>F?ZK zfYgUo?Cjn9aqc#`iup9_Rb`1mDjmADRio3;Y0oamS9A_{-Zn zI^O|iiy!3Q;nuH=&MRVKL*uV(8SId->aT93{IL20jve)WFJ2Ur;?Rb^p|fd`7tSsMTX) znjhuUpKCnNk3CEQ%k)KU>2-EojCW>4x>FFxxAb~(OJ^nQIW9__6MSY%2S-@>LVIKq zmQDT6@3C5>zBh(m30UiHNgf0MuUxrUvyol_h;E^3h5X>j#U=M10ioE2|%-rdr+^exB}bWd*@ z@=L6qQfsH_)bV+|EM`w9Cpn|O*$s`I6MH-7#e3r91~5Q^$2%AJSw6940N3%(#Vv3k zj*D~U-e*Pih zy4?9dyvg6g{P&4lHFJeuNP1WK#iVzQ&#B%NzodA)+be-L|A4>0 zWz=u5dH#Wzhi?br&a$LC;2bI__ahHBlP7u zTfn=BcS)EX6T1C`e+ZqHx3v1Fd~Q-3*O(1?FK=lVwaxU>_@cq~u}TjXr@z(V;v=%6 zgwGH7^?vUMVqZt_*ZGYtW534d_s1|(!d5(--ZbT}#XfQRuC97hVjs?QGuGXvd8a5U z30-sPFF|*ORYkQw^ApU}m?Pp{x5Nte-V!zFKCQ4ni?bgp&hla(v~+u1;~(wUSiOIS z!|@p+OG&h*V4(rwKp!7NM+VK@*1R`OP3)Gy*k-{)9n^1&QO@`RU##kU>0t^p zpTvImvE~^kG=-;3cD9+8{}P=B71F62&%>Gx))0-3hnSL zN45rQTUV#9ku+Fav^NLjcppom2esGfX@S>OkMKQrZd+UY@ML0>PuS@5xm!sc9`}3p!khawwR?dd#&^N{w-`$T|E1{d{8H__vu$ZyiU9ltk5UPSC52(h`@24E z6I>rkFi?+ZKaC-L3PWwjS+cU;$EO2dUEwoS?@4^ECpfVbMaJ*;PrH^tEGk|Xz288l z>rY58EC28q#XtPNGW)a-Z*K0AxoAtIKgC}x)2I)b@Cm=XNAU&m>34X4qvF)% z&V7|NE_WWN?{-e?#kwi+nOG2uVyOxbP(F4q$+=VfejH83CzBd~0Pp)OsUfELkD#R= ziv{I}#s7=PzEJD>iuUYpN%&<@N5sht1IOy@0y&Lf&;M0$t<1 zHrkw6uZZpCitqYTa4hUv6U$sWcZI|t%k5BvAcB4-Xa}Ka#Cw~z$6JAdF|XTU4YOv| z^_Qc;LZK=B6~`6akf++jadli&!CGwR(x=f)J-Ehq5K|DM=-@+=Hf@ zu`48y2zWx0WOS%U<%9VTuUW*kRq?&hc09+cS}{yceTww}8+R27f!~r95kzm!#~mU| z5Hr}>XxN27G;Il~@l)1v+e%J1qd^Li`JS-EI}l6-EOYzp zJ3#TDcr`hR<|eE%wJO@(0V!brfrwHCYdRX8`nWLFC`@r{>hkQ=)!C`}$jDpKI~Quc zACR3i{2;WXs0fdw0bArsLer2n797tCYr=9J^zCK;uEo z)QUk8NPg6(SE(e6P8C(?U+YkZ0d=qe@+`Y5D1L;(3n;uqqk&WP1b^QnMJPv6x`8zH z^{(DlgSgobgi6%csq+FF3M|+ScYVwDnQ_ zjbPTg_aHK6tY|m}fmmhBo3)~`8!b=82)J|{CyLSN6}#nCYNUq5lcT9XR9eyr+eL~k z=oN|0N?7)#gE&@h`tYGPxE_GyC_Z-w{5U8_MzNUJ*{;40BLp>JyI~Ci{+goM*Xz`z z(L@L5ya32Sct={WoB(>aQRhWdu)9_`%f)i53bNH#OE?9&QYoxCK_Q6=)nEP<{r>UY z=whi9Ai^)43*q15%%L4v4=t5S#bT+%!-WtgFr7bh=zT8~(z9Lb)~)w1_Sbndf|yh2 zo5^gc6zwZD{bs9Ka%jhjT@l4v*l<}Yw3nlyoqD0Q_<1q z5P3BU3c|COkrNA1YH_g`z40wHUU@bVkN^!U8nk9-tzgw@Hlf7Gq)~M8*@UN$F1Nza z_lgjs2n#S%kW6?!^^I;tDeUf=YsjN5vY~~q{^qw!EYlX0Pno%lJWXC@hoKCjBUTeW z51IO_5WZK1GQye-@xz2*SR`z+v}mMaHz7t!g0YNS*MOSXT=?pjoqzw`&z7tUF$5B0 z2rUSXh-5uS)q(Hcg{n!(3GQwdfl+OZvdbb|6T-8iw5kQZ(F;*#;gb`Ecb1|fQiLt( zS>Gm|S1G3ig=oK3^jeLA13wEOA;ez&&Xa=3CureOP>8i+J8lb**0f?hQTk(e^Z6fj2g+p_ zYq`v_wydIt34U@zl41wj+nX(K-zq3|$%eXDNRm9u_JHI7hLUQF zZnhV>!SMIP4HA6vf=ZxszUj7l81M<_N`s)AZX$z*T@%BQ?e2l2)?(RY5OhrELE;@g_PIL)_tTH9E+jERIzQFF{%()Mm9qD zI?Gm2&{3~z`c2`nw91Ms<9GztH7VSp7>lnUD1K0jhisT6OhL7*pt~8WVNf}nM73+8 zvJK=>CB~#axgJZQn*xKhJ1jS0KSBzxBG_1~Nw>cpQvut(<2ELb5UPO$75?#{E{$x+ z@gN4*Eki55$ABHD_UHCEaPat|QKoE# z!@5E=+29I-aTy1u%+AsuT|a;GGF$|qo0+Ya!`Yr4^?$h~UA8Mu^5xKHIZv#W<7m!? z<8K)Wjk_XX1HDdK-6p$fRTQvH95Gmi+*$B$wihkvwrR^yG29)6twH({7ZBSm2Tp{F z4nELocmXqN638ggkZ+(GMGVCyEhbNQ4mbnkp6#vS9MTO~ig<_fgdxQyG67B}9=RDO z-?`pBo(*1@pD)kPU75RZ^TQi#Pm(YDA{umpGIwApb{ob!am}A^Rcdpw{Q5{B!FH4E z3oao`q~IYBWhwkR&N^VkUO_DbTTYR;q=MEm+l35Tjt@3MU6mI|k4TzqD3tA_5B|Gp zS#Jf{*9g$BxVYAI%{wiq6_^@3s&=~ENxClJvexi<%N1wowivi*&2(1pKgOW`jK|38 zBY4JCn;kH8<9{=$%>U3%n#~&7)Sy00SPW`0gZH43GsdtJk2$92^ju~f*pC?#`lO!G zC$J}r?=a|P4UF~S)PdAk3UGREAWN+=q7Py`sAmjNAS$}4o-FVZ)e*zgjX#>8jTUbP z7)OB7#OjznnHm9>5wr%r6Z(j8BsFXp`Zz}Pj_4Wlm_C~tS9@}Q%%)65i+1&%3Ggti r)7nAsdQd+C-g9`Zeye5-%!0d@)ao#;8W#8)2S0R@tbPE`VLbl_#VpU) diff --git a/tests/fixtures/.gitignore b/tests/fixtures/.gitignore deleted file mode 100644 index 8e89a83..0000000 --- a/tests/fixtures/.gitignore +++ /dev/null @@ -1,5 +0,0 @@ -# Ignore everything in this directory -* -# Except these files -!.gitignore -!README.md \ No newline at end of file diff --git a/tests/fixtures/README.md b/tests/fixtures/README.md deleted file mode 100644 index d24fe06..0000000 --- a/tests/fixtures/README.md +++ /dev/null @@ -1,29 +0,0 @@ -# Test fixtures for `map_item` - -Real captured items used to exercise each module's auto-generated `map_item` -function. - -## Layout - -``` -tests/fixtures/ - / - .ndjson - .ndjson -``` - -`` matches the filename in `modules/` without `.js` — -e.g. `tiktok/` → `modules/tiktok.js`, `pinterest/` → `modules/pinterest.js`. -You can drop multiple `.ndjson` files in a module folder; each gets its own -`describe` block and each line becomes its own `test`. - -Filenames are free-form — the auto-export filename from the popup -(`zeeschuimer-export--.ndjson`) is fine. - -## Privacy / committing - -These files contain real captured platform data — usernames, post -content, URLs, sometimes images and other PII. - -If we want to create test exports or annonomize real exports, add them to -.gitignore. \ No newline at end of file diff --git a/tests/probe-4cat.mjs b/tests/probe-4cat.mjs deleted file mode 100644 index 0bf4e4d..0000000 --- a/tests/probe-4cat.mjs +++ /dev/null @@ -1,140 +0,0 @@ -/** - * Manually exercise 4CAT's /api/map-item/ endpoint against a fixture item. - * - * Usage: - * node probe-4cat.mjs [] [--index N] - * - * is the Zeeschuimer module filename without `.js` (e.g. - * "tiktok", "pinterest"). If is omitted, the first - * .ndjson in tests/fixtures// is used. --index selects which - * line of the fixture to send (default 0). - * - * Requires tests/.env with FOURCAT_URL and FOURCAT_API_KEY. - */ - -import 'dotenv/config'; -import { readFileSync, existsSync, readdirSync } from 'node:fs'; -import { join, dirname } from 'node:path'; -import { fileURLToPath } from 'node:url'; - -const __dirname = dirname(fileURLToPath(import.meta.url)); - -const FOURCAT_URL = process.env.FOURCAT_URL?.replace(/\/$/, ''); -const FOURCAT_API_KEY = process.env.FOURCAT_API_KEY; - -if (!FOURCAT_URL || !FOURCAT_API_KEY || FOURCAT_API_KEY === 'your-api-key-here') { - console.error('error: FOURCAT_URL and FOURCAT_API_KEY must be set in tests/.env'); - console.error(' (copy tests/.env.example to tests/.env and fill in real values)'); - process.exit(1); -} - -const ID_MAP_PATH = join(__dirname, 'zeeschuimer-to-4cat.json'); -const ID_MAP = existsSync(ID_MAP_PATH) - ? JSON.parse(readFileSync(ID_MAP_PATH, 'utf8')) - : {}; - -function auth_headers() { - return { 'Authorization': `${FOURCAT_API_KEY}` }; -} - -async function list_datasources() { - const res = await fetch(`${FOURCAT_URL}/api/datasources/`, { headers: auth_headers() }); - if (!res.ok) { - throw new Error(`GET /api/datasources/ → ${res.status}: ${await res.text()}`); - } - const body = await res.json(); - return body.datasources ?? []; -} - -async function map_item(datasource_id, item) { - const res = await fetch(`${FOURCAT_URL}/api/map-item/${datasource_id}/`, { - method: 'POST', - headers: { ...auth_headers(), 'Content-Type': 'application/json' }, - body: JSON.stringify({ item }), - }); - const text = await res.text(); - let body; - try { body = JSON.parse(text); } catch { body = { raw: text }; } - return { status_code: res.status, body }; -} - -function parse_args(argv) { - const args = { module: null, fixture: null, index: 0 }; - const positional = []; - for (let i = 2; i < argv.length; i++) { - if (argv[i] === '--index') { - args.index = parseInt(argv[++i], 10); - } else if (argv[i].startsWith('--index=')) { - args.index = parseInt(argv[i].split('=')[1], 10); - } else { - positional.push(argv[i]); - } - } - args.module = positional[0]; - args.fixture = positional[1]; - return args; -} - -async function main() { - const args = parse_args(process.argv); - if (!args.module) { - console.error('Usage: node probe-4cat.mjs [] [--index N]'); - process.exit(1); - } - - const datasource_id = ID_MAP[args.module] ?? args.module; - const fixture_dir = join(__dirname, 'fixtures', args.module); - - if (!existsSync(fixture_dir)) { - console.error(`error: no fixture dir at ${fixture_dir}`); - process.exit(1); - } - - const candidates = readdirSync(fixture_dir).filter(f => f.endsWith('.ndjson')); - if (candidates.length === 0) { - console.error(`error: no .ndjson fixtures under ${fixture_dir}`); - process.exit(1); - } - const fixture_name = args.fixture ?? candidates[0]; - const fixture_path = join(fixture_dir, fixture_name); - if (!existsSync(fixture_path)) { - console.error(`error: fixture ${fixture_path} not found`); - process.exit(1); - } - - const lines = readFileSync(fixture_path, 'utf8').split('\n').filter(l => l.trim().length > 0); - if (args.index >= lines.length) { - console.error(`error: --index ${args.index} but fixture has ${lines.length} items`); - process.exit(1); - } - const item = JSON.parse(lines[args.index]); - - console.log(`Module: ${args.module}`); - console.log(`Datasource id: ${datasource_id}${ID_MAP[args.module] ? ' (mapped via zeeschuimer-to-4cat.json)' : ''}`); - console.log(`URL: ${FOURCAT_URL}/api/map-item/${datasource_id}/`); - console.log(`Fixture: ${fixture_name}, item ${args.index} (item_id=${item.item_id ?? item.id})`); - console.log(''); - - const { status_code, body } = await map_item(datasource_id, item); - console.log(`HTTP ${status_code}`); - console.log(JSON.stringify(body, null, 2)); - - if (status_code === 404) { - console.error(''); - console.error('Hint: datasource id may be wrong. Available Zeeschuimer-origin datasources:'); - try { - const datasources = await list_datasources(); - datasources - .filter(d => d.is_from_zeeschuimer && d.has_map_item) - .forEach(d => console.error(` - ${d.id} (${d.name})`)); - } catch (e) { - console.error(` (couldn't fetch list: ${e.message})`); - } - process.exit(2); - } -} - -main().catch(e => { - console.error(`probe failed: ${e.message}`); - process.exit(2); -}); From c62a7e796db9bc3e1f7cb12f78fc50cbfa37e60c Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 3 Jun 2026 15:06:47 +0200 Subject: [PATCH 24/41] update lib.js note on new endpoint --- js/lib.js | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/js/lib.js b/js/lib.js index c618a6a..518a6fa 100644 --- a/js/lib.js +++ b/js/lib.js @@ -59,7 +59,11 @@ class MissingMappedField { } // Mirror 4CAT's API serialization so JSON.stringify produces the same - // tagged form on both sides. See docs/4cat-map-item-api.md. + // tagged form on both sides: 4CAT's /api/dataset//items/ endpoint, + // when called with `missing_fields=keep`, emits missing values as + // `{ __missing: true, value: }`. Matching that shape here + // lets the map_item comparator deep-equal both sides without special + // handling. toJSON() { return { __missing: true, value: this.value }; } From 234f1ce4377ceedf64777054b303e01d84293a2c Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 3 Jun 2026 15:07:21 +0200 Subject: [PATCH 25/41] update tests/.env.example (comments and dataset keys) --- tests/.env.example | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/tests/.env.example b/tests/.env.example index 2e021bb..137a52b 100644 --- a/tests/.env.example +++ b/tests/.env.example @@ -1,9 +1,23 @@ -# 4CAT API config for the map_item comparison tests. +# 4CAT API config for the map_item comparator (`npm run test:compare`). # Copy this file to .env in this directory and fill in real values. # .env is gitignored; .env.example is the committed template. -# Base URL of the 4CAT instance to hit. No trailing slash. +# Base URL of the 4CAT instance to hit. No trailing slash. Default ports: +# :80 for nginx (production) +# :4000 for the Flask dev server FOURCAT_URL=http://localhost -# API key for that 4CAT instance. Get one from the 4CAT UI; tied to your user. +# API key for that 4CAT instance. Get one from the 4CAT UI; tied to your +# user. 4CAT accepts the raw key as the Authorization header value (no +# `Bearer ` prefix). FOURCAT_API_KEY=your-api-key-here + +# Comma-separated list of dataset keys (the 32-char ids from 4CAT dataset +# URLs) to compare. The comparator pulls inputs from /download/ and +# expected outputs from +# /api/dataset//items/?annotations=no&missing_fields=keep&stream=true +# for each. Datasource is read from each dataset's metadata. +# +# `npm run test:compare -- ` narrows a single run to one key; the key +# must still be listed here. +FOURCAT_DATASETS=key1,key2,key3 From e0d0fb834983456aafadf4f1f9708855aa502b1c Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 3 Jun 2026 15:08:31 +0200 Subject: [PATCH 26/41] note on _loader.js for `wrap_for_map_item` --- modules/_loader.js | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/modules/_loader.js b/modules/_loader.js index afae2d7..ceb0080 100644 --- a/modules/_loader.js +++ b/modules/_loader.js @@ -1,3 +1,8 @@ +// Load-order dependency: `wrap_for_map_item` (used below) is a free global +// defined in js/lib.js, which manifest.json loads as a plain background +// script before this module. There is no import for it here on purpose — +// MV2 background scripts share one global scope. If lib.js stops being +// loaded first, the mapper wrapper below will ReferenceError. async function load() { const imported_modules = [ await import("./tiktok.js"), From f2341d6e798a39f777d13e5c60af81d360ae6714 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 3 Jun 2026 15:09:51 +0200 Subject: [PATCH 27/41] fix my test environment; scripts vs libraries --- .gitignore | 2 + tests/_module-info.js | 36 ++++++++++++------ tests/jest.compare.config.cjs | 20 ++++++++++ tests/jest.config.cjs | 3 ++ tests/package-lock.json | 70 +++++++++++++++++++++++++++++------ tests/package.json | 12 +++--- tests/run-compare.mjs | 43 +++++++++++++++++++++ tests/setup-globals.cjs | 52 +++++++++++--------------- 8 files changed, 179 insertions(+), 59 deletions(-) create mode 100644 tests/jest.compare.config.cjs create mode 100644 tests/run-compare.mjs diff --git a/.gitignore b/.gitignore index fea65f3..4d495c9 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,8 @@ .temp-profile tests/.env tests/.env.local +__pycache__/ +*.pyc # logs geckodriver.log diff --git a/tests/_module-info.js b/tests/_module-info.js index e261e4e..e6866a3 100644 --- a/tests/_module-info.js +++ b/tests/_module-info.js @@ -7,6 +7,9 @@ * the dynamic importer). * 2. Dynamically importing it and checking for a `map_item` export. * + * Results are cached per module name so test files that load this helper + * via separate Jest workers/files don't pay the spawnSync cost twice. + * * Returns one of four states the test driver can branch on: * { state: 'ok', map_item: } * { state: 'no_map_item' } @@ -21,25 +24,36 @@ import { fileURLToPath } from 'node:url'; const __dirname = dirname(fileURLToPath(import.meta.url)); const MODULES_ROOT = join(__dirname, '..', 'modules'); +const syntax_cache = new Map(); +const inspect_cache = new Map(); + function check_module_syntax(module_name) { + if (syntax_cache.has(module_name)) return syntax_cache.get(module_name); const module_path = join(MODULES_ROOT, `${module_name}.js`); const result = spawnSync(process.execPath, ['--check', module_path], { encoding: 'utf8' }); - if (result.status === 0) return null; - return (result.stderr || result.stdout || `exit code ${result.status}`).trim(); + const out = result.status === 0 + ? null + : (result.stderr || result.stdout || `exit code ${result.status}`).trim(); + syntax_cache.set(module_name, out); + return out; } export async function inspect_module(module_name) { + if (inspect_cache.has(module_name)) return inspect_cache.get(module_name); const syntax_error = check_module_syntax(module_name); + let result; if (syntax_error) { - return { state: 'syntax_error', error: syntax_error }; - } - try { - const mod = await import(`../modules/${module_name}.js`); - if (typeof mod.map_item !== 'function') { - return { state: 'no_map_item' }; + result = { state: 'syntax_error', error: syntax_error }; + } else { + try { + const mod = await import(`../modules/${module_name}.js`); + result = typeof mod.map_item === 'function' + ? { state: 'ok', map_item: mod.map_item } + : { state: 'no_map_item' }; + } catch (e) { + result = { state: 'import_error', error: e }; } - return { state: 'ok', map_item: mod.map_item }; - } catch (e) { - return { state: 'import_error', error: e }; } + inspect_cache.set(module_name, result); + return result; } diff --git a/tests/jest.compare.config.cjs b/tests/jest.compare.config.cjs new file mode 100644 index 0000000..070e2ff --- /dev/null +++ b/tests/jest.compare.config.cjs @@ -0,0 +1,20 @@ +// Tier 2 — live comparator against a 4CAT instance. +// +// Runs only `map_item_compare.test.js`. Requires FOURCAT_URL, +// FOURCAT_API_KEY, and FOURCAT_DATASETS to be set in tests/.env. Hard-errors +// rather than silently skipping if env is missing. +// +// Env is jsdom so that the four modules using `strip_tags` (gab, pinterest, +// rednote, truth) have a native DOMParser. The comparator uses cross-fetch +// to provide a jsdom-friendly fetch (jsdom doesn't ship fetch and undici +// crashes inside jsdom). +module.exports = { + testEnvironment: 'jsdom', + testMatch: ['**/map_item_compare.test.js'], + testPathIgnorePatterns: ['/node_modules/'], + transform: {}, + moduleFileExtensions: ['js', 'json'], + setupFiles: ['/setup-globals.cjs'], + testTimeout: 30000, + verbose: true +}; diff --git a/tests/jest.config.cjs b/tests/jest.config.cjs index ea72b10..239abbc 100644 --- a/tests/jest.config.cjs +++ b/tests/jest.config.cjs @@ -1,6 +1,9 @@ +// Default Jest config — Tier 1 only (duplicate-behavior + load-only smoke). +// The comparator is excluded; invoke it via `npm run test:compare`. module.exports = { testEnvironment: 'jsdom', testMatch: ['**/*.test.js'], + testPathIgnorePatterns: ['/node_modules/', 'map_item_compare\\.test\\.js$'], transform: {}, moduleFileExtensions: ['js', 'json'], collectCoverageFrom: ['*.test.js'], diff --git a/tests/package-lock.json b/tests/package-lock.json index 7758e9f..ada8011 100644 --- a/tests/package-lock.json +++ b/tests/package-lock.json @@ -8,12 +8,12 @@ "name": "zeeschuimer-db-tests", "version": "1.0.0", "devDependencies": { + "cross-fetch": "^4.0.0", "dexie": "^3.2.4", "dotenv": "^16.4.5", "fake-indexeddb": "^5.0.1", "jest": "^29.7.0", - "jest-environment-jsdom": "^29.7.0", - "undici": "^6.20.0" + "jest-environment-jsdom": "^29.7.0" } }, "node_modules/@babel/code-frame": { @@ -1599,6 +1599,16 @@ "node": "^14.15.0 || ^16.10.0 || >=18.0.0" } }, + "node_modules/cross-fetch": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/cross-fetch/-/cross-fetch-4.1.0.tgz", + "integrity": "sha512-uKm5PU+MHTootlWEY+mZ4vvXoCn4fLQxT9dSc1sXVMSFkINTJVN8cAQROpwcKm8bJ/c7rgZVIBWzH5T78sNZZw==", + "dev": true, + "license": "MIT", + "dependencies": { + "node-fetch": "^2.7.0" + } + }, "node_modules/cross-spawn": { "version": "7.0.6", "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.6.tgz", @@ -3481,6 +3491,52 @@ "dev": true, "license": "MIT" }, + "node_modules/node-fetch": { + "version": "2.7.0", + "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.7.0.tgz", + "integrity": "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==", + "dev": true, + "license": "MIT", + "dependencies": { + "whatwg-url": "^5.0.0" + }, + "engines": { + "node": "4.x || >=6.0.0" + }, + "peerDependencies": { + "encoding": "^0.1.0" + }, + "peerDependenciesMeta": { + "encoding": { + "optional": true + } + } + }, + "node_modules/node-fetch/node_modules/tr46": { + "version": "0.0.3", + "resolved": "https://registry.npmjs.org/tr46/-/tr46-0.0.3.tgz", + "integrity": "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==", + "dev": true, + "license": "MIT" + }, + "node_modules/node-fetch/node_modules/webidl-conversions": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz", + "integrity": "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==", + "dev": true, + "license": "BSD-2-Clause" + }, + "node_modules/node-fetch/node_modules/whatwg-url": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-5.0.0.tgz", + "integrity": "sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw==", + "dev": true, + "license": "MIT", + "dependencies": { + "tr46": "~0.0.3", + "webidl-conversions": "^3.0.0" + } + }, "node_modules/node-int64": { "version": "0.4.0", "resolved": "https://registry.npmjs.org/node-int64/-/node-int64-0.4.0.tgz", @@ -4198,16 +4254,6 @@ "url": "https://github.com/sponsors/sindresorhus" } }, - "node_modules/undici": { - "version": "6.26.0", - "resolved": "https://registry.npmjs.org/undici/-/undici-6.26.0.tgz", - "integrity": "sha512-4yqz8a3n5HmGTlsbADNtr/dJlhkh/55Rq798G6ibiULcXbDtaLpTl1pvdqcbFfeoj3iSi52lePFM7h9H21cw/A==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=18.17" - } - }, "node_modules/undici-types": { "version": "7.16.0", "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.16.0.tgz", diff --git a/tests/package.json b/tests/package.json index 390fdd3..763321c 100644 --- a/tests/package.json +++ b/tests/package.json @@ -1,19 +1,19 @@ { "name": "zeeschuimer-db-tests", "version": "1.0.0", - "description": "Unit tests for Zeeschuimer duplicate handling logic", + "description": "Unit tests for Zeeschuimer duplicate handling logic and map_item generator output", "type": "module", "scripts": { - "test": "node --experimental-vm-modules node_modules/jest/bin/jest.js", - "test:watch": "node --experimental-vm-modules node_modules/jest/bin/jest.js --watch", - "probe": "node probe-4cat.mjs" + "test": "node --experimental-vm-modules node_modules/jest/bin/jest.js --config jest.config.cjs", + "test:watch": "node --experimental-vm-modules node_modules/jest/bin/jest.js --config jest.config.cjs --watch", + "test:compare": "node run-compare.mjs" }, "devDependencies": { + "cross-fetch": "^4.0.0", "dexie": "^3.2.4", "dotenv": "^16.4.5", "fake-indexeddb": "^5.0.1", "jest": "^29.7.0", - "jest-environment-jsdom": "^29.7.0", - "undici": "^6.20.0" + "jest-environment-jsdom": "^29.7.0" } } diff --git a/tests/run-compare.mjs b/tests/run-compare.mjs new file mode 100644 index 0000000..69240ab --- /dev/null +++ b/tests/run-compare.mjs @@ -0,0 +1,43 @@ +/** + * Launcher for the Tier 2 map_item comparator (`npm run test:compare`). + * + * npm run test:compare -> compares every key in FOURCAT_DATASETS + * npm run test:compare -- -> narrows the run to a single key + * npm run test:compare -- -t "id=123" -> key + forwarded jest flags + * + * Why this exists instead of invoking jest directly: jest treats any bare + * positional argument as a test-path-pattern filter. A 4CAT dataset key + * (`5daeba72a2dfbb5ed8c855f824a61570`) matches no test file path, so + * `jest ` silently discovers zero tests and exits "green" having run + * nothing. This launcher intercepts the first non-flag argument, hands it to + * the comparator through the COMPARE_DATASET env var, and forwards only the + * remaining flags to jest — so the key never reaches jest's argv. + */ + +import { spawn } from 'node:child_process'; +import { fileURLToPath } from 'node:url'; +import { dirname, join } from 'node:path'; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const args = process.argv.slice(2); + +// First non-flag arg (if any) is the dataset key to narrow to. Everything +// that looks like a flag is forwarded to jest verbatim. +const dataset_key = args.find(a => !a.startsWith('-')); +const jest_flags = args.filter(a => a !== dataset_key); + +const env = { ...process.env }; +if (dataset_key) env.COMPARE_DATASET = dataset_key; + +const jest_bin = join(__dirname, 'node_modules', 'jest', 'bin', 'jest.js'); +const child = spawn( + process.execPath, + ['--experimental-vm-modules', jest_bin, '--config', 'jest.compare.config.cjs', ...jest_flags], + { stdio: 'inherit', cwd: __dirname, env }, +); + +child.on('exit', code => process.exit(code ?? 1)); +child.on('error', err => { + console.error(`failed to launch jest: ${err.message}`); + process.exit(1); +}); diff --git a/tests/setup-globals.cjs b/tests/setup-globals.cjs index 6793cc0..b55e659 100644 --- a/tests/setup-globals.cjs +++ b/tests/setup-globals.cjs @@ -4,50 +4,42 @@ * loads lib.js as a plain script. * * map_item bodies reference these as free identifiers (MappedItem, - * MissingMappedField, strip_tags, normalize_url_encoding, ...). Without this - * shim they'd hit ReferenceError as soon as a test invokes map_item. + * MissingMappedField, strip_tags, normalize_url_encoding, ...). Without + * this shim they'd hit ReferenceError as soon as a test invokes map_item. * - * Approach: read lib.js, wrap it in a new Function() body that returns the - * named helpers, call the function, and assign the returned object onto - * globalThis. (Earlier attempt with vm.runInThisContext failed because in - * the jsdom env the vm context's global differs from jsdom's window.) - * - * If a new helper is added to lib.js, append its name to EXPOSED_NAMES. + * Names are auto-discovered from lib.js by regex-matching top-level + * `function name(...)` and `class Name ...` declarations. Adding a helper + * to lib.js makes it available to tests without touching this file. */ const fs = require('node:fs'); const path = require('node:path'); -const EXPOSED_NAMES = [ - 'traverse_data', - 'MappedItem', - 'MissingMappedField', - 'MapItemException', - 'wrap_for_map_item', - 'strip_tags', - 'normalize_url_encoding', - 'formatUtcTimestamp', -]; - const lib_source = fs.readFileSync( path.join(__dirname, '..', 'js', 'lib.js'), 'utf8', ); +// Match `function name(` and `class Name {` / `class Name extends` at +// column 0 of a line. lib.js is a classic script with all top-level +// declarations unindented; requiring column 0 keeps nested helpers (like +// the `_traverse_data` IIFE inside `traverse_data`) from being exposed. +const NAME_PATTERN = /^(?:function|class)\s+([A-Za-z_$][A-Za-z0-9_$]*)\b/gm; +const EXPOSED_NAMES = Array.from( + lib_source.matchAll(NAME_PATTERN), + m => m[1], +); + +if (EXPOSED_NAMES.length === 0) { + throw new Error( + 'setup-globals.cjs: no top-level function/class declarations found in js/lib.js — ' + + 'auto-discovery regex may be broken. Tests will ReferenceError if not fixed.' + ); +} + const factory = new Function(` ${lib_source} return { ${EXPOSED_NAMES.join(', ')} }; `); Object.assign(globalThis, factory()); - -// jsdom doesn't expose fetch and Jest's jsdom env shadows Node's global -// fetch, so the comparator can't hit 4CAT without help. Polyfill from -// undici (a Node-friendly HTTP client, separately installable on npm — -// distinct from the undici bundled internally by Node, which isn't -// require()-able by name). -// Note: tests that use fetch (e.g. map_item_compare.test.js) declare -// `@jest-environment node` at the top of the file. Node env has fetch -// natively. Don't try to polyfill into jsdom — undici's internals use -// Node-specific globals that jsdom shadows (clearImmediate, -// markResourceTiming, fast timers), and polyfilling them all is brittle. From e39ad4276e93b7792d852a55c83ce2cbf9c805d4 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 3 Jun 2026 15:10:12 +0200 Subject: [PATCH 28/41] update map_item_compare.test.js for new 4CAT endpoints --- tests/README.md | 193 ++++++++++--- tests/map_item_compare.test.js | 505 +++++++++++++++++++++------------ 2 files changed, 478 insertions(+), 220 deletions(-) diff --git a/tests/README.md b/tests/README.md index f1188e2..cd35e0a 100644 --- a/tests/README.md +++ b/tests/README.md @@ -1,31 +1,42 @@ ## Tests for Zeeschuimer -This folder contains **testing** code for Zeeschuimer. +This folder contains testing code for Zeeschuimer. There are three suites, +each with a different purpose and a different runtime environment: -### Integration Tests (Selenium) +| Suite | Tests | Environment | When it runs | Needs | +|----------------------------------|-----------------------------------------------------------|--------------------|---------------------------------|----------------------------------------| +| Selenium integration | Page captures real items from each supported platform | Real Firefox | Reviewer-supervised, manual | Firefox profile, sometimes a human | +| Duplicate-behavior unit (Jest) | DB merge / keep / update semantics in isolation | jsdom + fake-IDB | `npm test` (every push) | None | +| Module load smoke (Jest, Tier 1) | Each `modules/*.js` parses and imports cleanly | jsdom | `npm test` (every push) | None | +| `map_item` comparator (Jest, Tier 2) | JS `map_item` output matches 4CAT's Python mapping per item | jsdom + cross-fetch | `npm run test:compare` (on demand) | Live 4CAT, API key, dataset key(s) | -The Python + Selenium tests visit pages on supported platforms -and see how many items are captured. If the amount of items captured is -unexpectedly low or high, this is flagged and may indicate that Zeeschuimer no -longer properly captures data from the platform. +Hermetic suites (no external dependencies) live in `npm test`. Anything that +requires a real browser, a 4CAT server, or a human in the loop is opt-in. -These tests are **supervised** i.e. they require monitoring by a human and +### Integration tests (Selenium) + +The Python + Selenium tests visit pages on supported platforms and see how +many items are captured. If the amount of items captured is unexpectedly +low or high, this is flagged and may indicate that Zeeschuimer no longer +properly captures data from the platform. + +These tests are **supervised** — they require monitoring by a human and cannot run fully autonomously, since some platforms (TikTok in particular) occasionally show CAPTCHAs that need to be completed for a test to run successfully. This is also why Selenium does not run a headless Firefox. -The amount of items returned per page is somewhat variable for most platforms, -so if the number is slightly lower or higher than expected this is not -necessarily a problem (but worth checking). +The amount of items returned per page is somewhat variable for most +platforms, so if the number is slightly lower or higher than expected this +is not necessarily a problem (but worth checking). -Additionally, most platforms require logging in before (full) access to the UI -is available. The testing script borrows a Firefox profile directory from -elsewhere on the system to do this. It will try to find one automatically but -you can also pass one with the `--profiledir` argument. The idea is that you -log in to the various sites (Instagram, etc) in your 'normal' Firefox, and the -tests then borrow that login to interface with the website. +Most platforms require logging in before (full) access to the UI is +available. The testing script borrows a Firefox profile directory from +elsewhere on the system to do this. It will try to find one automatically +but you can also pass one with the `--profiledir` argument. Log in to the +various sites (Instagram, etc) in your 'normal' Firefox, and the tests then +borrow that login. -Run `test.py` to run tests. Required non-standard libraries are in +Run `test.py` to run tests. Required non-standard libraries are in `requirements.txt`. Tests are defined in `tests.json` with the following structure: @@ -35,49 +46,139 @@ Tests are defined in `tests.json` with the following structure: "platform id as in zeeschuimer (e.g. 'tiktok.com')": { "test case (e.g. 'Home feed')": { "url": { - "expected": 0, # amount of items expected to be captured on this page - "more-after-scroll": false, # whether scrolling is supposed to load more items (currently unsupported) - "wait": 10 # wait time before checking number of items (optional, default 5) - } # more URLS can be added per test case + "expected": 0, + "more-after-scroll": false, + "wait": 10 + } } } } ``` -### Unit Tests (Jest) - -The JavaScript unit tests verify duplicate-handling logic in isolation using -a mocked Dexie database. These tests ensure that when the duplicate behavior -setting is changed, the correct existing record is selected for updates. +### Jest suites **Prerequisites** -- Node.js (v18 or later) and npm must be installed +- Node.js (v18 or later) and npm +- `cd tests && npm install` + +**Recommended: develop the tests inside Docker.** On Windows the global +permission model can make `npm install` / `npm test` awkward to run from +an arbitrary shell, and an agentic assistant working in auto-mode will +hit deny-rules before it can do a `cross-fetch`-style dependency spike. +Any minimal `node:20`-or-newer image with this repo mounted in is +enough — install what you need, run `npm install`, run `npm test` and +`npm run test:compare`. The host's `tests/.env` is picked up via the +mount, and `FOURCAT_URL` can point at a 4CAT reachable from the +container (`host.docker.internal` on Windows/Mac, the host IP on +Linux). + +#### Duplicate-behavior unit tests + +Verify duplicate-handling logic in isolation using a mocked Dexie database. +Ensures that when the duplicate behavior setting is changed, the correct +existing record is selected for updates. + +Coverage: +- Schema upgrade backfills `last_updated` from `timestamp_collected` +- Compound index correctly selects most recent item by `last_updated` +- Forward-looking behavior: "keep" → "update" targets newest record +- Forward-looking behavior: "update" → "keep" creates new records +- Merge: shallow merge preserves fields from both records +- Skip: no modifications occur when duplicate found +- Platform isolation: same `item_id` on different platforms are independent +- Tie-breaker: when `last_updated` is equal, prefer higher `id` + +#### Module load smoke (Tier 1) + +For every file under `modules/*.js`, `tests/map_item.test.js` asserts the +module parses and imports without throwing. Modules with a `map_item` +export and modules without one both pass this tier — the goal is purely to +catch a generator that emits a syntax error or an import-time throw. + +No data is run through `map_item` here; that work belongs in the +comparator. + +#### `map_item` comparator (Tier 2) + +For every 4CAT dataset key listed in `FOURCAT_DATASETS`, +`tests/map_item_compare.test.js`: -**Setup** +1. fetches `/api/dataset//metadata/` to learn the datasource id +2. translates that id to a Zeeschuimer module name via + `zeeschuimer-to-4cat.json` (used in reverse) +3. fetches `/download/` (NDJSON inputs, already wrapped via + `wrap_for_map_item` by Zeeschuimer pre-upload) and + `/api/dataset//items/?annotations=no&missing_fields=keep&stream=true` + (expected outputs from 4CAT's Python `map_item`, as NDJSON — `stream=true` + avoids the JSON form's `limit=100` pagination) +4. pairs items by `id` (or by index with a warning if `id` is missing on + either side), runs each input through the local `map_item`, and + field-by-field diffs against the expected output (4CAT's API-only + aggregate `missing_fields` key is excluded; per-field `{__missing:true}` + markers are still compared) -1. Install Node.js dependencies: - ```bash - cd tests - npm install - ``` +The comparator does **not** exercise `wrap_for_map_item` itself — Zeeschuimer +applies it pre-storage and `/download/` returns post-wrap items. This +is an accepted gap; see `docs/map-item-test-plan.md`. -**Running tests** +**Configuration:** copy `tests/.env.example` to `tests/.env` and set: +- `FOURCAT_URL` — base URL of the 4CAT instance (no trailing slash) +- `FOURCAT_API_KEY` — raw API key (no `Bearer ` prefix) +- `FOURCAT_DATASETS` — comma-separated list of dataset keys + +The comparator hard-errors at startup if any of these are missing. + +**Optional knob:** `FAIL_FAST=0` (or `FAIL_FAST=false`) runs every item in +every dataset; default is to halt subsequent items in a dataset once one +has failed. + +### Running ```bash +# everything that's hermetic — duplicate-behavior unit + module load smoke npm test -``` -For watch mode during development: -```bash +# watch mode for the same npm run test:watch + +# the comparator — every dataset key in FOURCAT_DATASETS +npm run test:compare + +# the comparator narrowed to one dataset key (must still appear in +# FOURCAT_DATASETS — protects against typos) +npm run test:compare -- ``` -**Test coverage** -- Schema upgrade backfills `last_updated` from `timestamp_collected` -- Compound index correctly selects most recent item by `last_updated` -- Forward-looking behavior: switching from "keep" to "update" targets newest record -- Forward-looking behavior: switching from "update" to "keep" creates new records -- Merge behavior: shallow merge preserves fields from both records -- Skip behavior: no modifications occur when duplicate found -- Platform isolation: same `item_id` on different platforms are independent -- Tie-breaker: when `last_updated` is equal, prefer higher `id` +### Where does a new test go? + +- **Pure data transformation, no live external state, runs anywhere.** + Duplicate-behavior unit suite (DB logic) or the Tier 1 smoke + (`map_item` static checks). +- **Field-by-field correctness against 4CAT's Python `map_item`.** Tier 2 + comparator. Add a dataset to `FOURCAT_DATASETS` that covers the case; + the comparator will pick it up. +- **End-to-end user flow in the extension.** Selenium. + +### Why the environments differ + +The two Jest tiers run in **jsdom** rather than node env. The reasoning: + +- `map_item` bodies are pure data transformation, but four of them + (`gab`, `pinterest`, `rednote`, `truth`) call `strip_tags`, which + invokes `new DOMParser()`. jsdom provides a spec-compliant native + `DOMParser`; node env doesn't. +- jsdom doesn't ship `fetch`. The standard workaround + (`undici`) crashes inside jsdom because it pokes at + `clearImmediate` / `markResourceTiming` / fast-now timers that jsdom + shadows. `cross-fetch` wraps `node-fetch` v2 internally and doesn't + hit those Node internals, so it works in jsdom — the comparator + imports `cross-fetch/polyfill` to assign `globalThis.fetch`. + +The tradeoff is parser parity. `cross-fetch`-via-`node-fetch` and +jsdom's `DOMParser` are not byte-equal to Firefox's Gecko `DOMParser`, +which is what runs in production. Whitespace handling around `
` and +block elements is the usual suspect. If the comparator emits false- +positive diffs on text fields for the four `strip_tags` modules, the +right fix is to normalise whitespace in the comparator's `deep_equal` +rather than chase parser parity. The Selenium tier sits above and +provides the real-Gecko fidelity check. diff --git a/tests/map_item_compare.test.js b/tests/map_item_compare.test.js index 37e3e4c..86ab707 100644 --- a/tests/map_item_compare.test.js +++ b/tests/map_item_compare.test.js @@ -1,40 +1,60 @@ /** - * @jest-environment node + * Compare JS map_item output against 4CAT's Python map_item via dataset keys. * - * This file runs in Node test environment (not jsdom) because undici's - * fetch implementation uses Node-internal APIs (`clearImmediate`, - * `markResourceTiming`, fast-now timers, etc.) that jsdom shadows or - * doesn't expose. Polyfilling them into jsdom is whack-a-mole; node env - * has them all natively. + * For each 4CAT dataset key in FOURCAT_DATASETS, this test: + * 1. fetches /api/dataset//metadata/ to learn the datasource id + * 2. translates that id back to a Zeeschuimer module name via + * zeeschuimer-to-4cat.json (used in reverse) + * 3. inspects the local module (must export map_item) + * 4. fetches in parallel, both as NDJSON: + * /download/ -> INPUTS (post-wrap) + * /api/dataset//items/?annotations=no&missing_fields=keep&stream=true + * -> mapped EXPECTED OUTPUTS + * 5. pairs items by `id`, runs each input through the local map_item, and + * deep-equals the result against the corresponding expected output. * - * Trade-off: no DOMParser in node env. The four modules that use - * `strip_tags` (gab, pinterest, rednote, truth) will need a DOMParser - * polyfill (e.g. via linkedom) before the comparator can run against - * them. Other modules (including instagram) work as-is. - */ -/** - * Compare JS map_item output against 4CAT's Python map_item via the API. + * The items endpoint is fetched with `stream=true` (NDJSON): its JSON-array + * form paginates at `limit=100`, silently dropping rows on larger datasets. + * `annotations=no` drops processor-added fields; `missing_fields=keep` keeps + * unmapped fields as `{ __missing: true, value: "" }` markers (matching the JS + * side) and additionally adds a comma-joined `missing_fields` summary key. + * That summary is API-only — the JS map_item never emits it — so it is + * excluded from the diff (see API_ONLY_FIELDS); the per-field markers it + * summarizes are still compared. * - * For every line in every fixture, runs the JS map_item locally AND sends - * the same stored item to 4CAT's /api/map-item// endpoint, then - * diffs the two outputs field-by-field. Each item is its own Jest test — - * failures point at exactly which item and which fields diverge. + * Items from /download/ already have `wrap_for_map_item` applied by + * Zeeschuimer pre-upload, so they're fed to map_item directly without + * re-wrapping. The trade-off is that this comparator does not exercise + * `wrap_for_map_item` itself — see docs/map-item-test-plan.md for the + * accepted-gap rationale. * - * Skips itself entirely if FOURCAT_URL / FOURCAT_API_KEY aren't set, so - * `npm test` keeps working without 4CAT configuration. Drop real values in - * tests/.env to enable. + * Environment notes (fetch + DOMParser): + * - jsdom env so `strip_tags` (used by gab/pinterest/rednote/truth) has + * a native DOMParser. + * - jsdom doesn't ship `fetch`. Spiked three candidates on 2026-06-03 + * under node:20-alpine: + * * `undici` — crashes at import in jsdom (pokes at + * clearImmediate/markResourceTiming/fast-now + * timers that jsdom shadows). + * * `node-fetch` v3 — imports clean but `res.text()` throws + * `ReferenceError: TextDecoder is not defined` + * (jsdom doesn't expose TextDecoder as a global). + * * `cross-fetch/polyfill` — clean import + working round-trip. + * So this file imports `cross-fetch/polyfill`, which assigns + * `globalThis.fetch` when undefined. * - * Datasource id mapping: tests/zeeschuimer-to-4cat.json (Zeeschuimer - * module filename → 4CAT datasource id, for the few names that diverge). + * Invocation: + * npm run test:compare # runs every key in FOURCAT_DATASETS + * npm run test:compare -- # narrows to one key (must be in + * # FOURCAT_DATASETS to avoid typos) * - * Module-level state is determined upfront by inspect_module() (no - * map_item / syntax errors / import errors are handled before tests are - * registered, so they appear once per module, not once per item). + * Hard-errors at registration time if FOURCAT_URL, FOURCAT_API_KEY, or + * FOURCAT_DATASETS is missing — by Tier 2 contract these are required. */ +import 'cross-fetch/polyfill'; import 'dotenv/config'; -import { jest } from '@jest/globals'; -import { readdirSync, readFileSync, statSync, existsSync } from 'node:fs'; +import { readFileSync, existsSync } from 'node:fs'; import { join, dirname } from 'node:path'; import { fileURLToPath } from 'node:url'; import { inspect_module } from './_module-info.js'; @@ -43,56 +63,100 @@ const __dirname = dirname(fileURLToPath(import.meta.url)); const FOURCAT_URL = process.env.FOURCAT_URL?.replace(/\/$/, ''); const FOURCAT_API_KEY = process.env.FOURCAT_API_KEY; -const HAS_4CAT = Boolean( - FOURCAT_URL && FOURCAT_API_KEY && FOURCAT_API_KEY !== 'your-api-key-here' -); -// When true (default), once any item in a module fails, subsequent items -// in that same module skip the HTTP + map_item work and fail fast with a -// "halted" message. Saves time when generator output is broken at the top. -// Set FAIL_FAST=0 in env to run all items regardless. -// Trim because cmd.exe's `set FAIL_FAST=0 && ...` includes the trailing -// space in the variable value, which would otherwise defeat `!== '0'`. -const FAIL_FAST = (process.env.FAIL_FAST ?? '').trim() !== '0'; -const halted_modules = new Set(); +// Hard-fail if env is missing — Tier 2 contract. +function require_env(name, value, placeholder_values = []) { + if (!value || placeholder_values.includes(value)) { + throw new Error( + `${name} is not configured. Set it in tests/.env (see tests/.env.example).` + ); + } + return value; +} +require_env('FOURCAT_URL', FOURCAT_URL); +require_env('FOURCAT_API_KEY', FOURCAT_API_KEY, ['your-api-key-here']); + +const FOURCAT_DATASETS = require_env( + 'FOURCAT_DATASETS', + process.env.FOURCAT_DATASETS, + ['key1,key2,key3'], +) + .split(',') + .map(k => k.trim()) + .filter(k => k.length > 0); + +if (FOURCAT_DATASETS.length === 0) { + throw new Error('FOURCAT_DATASETS parsed as empty. Set a comma-separated list of dataset keys in tests/.env.'); +} + +// Optional narrowing to a single dataset key. The `npm run test:compare -- +// ` form is handled by run-compare.mjs, which sets COMPARE_DATASET; jest +// itself would mis-read a bare key as a test-path-pattern filter and silently +// run nothing. A narrowed key must still be declared in FOURCAT_DATASETS — +// erroring on an unlisted key catches typos and keeps the dataset list the +// single source of truth. +const COMPARE_DATASET = process.env.COMPARE_DATASET?.trim() || undefined; +if (COMPARE_DATASET && !FOURCAT_DATASETS.includes(COMPARE_DATASET)) { + throw new Error( + `COMPARE_DATASET=${COMPARE_DATASET} is not listed in FOURCAT_DATASETS. ` + + `Add it to tests/.env before narrowing the run to it.` + ); +} + +const DATASET_KEYS_TO_RUN = COMPARE_DATASET ? [COMPARE_DATASET] : FOURCAT_DATASETS; -const FIXTURE_ROOT = join(__dirname, 'fixtures'); +// 4CAT datasource id -> Zeeschuimer module name. The on-disk map is +// authored in the natural direction (zeeschuimer -> 4cat); flip here. const ID_MAP_PATH = join(__dirname, 'zeeschuimer-to-4cat.json'); -const ID_MAP = existsSync(ID_MAP_PATH) +const ZEESCHUIMER_TO_4CAT = existsSync(ID_MAP_PATH) ? JSON.parse(readFileSync(ID_MAP_PATH, 'utf8')) : {}; +const FOURCAT_TO_ZEESCHUIMER = Object.fromEntries( + Object.entries(ZEESCHUIMER_TO_4CAT) + .filter(([k]) => !k.startsWith('_')) + .map(([z, f]) => [f, z]) +); + +// When true (default), comparison of a dataset stops at its first failing +// item; the remaining items are reported as a single skipped "halted" +// placeholder rather than one failure each. Trim because `set FAIL_FAST=0 && +// ...` in cmd.exe includes the trailing space; treat both '0' and 'false' +// (case-insensitive) as off. +const FAIL_FAST_RAW = (process.env.FAIL_FAST ?? '').trim().toLowerCase(); +const FAIL_FAST = FAIL_FAST_RAW !== '0' && FAIL_FAST_RAW !== 'false'; -function wrap_for_map_item(stored_item) { - const { data, ...meta } = stored_item; - return { ...data, __import_meta: meta }; +function auth_headers(extra = {}) { + return { + // 4CAT accepts the raw key without a `Bearer ` prefix. + 'Authorization': FOURCAT_API_KEY, + ...extra, + }; } -async function call_4cat_map_item(datasource_id, item) { - const res = await fetch(`${FOURCAT_URL}/api/map-item/${datasource_id}/`, { - method: 'POST', - headers: { - // 4CAT accepts the raw key without a `Bearer ` prefix, per probe - 'Authorization': FOURCAT_API_KEY, - 'Content-Type': 'application/json', - }, - body: JSON.stringify({ item }), - }); +async function fetch_json(url) { + const res = await fetch(url, { headers: auth_headers() }); const text = await res.text(); - if (!res.ok) { - throw new Error(`HTTP ${res.status} from 4CAT: ${text}`); - } + if (!res.ok) throw new Error(`HTTP ${res.status} from ${url}: ${text}`); return JSON.parse(text); } -// Round-trip a value through JSON so MappedItem, MissingMappedField, etc. -// become plain JSON-compatible objects matching what 4CAT emits. +async function fetch_ndjson(url) { + const res = await fetch(url, { headers: auth_headers() }); + const text = await res.text(); + if (!res.ok) throw new Error(`HTTP ${res.status} from ${url}: ${text}`); + return text + .split('\n') + .filter(line => line.trim().length > 0) + .map((line, i) => { + try { return JSON.parse(line); } + catch (e) { throw new Error(`bad NDJSON at line ${i} of ${url}: ${e.message}`); } + }); +} + function normalize(value) { return JSON.parse(JSON.stringify(value)); } -// Recursive structural equality. Doesn't care about object key order, which -// matters for nested values like {__missing: true, value: ""} where JS and -// Python might emit keys in different orders. function deep_equal(a, b) { if (a === b) return true; if (a === null || b === null) return a === b; @@ -138,8 +202,6 @@ function format_diffs(diffs) { }).join('\n'); } -// Pull out the first few module-frame lines from an error's stack so the -// failure message points at where in modules/.js the throw happened. function format_error_with_location(err) { if (!err) return String(err); const message = err.message || String(err); @@ -153,131 +215,226 @@ function format_error_with_location(err) { : message; } -function list_module_dirs() { - if (!existsSync(FIXTURE_ROOT)) return []; - return readdirSync(FIXTURE_ROOT).filter(name => { - try { return statSync(join(FIXTURE_ROOT, name)).isDirectory(); } - catch { return false; } - }); -} +// Pair inputs and expected outputs by `id`. Falls back to index pairing +// (with a logged warning) if either side is missing the field on its +// first item. +function pair_items(inputs, outputs, dataset_key) { + const probe_in = inputs[0]; + const probe_out = outputs[0]; + const has_id_in = probe_in && 'id' in probe_in && probe_in.id != null; + const has_id_out = probe_out && 'id' in probe_out && probe_out.id != null; -// Per-test timeout: each test does one HTTP round-trip to 4CAT. Jest's -// default 5s is tight under load. -jest.setTimeout(30000); + if (!has_id_in || !has_id_out) { + // eslint-disable-next-line no-console + console.warn( + `[compare] ${dataset_key}: no usable 'id' on ${!has_id_in ? '/download' : '/items'} ` + + `side — falling back to index pairing for this dataset.` + ); + const n = Math.min(inputs.length, outputs.length); + return { + mode: 'index', + pairs: Array.from({ length: n }, (_, i) => ({ input: inputs[i], expected: outputs[i], id: i })), + input_count: inputs.length, + output_count: outputs.length, + unmatched_inputs: [], + unmatched_outputs: [], + }; + } -if (!HAS_4CAT) { - describe('map_item compare (JS vs 4CAT Python)', () => { - test.skip('FOURCAT_URL / FOURCAT_API_KEY not configured — set them in tests/.env to enable', () => {}); - }); -} else { - const module_dirs = list_module_dirs(); - - // Pre-pass: synchronously determine each module's state so we can branch - // on it at registration time. - const module_info = {}; - for (const module_name of module_dirs) { - module_info[module_name] = await inspect_module(module_name); + const by_id_out = new Map(); + for (const item of outputs) by_id_out.set(String(item.id), item); + + const pairs = []; + const unmatched_inputs = []; + for (const input of inputs) { + const expected = by_id_out.get(String(input.id)); + if (expected) { + pairs.push({ input, expected, id: input.id }); + by_id_out.delete(String(input.id)); + } else { + unmatched_inputs.push(input.id); + } } + return { + mode: 'id', + pairs, + input_count: inputs.length, + output_count: outputs.length, + unmatched_inputs, + unmatched_outputs: Array.from(by_id_out.keys()), + }; +} + +// 4CAT exposes the datasource via `metadata.type`, which is the datasource +// id with a `-search` or `-import` suffix appended (e.g. `tiktok-search`, +// `xiaohongshu-comments-import`). Strip the trailing suffix to get the bare +// id, which we then translate to a Zeeschuimer module via +// FOURCAT_TO_ZEESCHUIMER. Datasource ids themselves may contain hyphens +// (e.g. `xiaohongshu-comments`), so the strip is anchored to end-of-string. +function extract_datasource_id(metadata) { + const type = metadata?.type; + if (!type) return null; + return type.replace(/-(search|import)$/, ''); +} - let any_fixtures = false; +// Fields 4CAT's API attaches to every mapped item that the JS map_item never +// produces, so they would otherwise diff as spurious "only_python" entries. +// `missing_fields` is a comma-joined summary of which fields came back as +// MissingMappedField — redundant with the per-field `{__missing:true}` +// markers, which ARE compared. +const API_ONLY_FIELDS = new Set(['missing_fields']); - for (const module_name of module_dirs) { - const fixture_dir = join(FIXTURE_ROOT, module_name); - const fixture_files = readdirSync(fixture_dir).filter(f => f.endsWith('.ndjson')); - if (fixture_files.length === 0) continue; - any_fixtures = true; +function strip_api_fields(obj) { + if (!obj || typeof obj !== 'object' || Array.isArray(obj)) return obj; + const out = {}; + for (const k of Object.keys(obj)) { + if (!API_ONLY_FIELDS.has(k)) out[k] = obj[k]; + } + return out; +} - const datasource_id = ID_MAP[module_name] ?? module_name; - const info = module_info[module_name]; +// Run each paired input through the local map_item and diff the result +// against 4CAT's expected output. With FAIL_FAST on (default), stop at the +// first failing item and record how many were left unchecked — so one bad +// item yields a single failure plus one skipped "halted" placeholder, not N +// failures. +function compare_pairs(pairs, map_item) { + const results = []; + let halted_count = 0; + for (let i = 0; i < pairs.length; i++) { + const { input, expected, id } = pairs[i]; + let message = null; + try { + let js_result; + try { + js_result = map_item(input); + } catch (e) { + throw new Error(`JS map_item threw: ${format_error_with_location(e)}`); + } + const diffs = diff_objects( + strip_api_fields(normalize(js_result)), + strip_api_fields(normalize(expected)), + ); + if (diffs.length > 0) { + message = `${diffs.length} field(s) differ between JS and 4CAT:\n${format_diffs(diffs)}`; + } + } catch (e) { + message = e.message; + } + results.push({ id, ok: message === null, message }); + if (message !== null && FAIL_FAST) { + halted_count = pairs.length - (i + 1); + break; + } + } + return { results, halted_count }; +} - if (info.state === 'no_map_item') { - // eslint-disable-next-line no-console - console.log(`[compare] skipping ${module_name}: modules/${module_name}.js does not export a map_item`); - continue; +// Pre-pass: for each dataset, fetch metadata + items and run the comparison +// up front, so tests register with knowable counts and a deterministic +// pass/fail per item. Fetch/setup failures become a single "setup" failure +// inside that dataset's describe. +const dataset_state = {}; +for (const key of DATASET_KEYS_TO_RUN) { + try { + const metadata = await fetch_json(`${FOURCAT_URL}/api/dataset/${key}/metadata/`); + const datasource_id = extract_datasource_id(metadata); + if (!datasource_id) { + throw new Error( + `metadata for ${key} has no datasource id (checked parameters.datasource, datasource, type)` + ); } + const module_name = FOURCAT_TO_ZEESCHUIMER[datasource_id] ?? datasource_id; + const module_state = await inspect_module(module_name); - if (info.state === 'syntax_error' || info.state === 'import_error') { - const msg = info.state === 'syntax_error' - ? `syntax error:\n${info.error}` - : `import failed: ${info.error.message}`; - describe(`map_item compare: ${module_name}`, () => { - test(`module loads`, () => { throw new Error(msg); }); - }); - continue; + if (module_state.state === 'ok') { + // Both sides as NDJSON. `stream=true` on the items endpoint avoids + // the JSON-array form's default `limit=100` pagination, which would + // silently drop rows (and break id-pairing) on larger datasets. + const [inputs, outputs] = await Promise.all([ + fetch_ndjson(`${FOURCAT_URL}/download/${key}`), + fetch_ndjson(`${FOURCAT_URL}/api/dataset/${key}/items/?annotations=no&missing_fields=keep&stream=true`), + ]); + const pairing = pair_items(inputs, outputs, key); + const comparison = compare_pairs(pairing.pairs, module_state.map_item); + dataset_state[key] = { metadata, datasource_id, module_name, module_state, pairing, comparison }; + } else { + dataset_state[key] = { metadata, datasource_id, module_name, module_state }; } + } catch (e) { + dataset_state[key] = { error: e }; + } +} - // state === 'ok' — register per-item comparison tests - const map_item = info.map_item; - - describe(`map_item compare: ${module_name} (4CAT id: ${datasource_id})`, () => { - for (const fixture_file of fixture_files) { - const lines = readFileSync(join(fixture_dir, fixture_file), 'utf8') - .split('\n') - .filter(line => line.trim().length > 0); - - describe(fixture_file, () => { - lines.forEach((line, i) => { - test(`item ${i}`, async () => { - if (FAIL_FAST && halted_modules.has(module_name)) { - throw new Error( - '[halted after prior failure in this module — set FAIL_FAST=0 to run all items]' - ); - } - try { - const stored_item = JSON.parse(line); - - // 4CAT side - const response = await call_4cat_map_item(datasource_id, stored_item); - - // JS side - let js_result; - let js_error; - try { - js_result = map_item(wrap_for_map_item(stored_item)); - } catch (e) { - js_error = e; - } - - if (response.status === 'mapped') { - if (js_error) { - throw new Error( - `4CAT mapped this item but JS threw: ${format_error_with_location(js_error)}` - ); - } - const js_obj = normalize(js_result); - const py_obj = normalize(response.item); - const diffs = diff_objects(js_obj, py_obj); - if (diffs.length > 0) { - throw new Error( - `${diffs.length} field(s) differ between JS and 4CAT:\n${format_diffs(diffs)}` - ); - } - } else if (response.status === 'skipped') { - if (!js_error) { - throw new Error( - `4CAT skipped this item ("${response.reason}") but JS produced a result` - ); - } - // Both rejected — good. Skip reasons may differ in wording. - } else if (response.status === 'error') { - throw new Error(`4CAT errored on this item: ${response.message}`); - } else { - throw new Error(`unexpected 4CAT response status: ${JSON.stringify(response)}`); - } - } catch (e) { - if (FAIL_FAST) halted_modules.add(module_name); - throw e; - } - }); - }); - }); - } +for (const dataset_key of DATASET_KEYS_TO_RUN) { + const info = dataset_state[dataset_key]; + + if (info.error) { + describe(`map_item compare: dataset ${dataset_key}`, () => { + test('setup', () => { throw info.error; }); }); + continue; } - if (!any_fixtures) { - describe('map_item compare (JS vs 4CAT Python)', () => { - test.skip('no fixtures under tests/fixtures//*.ndjson', () => {}); + const { datasource_id, module_name, module_state, pairing, comparison } = info; + const label = `${dataset_key} (datasource: ${datasource_id}, module: ${module_name})`; + + if (module_state.state === 'no_map_item') { + describe(`map_item compare: ${label}`, () => { + test.skip(`modules/${module_name}.js has no map_item — nothing to compare`, () => {}); }); + continue; } + if (module_state.state === 'syntax_error' || module_state.state === 'import_error') { + const msg = module_state.state === 'syntax_error' + ? `syntax error:\n${module_state.error}` + : `import failed: ${module_state.error.message}`; + describe(`map_item compare: ${label}`, () => { + test('module loads', () => { throw new Error(msg); }); + }); + continue; + } + + describe(`map_item compare: ${label}`, () => { + test('pairing', () => { + const messages = []; + if (pairing.input_count !== pairing.output_count) { + messages.push( + `input count ${pairing.input_count} != output count ${pairing.output_count}` + ); + } + if (pairing.unmatched_inputs.length) { + const shown = pairing.unmatched_inputs.slice(0, 5).join(', '); + const extra = pairing.unmatched_inputs.length > 5 + ? ` (+${pairing.unmatched_inputs.length - 5} more)` + : ''; + messages.push(`unmatched input ids: ${shown}${extra}`); + } + if (pairing.unmatched_outputs.length) { + const shown = pairing.unmatched_outputs.slice(0, 5).join(', '); + const extra = pairing.unmatched_outputs.length > 5 + ? ` (+${pairing.unmatched_outputs.length - 5} more)` + : ''; + messages.push(`unmatched output ids: ${shown}${extra}`); + } + if (pairing.mode === 'index') { + messages.push(`paired by index (no usable 'id' field) — diffs may be misaligned`); + } + if (messages.length) throw new Error(messages.join('\n')); + }); + + comparison.results.forEach(({ id, ok, message }, i) => { + test(`item ${i} (id=${id})`, () => { + if (!ok) throw new Error(message); + }); + }); + + if (comparison.halted_count > 0) { + test.skip( + `halted after first failure — ${comparison.halted_count} later item(s) not compared ` + + `(set FAIL_FAST=0 to compare all)`, + () => {}, + ); + } + }); } From d7fcb4c72deb18de311d6056e521b156be299457 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 3 Jun 2026 15:48:12 +0200 Subject: [PATCH 29/41] fast_fail OR --all for tests --- tests/README.md | 18 +++++++++++++++--- tests/map_item_compare.test.js | 9 +++++---- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/tests/README.md b/tests/README.md index cd35e0a..beaee44 100644 --- a/tests/README.md +++ b/tests/README.md @@ -128,9 +128,18 @@ is an accepted gap; see `docs/map-item-test-plan.md`. The comparator hard-errors at startup if any of these are missing. -**Optional knob:** `FAIL_FAST=0` (or `FAIL_FAST=false`) runs every item in -every dataset; default is to halt subsequent items in a dataset once one -has failed. +**Optional knob:** by default the comparator halts a dataset at its first +failing item (reporting the rest as one skipped "halted" placeholder). To +compare *every* item, pass `--all`: + +```bash +npm run test:compare -- --all +``` + +`FAIL_FAST=0` (or `FAIL_FAST=false`) does the same, but prefer `--all`: an +inline `FAIL_FAST=0 npm run …` does not reliably reach node when npm/node is +the Windows binary run through WSL interop, and isn't env syntax in cmd.exe. +A CLI flag crosses every shell. ### Running @@ -147,6 +156,9 @@ npm run test:compare # the comparator narrowed to one dataset key (must still appear in # FOURCAT_DATASETS — protects against typos) npm run test:compare -- + +# compare every item instead of halting at the first failure +npm run test:compare -- --all ``` ### Where does a new test go? diff --git a/tests/map_item_compare.test.js b/tests/map_item_compare.test.js index 86ab707..2ca1d27 100644 --- a/tests/map_item_compare.test.js +++ b/tests/map_item_compare.test.js @@ -119,9 +119,10 @@ const FOURCAT_TO_ZEESCHUIMER = Object.fromEntries( // When true (default), comparison of a dataset stops at its first failing // item; the remaining items are reported as a single skipped "halted" -// placeholder rather than one failure each. Trim because `set FAIL_FAST=0 && -// ...` in cmd.exe includes the trailing space; treat both '0' and 'false' -// (case-insensitive) as off. +// placeholder rather than one failure each. Disable it with the `--all` +// launcher flag (preferred — crosses every shell) or FAIL_FAST=0. Trim +// because `set FAIL_FAST=0 && ...` in cmd.exe includes the trailing space; +// treat both '0' and 'false' (case-insensitive) as off. const FAIL_FAST_RAW = (process.env.FAIL_FAST ?? '').trim().toLowerCase(); const FAIL_FAST = FAIL_FAST_RAW !== '0' && FAIL_FAST_RAW !== 'false'; @@ -432,7 +433,7 @@ for (const dataset_key of DATASET_KEYS_TO_RUN) { if (comparison.halted_count > 0) { test.skip( `halted after first failure — ${comparison.halted_count} later item(s) not compared ` + - `(set FAIL_FAST=0 to compare all)`, + `(pass --all, or set FAIL_FAST=0, to compare every item)`, () => {}, ); } From 4f9e69c3dc8e38ed98b4d0fe17f8f413a0b7c40a Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 3 Jun 2026 16:34:30 +0200 Subject: [PATCH 30/41] use headers for datasource --- tests/README.md | 3 +- tests/map_item_compare.test.js | 63 ++++++++++++++++++---------------- 2 files changed, 36 insertions(+), 30 deletions(-) diff --git a/tests/README.md b/tests/README.md index beaee44..f203b60 100644 --- a/tests/README.md +++ b/tests/README.md @@ -103,7 +103,8 @@ comparator. For every 4CAT dataset key listed in `FOURCAT_DATASETS`, `tests/map_item_compare.test.js`: -1. fetches `/api/dataset//metadata/` to learn the datasource id +1. sends a HEAD to the items endpoint and reads the datasource id from its + `X-4CAT-Dataset-Datasource` response header (no metadata-endpoint call) 2. translates that id to a Zeeschuimer module name via `zeeschuimer-to-4cat.json` (used in reverse) 3. fetches `/download/` (NDJSON inputs, already wrapped via diff --git a/tests/map_item_compare.test.js b/tests/map_item_compare.test.js index 2ca1d27..681076c 100644 --- a/tests/map_item_compare.test.js +++ b/tests/map_item_compare.test.js @@ -2,7 +2,8 @@ * Compare JS map_item output against 4CAT's Python map_item via dataset keys. * * For each 4CAT dataset key in FOURCAT_DATASETS, this test: - * 1. fetches /api/dataset//metadata/ to learn the datasource id + * 1. HEADs the items endpoint to read the datasource id from the + * `X-4CAT-Dataset-*` response headers (no metadata-endpoint dependency) * 2. translates that id back to a Zeeschuimer module name via * zeeschuimer-to-4cat.json (used in reverse) * 3. inspects the local module (must export map_item) @@ -134,11 +135,10 @@ function auth_headers(extra = {}) { }; } -async function fetch_json(url) { - const res = await fetch(url, { headers: auth_headers() }); - const text = await res.text(); - if (!res.ok) throw new Error(`HTTP ${res.status} from ${url}: ${text}`); - return JSON.parse(text); +async function fetch_headers(url) { + const res = await fetch(url, { method: 'HEAD', headers: auth_headers() }); + if (!res.ok) throw new Error(`HTTP ${res.status} from HEAD ${url}`); + return res.headers; } async function fetch_ndjson(url) { @@ -266,16 +266,18 @@ function pair_items(inputs, outputs, dataset_key) { }; } -// 4CAT exposes the datasource via `metadata.type`, which is the datasource -// id with a `-search` or `-import` suffix appended (e.g. `tiktok-search`, -// `xiaohongshu-comments-import`). Strip the trailing suffix to get the bare -// id, which we then translate to a Zeeschuimer module via -// FOURCAT_TO_ZEESCHUIMER. Datasource ids themselves may contain hyphens -// (e.g. `xiaohongshu-comments`), so the strip is anchored to end-of-string. -function extract_datasource_id(metadata) { - const type = metadata?.type; - if (!type) return null; - return type.replace(/-(search|import)$/, ''); +// Recover the datasource id from a dataset's response headers. 4CAT exposes it +// directly as `X-4CAT-Dataset-Datasource`. Older responses may only carry +// `X-4CAT-Dataset-Type` (the datasource id with a `-search`/`-import` suffix), +// so fall back to stripping that — anchored to end-of-string because +// datasource ids can themselves contain hyphens (e.g. `xiaohongshu-comments`). +// The result is translated to a Zeeschuimer module via FOURCAT_TO_ZEESCHUIMER. +function datasource_id_from_headers(headers) { + const datasource = headers.get('x-4cat-dataset-datasource'); + if (datasource) return datasource.trim(); + const type = headers.get('x-4cat-dataset-type'); + if (type) return type.trim().replace(/-(search|import)$/, ''); + return null; } // Fields 4CAT's API attaches to every mapped item that the JS map_item never @@ -331,36 +333,39 @@ function compare_pairs(pairs, map_item) { return { results, halted_count }; } -// Pre-pass: for each dataset, fetch metadata + items and run the comparison -// up front, so tests register with knowable counts and a deterministic -// pass/fail per item. Fetch/setup failures become a single "setup" failure -// inside that dataset's describe. +// Pre-pass: for each dataset, resolve the datasource (HEAD), fetch items, and +// run the comparison up front, so tests register with knowable counts and a +// deterministic pass/fail per item. Fetch/setup failures become a single +// "setup" failure inside that dataset's describe. const dataset_state = {}; for (const key of DATASET_KEYS_TO_RUN) { try { - const metadata = await fetch_json(`${FOURCAT_URL}/api/dataset/${key}/metadata/`); - const datasource_id = extract_datasource_id(metadata); + // The same items URL serves double duty: a HEAD reveals the datasource + // (via X-4CAT-Dataset-* headers) with no body; the GET pulls the mapped + // rows. `stream=true` avoids the JSON form's limit=100 pagination, which + // would silently drop rows (and break id-pairing) on larger datasets. + const items_url = `${FOURCAT_URL}/api/dataset/${key}/items/?annotations=no&missing_fields=keep&stream=true`; + const headers = await fetch_headers(items_url); + const datasource_id = datasource_id_from_headers(headers); if (!datasource_id) { throw new Error( - `metadata for ${key} has no datasource id (checked parameters.datasource, datasource, type)` + `no datasource id in response headers for ${key} ` + + `(looked for X-4CAT-Dataset-Datasource / X-4CAT-Dataset-Type)` ); } const module_name = FOURCAT_TO_ZEESCHUIMER[datasource_id] ?? datasource_id; const module_state = await inspect_module(module_name); if (module_state.state === 'ok') { - // Both sides as NDJSON. `stream=true` on the items endpoint avoids - // the JSON-array form's default `limit=100` pagination, which would - // silently drop rows (and break id-pairing) on larger datasets. const [inputs, outputs] = await Promise.all([ fetch_ndjson(`${FOURCAT_URL}/download/${key}`), - fetch_ndjson(`${FOURCAT_URL}/api/dataset/${key}/items/?annotations=no&missing_fields=keep&stream=true`), + fetch_ndjson(items_url), ]); const pairing = pair_items(inputs, outputs, key); const comparison = compare_pairs(pairing.pairs, module_state.map_item); - dataset_state[key] = { metadata, datasource_id, module_name, module_state, pairing, comparison }; + dataset_state[key] = { datasource_id, module_name, module_state, pairing, comparison }; } else { - dataset_state[key] = { metadata, datasource_id, module_name, module_state }; + dataset_state[key] = { datasource_id, module_name, module_state }; } } catch (e) { dataset_state[key] = { error: e }; From 8b918d46ba99f2939610a5f0e34fbf0e3aa434bd Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 3 Jun 2026 16:35:08 +0200 Subject: [PATCH 31/41] add the --all instead of just fail_fail --- tests/run-compare.mjs | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/tests/run-compare.mjs b/tests/run-compare.mjs index 69240ab..57efb66 100644 --- a/tests/run-compare.mjs +++ b/tests/run-compare.mjs @@ -3,6 +3,7 @@ * * npm run test:compare -> compares every key in FOURCAT_DATASETS * npm run test:compare -- -> narrows the run to a single key + * npm run test:compare -- --all -> compare every item (no fail-fast) * npm run test:compare -- -t "id=123" -> key + forwarded jest flags * * Why this exists instead of invoking jest directly: jest treats any bare @@ -21,13 +22,22 @@ import { dirname, join } from 'node:path'; const __dirname = dirname(fileURLToPath(import.meta.url)); const args = process.argv.slice(2); -// First non-flag arg (if any) is the dataset key to narrow to. Everything -// that looks like a flag is forwarded to jest verbatim. +// First non-flag arg (if any) is the dataset key to narrow to. const dataset_key = args.find(a => !a.startsWith('-')); -const jest_flags = args.filter(a => a !== dataset_key); +const flags = args.filter(a => a !== dataset_key); + +// `--all` (alias `--no-fail-fast`) compares every item instead of halting at +// the first failure. It's offered as a flag, not only via the FAIL_FAST env +// var, because `FAIL_FAST=0 npm run ...` does not reliably reach node when +// npm/node is the Windows binary invoked through WSL interop, and isn't env +// syntax at all in cmd.exe. A CLI flag crosses every shell; the env var still +// works where it propagates. +const disable_fail_fast = flags.includes('--all') || flags.includes('--no-fail-fast'); +const jest_flags = flags.filter(f => f !== '--all' && f !== '--no-fail-fast'); const env = { ...process.env }; if (dataset_key) env.COMPARE_DATASET = dataset_key; +if (disable_fail_fast) env.FAIL_FAST = '0'; const jest_bin = join(__dirname, 'node_modules', 'jest', 'bin', 'jest.js'); const child = spawn( From 00f0369d12804e397202a7206d25b9b864414c82 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Thu, 4 Jun 2026 16:46:51 +0200 Subject: [PATCH 32/41] map_item_compare.test.js: compare based on mapped `id` field not raw `id` --- tests/map_item_compare.test.js | 98 ++++++++++++++++++++++------------ 1 file changed, 63 insertions(+), 35 deletions(-) diff --git a/tests/map_item_compare.test.js b/tests/map_item_compare.test.js index 681076c..2d1403b 100644 --- a/tests/map_item_compare.test.js +++ b/tests/map_item_compare.test.js @@ -11,8 +11,11 @@ * /download/ -> INPUTS (post-wrap) * /api/dataset//items/?annotations=no&missing_fields=keep&stream=true * -> mapped EXPECTED OUTPUTS - * 5. pairs items by `id`, runs each input through the local map_item, and - * deep-equals the result against the corresponding expected output. + * 5. runs each input through the local map_item, then pairs by the + * resulting MAPPED `id` — which can differ from the raw input id (e.g. + * instagram maps to the post shortcode, not the numeric pk) — and + * deep-equals each mapped result against the corresponding expected + * output. * * The items endpoint is fetched with `stream=true` (NDJSON): its JSON-array * form paginates at `limit=100`, silently dropping rows on larger datasets. @@ -216,25 +219,49 @@ function format_error_with_location(err) { : message; } -// Pair inputs and expected outputs by `id`. Falls back to index pairing -// (with a logged warning) if either side is missing the field on its -// first item. -function pair_items(inputs, outputs, dataset_key) { - const probe_in = inputs[0]; +// Map each input through the local map_item, then pair the mapped result +// against the expected output by `id`. Pairing MUST key on the mapped id: +// some modules emit an `id` that differs from the raw input id — instagram, +// for instance, maps to the post shortcode (`node.code`), not the numeric pk +// — so pairing raw input ids against the API's already-mapped ids would match +// nothing. Falls back to index pairing (with a logged warning) if either side +// lacks a usable id. A throw inside map_item is captured per-item and surfaced +// later as that item's failure. +function map_and_pair(inputs, outputs, map_item, dataset_key) { + // Map every input up front so pairing can key on the mapped id. + const mapped = inputs.map(input => { + try { + return { input, js_result: map_item(input), error: null }; + } catch (e) { + return { + input, + js_result: null, + error: new Error(`JS map_item threw: ${format_error_with_location(e)}`), + }; + } + }); + + const probe_mapped = mapped.find(m => m.js_result)?.js_result; const probe_out = outputs[0]; - const has_id_in = probe_in && 'id' in probe_in && probe_in.id != null; + const has_id_mapped = probe_mapped && 'id' in probe_mapped && probe_mapped.id != null; const has_id_out = probe_out && 'id' in probe_out && probe_out.id != null; - if (!has_id_in || !has_id_out) { + if (!has_id_mapped || !has_id_out) { // eslint-disable-next-line no-console console.warn( - `[compare] ${dataset_key}: no usable 'id' on ${!has_id_in ? '/download' : '/items'} ` + + `[compare] ${dataset_key}: no usable 'id' on ${!has_id_mapped ? 'map_item output' : '/items'} ` + `side — falling back to index pairing for this dataset.` ); - const n = Math.min(inputs.length, outputs.length); + const n = Math.min(mapped.length, outputs.length); return { mode: 'index', - pairs: Array.from({ length: n }, (_, i) => ({ input: inputs[i], expected: outputs[i], id: i })), + pairs: Array.from({ length: n }, (_, i) => ({ + input: mapped[i].input, + js_result: mapped[i].js_result, + error: mapped[i].error, + expected: outputs[i], + id: i, + })), input_count: inputs.length, output_count: outputs.length, unmatched_inputs: [], @@ -247,13 +274,19 @@ function pair_items(inputs, outputs, dataset_key) { const pairs = []; const unmatched_inputs = []; - for (const input of inputs) { - const expected = by_id_out.get(String(input.id)); + for (const m of mapped) { + // Key on the mapped id when mapping succeeded; for a throw (no mapped + // id available) fall back to the raw input id so a pass-through-id + // module still surfaces the failure against its expected output. + const lookup_id = m.js_result && m.js_result.id != null + ? String(m.js_result.id) + : (m.input && m.input.id != null ? String(m.input.id) : null); + const expected = lookup_id != null ? by_id_out.get(lookup_id) : undefined; if (expected) { - pairs.push({ input, expected, id: input.id }); - by_id_out.delete(String(input.id)); + pairs.push({ input: m.input, js_result: m.js_result, error: m.error, expected, id: lookup_id }); + by_id_out.delete(lookup_id); } else { - unmatched_inputs.push(input.id); + unmatched_inputs.push(lookup_id); } } return { @@ -296,24 +329,21 @@ function strip_api_fields(obj) { return out; } -// Run each paired input through the local map_item and diff the result -// against 4CAT's expected output. With FAIL_FAST on (default), stop at the -// first failing item and record how many were left unchecked — so one bad -// item yields a single failure plus one skipped "halted" placeholder, not N -// failures. -function compare_pairs(pairs, map_item) { +// Diff each paired (already-mapped) JS result against 4CAT's expected output. +// map_item was run up front during pairing — so we could key on the mapped id +// — so here we only diff, or report an input whose map_item threw. With +// FAIL_FAST on (default), stop at the first failing item and record how many +// were left unchecked — so one bad item yields a single failure plus one +// skipped "halted" placeholder, not N failures. +function compare_pairs(pairs) { const results = []; let halted_count = 0; for (let i = 0; i < pairs.length; i++) { - const { input, expected, id } = pairs[i]; + const { id, js_result, error, expected } = pairs[i]; let message = null; - try { - let js_result; - try { - js_result = map_item(input); - } catch (e) { - throw new Error(`JS map_item threw: ${format_error_with_location(e)}`); - } + if (error) { + message = error.message; + } else { const diffs = diff_objects( strip_api_fields(normalize(js_result)), strip_api_fields(normalize(expected)), @@ -321,8 +351,6 @@ function compare_pairs(pairs, map_item) { if (diffs.length > 0) { message = `${diffs.length} field(s) differ between JS and 4CAT:\n${format_diffs(diffs)}`; } - } catch (e) { - message = e.message; } results.push({ id, ok: message === null, message }); if (message !== null && FAIL_FAST) { @@ -361,8 +389,8 @@ for (const key of DATASET_KEYS_TO_RUN) { fetch_ndjson(`${FOURCAT_URL}/download/${key}`), fetch_ndjson(items_url), ]); - const pairing = pair_items(inputs, outputs, key); - const comparison = compare_pairs(pairing.pairs, module_state.map_item); + const pairing = map_and_pair(inputs, outputs, module_state.map_item, key); + const comparison = compare_pairs(pairing.pairs); dataset_state[key] = { datasource_id, module_name, module_state, pairing, comparison }; } else { dataset_state[key] = { datasource_id, module_name, module_state }; From c7bb9ac9b2c7e046ef25d15b1ea07217e3fbeabc Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Thu, 4 Jun 2026 17:15:05 +0200 Subject: [PATCH 33/41] map_item_compare.test.js: still show errors on failed `id` matches --- tests/map_item_compare.test.js | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/tests/map_item_compare.test.js b/tests/map_item_compare.test.js index 2d1403b..8e06979 100644 --- a/tests/map_item_compare.test.js +++ b/tests/map_item_compare.test.js @@ -275,15 +275,21 @@ function map_and_pair(inputs, outputs, map_item, dataset_key) { const pairs = []; const unmatched_inputs = []; for (const m of mapped) { - // Key on the mapped id when mapping succeeded; for a throw (no mapped - // id available) fall back to the raw input id so a pass-through-id - // module still surfaces the failure against its expected output. - const lookup_id = m.js_result && m.js_result.id != null - ? String(m.js_result.id) - : (m.input && m.input.id != null ? String(m.input.id) : null); + // A throw produces no mapped id to pair on. Surface it as its own + // failing item (labelled with the raw input id) rather than burying it + // in the unmatched-id list — otherwise an id-transforming module hides + // the actual map_item error behind a generic "unmatched input" report. + if (m.error) { + const label = m.input && m.input.id != null ? String(m.input.id) : '(no id)'; + pairs.push({ input: m.input, js_result: null, error: m.error, expected: null, id: label }); + continue; + } + // Key on the mapped id; a successful map whose id matches no output is + // a genuine pairing miss and goes to unmatched_inputs. + const lookup_id = m.js_result && m.js_result.id != null ? String(m.js_result.id) : null; const expected = lookup_id != null ? by_id_out.get(lookup_id) : undefined; if (expected) { - pairs.push({ input: m.input, js_result: m.js_result, error: m.error, expected, id: lookup_id }); + pairs.push({ input: m.input, js_result: m.js_result, error: null, expected, id: lookup_id }); by_id_out.delete(lookup_id); } else { unmatched_inputs.push(lookup_id); From 53c2b6f1693897635f0b2f9f23ac0d2cb3c9b875 Mon Sep 17 00:00:00 2001 From: dale-wahl <32108944+dale-wahl@users.noreply.github.com> Date: Wed, 10 Jun 2026 13:50:57 +0000 Subject: [PATCH 34/41] chore: sync map_item for bootstrap from 4CAT 888f0a126ea70404034f265fefc1468568c73d8d --- modules/9gag.js | 68 ++++++- modules/douyin.js | 266 +++++++++++++++++++++++- modules/gab.js | 110 +++++++++- modules/imgur.js | 42 +++- modules/instagram.js | 394 +++++++++++++++++++++++++++++++++++- modules/linkedin.js | 228 ++++++++++++++++++++- modules/pinterest.js | 93 ++++++++- modules/rednote-comments.js | 34 +++- modules/rednote.js | 134 +++++++++++- modules/threads.js | 83 +++++++- modules/tiktok-comments.js | 34 +++- modules/tiktok.js | 96 ++++++++- modules/truth.js | 93 ++++++++- 13 files changed, 1662 insertions(+), 13 deletions(-) diff --git a/modules/9gag.js b/modules/9gag.js index a2d8bc5..213e798 100644 --- a/modules/9gag.js +++ b/modules/9gag.js @@ -40,4 +40,70 @@ export function capture(response, source_platform_url, source_url) { } return data["data"]["posts"]; -} \ No newline at end of file +} + +// === auto-generated by 4cat map_item sync — BLOCK REPLACED AUTOMATICALLY === +// (regenerated from datasources/ninegag/search_9gag.py) +export function map_item(post) { + // Convert Unix timestamp (seconds) to Date object + const postTimestampSec = post.creationTs; + const postTimestamp = new Date(postTimestampSec * 1000); + + // Select the highest‑resolution image that is not a video + const images = Object.values(post.images ?? {}); + const imageCandidates = images.filter(v => !('hasAudio' in v)); + imageCandidates.sort((a, b) => (b.width * b.height) - (a.width * a.height)); + const image = imageCandidates[0] ?? {}; + + // Select the highest‑resolution video (if any) and pick the best URL format + const videoCandidates = images.filter(v => ('hasAudio' in v)); + videoCandidates.sort((a, b) => (b.width * b.height) - (a.width * a.height)); + let videoUrl = ""; + if (videoCandidates.length) { + const vid = videoCandidates[0]; + if (vid.av1Url) videoUrl = vid.av1Url; + else if (vid.h265Url) videoUrl = vid.h265Url; + else if (vid.vp9Url) videoUrl = vid.vp9Url; + else if (vid.vp8Url) videoUrl = vid.vp8Url; + } + + // Handle anonymous posts – they appear as the user "9GAGGER" + if (!post.creator) { + post.creator = { + username: "9GAGGER", + fullName: "", + emojiStatus: "", + isVerifiedAccount: "" + }; + } + + return new MappedItem({ + collected_from_url: normalize_url_encoding(post.__import_meta?.source_platform_url ?? ""), + id: post.id, + url: post.url, + subject: post.title, + body: post.description, + timestamp: formatUtcTimestamp(postTimestampSec), + author: post.creator?.username ?? "", + author_name: post.creator?.fullName ?? "", + author_status: post.creator?.emojiStatus ?? "", + author_verified: post.creator?.isVerifiedAccount ? "yes" : "no", + type: post.type, + image_url: image.url ?? "", + video_url: videoUrl, + is_nsfw: post.nsfw === 0 ? "no" : "yes", + is_promoted: post.promoted === 0 ? "no" : "yes", + is_vote_masked: post.isVoteMasked === 0 ? "no" : "yes", + is_anonymous: !post.isAnonymous ? "no" : "yes", + source_domain: post.sourceDomain, + source_url: post.sourceUrl, + upvotes: post.upVoteCount, + downvotes: post.downVoteCount, + score: (post.upVoteCount ?? 0) - (post.downVoteCount ?? 0), + comments: post.commentsCount, + tags: (post.tags ?? []).map(t => t.key).join(","), + tags_annotated: (post.annotationTags ?? []).join(","), + unix_timestamp: postTimestampSec + }); +} +// === end auto-generated === diff --git a/modules/douyin.js b/modules/douyin.js index ef811d9..6c19a0e 100644 --- a/modules/douyin.js +++ b/modules/douyin.js @@ -339,4 +339,268 @@ export function capture(response, source_platform_url, source_url) { } else { // console.log("Detected expected object(s) by no usable items found") } -} \ No newline at end of file +} + +// === auto-generated by 4cat map_item sync — BLOCK REPLACED AUTOMATICALLY === +// (regenerated from datasources/douyin/search_douyin.py) +function getChineseNumber(num) { + if (typeof num === "number") { + return num; + } + if (typeof num !== "string") { + return 0; + } + if (num.includes("万")) { + const cleaned = num.replace(/[^0-9.]/g, ""); + return parseFloat(cleaned) * 10000; + } + const cleaned = num.replace(/[^0-9.]/g, ""); + return cleaned ? parseInt(cleaned, 10) : 0; +} + +export function map_item(item) { + // Helper to safely access nested properties + const get = (obj, path, def) => { + return path.reduce((o, p) => (o && o[p] != null ? o[p] : undefined), obj) ?? def; + }; + + const metadata = item["__import_meta"] ?? {}; + let subject = "Post"; + let stream_data = {}; + let post_timestamp; + let video_url = ""; + let video_thumbnail = ""; + let video_description = ""; + let duration = "Unknown"; + let prevent_download = null; + let stats = {}; + let author = {}; + let video_tags = ""; + let aweme_id_key, group_id_key, text_extra_key, hashtag_key, mention_key, author_id_key; + let mix_info_key, mix_id_key, mix_name_key; + let author_sec_key, avatar_thumb_key, url_list_key, is_fake_key; + + if (item["ZS_collected_from_embed"]) { + // Embedded HTML format + if (item["cellRoom"] && item["cellRoom"] !== "$undefined") { + stream_data = item["cellRoom"]["rawdata"] ?? {}; + } + if (Object.keys(stream_data).length) { + // Stream embedded + subject = "Stream"; + const createtime = stream_data["createtime"] ?? (item["requestTime"] ? item["requestTime"] / 1000 : undefined); + post_timestamp = new Date((createtime ?? 0) * 1000); + video_url = stream_data["stream_url"]?.["flv_pull_url"]?.["FULL_HD1"] ?? ""; + video_thumbnail = stream_data["video"]?.["cover"] ?? ""; + video_description = stream_data["title"] ?? ""; + duration = "Unknown"; + stats = stream_data["stats"] ?? {}; + author = stream_data["owner"] ?? {}; + author_sec_key = "sec_uid"; + avatar_thumb_key = "avatar_thumb"; + url_list_key = "url_list"; + is_fake_key = "is_ad_fake"; + } else { + // Regular post embedded + post_timestamp = new Date(item["createTime"] * 1000); + const videos_list = item["video"]?.["bitRateList"]; + if (videos_list) { + const videos = [...videos_list].sort((a, b) => (b["bitRate"] ?? 0) - (a["bitRate"] ?? 0)); + video_url = "https" + (videos[0]["playApi"] ?? ""); + } else { + video_url = ""; + } + video_thumbnail = item["video"]?.["cover"] ?? ""; + video_description = item["desc"] ?? ""; + duration = item["duration"] ?? item["video"]?.["duration"] ?? "Unknown"; + prevent_download = item["download"]?.["prevent"] ? "yes" : "no"; + stats = item["stats"] ?? {}; + author = item["authorInfo"] ?? {}; + author_sec_key = "secUid"; + avatar_thumb_key = "avatarThumb"; + url_list_key = "urlList"; + is_fake_key = "isAdFake"; + } + // Embedded keys (same for both branches) + aweme_id_key = "awemeId"; + group_id_key = "groupId"; + text_extra_key = "textExtra"; + hashtag_key = "hashtagName"; + mention_key = "secUid"; + author_id_key = "authorUserId"; + mix_info_key = "mixInfo"; + mix_id_key = "mixId"; + mix_name_key = "mixName"; + // Stats (may be MissingMappedField) + const collect_count = stats["collectCount"] ?? new MissingMappedField("Unknown"); + const comment_count = stats["commentCount"] ?? new MissingMappedField("Unknown"); + const digg_count = stats["diggCount"] ?? new MissingMappedField("Unknown"); + const download_count = stats["downloadCount"] ?? new MissingMappedField("Unknown"); + const forward_count = stats["forwardCount"] ?? new MissingMappedField("Unknown"); + const play_count = stats["playCount"] ?? new MissingMappedField("Unknown"); + const share_count = stats["shareCount"] ?? new MissingMappedField("Unknown"); + // Video tags (guess) + video_tags = (item["videoTag"] ?? []).filter(t => t["tagName"]).map(t => t["tagName"]).join(","); + const mix_current_episode = (item[mix_info_key] ?? {})["currentEpisode"] ?? "N/A"; + // Build result later – keep intermediate values in closure variables + var __embed_collect_count = collect_count; + var __embed_comment_count = comment_count; + var __embed_digg_count = digg_count; + var __embed_download_count = download_count; + var __embed_forward_count = forward_count; + var __embed_play_count = play_count; + var __embed_share_count = share_count; + var __embed_mix_current_episode = mix_current_episode; + } else { + // Non‑embedded JSON format + stream_data = item["rawdata"] ?? item["cell_room"]?.["rawdata"]; + if (stream_data) { + // Stream (may be a JSON string) + if (typeof stream_data === "string") { + try { stream_data = JSON.parse(stream_data); } catch (e) { /* ignore */ } + } + subject = "Stream"; + const create_time = stream_data["create_time"] ?? item["create_time"] ?? (metadata["timestamp_collected"] ? metadata["timestamp_collected"] / 1000 : undefined); + post_timestamp = new Date((create_time ?? 0) * 1000); + video_url = stream_data["stream_url"]?.["flv_pull_url"]?.["FULL_HD1"] ?? ""; + video_thumbnail = stream_data["video"]?.["cover"] ?? ""; + video_description = stream_data["title"] ?? ""; + duration = "Unknown"; + author = stream_data["owner"] ?? {}; + video_tags = stream_data["video_feed_tag"] ?? ""; + stats = stream_data["stats"] ?? {}; + } else { + // Regular post + post_timestamp = new Date(item["create_time"] * 1000); + const videos_list = item["video"]?.["bit_rate"]; + if (!videos_list) { + video_url = ""; + video_thumbnail = ""; + } else { + const videos = [...videos_list].sort((a, b) => (b["bit_rate"] ?? 0) - (a["bit_rate"] ?? 0)); + video_url = videos[0]["play_addr"]?.["url_list"]?.[0] ?? ""; + video_thumbnail = item["video"]?.["cover"]?.["url_list"]?.[0] ?? ""; + } + video_description = item["desc"] ?? ""; + duration = item["duration"] ?? item["video"]?.["duration"] ?? "Unknown"; + author = item["author"] ?? {}; + stats = item["statistics"] ?? {}; + } + prevent_download = ("prevent_download" in item) ? (item["prevent_download"] ? "yes" : "no") : null; + // Keys for non‑embedded format + aweme_id_key = "aweme_id"; + group_id_key = "group_id"; + text_extra_key = "text_extra"; + hashtag_key = "hashtag_name"; + mention_key = "sec_uid"; + author_id_key = "author_user_id"; + mix_info_key = "mix_info"; + mix_id_key = "mix_id"; + mix_name_key = "mix_name"; + author_sec_key = "sec_uid"; + avatar_thumb_key = "avatar_thumb"; + url_list_key = "url_list"; + is_fake_key = "is_ad_fake"; + // Stats (may be MissingMappedField) + const collect_count = stats ? (stats["collect_count"] ?? new MissingMappedField("Unknown")) : new MissingMappedField("Unknown"); + const comment_count = stats ? (stats["comment_count"] ?? new MissingMappedField("Unknown")) : new MissingMappedField("Unknown"); + const digg_count = stats ? (stats["digg_count"] ?? new MissingMappedField("Unknown")) : new MissingMappedField("Unknown"); + const download_count = stats ? (stats["download_count"] ?? new MissingMappedField("Unknown")) : new MissingMappedField("Unknown"); + const forward_count = stats ? (stats["forward_count"] ?? new MissingMappedField("Unknown")) : new MissingMappedField("Unknown"); + const play_count = stats ? (stats["play_count"] ?? new MissingMappedField("Unknown")) : new MissingMappedField("Unknown"); + const share_count = stats ? (stats["share_count"] ?? new MissingMappedField("Unknown")) : new MissingMappedField("Unknown"); + // Video tags list + video_tags = (item["video_tag"] ?? []).filter(t => t["tag_name"]).map(t => t["tag_name"]).join(","); + const mix_current_episode = item[mix_info_key] ? (item[mix_info_key]["statis"]?.["current_episode"] ?? "N/A") : "N/A"; + var __embed_collect_count = collect_count; + var __embed_comment_count = comment_count; + var __embed_digg_count = digg_count; + var __embed_download_count = download_count; + var __embed_forward_count = forward_count; + var __embed_play_count = play_count; + var __embed_share_count = share_count; + var __embed_mix_current_episode = mix_current_episode; + } + + // Stream stats (common) + const count_total_streams_viewers = stats["total_user"] ?? "N/A"; + const count_current_stream_viewers = ("user_count_str" in stats) ? getChineseNumber(stats["user_count_str"]) : "N/A"; + + // Displayed flag for mix items + let displayed = true; + if (item["ZS_collected_from_mix"] && !item["ZS_first_mix_vid"]) { + displayed = false; + } + + // Image URLs + const image_urls = []; + if (Array.isArray(item["images"])) { + for (const img of item["images"]) { + if (Array.isArray(img["url_list"])) { + image_urls.push(img["url_list"][0]); + } else if (Array.isArray(img["urlList"])) { + image_urls.push(img["urlList"][0]); + } + } + } + + // Music fields + const music_obj = item["music"]; + const music_author = (music_obj && music_obj !== "$undefined") ? (music_obj["author"] ?? "") : ""; + const music_title = (music_obj && music_obj !== "$undefined") ? (music_obj["title"] ?? "") : ""; + const music_url = (music_obj && music_obj !== "$undefined") ? (music_obj["play_url"]?.["uri"] ?? "") : ""; + + // Collection / Mix handling + let mix_current_episode = __embed_mix_current_episode; + if (mix_current_episode === "$undefined") mix_current_episode = "N/A"; + const collection_id_raw = item[mix_info_key]?.[mix_id_key] ?? "N/A"; + const collection_id = collection_id_raw === "$undefined" ? "N/A" : collection_id_raw; + const collection_name_raw = item[mix_info_key]?.[mix_name_key] ?? "N/A"; + const collection_name = collection_name_raw === "$undefined" ? "N/A" : collection_name_raw; + const part_of_collection = (item[mix_info_key] && (mix_id_key in item[mix_info_key]) && collection_id !== "N/A") ? "yes" : "no"; + + // Build the mapped item + return new MappedItem({ + "collected_from_url": normalize_url_encoding(metadata["source_platform_url"] ?? ""), + "id": item[aweme_id_key], + "thread_id": item[group_id_key], + "subject": subject, + "body": video_description, + "timestamp": formatUtcTimestamp(Math.floor(post_timestamp.getTime() / 1000)), + "post_url": subject === "Post" ? `https://www.douyin.com/video/${item[aweme_id_key]}` : `https://live.douyin.com/${author["web_rid"]}`, + "region": item["region"] ?? "", + "hashtags": (item[text_extra_key] ?? []).filter(t => t[hashtag_key]).map(t => t[hashtag_key]).join(","), + "mentions": (item[text_extra_key] ?? []).filter(t => t[mention_key]).map(t => `https://www.douyin.com/user/${t[mention_key]}`).join(","), + "video_tags": video_tags, + "prevent_download": prevent_download, + "video_url": video_url, + "video_thumbnail": video_thumbnail, + "video_duration": duration, + "image_urls": image_urls.join(","), + "music_author": music_author, + "music_title": music_title, + "music_url": music_url, + "collect_count": __embed_collect_count, + "comment_count": __embed_comment_count, + "digg_count": __embed_digg_count, + "download_count": __embed_download_count, + "forward_count": __embed_forward_count, + "play_count": __embed_play_count, + "share_count": __embed_share_count, + "count_total_streams_viewers": count_total_streams_viewers, + "count_current_stream_viewers": count_current_stream_viewers, + "author_user_id": item[author_id_key] ?? (author["uid"] ?? author["id"]), + "author_nickname": author["nickname"] ?? "", + "author_profile_url": `https://www.douyin.com/user/${author[author_sec_key]}`, + "author_thumbnail_url": author[avatar_thumb_key]?.[url_list_key]?.[0] ?? "", + "author_region": author["region"] ?? null, + "author_is_ad_fake": author[is_fake_key] ?? null, + "part_of_collection": part_of_collection, + "4CAT_first_video_displayed": displayed ? "yes" : "no", + "collection_id": collection_id, + "collection_name": collection_name, + "place_in_collection": mix_current_episode, + "unix_timestamp": Math.floor(post_timestamp.getTime() / 1000) + }); +} +// === end auto-generated === diff --git a/modules/gab.js b/modules/gab.js index a5eab6d..9c3e14e 100644 --- a/modules/gab.js +++ b/modules/gab.js @@ -72,4 +72,112 @@ export function capture(response, source_platform_url, source_url) { } } return items; -} \ No newline at end of file +} + +// === auto-generated by 4cat map_item sync — BLOCK REPLACED AUTOMATICALLY === +// (regenerated from datasources/gab/search_gab.py) +export function map_item(item) { + const unknownData = []; + + const postId = item['i'] ?? item['id']; + const metadata = item['__import_meta'] ?? {}; + + let collectedAt; + if (metadata['timestamp_collected'] != null) { + const ts = metadata['timestamp_collected'] / 1000; + collectedAt = formatUtcTimestamp(ts); + } else { + collectedAt = new MissingMappedField('Unknown'); + } + + const reactions = item['rc'] ?? item['reactions_counts']; + let reactionCount; + if (typeof reactions === 'number') { + reactionCount = reactions; + } else { + reactionCount = Object.values(reactions ?? {}).reduce((sum, val) => sum + (val ?? 0), 0); + } + + const group = item['g'] ?? item['group'] ?? null; + const author = item['author_info'] ?? item['account'] ?? null; + const mentions = item['m'] ?? item['mentions'] ?? []; + const tags = item['tg'] ?? item['tags'] ?? []; + const card = item['card'] ?? item['link'] ?? null; + const mediaItems = item['image_info'] ?? item['media_attachments'] ?? []; + + const imageUrls = []; + const videoUrls = []; + + for (const media of mediaItems) { + const type = media['t'] ?? media['type']; + if (type === 'image') { + const url = media['u'] ?? media['url']; + if (url == null) { + unknownData.push(`Media missing URL: ${url}`); + } else { + imageUrls.push(url); + } + } else if (type === 'video') { + const url = media['smp4'] ?? media['source_mp4']; + if (url == null) { + unknownData.push(`Media missing URL: ${url}`); + } else { + videoUrls.push(url); + } + } else { + unknownData.push(`Unknown media type: ${JSON.stringify(media)}`); + } + } + + const createdAtRaw = item['ca'] ?? item['created_at']; + const postDate = new Date(createdAtRaw); + const postTimeStr = formatUtcTimestamp(postDate.getTime() / 1000); + + const mappedItem = { + collected_at: collectedAt, + collected_from_url: normalize_url_encoding(metadata['source_platform_url'] ?? ''), + id: postId, + created_at: postTimeStr, + body: item['c'] ?? item['content'], + url: item['ul'] ?? item['url'], + reaction_count: reactionCount, + favourites_count: item['fbc'] ?? item['favourites_count'], + replies_count: item['rc'] ?? item['replies_count'], + reblogs_count: item['rbc'] ?? item['reblogs_count'], + mentions: mentions.map(m => m['username']).join(','), + tags: tags.map(t => t['name']).join(','), + + group_id: group ? group['id'] ?? null : null, + group_title: group ? group['title'] ?? null : null, + group_description: group ? group['description'] ?? null : null, + group_member_count: group ? group['member_count'] ?? null : null, + group_is_private: group ? group['is_private'] ?? null : null, + group_url: group ? group['url'] ?? null : null, + group_created_at: group ? group['created_at'] ?? null : null, + + account_id: author ? (author['i'] ?? author['id']) : null, + account_username: author ? (author['un'] ?? author['username']) : null, + account_account: author ? (author['ac'] ?? author['acct']) : null, + account_display_name: author ? (author['dn'] ?? author['display_name']) : null, + account_note: author ? (author['nt'] ?? author['note']) : null, + + link_id: card ? card['id'] ?? null : null, + link_url: card ? card['url'] ?? null : null, + link_title: card ? card['title'] ?? null : null, + link_description: card ? card['description'] ?? null : null, + link_type: card ? card['type'] ?? null : null, + link_image: card ? card['image'] ?? null : null, + + image_urls: imageUrls.join(','), + video_urls: videoUrls.join(','), + + thread_id: item['i'] ?? item['conversation_id'], + timestamp: postTimeStr + }; + + if (unknownData.length) { + return new MappedItem(mappedItem, {message: unknownData.join('')}); + } + return new MappedItem(mappedItem); +} +// === end auto-generated === diff --git a/modules/imgur.js b/modules/imgur.js index 9cc662b..3d37892 100644 --- a/modules/imgur.js +++ b/modules/imgur.js @@ -30,4 +30,44 @@ export function capture(response, source_platform_url, source_url) { } return data["posts"]; -} \ No newline at end of file +} + +// === auto-generated by 4cat map_item sync — BLOCK REPLACED AUTOMATICALLY === +// (regenerated from datasources/imgur/search_imgur.py) +export function map_item(item) { + // Parse created_at timestamp (ISO 8601) to Unix seconds + const createdAt = item.created_at; + if (!createdAt) { + throw new MapItemException('Missing created_at field'); + } + const unix_timestamp = Math.floor(Date.parse(createdAt) / 1000); + const timestamp = formatUtcTimestamp(unix_timestamp); + + const collected_from_url = normalize_url_encoding(item.__import_meta?.source_platform_url ?? ""); + + return new MappedItem({ + collected_from_url, + id: item.id, + subject: item.title, + body: item.description, + timestamp, + author: item.account_id, + type: item.cover?.type, + media_url: item.cover?.url, + post_url: item.url, + album_media: item.image_count, + is_ad: item.is_ad ? "yes" : "no", + is_album: item.is_album ? "yes" : "no", + is_mature: item.is_mature ? "yes" : "no", + is_viral: item.in_most_viral ? "yes" : "no", + views: item.view_count, + upvotes: item.upvote_count, + downvotes: item.downvote_count, + score: item.point_count, + comments: item.comment_count, + favourites: item.favorite_count, + virality_score: item.virality, + unix_timestamp, + }); +} +// === end auto-generated === diff --git a/modules/instagram.js b/modules/instagram.js index f14e6ef..b60358e 100644 --- a/modules/instagram.js +++ b/modules/instagram.js @@ -500,4 +500,396 @@ function extractEmbeddedInstagramJSON(response) { } return datas; -} \ No newline at end of file +} + +// === auto-generated by 4cat map_item sync — BLOCK REPLACED AUTOMATICALLY === +// (regenerated from datasources/instagram/search_instagram.py) +const MEDIA_TYPE_PHOTO = 1; +const MEDIA_TYPE_VIDEO = 2; +const MEDIA_TYPE_CAROUSEL = 8; + +const HASHTAG_REGEX = /#([^\s!@#$%ˆ&*()_+{}:"|<>?\[\];'\,./`~'‘’]+)/g; + +function extractHashtags(caption) { + if (caption instanceof MissingMappedField) { + return ""; + } + const matches = [...caption.matchAll(HASHTAG_REGEX)]; + return matches.map(m => m[1]).join(","); +} + +function parsePolarisItem(node) { + const partial_item = node._zs_partial ?? false; + const collected_at = new MissingMappedField(0); + const unix_at = new MissingMappedField(0); + let caption; + if (!('caption' in node)) { + caption = new MissingMappedField(""); + } else if (!node.caption) { + caption = ""; + } else { + caption = node.caption.text; + } + + const user = node.user; + const owner = node.owner ?? {}; + if (user && owner) { + if (owner.id === user.id) { + // prefer user + } else if (user.username !== owner.username) { + throw new MapItemException(`Unable to parse item: different user and owner`); + } + } + const is_verified = ("is_verified" in user && user.is_verified != null) ? user.is_verified : new MissingMappedField(false); + + const typeMap = {"XIGPolarisPhotoMedia": "photo", "XIGPolarisVideoMedia": "video"}; + const media_type = typeMap[node.__typename] ?? "unknown"; + const num_media = node.__typename !== "XIGPolarisCarouselMedia" ? 1 : (node.carousel_media?.length ?? 0); + + const display_urls = node.display_uri ?? new MissingMappedField(""); + const missing_media = null; + let media_urls; + if ("video_versions" in node) { + media_urls = node.video_versions[0]?.url ?? new MissingMappedField(""); + } else { + media_urls = new MissingMappedField(""); + } + + return { + "collected_from_url": normalize_url_encoding(node.__import_meta?.source_platform_url), + "collected_from_view": node._zs_instagram_view ?? "", + "partial_item": partial_item, + "id": node.code, + "timestamp": collected_at, + "thread_id": node.code, + "parent_id": node.code, + "url": "https://www.instagram.com/p/" + node.code, + "body": caption, + + "author_id": user?.id ?? owner?.id ?? new MissingMappedField(""), + "author": user?.username ?? owner?.username ?? new MissingMappedField(""), + "author_fullname": user?.full_name ?? owner?.full_name ?? new MissingMappedField(""), + "verified": is_verified, + "author_avatar_url": user?.profile_pic_url ?? owner?.profile_pic_url ?? new MissingMappedField(""), + + "coauthors": new MissingMappedField(""), + "coauthor_fullnames": new MissingMappedField(""), + "coauthor_ids": new MissingMappedField(""), + + "media_type": media_type, + "num_media": num_media, + "image_urls": display_urls, + "media_urls": media_urls, + + "hashtags": extractHashtags(caption), + "usertags": new MissingMappedField(""), + "play_count": node.play_count ?? new MissingMappedField(0), + + "likes_hidden": new MissingMappedField(""), + "num_likes": new MissingMappedField(0), + "num_comments": new MissingMappedField(0), + + "location_name": new MissingMappedField(""), + "location_id": new MissingMappedField(""), + "location_latlong": new MissingMappedField(""), + "location_city": new MissingMappedField(""), + + "unix_timestamp": unix_at, + "missing_media": missing_media + }; +} + +function parseGraphItem(node) { + let caption; + try { + caption = node.edge_media_to_caption.edges[0].node.text; + } catch (e) { + caption = new MissingMappedField(""); + } + + const num_media = node.__typename !== "GraphSidecar" ? 1 : (node.edge_sidecar_to_children?.edges?.length ?? 0); + + let media_node; + if (node.__typename === "GraphSidecar") { + media_node = node.edge_sidecar_to_children.edges[0].node; + } else { + media_node = node; + } + + let media_url; + if (media_node.__typename === "GraphVideo") { + media_url = media_node.video_url ?? ""; + } else if (media_node.__typename === "GraphImage") { + const resources = media_node.display_resources ?? media_node.thumbnail_resources; + if (resources && resources.length) { + media_url = resources[resources.length - 1].src; + } else { + media_url = media_node.display_url ?? ""; + } + } else { + media_url = media_node.display_url ?? ""; + } + + const typeMap = {"GraphSidecar": "photo", "GraphVideo": "video"}; + let media_type; + if (node.__typename !== "GraphSidecar") { + media_type = typeMap[node.__typename] ?? "unknown"; + } else { + const childTypes = new Set(node.edge_sidecar_to_children.edges.map(e => e.node.__typename)); + if (childTypes.size > 1) { + media_type = "mixed"; + } else { + const single = childTypes.values().next().value; + media_type = typeMap[single] ?? "unknown"; + } + } + + const location = {name: "", latlong: "", city: "", location_id: ""}; + if (node.location) { + location.name = node.location.name ?? ""; + location.location_id = node.location.pk ?? ""; + location.latlong = node.location.lat != null ? `${node.location.lat},${node.location.lng}` : ""; + location.city = node.location.city ?? ""; + } + + const no_likes = Boolean(node.like_and_view_counts_disabled); + const user = node.user; + const owner = node.owner; + if (user && owner) { + if (owner.id === user.id) { + // prefer user + } else if (user.username !== owner.username) { + throw new MapItemException(`Unable to parse item: different user and owner`); + } + } + + let play_count; + if (node.view_count != null) { + play_count = node.view_count; + } else if (node.play_count != null) { + play_count = node.play_count; + } else { + play_count = new MissingMappedField(0); + } + + let usertags = ""; + if (node.edge_media_to_tagged_user && Array.isArray(node.edge_media_to_tagged_user.edges)) { + usertags = node.edge_media_to_tagged_user.edges.map(e => e.node.user.username).join(","); + } + + return { + "id": node.shortcode, + "post_source_domain": node.__import_meta?.source_platform_url, + "collected_from_view": node._zs_instagram_view ?? new MissingMappedField(""), + "partial_item": node._zs_partial ?? new MissingMappedField(""), + "timestamp": formatUtcTimestamp(node.taken_at_timestamp), + "thread_id": node.shortcode, + "parent_id": node.shortcode, + "url": "https://www.instagram.com/p/" + node.shortcode, + "body": caption, + + "author": user?.username ?? owner?.username ?? new MissingMappedField(""), + "author_fullname": user?.full_name ?? owner?.full_name ?? new MissingMappedField(""), + "is_verified": Boolean(user?.is_verified), + "author_avatar_url": user?.profile_pic_url ?? owner?.profile_pic_url ?? new MissingMappedField(""), + "coauthors": new MissingMappedField(""), + "coauthor_fullnames": new MissingMappedField(""), + "coauthor_ids": new MissingMappedField(""), + + "media_type": media_type, + "num_media": num_media, + "image_urls": node.display_url ?? "", + "media_urls": media_url, + + "hashtags": extractHashtags(caption), + "usertags": usertags, + "play_count": play_count, + "likes_hidden": no_likes ? "yes" : "no", + "num_likes": no_likes ? new MissingMappedField(0) : (node.edge_media_preview_like?.count ?? new MissingMappedField(0)), + "num_comments": node.edge_media_preview_comment?.count ?? 0, + + "location_name": location.name, + "location_id": location.location_id, + "location_latlong": location.latlong, + "location_city": location.city, + + "unix_timestamp": node.taken_at_timestamp, + "missing_media": null + }; +} + +function parseItemlistItem(node) { + const partial_item = node._zs_partial ?? false; + const num_media = node.media_type !== MEDIA_TYPE_CAROUSEL ? 1 : (node.carousel_media?.length ?? 0); + let caption; + if (!('caption' in node)) { + caption = new MissingMappedField(""); + } else if (!node.caption) { + caption = ""; + } else { + caption = node.caption.text; + } + + const display_urls = []; + const media_urls = []; + let missing_media = null; + const typeMap = { [MEDIA_TYPE_PHOTO]: "photo", [MEDIA_TYPE_VIDEO]: "video" }; + const mediaTypesSet = new Set(); + + const media_nodes = node.media_type === MEDIA_TYPE_CAROUSEL ? node.carousel_media : [node]; + for (const media_node of media_nodes) { + if (media_node.media_type === MEDIA_TYPE_VIDEO) { + if (media_node.image_versions2) { + display_urls.push(media_node.image_versions2.candidates[0].url); + } else if (media_node.video_versions) { + display_urls.push(media_node.video_versions[0].url); + } else { + if (!partial_item) { + throw new MapItemException("Instagram item format change"); + } + } + if (media_node.video_versions) { + media_urls.push(media_node.video_versions[0].url); + } else { + if (!partial_item) { + throw new MapItemException("Instagram item format change"); + } + } + } else if (media_node.media_type === MEDIA_TYPE_PHOTO && media_node.image_versions2) { + const media_url = media_node.image_versions2.candidates[0].url; + display_urls.push(media_url); + media_urls.push(media_url); + } else { + missing_media = new MissingMappedField(""); + } + mediaTypesSet.add(typeMap[media_node.media_type] ?? "unknown"); + } + + const media_type = mediaTypesSet.size > 1 ? "mixed" : (mediaTypesSet.values().next().value); + + let num_comments; + if ("comment_count" in node) { + num_comments = node.comment_count; + } else if (Array.isArray(node.comments)) { + num_comments = node.comments.length; + } else { + num_comments = -1; + } + + const location = {name: "", latlong: "", city: "", location_id: ""}; + if (node.location) { + location.name = node.location.name ?? ""; + location.location_id = node.location.pk ?? ""; + location.latlong = node.location.lat != null ? `${node.location.lat},${node.location.lng}` : ""; + location.city = node.location.city ?? ""; + } + + const user = node.user ?? {}; + const owner = node.owner ?? {}; + if (user && owner) { + if (owner.id === user.id) { + // prefer user + } else if (user.username !== owner.username) { + throw new MapItemException(`Unable to parse item: different user and owner`); + } + } + + const coauthorsArr = []; + const coauthorFullnamesArr = []; + const coauthorIdsArr = []; + if (Array.isArray(node.coauthor_producers)) { + for (const cp of node.coauthor_producers) { + coauthorsArr.push(cp.username ?? new MissingMappedField("")); + coauthorFullnamesArr.push(cp.full_name ?? new MissingMappedField("")); + coauthorIdsArr.push(cp.id); + } + } + const coauthors = coauthorsArr.map(v => String(v)).join(","); + const coauthor_fullnames = coauthorFullnamesArr.map(v => String(v)).join(","); + const coauthor_ids = coauthorIdsArr.join(","); + + const no_likes = Boolean(node.like_and_view_counts_disabled); + let play_count; + if (node.view_count != null) { + play_count = node.view_count; + } else if (node.play_count != null) { + play_count = node.play_count; + } else { + play_count = new MissingMappedField(0); + } + + let usertags = ""; + if (node.usertags) { + usertags = node.usertags.in?.map(u => u.user.username).join(",") ?? ""; + } + + let collected_at; + let unix_at; + if (partial_item) { + collected_at = new MissingMappedField(0); + unix_at = new MissingMappedField(0); + } else { + collected_at = formatUtcTimestamp(node.taken_at); + unix_at = node.taken_at; + } + + return { + "collected_from_url": normalize_url_encoding(node.__import_meta?.source_platform_url), + "collected_from_view": node._zs_instagram_view ?? "", + "partial_item": node._zs_partial ?? "", + "id": node.code, + "timestamp": collected_at, + "thread_id": node.code, + "parent_id": node.code, + "url": "https://www.instagram.com/p/" + node.code, + "body": caption, + + "author_id": user.id ?? owner.id ?? new MissingMappedField(""), + "author": user.username ?? owner.username ?? new MissingMappedField(""), + "author_fullname": user.full_name ?? owner.full_name ?? new MissingMappedField(""), + "verified": Boolean(user.is_verified), + "author_avatar_url": user.profile_pic_url ?? owner.profile_pic_url ?? new MissingMappedField(""), + "coauthors": coauthors, + "coauthor_fullnames": coauthor_fullnames, + "coauthor_ids": coauthor_ids, + + "media_type": media_type, + "num_media": num_media, + "image_urls": display_urls.join(","), + "media_urls": media_urls.join(","), + + "hashtags": extractHashtags(caption), + "usertags": usertags, + "play_count": play_count, + "likes_hidden": no_likes ? "yes" : "no", + "num_likes": no_likes ? new MissingMappedField(0) : (node.like_count ?? new MissingMappedField(0)), + "num_comments": num_comments, + + "location_name": location.name, + "location_id": location.location_id, + "location_latlong": location.latlong, + "location_city": location.city, + + "unix_timestamp": unix_at, + "missing_media": missing_media + }; +} + +export function map_item(item) { + const link = item.link ?? ""; + if ((item.product_type === "ad") || (link && link.startsWith("https://www.facebook.com/ads/ig_redirect"))) { + throw new MapItemException("appears to be Instagram ad, check raw data to confirm and ensure Zeeschuimer is up to date."); + } + + const isPolaris = typeof item.__typename === "string" && item.__typename.toLowerCase().includes("polaris"); + const isGraph = typeof item.__typename === "string" && item.__typename !== "XDTMediaDict"; + + if (isPolaris) { + return new MappedItem(parsePolarisItem(item)); + } else if (isGraph) { + return new MappedItem(parseGraphItem(item)); + } else { + return new MappedItem(parseItemlistItem(item)); + } +} +// === end auto-generated === diff --git a/modules/linkedin.js b/modules/linkedin.js index f9b3e7a..75ed5bc 100644 --- a/modules/linkedin.js +++ b/modules/linkedin.js @@ -167,4 +167,230 @@ function recursively_enrich(object, mapped_objects) { } return object; -} \ No newline at end of file +} + +// === auto-generated by 4cat map_item sync — BLOCK REPLACED AUTOMATICALLY === +// (regenerated from datasources/linkedin/search_linkedin.py) +function getAuthor(post) { + const author = { + username: post.actor.navigationContext.actionTarget.split("linkedin.com/").pop().split("?")[0], + name: post.actor.name.text, + description: post.actor.description?.text ?? "", + pronouns: "", + avatar_url: "", + is_company: "no", + url: post.actor.navigationContext.actionTarget.split("?")[0] + }; + + if (post.actor.name?.attributes && post.actor.name.attributes[0]) { + const attr0 = post.actor.name.attributes[0]; + if (attr0["*miniProfile"]) { + const profile = attr0["*miniProfile"]; + if (profile.picture) { + const artifacts = profile.picture.artifacts.slice().sort((a, b) => b.width - a.width); + author.avatar_url = profile.picture.rootUrl + artifacts[0].fileIdentifyingUrlPathSegment; + } + if (profile.customPronoun) { + author.pronouns = profile.customPronoun; + } else if (profile.standardizedPronoun) { + author.pronouns = profile.standardizedPronoun.toLowerCase(); + } + } else if (attr0["*miniCompany"]) { + const comp = attr0["*miniCompany"]; + const artifacts = comp.logo.artifacts.slice().sort((a, b) => b.width - a.width); + author.is_company = "yes"; + author.avatar_url = comp.logo.rootUrl + artifacts[0].fileIdentifyingUrlPathSegment; + } + } + + if (post.actor.name?.attributesV2 && post.actor.name.attributesV2[0]) { + const pron = post.actor.name.attributesV2[0].detailData?.["*profileFullName"]?.pronoun; + if (pron) { + if (pron.customPronoun) author.pronouns = pron.customPronoun; + else if (pron.standardizedPronoun) author.pronouns = pron.standardizedPronoun; + } + } + + const avatar = post.actor.image?.attributes?.[0]?.detailData?.nonEntityProfilePicture; + if (avatar && avatar.vectorImage) { + author.avatar_url = avatar.vectorImage.rootUrl + avatar.vectorImage.artifacts[0].fileIdentifyingUrlPathSegment; + } + + return author; +} + +function parseTimeAgo(time_ago) { + const part = time_ago.split("•")[0]; + const numbers = part.replace(/[^0-9]/g, "").trim(); + const letters = part.replace(/[0-9]/g, "").trim(); + + const periodLengths = { + s: 1, + m: 60, + h: 3600, + d: 86400, + w: 7 * 86400, + mo: 30.4375 * 86400, + mnd: 30.4375 * 86400, + yr: 365.25 * 86400, + j: 365.25 * 86400 + }; + + const num = numbers.length ? parseInt(numbers, 10) : 0; + const factor = periodLengths[letters] ?? 0; + return factor * num; +} + +export function map_item(item) { + if (!item.actor) { + return {}; + } + let time_collected; + if (item.__import_meta) { + time_collected = Math.floor(item.__import_meta.timestamp_collected / 1000); + } else { + time_collected = Math.floor(Date.now() / 1000); + } + const time_ago = item.actor.subDescription?.text ?? ""; + const timestamp = Math.floor(time_collected - parseTimeAgo(time_ago)); + + // images + const images = []; + if (item.content && item.content.images) { + for (const image of item.content.images) { + const image_data = image.attributes[0].vectorImage; + const artifacts = image_data.artifacts.slice().sort((a, b) => b.width - a.width); + const url = image_data.rootUrl + artifacts[0].fileIdentifyingUrlPathSegment; + images.push(url); + } + } + if (images.length === 0 && item.content && item.content.articleComponent && item.content.articleComponent.largeImage) { + const largeImg = item.content.articleComponent.largeImage; + const attr0 = largeImg.attributes[0]; + const image = attr0.detailData?.vectorImage; + if (!image && attr0.imageUrl) { + images.push(attr0.imageUrl.url); + } else if (image && image.artifacts) { + images.push(image.rootUrl + image.artifacts[0].fileIdentifyingUrlPathSegment); + } + } + + // video thumbnail + let video_thumb_url = ""; + let thumb_content = null; + if (item.content && "*videoPlayMetadata" in item.content) { + thumb_content = item.content["*videoPlayMetadata"].thumbnail; + } else if (item.content && item.content.linkedInVideoComponent && item.content.linkedInVideoComponent) { + thumb_content = item.content.linkedInVideoComponent["*videoPlayMetadata"].thumbnail; + } else if (item.content && item.content.externalVideoComponent && item.content.externalVideoComponent) { + thumb_content = item.content.externalVideoComponent["*videoPlayMetadata"].thumbnail; + } + if (thumb_content) { + video_thumb_url = thumb_content.rootUrl + thumb_content.artifacts[0].fileIdentifyingUrlPathSegment; + } + + const author = getAuthor(item); + + const meta_urn = (item.updateMetadata?.urn) ?? item.preDashEntityUrn; + const urn = "urn:li:activity:" + meta_urn.split("urn:li:activity:")[1].split(",")[0].split(")")[0]; + const item_id = urn.split(":").pop(); + + // hashtags + let hashtags = []; + if (item.commentary && item.commentary.text && item.commentary.text.attributes) { + hashtags = item.commentary.text.attributes + .filter(tag => tag.type === "HASHTAG") + .map(tag => tag.trackingUrn.split(":").pop()); + } else if (item.commentary && item.commentary.text && item.commentary.text.attributesV2) { + hashtags = item.commentary.text.attributesV2 + .filter(tag => tag.detailData && tag.detailData["*hashtag"]) + .map(tag => tag.detailData["*hashtag"].trackingUrn.split(":").pop()); + } + + // mentions + const author_mentions = []; + const author_name_mentions = []; + if (item.commentary && item.commentary.text && item.commentary.text.attributes) { + for (const mention of item.commentary.text.attributes) { + if (mention.type === "PROFILE_MENTION") { + const mini = mention["*miniProfile"]; + author_mentions.push(mini.publicIdentifier); + author_name_mentions.push([mini.firstName ?? "", mini.lastName ?? ""].join(" ").trim()); + } else if (mention.type === "COMPANY_NAME") { + const mini = mention["*miniCompany"]; + author_mentions.push(mini.universalName); + author_name_mentions.push(mini.name ?? ""); + } + } + } + + // metrics + let metrics = {}; + if (item["*socialDetail"] && "*totalSocialActivityCounts" in item["*socialDetail"]) { + const counts = item["*socialDetail"]["*totalSocialActivityCounts"]; + metrics = { + comments: counts.numComments, + shares: counts.numShares, + reactions: counts.numLikes, + reaction_like: 0, + reaction_empathy: 0, + reaction_praise: 0, + reaction_entertainment: 0, + reaction_appreciation: 0, + reaction_interest: 0 + }; + if (Array.isArray(counts.reactionTypeCounts)) { + for (const rc of counts.reactionTypeCounts) { + const key = "reaction_" + rc.reactionType.toLowerCase(); + metrics[key] = rc.count; + } + } + } else { + const sd = item["*socialDetail"]; + metrics = { + comments: sd.comments?.paging?.total ?? 0, + shares: sd.totalShares ?? 0, + reactions: sd.likes?.paging?.total ?? 0 + }; + } + + // link url + let link_url = ""; + if (item.content && item.content.navigationContext) { + link_url = item.content.navigationContext.actionTarget ?? ""; + } else if (item.content && item.content.articleComponent && item.content.articleComponent.navigationContext) { + link_url = item.content.articleComponent.navigationContext.actionTarget ?? ""; + } + + // build result object + const result = { + collected_from_url: normalize_url_encoding(item.__import_meta?.source_platform_url ?? ""), + id: item_id, + thread_id: item_id, + body: item.commentary?.text?.text ?? "", + timestamp: formatUtcTimestamp(timestamp), + timestamp_collected: formatUtcTimestamp(time_collected), + timestamp_ago: time_ago.split("•")[0].trim(), + is_promoted: /\d/.test(time_ago) ? "no" : "yes", + // author fields (author_ prefix, drop trailing _username) + ...Object.fromEntries(Object.entries(author).map(([k, v]) => { + let field = "author_" + k; + field = field.replace("_username", ""); + return [field, v]; + })), + author_mentions: author_mentions.join(","), + author_name_mentions: author_name_mentions.join(","), + hashtags: hashtags.join(","), + image_urls: images.join(","), + video_thumb_url: video_thumb_url, + post_url: "https://www.linkedin.com/feed/update/" + urn, + link_url: link_url, + ...metrics, + inclusion_context: item.header?.text?.text ?? "", + unix_timestamp: timestamp, + unix_timestamp_collected: time_collected + }; + + return new MappedItem(result); +} +// === end auto-generated === diff --git a/modules/pinterest.js b/modules/pinterest.js index 5f9abcc..a67a0fe 100644 --- a/modules/pinterest.js +++ b/modules/pinterest.js @@ -91,4 +91,95 @@ export function capture(response, source_platform_url, source_url) { } return pins; -} \ No newline at end of file +} + +// === auto-generated by 4cat map_item sync — BLOCK REPLACED AUTOMATICALLY === +// (regenerated from datasources/pinterest/search_pinterest.py) +export function map_item(item) { + function map_item_from_json(post) { + // Parse timestamp, handling missing or malformed values + let timestampStr = post['created_at'] ?? post['createdAt']; + let unix_timestamp; + let str_timestamp; + if (timestampStr) { + let date = new Date(timestampStr); + if (!isNaN(date)) { + unix_timestamp = Math.floor(date.getTime() / 1000); + str_timestamp = formatUtcTimestamp(unix_timestamp); + } else { + unix_timestamp = new MissingMappedField(""); + str_timestamp = new MissingMappedField(""); + } + } else { + unix_timestamp = new MissingMappedField(""); + str_timestamp = new MissingMappedField(""); + } + + let post_id = post['entityId'] ?? post['id']; + + let image_url; + if (post['imageSpec_orig']) { + image_url = post['imageSpec_orig']['url']; + } else if (post['images']?.orig?.url) { + image_url = post['images']['orig']['url']; + } else { + image_url = post['images']?.url; + } + + return new MappedItem({ + collected_from_url: normalize_url_encoding(post['__import_meta']?.source_platform_url ?? ""), + id: post_id, + thread_id: post_id, + author: post['pinner']?.username, + author_fullname: post['pinner']?.fullName ?? post['pinner']?.full_name ?? "", + author_original: post['nativeCreator'] ? post['nativeCreator'].username : post['pinner']?.username, + body: (post['description'] ?? "").trim(), + subject: (post['title'] ?? "").trim(), + ai_description: post['auto_alt_text'] ?? "", + pinner_original: post['originPinner'] ? post['originPinner'].fullName : "", + pinner_via: post['viaPinner'] ? post['viaPinner'].fullName : "", + board: post['board']?.name, + board_pins: post['board']?.pinCount ?? post['board']?.pin_count ?? null, + board_url: post['board']?.url ? `https://www.pinterest.com${post['board'].url}` : null, + timestamp: str_timestamp, + idea_tags: post['pinJoin'] ? (post['pinJoin']['visualAnnotation'] ?? []).join(",") : "", + url: `https://www.pinterest.com/pin/${post_id}`, + is_video: (post['isVideo'] ?? post['videos']) ? "yes" : "no", + image_url: image_url, + dominant_colour: post['dominantColor'] ?? post['dominant_color'] ?? null, + unix_timestamp: unix_timestamp + }); + } + + function map_item_from_html(post) { + return new MappedItem({ + collected_from_url: normalize_url_encoding(post['__import_meta']?.source_platform_url ?? ""), + id: parseInt(post['id'], 10), + thread_id: parseInt(post['id'], 10), + author: new MissingMappedField(""), + author_fullname: new MissingMappedField(""), + author_original: new MissingMappedField(""), + body: (post['body'] ?? "").trim(), + subject: (post['title'] ?? "").trim(), + ai_description: new MissingMappedField(""), + pinner_original: new MissingMappedField(""), + pinner_via: new MissingMappedField(""), + board: new MissingMappedField(""), + board_pins: new MissingMappedField(""), + board_url: new MissingMappedField(""), + timestamp: new MissingMappedField(""), + idea_tags: (post['tags'] ?? []).join(","), + url: `https://www.pinterest.com/pin/${post['id']}`, + is_video: new MissingMappedField(""), + image_url: post['image'], + dominant_colour: new MissingMappedField(""), + unix_timestamp: new MissingMappedField("") + }); + } + + if (item['_zs-origin'] === 'html') { + return map_item_from_html(item); + } + return map_item_from_json(item); +} +// === end auto-generated === diff --git a/modules/rednote-comments.js b/modules/rednote-comments.js index 47f9d79..46911a3 100644 --- a/modules/rednote-comments.js +++ b/modules/rednote-comments.js @@ -52,4 +52,36 @@ export function capture(response, source_platform_url, source_url) { // no posts, no data return []; -} \ No newline at end of file +} + +// === auto-generated by 4cat map_item sync — BLOCK REPLACED AUTOMATICALLY === +// (regenerated from datasources/xiaohongshu_comments/search_rednote_comments.py) +export function map_item(item) { + // Convert create_time (milliseconds) to Unix timestamp (seconds) + const createTimeMs = Number(item["create_time"]); + const unix_timestamp = Math.floor(createTimeMs / 1000); + // Format as "YYYY-MM-DD HH:MM:SS" using the global helper + const timestamp = formatUtcTimestamp(unix_timestamp); + + // Resolve optional import metadata URL + const collected_from_url = normalize_url_encoding(item["__import_meta"]?.["source_platform_url"] ?? ""); + + // ip_location may be missing or empty – use MissingMappedField in that case + const ip_location = item["ip_location"] ? item["ip_location"] : new MissingMappedField(""); + + return new MappedItem({ + collected_from_url: collected_from_url, + id: item["id"], + thread_id: item["note_id"], + url: `https://www.xiaohongshu.com/explore/${item["note_id"]}`, + body: item["content"] ?? "", + timestamp: timestamp, + author: item["user_info"]?.["nickname"] ?? "", + author_avatar_url: item["user_info"]?.["image"] ?? "", + ip_location: ip_location, + likes: item["like_count"], + replies: item["sub_comment_count"], + unix_timestamp: unix_timestamp + }); +} +// === end auto-generated === diff --git a/modules/rednote.js b/modules/rednote.js index 7471c92..e42d04f 100644 --- a/modules/rednote.js +++ b/modules/rednote.js @@ -103,4 +103,136 @@ export function capture(response, source_platform_url, source_url) { // no posts, no data return []; -} \ No newline at end of file +} + +// === auto-generated by 4cat map_item sync — BLOCK REPLACED AUTOMATICALLY === +// (regenerated from datasources/xiaohongshu/search_rednote.py) +function map_item_from_json_api_explore(post) { + const item = post.type !== 'video' ? post.note_card : post; + const item_id = post.id ?? post.note_id; + + // Images handling + let images; + if (item.image_list) { + images = []; + for (const image of item.image_list) { + if (image.url_default) { + images.push(image.url_default); + } else if (image.info_list && image.info_list.length) { + let found = false; + for (const imgInfo of image.info_list) { + if (imgInfo.image_scene === 'WB_DFT') { + images.push(imgInfo.url); + found = true; + break; + } + } + if (!found) { + images.push(image.info_list[0].url); + } + } + } + } else if (item.cover) { + images = [item.cover.url_default]; + } else { + images = new MissingMappedField(""); + } + + const xsec_bit = post.xsec_token ? `?xsec_token=${post.xsec_token}` : ""; + const video_url = item.video?.media ? item.video.media.stream.h264[0].master_url : new MissingMappedField(""); + const author = item.user.nickname ?? item.user.nick_name; + const timestamp = item.time ?? null; + const timestampStr = timestamp ? formatUtcTimestamp(timestamp / 1000) : new MissingMappedField(""); + const hashtags = item.desc ? [...item.desc.matchAll(/#([^\s!@#$%^&*()_+{}:"|<>?\[\];',.\/`~]+)/g)].map(m => m[1]).join(",") : new MissingMappedField(""); + const body = item.desc ?? new MissingMappedField(""); + const image_urls = Array.isArray(images) ? images.join(",") : images; + const likes = item.interact_info?.liked_count ?? null; + const unix_ts = timestamp ? Math.floor(timestamp / 1000) : new MissingMappedField(""); + + return new MappedItem({ + collected_from_url: normalize_url_encoding(post.__import_meta?.source_platform_url ?? ""), + id: item_id, + thread_id: item_id, + url: `https://www.xiaohongshu.com/explore/${post.id}${xsec_bit}`, + title: item.display_title ?? "", + body: body, + hashtags: hashtags, + timestamp: timestampStr, + author: author, + author_avatar_url: item.user.avatar, + image_urls: image_urls, + video_url: video_url, + likes: likes, + unix_timestamp: unix_ts, + }); +} + +function map_item_from_json_embedded(item) { + const note = item.note; + const image = note.imageList?.[0]?.urlDefault ?? new MissingMappedField(""); + const xsec_bit = `?xsec_token=${note.xsecToken}`; + const timestamp = note.time ?? null; + const timestampStr = timestamp ? formatUtcTimestamp(timestamp / 1000) : new MissingMappedField(""); + const hashtags = note.desc ? [...note.desc.matchAll(/#([^\s!@#$%^&*()_+{}:"|<>?\[\];',.\/`~]+)/g)].map(m => m[1]).join(",") : new MissingMappedField(""); + const body = note.desc ?? new MissingMappedField(""); + const author = note.user.nickname ?? note.user.nick_name; + const likes = note.interactInfo?.likedCount ?? + note.interact_info?.liked_count ?? + note.likes ?? + new MissingMappedField(""); + const unix_ts = timestamp ? Math.floor(timestamp / 1000) : new MissingMappedField(""); + + return new MappedItem({ + collected_from_url: normalize_url_encoding(item.__import_meta?.source_platform_url ?? ""), + id: item.id, + thread_id: item.id, + url: `https://www.xiaohongshu.com/explore/${item.id}${xsec_bit}`, + title: note.title ?? "", + body: body, + hashtags: hashtags, + timestamp: timestampStr, + author: author, + author_avatar_url: note.user.avatar, + image_url: image, + video_url: new MissingMappedField(""), + likes: likes, + unix_timestamp: unix_ts, + }); +} + +function map_item_from_html(item) { + return new MappedItem({ + collected_from_url: normalize_url_encoding(item.__import_meta?.source_platform_url ?? ""), + id: item.id, + thread_id: item.id, + url: `https://www.xiaohongshu.com${item.url}`, + title: item.title, + body: new MissingMappedField(""), + hashtags: new MissingMappedField(""), + timestamp: new MissingMappedField(""), + author: item.author_name, + author_avatar_url: item.author_avatar_url, + image_url: item.thumbnail_url, + video_url: new MissingMappedField(""), + likes: item.likes, + unix_timestamp: new MissingMappedField(""), + }); +} + +export function map_item(post) { + // Reject tile stub items – minimal thumbnail entries with no content + if (!post.note_card && !post.user && post['_zs-origin'] !== 'html' && !post.note) { + const source = post.__import_meta?.source_url ?? ""; + throw new MapItemException(`Xiaohongshu tile stub without post content (source: ${source || 'unknown'})`); + } + if (post['_zs-origin'] === 'html') { + return map_item_from_html(post); + } else { + if (post.note) { + return map_item_from_json_embedded(post); + } else { + return map_item_from_json_api_explore(post); + } + } +} +// === end auto-generated === diff --git a/modules/threads.js b/modules/threads.js index 98ebfa5..4304e6e 100644 --- a/modules/threads.js +++ b/modules/threads.js @@ -69,4 +69,85 @@ export function capture(response, source_platform_url, source_url) { return item; } })] -} \ No newline at end of file +} + +// === auto-generated by 4cat map_item sync — BLOCK REPLACED AUTOMATICALLY === +// (regenerated from datasources/threads/search_threads.py) +export function map_item(item) { + const post = item; + const timestampStr = post.taken_at != null ? formatUtcTimestamp(post.taken_at) : ""; + let imageUrls = []; + let videoUrls = []; + + if (post.carousel_media && post.carousel_media.length) { + for (const c of post.carousel_media) { + if (c.image_versions2 && c.image_versions2.candidates && c.image_versions2.candidates.length) { + const url = c.image_versions2.candidates[0].url; + if (url) imageUrls.push(url); + } + if (c.video_versions && c.video_versions.length) { + const vurl = c.video_versions[0].url; + if (vurl) videoUrls.push(vurl); + } + } + } else { + if (post.image_versions2 && post.image_versions2.candidates && post.image_versions2.candidates.length) { + const url = post.image_versions2.candidates[0].url; + if (url) imageUrls.push(url); + } + if (post.video_versions && post.video_versions.length) { + const vurl = post.video_versions[0].url; + if (vurl) videoUrls.push(vurl); + } + } + + const audioUrl = post.audio && post.audio.audio_src ? post.audio.audio_src : ""; + + let linkedUrl = ""; + let linkThumbnail = ""; + const linkPreview = post.text_post_app_info && post.text_post_app_info.link_preview_attachment; + if (linkPreview) { + linkedUrl = linkPreview.url || ""; + try { + const parsed = new URL(linkedUrl); + const uParam = parsed.searchParams.getAll('u'); + if (uParam.length) { + linkedUrl = uParam[0]; + linkThumbnail = linkPreview.image_url ?? ""; + } else { + linkThumbnail = linkedUrl; + } + } catch (e) { + linkThumbnail = linkedUrl; + } + } + + const hashtags = post.caption && post.caption.text + ? [...post.caption.text.matchAll(/#([^\\s!@#$%ˆ&*()_+{}:"|<>?\\[\\];',.\\/`~']+)/g)].map(m => m[1]).join(',') + : ""; + + return new MappedItem({ + collected_from_url: normalize_url_encoding(post.__import_meta?.source_platform_url ?? ""), + id: post.code, + thread_id: post.code, + url: `https://www.threads.com/@${post.user?.username ?? ""}/post/${post.code}`, + body: post.caption?.text ?? "", + timestamp: timestampStr, + author: post.user?.username ?? "", + author_is_verified: post.user?.is_verified ? "yes" : "no", + author_avatar: post.user?.profile_pic_url ?? null, + image_url: imageUrls.join(","), + video_url: videoUrls.join(","), + audio_url: audioUrl, + link_url: linkedUrl, + link_thumbnail_url: linkThumbnail ?? "", + is_paid_partnership: post.is_paid_partnership ? "yes" : "no", + likes: post.like_count, + reposts: post.text_post_app_info?.repost_count ?? 0, + replies: post.text_post_app_info?.direct_reply_count ?? 0, + quotes: post.text_post_app_info?.quote_count ?? 0, + hashtags: hashtags, + unix_timestamp: post.taken_at != null ? Math.floor(post.taken_at) : null + }); +} +// === end auto-generated === diff --git a/modules/tiktok-comments.js b/modules/tiktok-comments.js index 97b68b8..e41446a 100644 --- a/modules/tiktok-comments.js +++ b/modules/tiktok-comments.js @@ -29,4 +29,36 @@ export function capture(response, source_platform_url, source_url) { } return []; -} \ No newline at end of file +} + +// === auto-generated by 4cat map_item sync — BLOCK REPLACED AUTOMATICALLY === +// (regenerated from datasources/tiktok_comments/search_tiktok_comments.py) +export function map_item(item) { + const timestamp = formatUtcTimestamp(item.create_time); + const thread_id = item.reply_id === "0" ? item.aweme_id : item.reply_id; + const avatar_url = item.user?.avatar_thumb?.url_list?.[0] ?? null; + const collected_from_url = normalize_url_encoding(item.__import_meta?.source_platform_url ?? ""); + const post_url = item.share_info?.url?.split(".html")[0] ?? null; + return new MappedItem({ + collected_from_url: collected_from_url, + id: item.cid, + thread_id: thread_id, + author: item.user?.unique_id ?? null, + author_full: item.user?.nickname ?? null, + author_avatar_url: avatar_url, + body: item.text ?? null, + timestamp: timestamp, + unix_timestamp: item.create_time, + likes: item.digg_count, + replies: item.reply_comment_total ?? 0, + post_id: item.aweme_id, + post_url: post_url, + post_body: item.share_info?.title ?? null, + comment_url: item.share_info?.url ?? null, + is_liked_by_post_author: !!item.author_pin ? "yes" : "no", + is_sticky: !!item.stick_position ? "yes" : "no", + is_comment_on_comment: item.reply_id === "0" ? "no" : "yes", + language_guess: item.comment_language ?? null + }); +} +// === end auto-generated === diff --git a/modules/tiktok.js b/modules/tiktok.js index 55e6fbf..586c9e1 100644 --- a/modules/tiktok.js +++ b/modules/tiktok.js @@ -103,4 +103,98 @@ export function capture(response, source_platform_url, source_url) { } else { return []; } -} \ No newline at end of file +} + +// === auto-generated by 4cat map_item sync — BLOCK REPLACED AUTOMATICALLY === +// (regenerated from datasources/tiktok/search_tiktok.py) +export function map_item(post) { + // Zeeschuimer metadata + const metadata = post["__import_meta"] ?? {}; + + const challenges = (post["challenges"] ?? []).map(ch => ch.title); + + const hashtags = (post["textExtra"] ?? []).filter(extra => "hashtagName" in extra && extra.hashtagName).map(extra => extra.hashtagName); + + const labels = Array.isArray(post["diversificationLabels"]) ? post["diversificationLabels"].join(",") : ""; + + let user_nickname = ""; + let user_fullname = ""; + let user_thumbnail = ""; + if (post["author"] && typeof post["author"] === "object") { + // from intercepted API response + user_nickname = post["author"]["uniqueId"] ?? ""; + user_fullname = post["author"]["nickname"] ?? ""; + user_thumbnail = post["author"]["avatarThumb"] ?? ""; + } else if (post["author"]) { + // from embedded JSON object + user_nickname = post["author"] ?? ""; + user_fullname = post["nickname"] ?? ""; + user_thumbnail = ""; + } + + // Determine the best thumbnail URL that hasn't expired yet + const thumbnail_options = []; + if (post["video"]?.shareCover) { + const shareCover = post["video"]["shareCover"]; + if (Array.isArray(shareCover) && shareCover.length) { + thumbnail_options.push(shareCover[shareCover.length - 1]); + } else if (typeof shareCover === "string") { + thumbnail_options.push(shareCover); + } + } + if (post["video"]?.cover) { + thumbnail_options.push(post["video"]["cover"]); + } + const now = Math.floor(Date.now() / 1000); + const validThumbnails = thumbnail_options.filter(url => { + if (!url) return false; + try { + const expiresStr = new URL(url).searchParams.get("x-expires"); + const expires = expiresStr ? parseInt(expiresStr, 10) : now; + return expires >= now; + } catch (e) { + return false; + } + }); + const thumbnail_url = validThumbnails.length ? validThumbnails[validThumbnails.length - 1] : ""; + + return new MappedItem({ + "collected_from_url": metadata["source_platform_url"] ? normalize_url_encoding(metadata["source_platform_url"]) : "", + "id": post["id"], + "thread_id": post["id"], + "author": user_nickname, + "author_full": user_fullname, + "author_followers": post["authorStats"]?.followerCount ?? "", + "author_likes": post["authorStats"]?.diggCount ?? "", + "author_videos": post["authorStats"]?.videoCount ?? "", + "author_avatar": user_thumbnail, + "body": post["desc"], + "stickers": (post["stickersOnItem"] ?? []).map(s => s.stickerText.join(" ")).join("\n"), + "timestamp": formatUtcTimestamp(parseInt(post["createTime"], 10)), + "unix_timestamp": parseInt(post["createTime"], 10), + "is_duet": (post["duetInfo"]?.duetFromId && post["duetInfo"]["duetFromId"] !== "0") ? "yes" : "no", + "is_ad": post["isAd"] ? "yes" : "no", + "is_paid_partnership": post["adAuthorization"] ? "yes" : "no", + "is_sensitive": post["maskType"] === 3 ? "yes" : "no", + "is_photosensitive": post["maskType"] === 4 ? "yes" : "no", + "music_name": post["music"]?.title ?? "", + "music_id": post["music"]?.id ?? "", + "music_url": post["music"]?.playUrl ?? "", + "music_thumbnail": post["music"]?.coverLarge ?? "", + "music_author": post["music"]?.authorName ?? "", + "video_url": post["video"]?.downloadAddr ?? "", + "tiktok_url": `https://www.tiktok.com/@${user_nickname}/video/${post["id"]}`, + "thumbnail_url": thumbnail_url, + "likes": post["stats"]?.diggCount, + "comments": post["stats"]?.commentCount, + "shares": post["stats"]?.shareCount, + "plays": post["stats"]?.playCount, + "hashtags": hashtags.join(","), + "challenges": challenges.join(","), + "diversification_labels": labels, + "location_created": post["locationCreated"] ?? "", + "effects": (post["effectStickers"] ?? []).map(e => e.name).join(","), + "warning": (post["warnInfo"] ?? []).map(w => w.text).join(",") + }); +} +// === end auto-generated === diff --git a/modules/truth.js b/modules/truth.js index fe626cf..ec6bdb1 100644 --- a/modules/truth.js +++ b/modules/truth.js @@ -35,4 +35,95 @@ export function capture(response, source_platform_url, source_url) { } return items; -} \ No newline at end of file +} + +// === auto-generated by 4cat map_item sync — BLOCK REPLACED AUTOMATICALLY === +// (regenerated from datasources/truth/search_truth.py) +export function map_item(item) { + const errors = []; + const postTime = new Date(item["created_at"]); + const images = []; + const videos = []; + const videoThumbs = []; + + if (item.media_attachments) { + for (const media of item.media_attachments) { + const mtype = media.type; + if (mtype === "image") { + images.push(media.url); + } else if (mtype === "video") { + videos.push(media.url); + videoThumbs.push(media.preview_url); + } else if (mtype === "tv") { + // Truth social TV channels – only a thumbnail is provided + videoThumbs.push(media.url); + // preview_url is a smaller thumb (ignored) + } else { + errors.push(`New media type: ${mtype}`); + } + } + } + + const group = item.group ? item.group : {}; + + let thread_id; + if (item.quote_id != null) { + thread_id = item.quote_id; + } else if (item.in_reply_to != null) { + let reply_to = item.in_reply_to; + while (reply_to) { + if (reply_to.in_reply_to != null) { + reply_to = reply_to.in_reply_to; + } else { + thread_id = reply_to.id; + break; + } + } + } else { + thread_id = item.id; + } + + const mentions = (item.mentions ?? []).map(m => m.username); + const hashtags = (item.tags ?? []).map(t => t.name); + + // Format timestamp as "YYYY-MM-DD HH:MM:SS" in UTC + const pad = n => String(n).padStart(2, "0"); + const timestamp = `${postTime.getUTCFullYear()}-${pad(postTime.getUTCMonth() + 1)}-${pad(postTime.getUTCDate())} ${pad(postTime.getUTCHours())}:${pad(postTime.getUTCMinutes())}:${pad(postTime.getUTCSeconds())}`; + + const mapped_item = { + collected_from_url: normalize_url_encoding(item.__import_meta?.source_platform_url ?? ""), + id: item.id, + created_at: item.created_at, + body: item.content, + url: item.url ?? null, + reblogs_count: item.reblogs_count ?? 0, + replies_count: item.replies_count ?? 0, + + account_id: item.account.id, + account_username: item.account.username, + account_display_name: item.account.display_name, + account_avatar: item.account.avatar, + account_verified: item.account.verified, + account_followers: item.account.followers_count, + account_following: item.account.following_count, + + mentions: mentions.join(","), + hashtags: hashtags.join(","), + + images: images.join(","), + video_thumbs: videoThumbs.join(","), + video_urls: videos.join(","), + + group_id: group.id ?? null, + group_display_name: group.display_name ?? null, + group_avatar: group.avatar ?? null, + group_note: group.note ?? null, + group_members_count: group.members_count ?? 0, + + thread_id: thread_id, + timestamp: timestamp + }; + + return new MappedItem(mapped_item, errors.join("; ")); +} +// === end auto-generated === From ce9ba39df9046669179f46e1706f666b86918d89 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Thu, 11 Jun 2026 11:44:25 +0200 Subject: [PATCH 35/41] instagram.js: fix {} is truthy, location_city null vs "" --- modules/instagram.js | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/modules/instagram.js b/modules/instagram.js index b60358e..b621b96 100644 --- a/modules/instagram.js +++ b/modules/instagram.js @@ -532,7 +532,7 @@ function parsePolarisItem(node) { } const user = node.user; - const owner = node.owner ?? {}; + const owner = node.owner; if (user && owner) { if (owner.id === user.id) { // prefer user @@ -649,7 +649,7 @@ function parseGraphItem(node) { location.name = node.location.name ?? ""; location.location_id = node.location.pk ?? ""; location.latlong = node.location.lat != null ? `${node.location.lat},${node.location.lng}` : ""; - location.city = node.location.city ?? ""; + location.city = node.location.city ?? null; } const no_likes = Boolean(node.like_and_view_counts_disabled); @@ -781,11 +781,11 @@ function parseItemlistItem(node) { location.name = node.location.name ?? ""; location.location_id = node.location.pk ?? ""; location.latlong = node.location.lat != null ? `${node.location.lat},${node.location.lng}` : ""; - location.city = node.location.city ?? ""; + location.city = node.location.city ?? null; } - const user = node.user ?? {}; - const owner = node.owner ?? {}; + const user = node.user; + const owner = node.owner; if (user && owner) { if (owner.id === user.id) { // prefer user From a5d981c37030bfbcdce841e4231280dfb3c7b4bb Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Thu, 11 Jun 2026 12:24:39 +0200 Subject: [PATCH 36/41] douyin: "" vs null and Missing vs null --- modules/douyin.js | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/modules/douyin.js b/modules/douyin.js index 6c19a0e..bb33e3a 100644 --- a/modules/douyin.js +++ b/modules/douyin.js @@ -391,7 +391,7 @@ export function map_item(item) { const createtime = stream_data["createtime"] ?? (item["requestTime"] ? item["requestTime"] / 1000 : undefined); post_timestamp = new Date((createtime ?? 0) * 1000); video_url = stream_data["stream_url"]?.["flv_pull_url"]?.["FULL_HD1"] ?? ""; - video_thumbnail = stream_data["video"]?.["cover"] ?? ""; + video_thumbnail = stream_data["video"]?.["cover"] ?? null; video_description = stream_data["title"] ?? ""; duration = "Unknown"; stats = stream_data["stats"] ?? {}; @@ -410,7 +410,7 @@ export function map_item(item) { } else { video_url = ""; } - video_thumbnail = item["video"]?.["cover"] ?? ""; + video_thumbnail = item["video"]?.["cover"] ?? null; video_description = item["desc"] ?? ""; duration = item["duration"] ?? item["video"]?.["duration"] ?? "Unknown"; prevent_download = item["download"]?.["prevent"] ? "yes" : "no"; @@ -463,7 +463,7 @@ export function map_item(item) { const create_time = stream_data["create_time"] ?? item["create_time"] ?? (metadata["timestamp_collected"] ? metadata["timestamp_collected"] / 1000 : undefined); post_timestamp = new Date((create_time ?? 0) * 1000); video_url = stream_data["stream_url"]?.["flv_pull_url"]?.["FULL_HD1"] ?? ""; - video_thumbnail = stream_data["video"]?.["cover"] ?? ""; + video_thumbnail = stream_data["video"]?.["cover"] ?? null; video_description = stream_data["title"] ?? ""; duration = "Unknown"; author = stream_data["owner"] ?? {}; @@ -502,13 +502,13 @@ export function map_item(item) { url_list_key = "url_list"; is_fake_key = "is_ad_fake"; // Stats (may be MissingMappedField) - const collect_count = stats ? (stats["collect_count"] ?? new MissingMappedField("Unknown")) : new MissingMappedField("Unknown"); - const comment_count = stats ? (stats["comment_count"] ?? new MissingMappedField("Unknown")) : new MissingMappedField("Unknown"); - const digg_count = stats ? (stats["digg_count"] ?? new MissingMappedField("Unknown")) : new MissingMappedField("Unknown"); - const download_count = stats ? (stats["download_count"] ?? new MissingMappedField("Unknown")) : new MissingMappedField("Unknown"); - const forward_count = stats ? (stats["forward_count"] ?? new MissingMappedField("Unknown")) : new MissingMappedField("Unknown"); - const play_count = stats ? (stats["play_count"] ?? new MissingMappedField("Unknown")) : new MissingMappedField("Unknown"); - const share_count = stats ? (stats["share_count"] ?? new MissingMappedField("Unknown")) : new MissingMappedField("Unknown"); + const collect_count = stats ? (stats["collect_count"] ?? null) : new MissingMappedField("Unknown"); + const comment_count = stats ? (stats["comment_count"] ?? null) : new MissingMappedField("Unknown"); + const digg_count = stats ? (stats["digg_count"] ?? null) : new MissingMappedField("Unknown"); + const download_count = stats ? (stats["download_count"] ?? null) : new MissingMappedField("Unknown"); + const forward_count = stats ? (stats["forward_count"] ?? null) : new MissingMappedField("Unknown"); + const play_count = stats ? (stats["play_count"] ?? null) : new MissingMappedField("Unknown"); + const share_count = stats ? (stats["share_count"] ?? null) : new MissingMappedField("Unknown"); // Video tags list video_tags = (item["video_tag"] ?? []).filter(t => t["tag_name"]).map(t => t["tag_name"]).join(","); const mix_current_episode = item[mix_info_key] ? (item[mix_info_key]["statis"]?.["current_episode"] ?? "N/A") : "N/A"; From 25ab435bd84a6f73d98b49665f5b108bd4710d2a Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Thu, 11 Jun 2026 14:10:56 +0200 Subject: [PATCH 37/41] gab: key lost if undefined in JS --- modules/gab.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/gab.js b/modules/gab.js index 9c3e14e..9e8b4e2 100644 --- a/modules/gab.js +++ b/modules/gab.js @@ -141,7 +141,7 @@ export function map_item(item) { body: item['c'] ?? item['content'], url: item['ul'] ?? item['url'], reaction_count: reactionCount, - favourites_count: item['fbc'] ?? item['favourites_count'], + favourites_count: item['fbc'] ?? item['favourites_count'] ?? null, replies_count: item['rc'] ?? item['replies_count'], reblogs_count: item['rbc'] ?? item['reblogs_count'], mentions: mentions.map(m => m['username']).join(','), From a83ebe8790acc85eb1b33f78a6bf8666342773b7 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Thu, 11 Jun 2026 14:27:35 +0200 Subject: [PATCH 38/41] map_item_test: fix order issue on id comparison --- tests/map_item_compare.test.js | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/tests/map_item_compare.test.js b/tests/map_item_compare.test.js index 8e06979..05406c5 100644 --- a/tests/map_item_compare.test.js +++ b/tests/map_item_compare.test.js @@ -269,8 +269,21 @@ function map_and_pair(inputs, outputs, map_item, dataset_key) { }; } + // An id is NOT guaranteed unique: some datasources re-emit the same post + // across paginated/scroll responses (e.g. imgur gallery returns a post on + // every page it appears on), so a key can legitimately recur with a + // different `collected_from_url` per capture. Bucket outputs into a FIFO + // queue per id rather than a single slot — then the k-th input occurrence + // of an id pairs with the k-th output occurrence. Both endpoints stream the + // dataset in the same stored order, so occurrences line up. (A plain + // last-wins Map would cross-match occurrence #0 against the surviving + // occurrence #N, fabricating field diffs and bogus unmatched ids.) const by_id_out = new Map(); - for (const item of outputs) by_id_out.set(String(item.id), item); + for (const item of outputs) { + const k = String(item.id); + if (!by_id_out.has(k)) by_id_out.set(k, []); + by_id_out.get(k).push(item); + } const pairs = []; const unmatched_inputs = []; @@ -284,24 +297,29 @@ function map_and_pair(inputs, outputs, map_item, dataset_key) { pairs.push({ input: m.input, js_result: null, error: m.error, expected: null, id: label }); continue; } - // Key on the mapped id; a successful map whose id matches no output is - // a genuine pairing miss and goes to unmatched_inputs. + // Key on the mapped id; a successful map whose id matches no remaining + // output occurrence is a genuine pairing miss and goes to unmatched_inputs. const lookup_id = m.js_result && m.js_result.id != null ? String(m.js_result.id) : null; - const expected = lookup_id != null ? by_id_out.get(lookup_id) : undefined; + const queue = lookup_id != null ? by_id_out.get(lookup_id) : undefined; + const expected = queue && queue.length ? queue.shift() : undefined; if (expected) { pairs.push({ input: m.input, js_result: m.js_result, error: null, expected, id: lookup_id }); - by_id_out.delete(lookup_id); } else { unmatched_inputs.push(lookup_id); } } + // Any output occurrences left in the queues had no matching input. + const unmatched_outputs = []; + for (const [id, queue] of by_id_out) { + for (let i = 0; i < queue.length; i++) unmatched_outputs.push(id); + } return { mode: 'id', pairs, input_count: inputs.length, output_count: outputs.length, unmatched_inputs, - unmatched_outputs: Array.from(by_id_out.keys()), + unmatched_outputs, }; } From 3b5d1576f3be98023ba15655e9fcb3cec1754db7 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Thu, 11 Jun 2026 14:49:06 +0200 Subject: [PATCH 39/41] map_item_compare.test: summarize datasources that pass/fail --- .gitignore | 1 + tests/map_item_compare.test.js | 69 +++++++++++++++++++++++++++++++++- tests/run-compare.mjs | 20 +++++++++- 3 files changed, 88 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 4d495c9..2a29aaf 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,7 @@ .temp-profile tests/.env tests/.env.local +tests/.compare-summary.txt __pycache__/ *.pyc diff --git a/tests/map_item_compare.test.js b/tests/map_item_compare.test.js index 05406c5..986a691 100644 --- a/tests/map_item_compare.test.js +++ b/tests/map_item_compare.test.js @@ -58,13 +58,19 @@ import 'cross-fetch/polyfill'; import 'dotenv/config'; -import { readFileSync, existsSync } from 'node:fs'; +import { readFileSync, existsSync, writeFileSync } from 'node:fs'; import { join, dirname } from 'node:path'; import { fileURLToPath } from 'node:url'; import { inspect_module } from './_module-info.js'; const __dirname = dirname(fileURLToPath(import.meta.url)); +// The end-of-run roll-up is written here, then printed by run-compare.mjs +// AFTER jest exits — jest buffers in-test stdout and hoists it above the +// result tree, so writing it from here directly would never land last. Keep +// in sync with the same constant in run-compare.mjs. +const SUMMARY_PATH = join(__dirname, '.compare-summary.txt'); + const FOURCAT_URL = process.env.FOURCAT_URL?.replace(/\/$/, ''); const FOURCAT_API_KEY = process.env.FOURCAT_API_KEY; @@ -496,3 +502,64 @@ for (const dataset_key of DATASET_KEYS_TO_RUN) { } }); } + +// Reduce a dataset's pre-computed state to a single verdict + one-line detail. +// Mirrors the assertions above exactly so the summary never disagrees with the +// per-test results: PASS only when pairing is clean AND every compared item +// matched; a FAIL_FAST halt leaves items unchecked, so it cannot be a PASS. +function summarize_dataset(key, info) { + if (info.error) { + return { key, status: 'FAIL', datasource: '?', module: '?', detail: `setup error: ${info.error.message}` }; + } + const { datasource_id, module_name, module_state, pairing, comparison } = info; + if (module_state.state === 'no_map_item') { + return { key, status: 'SKIP', datasource: datasource_id, module: module_name, detail: 'no map_item — nothing to compare' }; + } + if (module_state.state === 'syntax_error' || module_state.state === 'import_error') { + return { key, status: 'FAIL', datasource: datasource_id, module: module_name, detail: `module ${module_state.state.replace('_', ' ')}` }; + } + + const pairing_problems = []; + if (pairing.input_count !== pairing.output_count) { + pairing_problems.push(`count ${pairing.input_count}!=${pairing.output_count}`); + } + if (pairing.unmatched_inputs.length) pairing_problems.push(`${pairing.unmatched_inputs.length} unmatched input(s)`); + if (pairing.unmatched_outputs.length) pairing_problems.push(`${pairing.unmatched_outputs.length} unmatched output(s)`); + if (pairing.mode === 'index') pairing_problems.push(`paired by index`); + + const compared = comparison.results.length; + const failed_items = comparison.results.filter(r => !r.ok).length; + const total = pairing.pairs.length; + + if (pairing_problems.length || failed_items) { + const parts = []; + if (pairing_problems.length) parts.push(`pairing: ${pairing_problems.join(', ')}`); + if (failed_items) { + const halted = comparison.halted_count > 0 ? `, halted (+${comparison.halted_count} unchecked)` : ''; + parts.push(`${failed_items}/${compared} item(s) differ${halted}`); + } + return { key, status: 'FAIL', datasource: datasource_id, module: module_name, detail: parts.join('; ') }; + } + return { key, status: 'PASS', datasource: datasource_id, module: module_name, detail: `${total}/${total} items match` }; +} + +// Build the per-datasource roll-up once the whole file has run and stash it +// for run-compare.mjs to print as the genuine final output (see SUMMARY_PATH). +afterAll(() => { + const rows = DATASET_KEYS_TO_RUN.map(key => summarize_dataset(key, dataset_state[key])); + const w_status = 4; // PASS/FAIL/SKIP + const w_module = Math.max(6, ...rows.map(r => r.module.length)); + + const lines = ['', '=== map_item compare summary ===']; + for (const r of rows) { + const mark = r.status === 'PASS' ? '✓' : r.status === 'SKIP' ? '○' : '✗'; + lines.push( + ` ${mark} ${r.status.padEnd(w_status)} ${r.module.padEnd(w_module)} ${r.key} — ${r.detail}` + ); + } + const passed = rows.filter(r => r.status === 'PASS').length; + const failed = rows.filter(r => r.status === 'FAIL').length; + const skipped = rows.filter(r => r.status === 'SKIP').length; + lines.push(`${rows.length} datasource(s): ${passed} passed, ${failed} failed, ${skipped} skipped`); + writeFileSync(SUMMARY_PATH, lines.join('\n') + '\n'); +}); diff --git a/tests/run-compare.mjs b/tests/run-compare.mjs index 57efb66..bc7e88f 100644 --- a/tests/run-compare.mjs +++ b/tests/run-compare.mjs @@ -18,10 +18,19 @@ import { spawn } from 'node:child_process'; import { fileURLToPath } from 'node:url'; import { dirname, join } from 'node:path'; +import { readFileSync, rmSync } from 'node:fs'; const __dirname = dirname(fileURLToPath(import.meta.url)); const args = process.argv.slice(2); +// The comparator writes its roll-up here (jest buffers in-test stdout and +// hoists it above the result tree, so we print it from this launcher after +// jest exits to make it the genuine last output). Keep in sync with the same +// constant in map_item_compare.test.js. +const SUMMARY_PATH = join(__dirname, '.compare-summary.txt'); +// Drop any stale summary up front so a crashed run can't print the prior one. +rmSync(SUMMARY_PATH, { force: true }); + // First non-flag arg (if any) is the dataset key to narrow to. const dataset_key = args.find(a => !a.startsWith('-')); const flags = args.filter(a => a !== dataset_key); @@ -46,7 +55,16 @@ const child = spawn( { stdio: 'inherit', cwd: __dirname, env }, ); -child.on('exit', code => process.exit(code ?? 1)); +child.on('exit', code => { + // Print the roll-up after jest's own tally so it's the last thing on screen. + try { + process.stdout.write(readFileSync(SUMMARY_PATH, 'utf8')); + rmSync(SUMMARY_PATH, { force: true }); + } catch { + // No summary file (e.g. setup threw before afterAll) — nothing to print. + } + process.exit(code ?? 1); +}); child.on('error', err => { console.error(`failed to launch jest: ${err.message}`); process.exit(1); From 8118d81d52ba55b9db8723965718dd6aac638002 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Thu, 11 Jun 2026 14:49:25 +0200 Subject: [PATCH 40/41] threads.js fix some regex --- modules/threads.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/threads.js b/modules/threads.js index 4304e6e..e906fa5 100644 --- a/modules/threads.js +++ b/modules/threads.js @@ -123,7 +123,7 @@ export function map_item(item) { } const hashtags = post.caption && post.caption.text - ? [...post.caption.text.matchAll(/#([^\\s!@#$%ˆ&*()_+{}:"|<>?\\[\\];',.\\/`~']+)/g)].map(m => m[1]).join(',') + ? [...post.caption.text.matchAll(/#([^\s!@#$%ˆ&*()_+{}:"|<>?\[\];',./`~']+)/g)].map(m => m[1]).join(',') : ""; return new MappedItem({ From 743af8535c4112ee4bc680ee4f3e00e38895cc2f Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Thu, 11 Jun 2026 15:34:12 +0200 Subject: [PATCH 41/41] map_item_compare.test: loosely test URLs (not byte for byte) --- tests/map_item_compare.test.js | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/tests/map_item_compare.test.js b/tests/map_item_compare.test.js index 986a691..43c5283 100644 --- a/tests/map_item_compare.test.js +++ b/tests/map_item_compare.test.js @@ -167,11 +167,36 @@ function normalize(value) { return JSON.parse(JSON.stringify(value)); } +function looks_like_url(v) { + return typeof v === 'string' && /^https?:\/\//i.test(v); +} + +// Percent-decode for encoding-insensitive URL comparison. Decode each maximal +// %XX run on its own so a malformed sequence doesn't throw and abort the rest. +function decode_url_loose(s) { + return s.replace(/(?:%[0-9A-Fa-f]{2})+/g, run => { + try { return decodeURIComponent(run); } catch { return run; } + }); +} + function deep_equal(a, b) { if (a === b) return true; if (a === null || b === null) return a === b; if (typeof a !== typeof b) return false; - if (typeof a !== 'object') return false; + if (typeof a !== 'object') { + // Treat encoding-equivalent URLs as equal. The comparator targets bad + // data, not cosmetic percent-encoding differences: `=` vs `%3D` in a + // query value (and the like) resolve to the same URL, so 4CAT emitting + // one form while the JS normalizer emits the other is not a defect. + // Applied at the leaf so it covers URLs nested in arrays/objects too. + // Tradeoff: this also collapses `%2F` vs `/`, which can be semantically + // distinct — accepted, as a genuinely different URL still differs once + // decoded. + if (looks_like_url(a) && looks_like_url(b)) { + return decode_url_loose(a) === decode_url_loose(b); + } + return false; + } if (Array.isArray(a) !== Array.isArray(b)) return false; if (Array.isArray(a)) { if (a.length !== b.length) return false;