diff --git a/.gitignore b/.gitignore index 6cf9326..2a29aaf 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,11 @@ # Testing artefacts .temp-profile +tests/.env +tests/.env.local +tests/.compare-summary.txt +__pycache__/ +*.pyc # logs geckodriver.log diff --git a/js/lib.js b/js/lib.js index e38430e..518a6fa 100644 --- a/js/lib.js +++ b/js/lib.js @@ -57,6 +57,16 @@ class MissingMappedField { toString() { return `${this.value}`; } + + // Mirror 4CAT's API serialization so JSON.stringify produces the same + // tagged form on both sides: 4CAT's /api/dataset//items/ endpoint, + // when called with `missing_fields=keep`, emits missing values as + // `{ __missing: true, value: }`. Matching that shape here + // lets the map_item comparator deep-equal both sides without special + // handling. + toJSON() { + return { __missing: true, value: this.value }; + } } /** diff --git a/modules/9gag.js b/modules/9gag.js index a2d8bc5..213e798 100644 --- a/modules/9gag.js +++ b/modules/9gag.js @@ -40,4 +40,70 @@ export function capture(response, source_platform_url, source_url) { } return data["data"]["posts"]; -} \ No newline at end of file +} + +// === auto-generated by 4cat map_item sync — BLOCK REPLACED AUTOMATICALLY === +// (regenerated from datasources/ninegag/search_9gag.py) +export function map_item(post) { + // Convert Unix timestamp (seconds) to Date object + const postTimestampSec = post.creationTs; + const postTimestamp = new Date(postTimestampSec * 1000); + + // Select the highest‑resolution image that is not a video + const images = Object.values(post.images ?? {}); + const imageCandidates = images.filter(v => !('hasAudio' in v)); + imageCandidates.sort((a, b) => (b.width * b.height) - (a.width * a.height)); + const image = imageCandidates[0] ?? {}; + + // Select the highest‑resolution video (if any) and pick the best URL format + const videoCandidates = images.filter(v => ('hasAudio' in v)); + videoCandidates.sort((a, b) => (b.width * b.height) - (a.width * a.height)); + let videoUrl = ""; + if (videoCandidates.length) { + const vid = videoCandidates[0]; + if (vid.av1Url) videoUrl = vid.av1Url; + else if (vid.h265Url) videoUrl = vid.h265Url; + else if (vid.vp9Url) videoUrl = vid.vp9Url; + else if (vid.vp8Url) videoUrl = vid.vp8Url; + } + + // Handle anonymous posts – they appear as the user "9GAGGER" + if (!post.creator) { + post.creator = { + username: "9GAGGER", + fullName: "", + emojiStatus: "", + isVerifiedAccount: "" + }; + } + + return new MappedItem({ + collected_from_url: normalize_url_encoding(post.__import_meta?.source_platform_url ?? ""), + id: post.id, + url: post.url, + subject: post.title, + body: post.description, + timestamp: formatUtcTimestamp(postTimestampSec), + author: post.creator?.username ?? "", + author_name: post.creator?.fullName ?? "", + author_status: post.creator?.emojiStatus ?? "", + author_verified: post.creator?.isVerifiedAccount ? "yes" : "no", + type: post.type, + image_url: image.url ?? "", + video_url: videoUrl, + is_nsfw: post.nsfw === 0 ? "no" : "yes", + is_promoted: post.promoted === 0 ? "no" : "yes", + is_vote_masked: post.isVoteMasked === 0 ? "no" : "yes", + is_anonymous: !post.isAnonymous ? "no" : "yes", + source_domain: post.sourceDomain, + source_url: post.sourceUrl, + upvotes: post.upVoteCount, + downvotes: post.downVoteCount, + score: (post.upVoteCount ?? 0) - (post.downVoteCount ?? 0), + comments: post.commentsCount, + tags: (post.tags ?? []).map(t => t.key).join(","), + tags_annotated: (post.annotationTags ?? []).join(","), + unix_timestamp: postTimestampSec + }); +} +// === end auto-generated === diff --git a/modules/_loader.js b/modules/_loader.js index afae2d7..ceb0080 100644 --- a/modules/_loader.js +++ b/modules/_loader.js @@ -1,3 +1,8 @@ +// Load-order dependency: `wrap_for_map_item` (used below) is a free global +// defined in js/lib.js, which manifest.json loads as a plain background +// script before this module. There is no import for it here on purpose — +// MV2 background scripts share one global scope. If lib.js stops being +// loaded first, the mapper wrapper below will ReferenceError. async function load() { const imported_modules = [ await import("./tiktok.js"), diff --git a/modules/douyin.js b/modules/douyin.js index ef811d9..bb33e3a 100644 --- a/modules/douyin.js +++ b/modules/douyin.js @@ -339,4 +339,268 @@ export function capture(response, source_platform_url, source_url) { } else { // console.log("Detected expected object(s) by no usable items found") } -} \ No newline at end of file +} + +// === auto-generated by 4cat map_item sync — BLOCK REPLACED AUTOMATICALLY === +// (regenerated from datasources/douyin/search_douyin.py) +function getChineseNumber(num) { + if (typeof num === "number") { + return num; + } + if (typeof num !== "string") { + return 0; + } + if (num.includes("万")) { + const cleaned = num.replace(/[^0-9.]/g, ""); + return parseFloat(cleaned) * 10000; + } + const cleaned = num.replace(/[^0-9.]/g, ""); + return cleaned ? parseInt(cleaned, 10) : 0; +} + +export function map_item(item) { + // Helper to safely access nested properties + const get = (obj, path, def) => { + return path.reduce((o, p) => (o && o[p] != null ? o[p] : undefined), obj) ?? def; + }; + + const metadata = item["__import_meta"] ?? {}; + let subject = "Post"; + let stream_data = {}; + let post_timestamp; + let video_url = ""; + let video_thumbnail = ""; + let video_description = ""; + let duration = "Unknown"; + let prevent_download = null; + let stats = {}; + let author = {}; + let video_tags = ""; + let aweme_id_key, group_id_key, text_extra_key, hashtag_key, mention_key, author_id_key; + let mix_info_key, mix_id_key, mix_name_key; + let author_sec_key, avatar_thumb_key, url_list_key, is_fake_key; + + if (item["ZS_collected_from_embed"]) { + // Embedded HTML format + if (item["cellRoom"] && item["cellRoom"] !== "$undefined") { + stream_data = item["cellRoom"]["rawdata"] ?? {}; + } + if (Object.keys(stream_data).length) { + // Stream embedded + subject = "Stream"; + const createtime = stream_data["createtime"] ?? (item["requestTime"] ? item["requestTime"] / 1000 : undefined); + post_timestamp = new Date((createtime ?? 0) * 1000); + video_url = stream_data["stream_url"]?.["flv_pull_url"]?.["FULL_HD1"] ?? ""; + video_thumbnail = stream_data["video"]?.["cover"] ?? null; + video_description = stream_data["title"] ?? ""; + duration = "Unknown"; + stats = stream_data["stats"] ?? {}; + author = stream_data["owner"] ?? {}; + author_sec_key = "sec_uid"; + avatar_thumb_key = "avatar_thumb"; + url_list_key = "url_list"; + is_fake_key = "is_ad_fake"; + } else { + // Regular post embedded + post_timestamp = new Date(item["createTime"] * 1000); + const videos_list = item["video"]?.["bitRateList"]; + if (videos_list) { + const videos = [...videos_list].sort((a, b) => (b["bitRate"] ?? 0) - (a["bitRate"] ?? 0)); + video_url = "https" + (videos[0]["playApi"] ?? ""); + } else { + video_url = ""; + } + video_thumbnail = item["video"]?.["cover"] ?? null; + video_description = item["desc"] ?? ""; + duration = item["duration"] ?? item["video"]?.["duration"] ?? "Unknown"; + prevent_download = item["download"]?.["prevent"] ? "yes" : "no"; + stats = item["stats"] ?? {}; + author = item["authorInfo"] ?? {}; + author_sec_key = "secUid"; + avatar_thumb_key = "avatarThumb"; + url_list_key = "urlList"; + is_fake_key = "isAdFake"; + } + // Embedded keys (same for both branches) + aweme_id_key = "awemeId"; + group_id_key = "groupId"; + text_extra_key = "textExtra"; + hashtag_key = "hashtagName"; + mention_key = "secUid"; + author_id_key = "authorUserId"; + mix_info_key = "mixInfo"; + mix_id_key = "mixId"; + mix_name_key = "mixName"; + // Stats (may be MissingMappedField) + const collect_count = stats["collectCount"] ?? new MissingMappedField("Unknown"); + const comment_count = stats["commentCount"] ?? new MissingMappedField("Unknown"); + const digg_count = stats["diggCount"] ?? new MissingMappedField("Unknown"); + const download_count = stats["downloadCount"] ?? new MissingMappedField("Unknown"); + const forward_count = stats["forwardCount"] ?? new MissingMappedField("Unknown"); + const play_count = stats["playCount"] ?? new MissingMappedField("Unknown"); + const share_count = stats["shareCount"] ?? new MissingMappedField("Unknown"); + // Video tags (guess) + video_tags = (item["videoTag"] ?? []).filter(t => t["tagName"]).map(t => t["tagName"]).join(","); + const mix_current_episode = (item[mix_info_key] ?? {})["currentEpisode"] ?? "N/A"; + // Build result later – keep intermediate values in closure variables + var __embed_collect_count = collect_count; + var __embed_comment_count = comment_count; + var __embed_digg_count = digg_count; + var __embed_download_count = download_count; + var __embed_forward_count = forward_count; + var __embed_play_count = play_count; + var __embed_share_count = share_count; + var __embed_mix_current_episode = mix_current_episode; + } else { + // Non‑embedded JSON format + stream_data = item["rawdata"] ?? item["cell_room"]?.["rawdata"]; + if (stream_data) { + // Stream (may be a JSON string) + if (typeof stream_data === "string") { + try { stream_data = JSON.parse(stream_data); } catch (e) { /* ignore */ } + } + subject = "Stream"; + const create_time = stream_data["create_time"] ?? item["create_time"] ?? (metadata["timestamp_collected"] ? metadata["timestamp_collected"] / 1000 : undefined); + post_timestamp = new Date((create_time ?? 0) * 1000); + video_url = stream_data["stream_url"]?.["flv_pull_url"]?.["FULL_HD1"] ?? ""; + video_thumbnail = stream_data["video"]?.["cover"] ?? null; + video_description = stream_data["title"] ?? ""; + duration = "Unknown"; + author = stream_data["owner"] ?? {}; + video_tags = stream_data["video_feed_tag"] ?? ""; + stats = stream_data["stats"] ?? {}; + } else { + // Regular post + post_timestamp = new Date(item["create_time"] * 1000); + const videos_list = item["video"]?.["bit_rate"]; + if (!videos_list) { + video_url = ""; + video_thumbnail = ""; + } else { + const videos = [...videos_list].sort((a, b) => (b["bit_rate"] ?? 0) - (a["bit_rate"] ?? 0)); + video_url = videos[0]["play_addr"]?.["url_list"]?.[0] ?? ""; + video_thumbnail = item["video"]?.["cover"]?.["url_list"]?.[0] ?? ""; + } + video_description = item["desc"] ?? ""; + duration = item["duration"] ?? item["video"]?.["duration"] ?? "Unknown"; + author = item["author"] ?? {}; + stats = item["statistics"] ?? {}; + } + prevent_download = ("prevent_download" in item) ? (item["prevent_download"] ? "yes" : "no") : null; + // Keys for non‑embedded format + aweme_id_key = "aweme_id"; + group_id_key = "group_id"; + text_extra_key = "text_extra"; + hashtag_key = "hashtag_name"; + mention_key = "sec_uid"; + author_id_key = "author_user_id"; + mix_info_key = "mix_info"; + mix_id_key = "mix_id"; + mix_name_key = "mix_name"; + author_sec_key = "sec_uid"; + avatar_thumb_key = "avatar_thumb"; + url_list_key = "url_list"; + is_fake_key = "is_ad_fake"; + // Stats (may be MissingMappedField) + const collect_count = stats ? (stats["collect_count"] ?? null) : new MissingMappedField("Unknown"); + const comment_count = stats ? (stats["comment_count"] ?? null) : new MissingMappedField("Unknown"); + const digg_count = stats ? (stats["digg_count"] ?? null) : new MissingMappedField("Unknown"); + const download_count = stats ? (stats["download_count"] ?? null) : new MissingMappedField("Unknown"); + const forward_count = stats ? (stats["forward_count"] ?? null) : new MissingMappedField("Unknown"); + const play_count = stats ? (stats["play_count"] ?? null) : new MissingMappedField("Unknown"); + const share_count = stats ? (stats["share_count"] ?? null) : new MissingMappedField("Unknown"); + // Video tags list + video_tags = (item["video_tag"] ?? []).filter(t => t["tag_name"]).map(t => t["tag_name"]).join(","); + const mix_current_episode = item[mix_info_key] ? (item[mix_info_key]["statis"]?.["current_episode"] ?? "N/A") : "N/A"; + var __embed_collect_count = collect_count; + var __embed_comment_count = comment_count; + var __embed_digg_count = digg_count; + var __embed_download_count = download_count; + var __embed_forward_count = forward_count; + var __embed_play_count = play_count; + var __embed_share_count = share_count; + var __embed_mix_current_episode = mix_current_episode; + } + + // Stream stats (common) + const count_total_streams_viewers = stats["total_user"] ?? "N/A"; + const count_current_stream_viewers = ("user_count_str" in stats) ? getChineseNumber(stats["user_count_str"]) : "N/A"; + + // Displayed flag for mix items + let displayed = true; + if (item["ZS_collected_from_mix"] && !item["ZS_first_mix_vid"]) { + displayed = false; + } + + // Image URLs + const image_urls = []; + if (Array.isArray(item["images"])) { + for (const img of item["images"]) { + if (Array.isArray(img["url_list"])) { + image_urls.push(img["url_list"][0]); + } else if (Array.isArray(img["urlList"])) { + image_urls.push(img["urlList"][0]); + } + } + } + + // Music fields + const music_obj = item["music"]; + const music_author = (music_obj && music_obj !== "$undefined") ? (music_obj["author"] ?? "") : ""; + const music_title = (music_obj && music_obj !== "$undefined") ? (music_obj["title"] ?? "") : ""; + const music_url = (music_obj && music_obj !== "$undefined") ? (music_obj["play_url"]?.["uri"] ?? "") : ""; + + // Collection / Mix handling + let mix_current_episode = __embed_mix_current_episode; + if (mix_current_episode === "$undefined") mix_current_episode = "N/A"; + const collection_id_raw = item[mix_info_key]?.[mix_id_key] ?? "N/A"; + const collection_id = collection_id_raw === "$undefined" ? "N/A" : collection_id_raw; + const collection_name_raw = item[mix_info_key]?.[mix_name_key] ?? "N/A"; + const collection_name = collection_name_raw === "$undefined" ? "N/A" : collection_name_raw; + const part_of_collection = (item[mix_info_key] && (mix_id_key in item[mix_info_key]) && collection_id !== "N/A") ? "yes" : "no"; + + // Build the mapped item + return new MappedItem({ + "collected_from_url": normalize_url_encoding(metadata["source_platform_url"] ?? ""), + "id": item[aweme_id_key], + "thread_id": item[group_id_key], + "subject": subject, + "body": video_description, + "timestamp": formatUtcTimestamp(Math.floor(post_timestamp.getTime() / 1000)), + "post_url": subject === "Post" ? `https://www.douyin.com/video/${item[aweme_id_key]}` : `https://live.douyin.com/${author["web_rid"]}`, + "region": item["region"] ?? "", + "hashtags": (item[text_extra_key] ?? []).filter(t => t[hashtag_key]).map(t => t[hashtag_key]).join(","), + "mentions": (item[text_extra_key] ?? []).filter(t => t[mention_key]).map(t => `https://www.douyin.com/user/${t[mention_key]}`).join(","), + "video_tags": video_tags, + "prevent_download": prevent_download, + "video_url": video_url, + "video_thumbnail": video_thumbnail, + "video_duration": duration, + "image_urls": image_urls.join(","), + "music_author": music_author, + "music_title": music_title, + "music_url": music_url, + "collect_count": __embed_collect_count, + "comment_count": __embed_comment_count, + "digg_count": __embed_digg_count, + "download_count": __embed_download_count, + "forward_count": __embed_forward_count, + "play_count": __embed_play_count, + "share_count": __embed_share_count, + "count_total_streams_viewers": count_total_streams_viewers, + "count_current_stream_viewers": count_current_stream_viewers, + "author_user_id": item[author_id_key] ?? (author["uid"] ?? author["id"]), + "author_nickname": author["nickname"] ?? "", + "author_profile_url": `https://www.douyin.com/user/${author[author_sec_key]}`, + "author_thumbnail_url": author[avatar_thumb_key]?.[url_list_key]?.[0] ?? "", + "author_region": author["region"] ?? null, + "author_is_ad_fake": author[is_fake_key] ?? null, + "part_of_collection": part_of_collection, + "4CAT_first_video_displayed": displayed ? "yes" : "no", + "collection_id": collection_id, + "collection_name": collection_name, + "place_in_collection": mix_current_episode, + "unix_timestamp": Math.floor(post_timestamp.getTime() / 1000) + }); +} +// === end auto-generated === diff --git a/modules/gab.js b/modules/gab.js index a5eab6d..9e8b4e2 100644 --- a/modules/gab.js +++ b/modules/gab.js @@ -72,4 +72,112 @@ export function capture(response, source_platform_url, source_url) { } } return items; -} \ No newline at end of file +} + +// === auto-generated by 4cat map_item sync — BLOCK REPLACED AUTOMATICALLY === +// (regenerated from datasources/gab/search_gab.py) +export function map_item(item) { + const unknownData = []; + + const postId = item['i'] ?? item['id']; + const metadata = item['__import_meta'] ?? {}; + + let collectedAt; + if (metadata['timestamp_collected'] != null) { + const ts = metadata['timestamp_collected'] / 1000; + collectedAt = formatUtcTimestamp(ts); + } else { + collectedAt = new MissingMappedField('Unknown'); + } + + const reactions = item['rc'] ?? item['reactions_counts']; + let reactionCount; + if (typeof reactions === 'number') { + reactionCount = reactions; + } else { + reactionCount = Object.values(reactions ?? {}).reduce((sum, val) => sum + (val ?? 0), 0); + } + + const group = item['g'] ?? item['group'] ?? null; + const author = item['author_info'] ?? item['account'] ?? null; + const mentions = item['m'] ?? item['mentions'] ?? []; + const tags = item['tg'] ?? item['tags'] ?? []; + const card = item['card'] ?? item['link'] ?? null; + const mediaItems = item['image_info'] ?? item['media_attachments'] ?? []; + + const imageUrls = []; + const videoUrls = []; + + for (const media of mediaItems) { + const type = media['t'] ?? media['type']; + if (type === 'image') { + const url = media['u'] ?? media['url']; + if (url == null) { + unknownData.push(`Media missing URL: ${url}`); + } else { + imageUrls.push(url); + } + } else if (type === 'video') { + const url = media['smp4'] ?? media['source_mp4']; + if (url == null) { + unknownData.push(`Media missing URL: ${url}`); + } else { + videoUrls.push(url); + } + } else { + unknownData.push(`Unknown media type: ${JSON.stringify(media)}`); + } + } + + const createdAtRaw = item['ca'] ?? item['created_at']; + const postDate = new Date(createdAtRaw); + const postTimeStr = formatUtcTimestamp(postDate.getTime() / 1000); + + const mappedItem = { + collected_at: collectedAt, + collected_from_url: normalize_url_encoding(metadata['source_platform_url'] ?? ''), + id: postId, + created_at: postTimeStr, + body: item['c'] ?? item['content'], + url: item['ul'] ?? item['url'], + reaction_count: reactionCount, + favourites_count: item['fbc'] ?? item['favourites_count'] ?? null, + replies_count: item['rc'] ?? item['replies_count'], + reblogs_count: item['rbc'] ?? item['reblogs_count'], + mentions: mentions.map(m => m['username']).join(','), + tags: tags.map(t => t['name']).join(','), + + group_id: group ? group['id'] ?? null : null, + group_title: group ? group['title'] ?? null : null, + group_description: group ? group['description'] ?? null : null, + group_member_count: group ? group['member_count'] ?? null : null, + group_is_private: group ? group['is_private'] ?? null : null, + group_url: group ? group['url'] ?? null : null, + group_created_at: group ? group['created_at'] ?? null : null, + + account_id: author ? (author['i'] ?? author['id']) : null, + account_username: author ? (author['un'] ?? author['username']) : null, + account_account: author ? (author['ac'] ?? author['acct']) : null, + account_display_name: author ? (author['dn'] ?? author['display_name']) : null, + account_note: author ? (author['nt'] ?? author['note']) : null, + + link_id: card ? card['id'] ?? null : null, + link_url: card ? card['url'] ?? null : null, + link_title: card ? card['title'] ?? null : null, + link_description: card ? card['description'] ?? null : null, + link_type: card ? card['type'] ?? null : null, + link_image: card ? card['image'] ?? null : null, + + image_urls: imageUrls.join(','), + video_urls: videoUrls.join(','), + + thread_id: item['i'] ?? item['conversation_id'], + timestamp: postTimeStr + }; + + if (unknownData.length) { + return new MappedItem(mappedItem, {message: unknownData.join('')}); + } + return new MappedItem(mappedItem); +} +// === end auto-generated === diff --git a/modules/imgur.js b/modules/imgur.js index 9cc662b..3d37892 100644 --- a/modules/imgur.js +++ b/modules/imgur.js @@ -30,4 +30,44 @@ export function capture(response, source_platform_url, source_url) { } return data["posts"]; -} \ No newline at end of file +} + +// === auto-generated by 4cat map_item sync — BLOCK REPLACED AUTOMATICALLY === +// (regenerated from datasources/imgur/search_imgur.py) +export function map_item(item) { + // Parse created_at timestamp (ISO 8601) to Unix seconds + const createdAt = item.created_at; + if (!createdAt) { + throw new MapItemException('Missing created_at field'); + } + const unix_timestamp = Math.floor(Date.parse(createdAt) / 1000); + const timestamp = formatUtcTimestamp(unix_timestamp); + + const collected_from_url = normalize_url_encoding(item.__import_meta?.source_platform_url ?? ""); + + return new MappedItem({ + collected_from_url, + id: item.id, + subject: item.title, + body: item.description, + timestamp, + author: item.account_id, + type: item.cover?.type, + media_url: item.cover?.url, + post_url: item.url, + album_media: item.image_count, + is_ad: item.is_ad ? "yes" : "no", + is_album: item.is_album ? "yes" : "no", + is_mature: item.is_mature ? "yes" : "no", + is_viral: item.in_most_viral ? "yes" : "no", + views: item.view_count, + upvotes: item.upvote_count, + downvotes: item.downvote_count, + score: item.point_count, + comments: item.comment_count, + favourites: item.favorite_count, + virality_score: item.virality, + unix_timestamp, + }); +} +// === end auto-generated === diff --git a/modules/instagram.js b/modules/instagram.js index f14e6ef..b621b96 100644 --- a/modules/instagram.js +++ b/modules/instagram.js @@ -500,4 +500,396 @@ function extractEmbeddedInstagramJSON(response) { } return datas; -} \ No newline at end of file +} + +// === auto-generated by 4cat map_item sync — BLOCK REPLACED AUTOMATICALLY === +// (regenerated from datasources/instagram/search_instagram.py) +const MEDIA_TYPE_PHOTO = 1; +const MEDIA_TYPE_VIDEO = 2; +const MEDIA_TYPE_CAROUSEL = 8; + +const HASHTAG_REGEX = /#([^\s!@#$%ˆ&*()_+{}:"|<>?\[\];'\,./`~'‘’]+)/g; + +function extractHashtags(caption) { + if (caption instanceof MissingMappedField) { + return ""; + } + const matches = [...caption.matchAll(HASHTAG_REGEX)]; + return matches.map(m => m[1]).join(","); +} + +function parsePolarisItem(node) { + const partial_item = node._zs_partial ?? false; + const collected_at = new MissingMappedField(0); + const unix_at = new MissingMappedField(0); + let caption; + if (!('caption' in node)) { + caption = new MissingMappedField(""); + } else if (!node.caption) { + caption = ""; + } else { + caption = node.caption.text; + } + + const user = node.user; + const owner = node.owner; + if (user && owner) { + if (owner.id === user.id) { + // prefer user + } else if (user.username !== owner.username) { + throw new MapItemException(`Unable to parse item: different user and owner`); + } + } + const is_verified = ("is_verified" in user && user.is_verified != null) ? user.is_verified : new MissingMappedField(false); + + const typeMap = {"XIGPolarisPhotoMedia": "photo", "XIGPolarisVideoMedia": "video"}; + const media_type = typeMap[node.__typename] ?? "unknown"; + const num_media = node.__typename !== "XIGPolarisCarouselMedia" ? 1 : (node.carousel_media?.length ?? 0); + + const display_urls = node.display_uri ?? new MissingMappedField(""); + const missing_media = null; + let media_urls; + if ("video_versions" in node) { + media_urls = node.video_versions[0]?.url ?? new MissingMappedField(""); + } else { + media_urls = new MissingMappedField(""); + } + + return { + "collected_from_url": normalize_url_encoding(node.__import_meta?.source_platform_url), + "collected_from_view": node._zs_instagram_view ?? "", + "partial_item": partial_item, + "id": node.code, + "timestamp": collected_at, + "thread_id": node.code, + "parent_id": node.code, + "url": "https://www.instagram.com/p/" + node.code, + "body": caption, + + "author_id": user?.id ?? owner?.id ?? new MissingMappedField(""), + "author": user?.username ?? owner?.username ?? new MissingMappedField(""), + "author_fullname": user?.full_name ?? owner?.full_name ?? new MissingMappedField(""), + "verified": is_verified, + "author_avatar_url": user?.profile_pic_url ?? owner?.profile_pic_url ?? new MissingMappedField(""), + + "coauthors": new MissingMappedField(""), + "coauthor_fullnames": new MissingMappedField(""), + "coauthor_ids": new MissingMappedField(""), + + "media_type": media_type, + "num_media": num_media, + "image_urls": display_urls, + "media_urls": media_urls, + + "hashtags": extractHashtags(caption), + "usertags": new MissingMappedField(""), + "play_count": node.play_count ?? new MissingMappedField(0), + + "likes_hidden": new MissingMappedField(""), + "num_likes": new MissingMappedField(0), + "num_comments": new MissingMappedField(0), + + "location_name": new MissingMappedField(""), + "location_id": new MissingMappedField(""), + "location_latlong": new MissingMappedField(""), + "location_city": new MissingMappedField(""), + + "unix_timestamp": unix_at, + "missing_media": missing_media + }; +} + +function parseGraphItem(node) { + let caption; + try { + caption = node.edge_media_to_caption.edges[0].node.text; + } catch (e) { + caption = new MissingMappedField(""); + } + + const num_media = node.__typename !== "GraphSidecar" ? 1 : (node.edge_sidecar_to_children?.edges?.length ?? 0); + + let media_node; + if (node.__typename === "GraphSidecar") { + media_node = node.edge_sidecar_to_children.edges[0].node; + } else { + media_node = node; + } + + let media_url; + if (media_node.__typename === "GraphVideo") { + media_url = media_node.video_url ?? ""; + } else if (media_node.__typename === "GraphImage") { + const resources = media_node.display_resources ?? media_node.thumbnail_resources; + if (resources && resources.length) { + media_url = resources[resources.length - 1].src; + } else { + media_url = media_node.display_url ?? ""; + } + } else { + media_url = media_node.display_url ?? ""; + } + + const typeMap = {"GraphSidecar": "photo", "GraphVideo": "video"}; + let media_type; + if (node.__typename !== "GraphSidecar") { + media_type = typeMap[node.__typename] ?? "unknown"; + } else { + const childTypes = new Set(node.edge_sidecar_to_children.edges.map(e => e.node.__typename)); + if (childTypes.size > 1) { + media_type = "mixed"; + } else { + const single = childTypes.values().next().value; + media_type = typeMap[single] ?? "unknown"; + } + } + + const location = {name: "", latlong: "", city: "", location_id: ""}; + if (node.location) { + location.name = node.location.name ?? ""; + location.location_id = node.location.pk ?? ""; + location.latlong = node.location.lat != null ? `${node.location.lat},${node.location.lng}` : ""; + location.city = node.location.city ?? null; + } + + const no_likes = Boolean(node.like_and_view_counts_disabled); + const user = node.user; + const owner = node.owner; + if (user && owner) { + if (owner.id === user.id) { + // prefer user + } else if (user.username !== owner.username) { + throw new MapItemException(`Unable to parse item: different user and owner`); + } + } + + let play_count; + if (node.view_count != null) { + play_count = node.view_count; + } else if (node.play_count != null) { + play_count = node.play_count; + } else { + play_count = new MissingMappedField(0); + } + + let usertags = ""; + if (node.edge_media_to_tagged_user && Array.isArray(node.edge_media_to_tagged_user.edges)) { + usertags = node.edge_media_to_tagged_user.edges.map(e => e.node.user.username).join(","); + } + + return { + "id": node.shortcode, + "post_source_domain": node.__import_meta?.source_platform_url, + "collected_from_view": node._zs_instagram_view ?? new MissingMappedField(""), + "partial_item": node._zs_partial ?? new MissingMappedField(""), + "timestamp": formatUtcTimestamp(node.taken_at_timestamp), + "thread_id": node.shortcode, + "parent_id": node.shortcode, + "url": "https://www.instagram.com/p/" + node.shortcode, + "body": caption, + + "author": user?.username ?? owner?.username ?? new MissingMappedField(""), + "author_fullname": user?.full_name ?? owner?.full_name ?? new MissingMappedField(""), + "is_verified": Boolean(user?.is_verified), + "author_avatar_url": user?.profile_pic_url ?? owner?.profile_pic_url ?? new MissingMappedField(""), + "coauthors": new MissingMappedField(""), + "coauthor_fullnames": new MissingMappedField(""), + "coauthor_ids": new MissingMappedField(""), + + "media_type": media_type, + "num_media": num_media, + "image_urls": node.display_url ?? "", + "media_urls": media_url, + + "hashtags": extractHashtags(caption), + "usertags": usertags, + "play_count": play_count, + "likes_hidden": no_likes ? "yes" : "no", + "num_likes": no_likes ? new MissingMappedField(0) : (node.edge_media_preview_like?.count ?? new MissingMappedField(0)), + "num_comments": node.edge_media_preview_comment?.count ?? 0, + + "location_name": location.name, + "location_id": location.location_id, + "location_latlong": location.latlong, + "location_city": location.city, + + "unix_timestamp": node.taken_at_timestamp, + "missing_media": null + }; +} + +function parseItemlistItem(node) { + const partial_item = node._zs_partial ?? false; + const num_media = node.media_type !== MEDIA_TYPE_CAROUSEL ? 1 : (node.carousel_media?.length ?? 0); + let caption; + if (!('caption' in node)) { + caption = new MissingMappedField(""); + } else if (!node.caption) { + caption = ""; + } else { + caption = node.caption.text; + } + + const display_urls = []; + const media_urls = []; + let missing_media = null; + const typeMap = { [MEDIA_TYPE_PHOTO]: "photo", [MEDIA_TYPE_VIDEO]: "video" }; + const mediaTypesSet = new Set(); + + const media_nodes = node.media_type === MEDIA_TYPE_CAROUSEL ? node.carousel_media : [node]; + for (const media_node of media_nodes) { + if (media_node.media_type === MEDIA_TYPE_VIDEO) { + if (media_node.image_versions2) { + display_urls.push(media_node.image_versions2.candidates[0].url); + } else if (media_node.video_versions) { + display_urls.push(media_node.video_versions[0].url); + } else { + if (!partial_item) { + throw new MapItemException("Instagram item format change"); + } + } + if (media_node.video_versions) { + media_urls.push(media_node.video_versions[0].url); + } else { + if (!partial_item) { + throw new MapItemException("Instagram item format change"); + } + } + } else if (media_node.media_type === MEDIA_TYPE_PHOTO && media_node.image_versions2) { + const media_url = media_node.image_versions2.candidates[0].url; + display_urls.push(media_url); + media_urls.push(media_url); + } else { + missing_media = new MissingMappedField(""); + } + mediaTypesSet.add(typeMap[media_node.media_type] ?? "unknown"); + } + + const media_type = mediaTypesSet.size > 1 ? "mixed" : (mediaTypesSet.values().next().value); + + let num_comments; + if ("comment_count" in node) { + num_comments = node.comment_count; + } else if (Array.isArray(node.comments)) { + num_comments = node.comments.length; + } else { + num_comments = -1; + } + + const location = {name: "", latlong: "", city: "", location_id: ""}; + if (node.location) { + location.name = node.location.name ?? ""; + location.location_id = node.location.pk ?? ""; + location.latlong = node.location.lat != null ? `${node.location.lat},${node.location.lng}` : ""; + location.city = node.location.city ?? null; + } + + const user = node.user; + const owner = node.owner; + if (user && owner) { + if (owner.id === user.id) { + // prefer user + } else if (user.username !== owner.username) { + throw new MapItemException(`Unable to parse item: different user and owner`); + } + } + + const coauthorsArr = []; + const coauthorFullnamesArr = []; + const coauthorIdsArr = []; + if (Array.isArray(node.coauthor_producers)) { + for (const cp of node.coauthor_producers) { + coauthorsArr.push(cp.username ?? new MissingMappedField("")); + coauthorFullnamesArr.push(cp.full_name ?? new MissingMappedField("")); + coauthorIdsArr.push(cp.id); + } + } + const coauthors = coauthorsArr.map(v => String(v)).join(","); + const coauthor_fullnames = coauthorFullnamesArr.map(v => String(v)).join(","); + const coauthor_ids = coauthorIdsArr.join(","); + + const no_likes = Boolean(node.like_and_view_counts_disabled); + let play_count; + if (node.view_count != null) { + play_count = node.view_count; + } else if (node.play_count != null) { + play_count = node.play_count; + } else { + play_count = new MissingMappedField(0); + } + + let usertags = ""; + if (node.usertags) { + usertags = node.usertags.in?.map(u => u.user.username).join(",") ?? ""; + } + + let collected_at; + let unix_at; + if (partial_item) { + collected_at = new MissingMappedField(0); + unix_at = new MissingMappedField(0); + } else { + collected_at = formatUtcTimestamp(node.taken_at); + unix_at = node.taken_at; + } + + return { + "collected_from_url": normalize_url_encoding(node.__import_meta?.source_platform_url), + "collected_from_view": node._zs_instagram_view ?? "", + "partial_item": node._zs_partial ?? "", + "id": node.code, + "timestamp": collected_at, + "thread_id": node.code, + "parent_id": node.code, + "url": "https://www.instagram.com/p/" + node.code, + "body": caption, + + "author_id": user.id ?? owner.id ?? new MissingMappedField(""), + "author": user.username ?? owner.username ?? new MissingMappedField(""), + "author_fullname": user.full_name ?? owner.full_name ?? new MissingMappedField(""), + "verified": Boolean(user.is_verified), + "author_avatar_url": user.profile_pic_url ?? owner.profile_pic_url ?? new MissingMappedField(""), + "coauthors": coauthors, + "coauthor_fullnames": coauthor_fullnames, + "coauthor_ids": coauthor_ids, + + "media_type": media_type, + "num_media": num_media, + "image_urls": display_urls.join(","), + "media_urls": media_urls.join(","), + + "hashtags": extractHashtags(caption), + "usertags": usertags, + "play_count": play_count, + "likes_hidden": no_likes ? "yes" : "no", + "num_likes": no_likes ? new MissingMappedField(0) : (node.like_count ?? new MissingMappedField(0)), + "num_comments": num_comments, + + "location_name": location.name, + "location_id": location.location_id, + "location_latlong": location.latlong, + "location_city": location.city, + + "unix_timestamp": unix_at, + "missing_media": missing_media + }; +} + +export function map_item(item) { + const link = item.link ?? ""; + if ((item.product_type === "ad") || (link && link.startsWith("https://www.facebook.com/ads/ig_redirect"))) { + throw new MapItemException("appears to be Instagram ad, check raw data to confirm and ensure Zeeschuimer is up to date."); + } + + const isPolaris = typeof item.__typename === "string" && item.__typename.toLowerCase().includes("polaris"); + const isGraph = typeof item.__typename === "string" && item.__typename !== "XDTMediaDict"; + + if (isPolaris) { + return new MappedItem(parsePolarisItem(item)); + } else if (isGraph) { + return new MappedItem(parseGraphItem(item)); + } else { + return new MappedItem(parseItemlistItem(item)); + } +} +// === end auto-generated === diff --git a/modules/linkedin.js b/modules/linkedin.js index f9b3e7a..75ed5bc 100644 --- a/modules/linkedin.js +++ b/modules/linkedin.js @@ -167,4 +167,230 @@ function recursively_enrich(object, mapped_objects) { } return object; -} \ No newline at end of file +} + +// === auto-generated by 4cat map_item sync — BLOCK REPLACED AUTOMATICALLY === +// (regenerated from datasources/linkedin/search_linkedin.py) +function getAuthor(post) { + const author = { + username: post.actor.navigationContext.actionTarget.split("linkedin.com/").pop().split("?")[0], + name: post.actor.name.text, + description: post.actor.description?.text ?? "", + pronouns: "", + avatar_url: "", + is_company: "no", + url: post.actor.navigationContext.actionTarget.split("?")[0] + }; + + if (post.actor.name?.attributes && post.actor.name.attributes[0]) { + const attr0 = post.actor.name.attributes[0]; + if (attr0["*miniProfile"]) { + const profile = attr0["*miniProfile"]; + if (profile.picture) { + const artifacts = profile.picture.artifacts.slice().sort((a, b) => b.width - a.width); + author.avatar_url = profile.picture.rootUrl + artifacts[0].fileIdentifyingUrlPathSegment; + } + if (profile.customPronoun) { + author.pronouns = profile.customPronoun; + } else if (profile.standardizedPronoun) { + author.pronouns = profile.standardizedPronoun.toLowerCase(); + } + } else if (attr0["*miniCompany"]) { + const comp = attr0["*miniCompany"]; + const artifacts = comp.logo.artifacts.slice().sort((a, b) => b.width - a.width); + author.is_company = "yes"; + author.avatar_url = comp.logo.rootUrl + artifacts[0].fileIdentifyingUrlPathSegment; + } + } + + if (post.actor.name?.attributesV2 && post.actor.name.attributesV2[0]) { + const pron = post.actor.name.attributesV2[0].detailData?.["*profileFullName"]?.pronoun; + if (pron) { + if (pron.customPronoun) author.pronouns = pron.customPronoun; + else if (pron.standardizedPronoun) author.pronouns = pron.standardizedPronoun; + } + } + + const avatar = post.actor.image?.attributes?.[0]?.detailData?.nonEntityProfilePicture; + if (avatar && avatar.vectorImage) { + author.avatar_url = avatar.vectorImage.rootUrl + avatar.vectorImage.artifacts[0].fileIdentifyingUrlPathSegment; + } + + return author; +} + +function parseTimeAgo(time_ago) { + const part = time_ago.split("•")[0]; + const numbers = part.replace(/[^0-9]/g, "").trim(); + const letters = part.replace(/[0-9]/g, "").trim(); + + const periodLengths = { + s: 1, + m: 60, + h: 3600, + d: 86400, + w: 7 * 86400, + mo: 30.4375 * 86400, + mnd: 30.4375 * 86400, + yr: 365.25 * 86400, + j: 365.25 * 86400 + }; + + const num = numbers.length ? parseInt(numbers, 10) : 0; + const factor = periodLengths[letters] ?? 0; + return factor * num; +} + +export function map_item(item) { + if (!item.actor) { + return {}; + } + let time_collected; + if (item.__import_meta) { + time_collected = Math.floor(item.__import_meta.timestamp_collected / 1000); + } else { + time_collected = Math.floor(Date.now() / 1000); + } + const time_ago = item.actor.subDescription?.text ?? ""; + const timestamp = Math.floor(time_collected - parseTimeAgo(time_ago)); + + // images + const images = []; + if (item.content && item.content.images) { + for (const image of item.content.images) { + const image_data = image.attributes[0].vectorImage; + const artifacts = image_data.artifacts.slice().sort((a, b) => b.width - a.width); + const url = image_data.rootUrl + artifacts[0].fileIdentifyingUrlPathSegment; + images.push(url); + } + } + if (images.length === 0 && item.content && item.content.articleComponent && item.content.articleComponent.largeImage) { + const largeImg = item.content.articleComponent.largeImage; + const attr0 = largeImg.attributes[0]; + const image = attr0.detailData?.vectorImage; + if (!image && attr0.imageUrl) { + images.push(attr0.imageUrl.url); + } else if (image && image.artifacts) { + images.push(image.rootUrl + image.artifacts[0].fileIdentifyingUrlPathSegment); + } + } + + // video thumbnail + let video_thumb_url = ""; + let thumb_content = null; + if (item.content && "*videoPlayMetadata" in item.content) { + thumb_content = item.content["*videoPlayMetadata"].thumbnail; + } else if (item.content && item.content.linkedInVideoComponent && item.content.linkedInVideoComponent) { + thumb_content = item.content.linkedInVideoComponent["*videoPlayMetadata"].thumbnail; + } else if (item.content && item.content.externalVideoComponent && item.content.externalVideoComponent) { + thumb_content = item.content.externalVideoComponent["*videoPlayMetadata"].thumbnail; + } + if (thumb_content) { + video_thumb_url = thumb_content.rootUrl + thumb_content.artifacts[0].fileIdentifyingUrlPathSegment; + } + + const author = getAuthor(item); + + const meta_urn = (item.updateMetadata?.urn) ?? item.preDashEntityUrn; + const urn = "urn:li:activity:" + meta_urn.split("urn:li:activity:")[1].split(",")[0].split(")")[0]; + const item_id = urn.split(":").pop(); + + // hashtags + let hashtags = []; + if (item.commentary && item.commentary.text && item.commentary.text.attributes) { + hashtags = item.commentary.text.attributes + .filter(tag => tag.type === "HASHTAG") + .map(tag => tag.trackingUrn.split(":").pop()); + } else if (item.commentary && item.commentary.text && item.commentary.text.attributesV2) { + hashtags = item.commentary.text.attributesV2 + .filter(tag => tag.detailData && tag.detailData["*hashtag"]) + .map(tag => tag.detailData["*hashtag"].trackingUrn.split(":").pop()); + } + + // mentions + const author_mentions = []; + const author_name_mentions = []; + if (item.commentary && item.commentary.text && item.commentary.text.attributes) { + for (const mention of item.commentary.text.attributes) { + if (mention.type === "PROFILE_MENTION") { + const mini = mention["*miniProfile"]; + author_mentions.push(mini.publicIdentifier); + author_name_mentions.push([mini.firstName ?? "", mini.lastName ?? ""].join(" ").trim()); + } else if (mention.type === "COMPANY_NAME") { + const mini = mention["*miniCompany"]; + author_mentions.push(mini.universalName); + author_name_mentions.push(mini.name ?? ""); + } + } + } + + // metrics + let metrics = {}; + if (item["*socialDetail"] && "*totalSocialActivityCounts" in item["*socialDetail"]) { + const counts = item["*socialDetail"]["*totalSocialActivityCounts"]; + metrics = { + comments: counts.numComments, + shares: counts.numShares, + reactions: counts.numLikes, + reaction_like: 0, + reaction_empathy: 0, + reaction_praise: 0, + reaction_entertainment: 0, + reaction_appreciation: 0, + reaction_interest: 0 + }; + if (Array.isArray(counts.reactionTypeCounts)) { + for (const rc of counts.reactionTypeCounts) { + const key = "reaction_" + rc.reactionType.toLowerCase(); + metrics[key] = rc.count; + } + } + } else { + const sd = item["*socialDetail"]; + metrics = { + comments: sd.comments?.paging?.total ?? 0, + shares: sd.totalShares ?? 0, + reactions: sd.likes?.paging?.total ?? 0 + }; + } + + // link url + let link_url = ""; + if (item.content && item.content.navigationContext) { + link_url = item.content.navigationContext.actionTarget ?? ""; + } else if (item.content && item.content.articleComponent && item.content.articleComponent.navigationContext) { + link_url = item.content.articleComponent.navigationContext.actionTarget ?? ""; + } + + // build result object + const result = { + collected_from_url: normalize_url_encoding(item.__import_meta?.source_platform_url ?? ""), + id: item_id, + thread_id: item_id, + body: item.commentary?.text?.text ?? "", + timestamp: formatUtcTimestamp(timestamp), + timestamp_collected: formatUtcTimestamp(time_collected), + timestamp_ago: time_ago.split("•")[0].trim(), + is_promoted: /\d/.test(time_ago) ? "no" : "yes", + // author fields (author_ prefix, drop trailing _username) + ...Object.fromEntries(Object.entries(author).map(([k, v]) => { + let field = "author_" + k; + field = field.replace("_username", ""); + return [field, v]; + })), + author_mentions: author_mentions.join(","), + author_name_mentions: author_name_mentions.join(","), + hashtags: hashtags.join(","), + image_urls: images.join(","), + video_thumb_url: video_thumb_url, + post_url: "https://www.linkedin.com/feed/update/" + urn, + link_url: link_url, + ...metrics, + inclusion_context: item.header?.text?.text ?? "", + unix_timestamp: timestamp, + unix_timestamp_collected: time_collected + }; + + return new MappedItem(result); +} +// === end auto-generated === diff --git a/modules/package.json b/modules/package.json new file mode 100644 index 0000000..3dbc1ca --- /dev/null +++ b/modules/package.json @@ -0,0 +1,3 @@ +{ + "type": "module" +} diff --git a/modules/pinterest.js b/modules/pinterest.js index 5f9abcc..a67a0fe 100644 --- a/modules/pinterest.js +++ b/modules/pinterest.js @@ -91,4 +91,95 @@ export function capture(response, source_platform_url, source_url) { } return pins; -} \ No newline at end of file +} + +// === auto-generated by 4cat map_item sync — BLOCK REPLACED AUTOMATICALLY === +// (regenerated from datasources/pinterest/search_pinterest.py) +export function map_item(item) { + function map_item_from_json(post) { + // Parse timestamp, handling missing or malformed values + let timestampStr = post['created_at'] ?? post['createdAt']; + let unix_timestamp; + let str_timestamp; + if (timestampStr) { + let date = new Date(timestampStr); + if (!isNaN(date)) { + unix_timestamp = Math.floor(date.getTime() / 1000); + str_timestamp = formatUtcTimestamp(unix_timestamp); + } else { + unix_timestamp = new MissingMappedField(""); + str_timestamp = new MissingMappedField(""); + } + } else { + unix_timestamp = new MissingMappedField(""); + str_timestamp = new MissingMappedField(""); + } + + let post_id = post['entityId'] ?? post['id']; + + let image_url; + if (post['imageSpec_orig']) { + image_url = post['imageSpec_orig']['url']; + } else if (post['images']?.orig?.url) { + image_url = post['images']['orig']['url']; + } else { + image_url = post['images']?.url; + } + + return new MappedItem({ + collected_from_url: normalize_url_encoding(post['__import_meta']?.source_platform_url ?? ""), + id: post_id, + thread_id: post_id, + author: post['pinner']?.username, + author_fullname: post['pinner']?.fullName ?? post['pinner']?.full_name ?? "", + author_original: post['nativeCreator'] ? post['nativeCreator'].username : post['pinner']?.username, + body: (post['description'] ?? "").trim(), + subject: (post['title'] ?? "").trim(), + ai_description: post['auto_alt_text'] ?? "", + pinner_original: post['originPinner'] ? post['originPinner'].fullName : "", + pinner_via: post['viaPinner'] ? post['viaPinner'].fullName : "", + board: post['board']?.name, + board_pins: post['board']?.pinCount ?? post['board']?.pin_count ?? null, + board_url: post['board']?.url ? `https://www.pinterest.com${post['board'].url}` : null, + timestamp: str_timestamp, + idea_tags: post['pinJoin'] ? (post['pinJoin']['visualAnnotation'] ?? []).join(",") : "", + url: `https://www.pinterest.com/pin/${post_id}`, + is_video: (post['isVideo'] ?? post['videos']) ? "yes" : "no", + image_url: image_url, + dominant_colour: post['dominantColor'] ?? post['dominant_color'] ?? null, + unix_timestamp: unix_timestamp + }); + } + + function map_item_from_html(post) { + return new MappedItem({ + collected_from_url: normalize_url_encoding(post['__import_meta']?.source_platform_url ?? ""), + id: parseInt(post['id'], 10), + thread_id: parseInt(post['id'], 10), + author: new MissingMappedField(""), + author_fullname: new MissingMappedField(""), + author_original: new MissingMappedField(""), + body: (post['body'] ?? "").trim(), + subject: (post['title'] ?? "").trim(), + ai_description: new MissingMappedField(""), + pinner_original: new MissingMappedField(""), + pinner_via: new MissingMappedField(""), + board: new MissingMappedField(""), + board_pins: new MissingMappedField(""), + board_url: new MissingMappedField(""), + timestamp: new MissingMappedField(""), + idea_tags: (post['tags'] ?? []).join(","), + url: `https://www.pinterest.com/pin/${post['id']}`, + is_video: new MissingMappedField(""), + image_url: post['image'], + dominant_colour: new MissingMappedField(""), + unix_timestamp: new MissingMappedField("") + }); + } + + if (item['_zs-origin'] === 'html') { + return map_item_from_html(item); + } + return map_item_from_json(item); +} +// === end auto-generated === diff --git a/modules/rednote-comments.js b/modules/rednote-comments.js index 47f9d79..46911a3 100644 --- a/modules/rednote-comments.js +++ b/modules/rednote-comments.js @@ -52,4 +52,36 @@ export function capture(response, source_platform_url, source_url) { // no posts, no data return []; -} \ No newline at end of file +} + +// === auto-generated by 4cat map_item sync — BLOCK REPLACED AUTOMATICALLY === +// (regenerated from datasources/xiaohongshu_comments/search_rednote_comments.py) +export function map_item(item) { + // Convert create_time (milliseconds) to Unix timestamp (seconds) + const createTimeMs = Number(item["create_time"]); + const unix_timestamp = Math.floor(createTimeMs / 1000); + // Format as "YYYY-MM-DD HH:MM:SS" using the global helper + const timestamp = formatUtcTimestamp(unix_timestamp); + + // Resolve optional import metadata URL + const collected_from_url = normalize_url_encoding(item["__import_meta"]?.["source_platform_url"] ?? ""); + + // ip_location may be missing or empty – use MissingMappedField in that case + const ip_location = item["ip_location"] ? item["ip_location"] : new MissingMappedField(""); + + return new MappedItem({ + collected_from_url: collected_from_url, + id: item["id"], + thread_id: item["note_id"], + url: `https://www.xiaohongshu.com/explore/${item["note_id"]}`, + body: item["content"] ?? "", + timestamp: timestamp, + author: item["user_info"]?.["nickname"] ?? "", + author_avatar_url: item["user_info"]?.["image"] ?? "", + ip_location: ip_location, + likes: item["like_count"], + replies: item["sub_comment_count"], + unix_timestamp: unix_timestamp + }); +} +// === end auto-generated === diff --git a/modules/rednote.js b/modules/rednote.js index 7471c92..e42d04f 100644 --- a/modules/rednote.js +++ b/modules/rednote.js @@ -103,4 +103,136 @@ export function capture(response, source_platform_url, source_url) { // no posts, no data return []; -} \ No newline at end of file +} + +// === auto-generated by 4cat map_item sync — BLOCK REPLACED AUTOMATICALLY === +// (regenerated from datasources/xiaohongshu/search_rednote.py) +function map_item_from_json_api_explore(post) { + const item = post.type !== 'video' ? post.note_card : post; + const item_id = post.id ?? post.note_id; + + // Images handling + let images; + if (item.image_list) { + images = []; + for (const image of item.image_list) { + if (image.url_default) { + images.push(image.url_default); + } else if (image.info_list && image.info_list.length) { + let found = false; + for (const imgInfo of image.info_list) { + if (imgInfo.image_scene === 'WB_DFT') { + images.push(imgInfo.url); + found = true; + break; + } + } + if (!found) { + images.push(image.info_list[0].url); + } + } + } + } else if (item.cover) { + images = [item.cover.url_default]; + } else { + images = new MissingMappedField(""); + } + + const xsec_bit = post.xsec_token ? `?xsec_token=${post.xsec_token}` : ""; + const video_url = item.video?.media ? item.video.media.stream.h264[0].master_url : new MissingMappedField(""); + const author = item.user.nickname ?? item.user.nick_name; + const timestamp = item.time ?? null; + const timestampStr = timestamp ? formatUtcTimestamp(timestamp / 1000) : new MissingMappedField(""); + const hashtags = item.desc ? [...item.desc.matchAll(/#([^\s!@#$%^&*()_+{}:"|<>?\[\];',.\/`~]+)/g)].map(m => m[1]).join(",") : new MissingMappedField(""); + const body = item.desc ?? new MissingMappedField(""); + const image_urls = Array.isArray(images) ? images.join(",") : images; + const likes = item.interact_info?.liked_count ?? null; + const unix_ts = timestamp ? Math.floor(timestamp / 1000) : new MissingMappedField(""); + + return new MappedItem({ + collected_from_url: normalize_url_encoding(post.__import_meta?.source_platform_url ?? ""), + id: item_id, + thread_id: item_id, + url: `https://www.xiaohongshu.com/explore/${post.id}${xsec_bit}`, + title: item.display_title ?? "", + body: body, + hashtags: hashtags, + timestamp: timestampStr, + author: author, + author_avatar_url: item.user.avatar, + image_urls: image_urls, + video_url: video_url, + likes: likes, + unix_timestamp: unix_ts, + }); +} + +function map_item_from_json_embedded(item) { + const note = item.note; + const image = note.imageList?.[0]?.urlDefault ?? new MissingMappedField(""); + const xsec_bit = `?xsec_token=${note.xsecToken}`; + const timestamp = note.time ?? null; + const timestampStr = timestamp ? formatUtcTimestamp(timestamp / 1000) : new MissingMappedField(""); + const hashtags = note.desc ? [...note.desc.matchAll(/#([^\s!@#$%^&*()_+{}:"|<>?\[\];',.\/`~]+)/g)].map(m => m[1]).join(",") : new MissingMappedField(""); + const body = note.desc ?? new MissingMappedField(""); + const author = note.user.nickname ?? note.user.nick_name; + const likes = note.interactInfo?.likedCount ?? + note.interact_info?.liked_count ?? + note.likes ?? + new MissingMappedField(""); + const unix_ts = timestamp ? Math.floor(timestamp / 1000) : new MissingMappedField(""); + + return new MappedItem({ + collected_from_url: normalize_url_encoding(item.__import_meta?.source_platform_url ?? ""), + id: item.id, + thread_id: item.id, + url: `https://www.xiaohongshu.com/explore/${item.id}${xsec_bit}`, + title: note.title ?? "", + body: body, + hashtags: hashtags, + timestamp: timestampStr, + author: author, + author_avatar_url: note.user.avatar, + image_url: image, + video_url: new MissingMappedField(""), + likes: likes, + unix_timestamp: unix_ts, + }); +} + +function map_item_from_html(item) { + return new MappedItem({ + collected_from_url: normalize_url_encoding(item.__import_meta?.source_platform_url ?? ""), + id: item.id, + thread_id: item.id, + url: `https://www.xiaohongshu.com${item.url}`, + title: item.title, + body: new MissingMappedField(""), + hashtags: new MissingMappedField(""), + timestamp: new MissingMappedField(""), + author: item.author_name, + author_avatar_url: item.author_avatar_url, + image_url: item.thumbnail_url, + video_url: new MissingMappedField(""), + likes: item.likes, + unix_timestamp: new MissingMappedField(""), + }); +} + +export function map_item(post) { + // Reject tile stub items – minimal thumbnail entries with no content + if (!post.note_card && !post.user && post['_zs-origin'] !== 'html' && !post.note) { + const source = post.__import_meta?.source_url ?? ""; + throw new MapItemException(`Xiaohongshu tile stub without post content (source: ${source || 'unknown'})`); + } + if (post['_zs-origin'] === 'html') { + return map_item_from_html(post); + } else { + if (post.note) { + return map_item_from_json_embedded(post); + } else { + return map_item_from_json_api_explore(post); + } + } +} +// === end auto-generated === diff --git a/modules/threads.js b/modules/threads.js index 98ebfa5..e906fa5 100644 --- a/modules/threads.js +++ b/modules/threads.js @@ -69,4 +69,85 @@ export function capture(response, source_platform_url, source_url) { return item; } })] -} \ No newline at end of file +} + +// === auto-generated by 4cat map_item sync — BLOCK REPLACED AUTOMATICALLY === +// (regenerated from datasources/threads/search_threads.py) +export function map_item(item) { + const post = item; + const timestampStr = post.taken_at != null ? formatUtcTimestamp(post.taken_at) : ""; + let imageUrls = []; + let videoUrls = []; + + if (post.carousel_media && post.carousel_media.length) { + for (const c of post.carousel_media) { + if (c.image_versions2 && c.image_versions2.candidates && c.image_versions2.candidates.length) { + const url = c.image_versions2.candidates[0].url; + if (url) imageUrls.push(url); + } + if (c.video_versions && c.video_versions.length) { + const vurl = c.video_versions[0].url; + if (vurl) videoUrls.push(vurl); + } + } + } else { + if (post.image_versions2 && post.image_versions2.candidates && post.image_versions2.candidates.length) { + const url = post.image_versions2.candidates[0].url; + if (url) imageUrls.push(url); + } + if (post.video_versions && post.video_versions.length) { + const vurl = post.video_versions[0].url; + if (vurl) videoUrls.push(vurl); + } + } + + const audioUrl = post.audio && post.audio.audio_src ? post.audio.audio_src : ""; + + let linkedUrl = ""; + let linkThumbnail = ""; + const linkPreview = post.text_post_app_info && post.text_post_app_info.link_preview_attachment; + if (linkPreview) { + linkedUrl = linkPreview.url || ""; + try { + const parsed = new URL(linkedUrl); + const uParam = parsed.searchParams.getAll('u'); + if (uParam.length) { + linkedUrl = uParam[0]; + linkThumbnail = linkPreview.image_url ?? ""; + } else { + linkThumbnail = linkedUrl; + } + } catch (e) { + linkThumbnail = linkedUrl; + } + } + + const hashtags = post.caption && post.caption.text + ? [...post.caption.text.matchAll(/#([^\s!@#$%ˆ&*()_+{}:"|<>?\[\];',./`~']+)/g)].map(m => m[1]).join(',') + : ""; + + return new MappedItem({ + collected_from_url: normalize_url_encoding(post.__import_meta?.source_platform_url ?? ""), + id: post.code, + thread_id: post.code, + url: `https://www.threads.com/@${post.user?.username ?? ""}/post/${post.code}`, + body: post.caption?.text ?? "", + timestamp: timestampStr, + author: post.user?.username ?? "", + author_is_verified: post.user?.is_verified ? "yes" : "no", + author_avatar: post.user?.profile_pic_url ?? null, + image_url: imageUrls.join(","), + video_url: videoUrls.join(","), + audio_url: audioUrl, + link_url: linkedUrl, + link_thumbnail_url: linkThumbnail ?? "", + is_paid_partnership: post.is_paid_partnership ? "yes" : "no", + likes: post.like_count, + reposts: post.text_post_app_info?.repost_count ?? 0, + replies: post.text_post_app_info?.direct_reply_count ?? 0, + quotes: post.text_post_app_info?.quote_count ?? 0, + hashtags: hashtags, + unix_timestamp: post.taken_at != null ? Math.floor(post.taken_at) : null + }); +} +// === end auto-generated === diff --git a/modules/tiktok-comments.js b/modules/tiktok-comments.js index 97b68b8..e41446a 100644 --- a/modules/tiktok-comments.js +++ b/modules/tiktok-comments.js @@ -29,4 +29,36 @@ export function capture(response, source_platform_url, source_url) { } return []; -} \ No newline at end of file +} + +// === auto-generated by 4cat map_item sync — BLOCK REPLACED AUTOMATICALLY === +// (regenerated from datasources/tiktok_comments/search_tiktok_comments.py) +export function map_item(item) { + const timestamp = formatUtcTimestamp(item.create_time); + const thread_id = item.reply_id === "0" ? item.aweme_id : item.reply_id; + const avatar_url = item.user?.avatar_thumb?.url_list?.[0] ?? null; + const collected_from_url = normalize_url_encoding(item.__import_meta?.source_platform_url ?? ""); + const post_url = item.share_info?.url?.split(".html")[0] ?? null; + return new MappedItem({ + collected_from_url: collected_from_url, + id: item.cid, + thread_id: thread_id, + author: item.user?.unique_id ?? null, + author_full: item.user?.nickname ?? null, + author_avatar_url: avatar_url, + body: item.text ?? null, + timestamp: timestamp, + unix_timestamp: item.create_time, + likes: item.digg_count, + replies: item.reply_comment_total ?? 0, + post_id: item.aweme_id, + post_url: post_url, + post_body: item.share_info?.title ?? null, + comment_url: item.share_info?.url ?? null, + is_liked_by_post_author: !!item.author_pin ? "yes" : "no", + is_sticky: !!item.stick_position ? "yes" : "no", + is_comment_on_comment: item.reply_id === "0" ? "no" : "yes", + language_guess: item.comment_language ?? null + }); +} +// === end auto-generated === diff --git a/modules/tiktok.js b/modules/tiktok.js index 55e6fbf..586c9e1 100644 --- a/modules/tiktok.js +++ b/modules/tiktok.js @@ -103,4 +103,98 @@ export function capture(response, source_platform_url, source_url) { } else { return []; } -} \ No newline at end of file +} + +// === auto-generated by 4cat map_item sync — BLOCK REPLACED AUTOMATICALLY === +// (regenerated from datasources/tiktok/search_tiktok.py) +export function map_item(post) { + // Zeeschuimer metadata + const metadata = post["__import_meta"] ?? {}; + + const challenges = (post["challenges"] ?? []).map(ch => ch.title); + + const hashtags = (post["textExtra"] ?? []).filter(extra => "hashtagName" in extra && extra.hashtagName).map(extra => extra.hashtagName); + + const labels = Array.isArray(post["diversificationLabels"]) ? post["diversificationLabels"].join(",") : ""; + + let user_nickname = ""; + let user_fullname = ""; + let user_thumbnail = ""; + if (post["author"] && typeof post["author"] === "object") { + // from intercepted API response + user_nickname = post["author"]["uniqueId"] ?? ""; + user_fullname = post["author"]["nickname"] ?? ""; + user_thumbnail = post["author"]["avatarThumb"] ?? ""; + } else if (post["author"]) { + // from embedded JSON object + user_nickname = post["author"] ?? ""; + user_fullname = post["nickname"] ?? ""; + user_thumbnail = ""; + } + + // Determine the best thumbnail URL that hasn't expired yet + const thumbnail_options = []; + if (post["video"]?.shareCover) { + const shareCover = post["video"]["shareCover"]; + if (Array.isArray(shareCover) && shareCover.length) { + thumbnail_options.push(shareCover[shareCover.length - 1]); + } else if (typeof shareCover === "string") { + thumbnail_options.push(shareCover); + } + } + if (post["video"]?.cover) { + thumbnail_options.push(post["video"]["cover"]); + } + const now = Math.floor(Date.now() / 1000); + const validThumbnails = thumbnail_options.filter(url => { + if (!url) return false; + try { + const expiresStr = new URL(url).searchParams.get("x-expires"); + const expires = expiresStr ? parseInt(expiresStr, 10) : now; + return expires >= now; + } catch (e) { + return false; + } + }); + const thumbnail_url = validThumbnails.length ? validThumbnails[validThumbnails.length - 1] : ""; + + return new MappedItem({ + "collected_from_url": metadata["source_platform_url"] ? normalize_url_encoding(metadata["source_platform_url"]) : "", + "id": post["id"], + "thread_id": post["id"], + "author": user_nickname, + "author_full": user_fullname, + "author_followers": post["authorStats"]?.followerCount ?? "", + "author_likes": post["authorStats"]?.diggCount ?? "", + "author_videos": post["authorStats"]?.videoCount ?? "", + "author_avatar": user_thumbnail, + "body": post["desc"], + "stickers": (post["stickersOnItem"] ?? []).map(s => s.stickerText.join(" ")).join("\n"), + "timestamp": formatUtcTimestamp(parseInt(post["createTime"], 10)), + "unix_timestamp": parseInt(post["createTime"], 10), + "is_duet": (post["duetInfo"]?.duetFromId && post["duetInfo"]["duetFromId"] !== "0") ? "yes" : "no", + "is_ad": post["isAd"] ? "yes" : "no", + "is_paid_partnership": post["adAuthorization"] ? "yes" : "no", + "is_sensitive": post["maskType"] === 3 ? "yes" : "no", + "is_photosensitive": post["maskType"] === 4 ? "yes" : "no", + "music_name": post["music"]?.title ?? "", + "music_id": post["music"]?.id ?? "", + "music_url": post["music"]?.playUrl ?? "", + "music_thumbnail": post["music"]?.coverLarge ?? "", + "music_author": post["music"]?.authorName ?? "", + "video_url": post["video"]?.downloadAddr ?? "", + "tiktok_url": `https://www.tiktok.com/@${user_nickname}/video/${post["id"]}`, + "thumbnail_url": thumbnail_url, + "likes": post["stats"]?.diggCount, + "comments": post["stats"]?.commentCount, + "shares": post["stats"]?.shareCount, + "plays": post["stats"]?.playCount, + "hashtags": hashtags.join(","), + "challenges": challenges.join(","), + "diversification_labels": labels, + "location_created": post["locationCreated"] ?? "", + "effects": (post["effectStickers"] ?? []).map(e => e.name).join(","), + "warning": (post["warnInfo"] ?? []).map(w => w.text).join(",") + }); +} +// === end auto-generated === diff --git a/modules/truth.js b/modules/truth.js index fe626cf..ec6bdb1 100644 --- a/modules/truth.js +++ b/modules/truth.js @@ -35,4 +35,95 @@ export function capture(response, source_platform_url, source_url) { } return items; -} \ No newline at end of file +} + +// === auto-generated by 4cat map_item sync — BLOCK REPLACED AUTOMATICALLY === +// (regenerated from datasources/truth/search_truth.py) +export function map_item(item) { + const errors = []; + const postTime = new Date(item["created_at"]); + const images = []; + const videos = []; + const videoThumbs = []; + + if (item.media_attachments) { + for (const media of item.media_attachments) { + const mtype = media.type; + if (mtype === "image") { + images.push(media.url); + } else if (mtype === "video") { + videos.push(media.url); + videoThumbs.push(media.preview_url); + } else if (mtype === "tv") { + // Truth social TV channels – only a thumbnail is provided + videoThumbs.push(media.url); + // preview_url is a smaller thumb (ignored) + } else { + errors.push(`New media type: ${mtype}`); + } + } + } + + const group = item.group ? item.group : {}; + + let thread_id; + if (item.quote_id != null) { + thread_id = item.quote_id; + } else if (item.in_reply_to != null) { + let reply_to = item.in_reply_to; + while (reply_to) { + if (reply_to.in_reply_to != null) { + reply_to = reply_to.in_reply_to; + } else { + thread_id = reply_to.id; + break; + } + } + } else { + thread_id = item.id; + } + + const mentions = (item.mentions ?? []).map(m => m.username); + const hashtags = (item.tags ?? []).map(t => t.name); + + // Format timestamp as "YYYY-MM-DD HH:MM:SS" in UTC + const pad = n => String(n).padStart(2, "0"); + const timestamp = `${postTime.getUTCFullYear()}-${pad(postTime.getUTCMonth() + 1)}-${pad(postTime.getUTCDate())} ${pad(postTime.getUTCHours())}:${pad(postTime.getUTCMinutes())}:${pad(postTime.getUTCSeconds())}`; + + const mapped_item = { + collected_from_url: normalize_url_encoding(item.__import_meta?.source_platform_url ?? ""), + id: item.id, + created_at: item.created_at, + body: item.content, + url: item.url ?? null, + reblogs_count: item.reblogs_count ?? 0, + replies_count: item.replies_count ?? 0, + + account_id: item.account.id, + account_username: item.account.username, + account_display_name: item.account.display_name, + account_avatar: item.account.avatar, + account_verified: item.account.verified, + account_followers: item.account.followers_count, + account_following: item.account.following_count, + + mentions: mentions.join(","), + hashtags: hashtags.join(","), + + images: images.join(","), + video_thumbs: videoThumbs.join(","), + video_urls: videos.join(","), + + group_id: group.id ?? null, + group_display_name: group.display_name ?? null, + group_avatar: group.avatar ?? null, + group_note: group.note ?? null, + group_members_count: group.members_count ?? 0, + + thread_id: thread_id, + timestamp: timestamp + }; + + return new MappedItem(mapped_item, errors.join("; ")); +} +// === end auto-generated === diff --git a/tests/.env.example b/tests/.env.example new file mode 100644 index 0000000..137a52b --- /dev/null +++ b/tests/.env.example @@ -0,0 +1,23 @@ +# 4CAT API config for the map_item comparator (`npm run test:compare`). +# Copy this file to .env in this directory and fill in real values. +# .env is gitignored; .env.example is the committed template. + +# Base URL of the 4CAT instance to hit. No trailing slash. Default ports: +# :80 for nginx (production) +# :4000 for the Flask dev server +FOURCAT_URL=http://localhost + +# API key for that 4CAT instance. Get one from the 4CAT UI; tied to your +# user. 4CAT accepts the raw key as the Authorization header value (no +# `Bearer ` prefix). +FOURCAT_API_KEY=your-api-key-here + +# Comma-separated list of dataset keys (the 32-char ids from 4CAT dataset +# URLs) to compare. The comparator pulls inputs from /download/ and +# expected outputs from +# /api/dataset//items/?annotations=no&missing_fields=keep&stream=true +# for each. Datasource is read from each dataset's metadata. +# +# `npm run test:compare -- ` narrows a single run to one key; the key +# must still be listed here. +FOURCAT_DATASETS=key1,key2,key3 diff --git a/tests/README.md b/tests/README.md index f1188e2..f203b60 100644 --- a/tests/README.md +++ b/tests/README.md @@ -1,31 +1,42 @@ ## Tests for Zeeschuimer -This folder contains **testing** code for Zeeschuimer. +This folder contains testing code for Zeeschuimer. There are three suites, +each with a different purpose and a different runtime environment: -### Integration Tests (Selenium) +| Suite | Tests | Environment | When it runs | Needs | +|----------------------------------|-----------------------------------------------------------|--------------------|---------------------------------|----------------------------------------| +| Selenium integration | Page captures real items from each supported platform | Real Firefox | Reviewer-supervised, manual | Firefox profile, sometimes a human | +| Duplicate-behavior unit (Jest) | DB merge / keep / update semantics in isolation | jsdom + fake-IDB | `npm test` (every push) | None | +| Module load smoke (Jest, Tier 1) | Each `modules/*.js` parses and imports cleanly | jsdom | `npm test` (every push) | None | +| `map_item` comparator (Jest, Tier 2) | JS `map_item` output matches 4CAT's Python mapping per item | jsdom + cross-fetch | `npm run test:compare` (on demand) | Live 4CAT, API key, dataset key(s) | -The Python + Selenium tests visit pages on supported platforms -and see how many items are captured. If the amount of items captured is -unexpectedly low or high, this is flagged and may indicate that Zeeschuimer no -longer properly captures data from the platform. +Hermetic suites (no external dependencies) live in `npm test`. Anything that +requires a real browser, a 4CAT server, or a human in the loop is opt-in. -These tests are **supervised** i.e. they require monitoring by a human and +### Integration tests (Selenium) + +The Python + Selenium tests visit pages on supported platforms and see how +many items are captured. If the amount of items captured is unexpectedly +low or high, this is flagged and may indicate that Zeeschuimer no longer +properly captures data from the platform. + +These tests are **supervised** — they require monitoring by a human and cannot run fully autonomously, since some platforms (TikTok in particular) occasionally show CAPTCHAs that need to be completed for a test to run successfully. This is also why Selenium does not run a headless Firefox. -The amount of items returned per page is somewhat variable for most platforms, -so if the number is slightly lower or higher than expected this is not -necessarily a problem (but worth checking). +The amount of items returned per page is somewhat variable for most +platforms, so if the number is slightly lower or higher than expected this +is not necessarily a problem (but worth checking). -Additionally, most platforms require logging in before (full) access to the UI -is available. The testing script borrows a Firefox profile directory from -elsewhere on the system to do this. It will try to find one automatically but -you can also pass one with the `--profiledir` argument. The idea is that you -log in to the various sites (Instagram, etc) in your 'normal' Firefox, and the -tests then borrow that login to interface with the website. +Most platforms require logging in before (full) access to the UI is +available. The testing script borrows a Firefox profile directory from +elsewhere on the system to do this. It will try to find one automatically +but you can also pass one with the `--profiledir` argument. Log in to the +various sites (Instagram, etc) in your 'normal' Firefox, and the tests then +borrow that login. -Run `test.py` to run tests. Required non-standard libraries are in +Run `test.py` to run tests. Required non-standard libraries are in `requirements.txt`. Tests are defined in `tests.json` with the following structure: @@ -35,49 +46,152 @@ Tests are defined in `tests.json` with the following structure: "platform id as in zeeschuimer (e.g. 'tiktok.com')": { "test case (e.g. 'Home feed')": { "url": { - "expected": 0, # amount of items expected to be captured on this page - "more-after-scroll": false, # whether scrolling is supposed to load more items (currently unsupported) - "wait": 10 # wait time before checking number of items (optional, default 5) - } # more URLS can be added per test case + "expected": 0, + "more-after-scroll": false, + "wait": 10 + } } } } ``` -### Unit Tests (Jest) - -The JavaScript unit tests verify duplicate-handling logic in isolation using -a mocked Dexie database. These tests ensure that when the duplicate behavior -setting is changed, the correct existing record is selected for updates. +### Jest suites **Prerequisites** -- Node.js (v18 or later) and npm must be installed +- Node.js (v18 or later) and npm +- `cd tests && npm install` + +**Recommended: develop the tests inside Docker.** On Windows the global +permission model can make `npm install` / `npm test` awkward to run from +an arbitrary shell, and an agentic assistant working in auto-mode will +hit deny-rules before it can do a `cross-fetch`-style dependency spike. +Any minimal `node:20`-or-newer image with this repo mounted in is +enough — install what you need, run `npm install`, run `npm test` and +`npm run test:compare`. The host's `tests/.env` is picked up via the +mount, and `FOURCAT_URL` can point at a 4CAT reachable from the +container (`host.docker.internal` on Windows/Mac, the host IP on +Linux). + +#### Duplicate-behavior unit tests + +Verify duplicate-handling logic in isolation using a mocked Dexie database. +Ensures that when the duplicate behavior setting is changed, the correct +existing record is selected for updates. + +Coverage: +- Schema upgrade backfills `last_updated` from `timestamp_collected` +- Compound index correctly selects most recent item by `last_updated` +- Forward-looking behavior: "keep" → "update" targets newest record +- Forward-looking behavior: "update" → "keep" creates new records +- Merge: shallow merge preserves fields from both records +- Skip: no modifications occur when duplicate found +- Platform isolation: same `item_id` on different platforms are independent +- Tie-breaker: when `last_updated` is equal, prefer higher `id` + +#### Module load smoke (Tier 1) + +For every file under `modules/*.js`, `tests/map_item.test.js` asserts the +module parses and imports without throwing. Modules with a `map_item` +export and modules without one both pass this tier — the goal is purely to +catch a generator that emits a syntax error or an import-time throw. + +No data is run through `map_item` here; that work belongs in the +comparator. + +#### `map_item` comparator (Tier 2) + +For every 4CAT dataset key listed in `FOURCAT_DATASETS`, +`tests/map_item_compare.test.js`: + +1. sends a HEAD to the items endpoint and reads the datasource id from its + `X-4CAT-Dataset-Datasource` response header (no metadata-endpoint call) +2. translates that id to a Zeeschuimer module name via + `zeeschuimer-to-4cat.json` (used in reverse) +3. fetches `/download/` (NDJSON inputs, already wrapped via + `wrap_for_map_item` by Zeeschuimer pre-upload) and + `/api/dataset//items/?annotations=no&missing_fields=keep&stream=true` + (expected outputs from 4CAT's Python `map_item`, as NDJSON — `stream=true` + avoids the JSON form's `limit=100` pagination) +4. pairs items by `id` (or by index with a warning if `id` is missing on + either side), runs each input through the local `map_item`, and + field-by-field diffs against the expected output (4CAT's API-only + aggregate `missing_fields` key is excluded; per-field `{__missing:true}` + markers are still compared) -**Setup** +The comparator does **not** exercise `wrap_for_map_item` itself — Zeeschuimer +applies it pre-storage and `/download/` returns post-wrap items. This +is an accepted gap; see `docs/map-item-test-plan.md`. -1. Install Node.js dependencies: - ```bash - cd tests - npm install - ``` +**Configuration:** copy `tests/.env.example` to `tests/.env` and set: +- `FOURCAT_URL` — base URL of the 4CAT instance (no trailing slash) +- `FOURCAT_API_KEY` — raw API key (no `Bearer ` prefix) +- `FOURCAT_DATASETS` — comma-separated list of dataset keys -**Running tests** +The comparator hard-errors at startup if any of these are missing. + +**Optional knob:** by default the comparator halts a dataset at its first +failing item (reporting the rest as one skipped "halted" placeholder). To +compare *every* item, pass `--all`: ```bash -npm test +npm run test:compare -- --all ``` -For watch mode during development: +`FAIL_FAST=0` (or `FAIL_FAST=false`) does the same, but prefer `--all`: an +inline `FAIL_FAST=0 npm run …` does not reliably reach node when npm/node is +the Windows binary run through WSL interop, and isn't env syntax in cmd.exe. +A CLI flag crosses every shell. + +### Running + ```bash +# everything that's hermetic — duplicate-behavior unit + module load smoke +npm test + +# watch mode for the same npm run test:watch + +# the comparator — every dataset key in FOURCAT_DATASETS +npm run test:compare + +# the comparator narrowed to one dataset key (must still appear in +# FOURCAT_DATASETS — protects against typos) +npm run test:compare -- + +# compare every item instead of halting at the first failure +npm run test:compare -- --all ``` -**Test coverage** -- Schema upgrade backfills `last_updated` from `timestamp_collected` -- Compound index correctly selects most recent item by `last_updated` -- Forward-looking behavior: switching from "keep" to "update" targets newest record -- Forward-looking behavior: switching from "update" to "keep" creates new records -- Merge behavior: shallow merge preserves fields from both records -- Skip behavior: no modifications occur when duplicate found -- Platform isolation: same `item_id` on different platforms are independent -- Tie-breaker: when `last_updated` is equal, prefer higher `id` +### Where does a new test go? + +- **Pure data transformation, no live external state, runs anywhere.** + Duplicate-behavior unit suite (DB logic) or the Tier 1 smoke + (`map_item` static checks). +- **Field-by-field correctness against 4CAT's Python `map_item`.** Tier 2 + comparator. Add a dataset to `FOURCAT_DATASETS` that covers the case; + the comparator will pick it up. +- **End-to-end user flow in the extension.** Selenium. + +### Why the environments differ + +The two Jest tiers run in **jsdom** rather than node env. The reasoning: + +- `map_item` bodies are pure data transformation, but four of them + (`gab`, `pinterest`, `rednote`, `truth`) call `strip_tags`, which + invokes `new DOMParser()`. jsdom provides a spec-compliant native + `DOMParser`; node env doesn't. +- jsdom doesn't ship `fetch`. The standard workaround + (`undici`) crashes inside jsdom because it pokes at + `clearImmediate` / `markResourceTiming` / fast-now timers that jsdom + shadows. `cross-fetch` wraps `node-fetch` v2 internally and doesn't + hit those Node internals, so it works in jsdom — the comparator + imports `cross-fetch/polyfill` to assign `globalThis.fetch`. + +The tradeoff is parser parity. `cross-fetch`-via-`node-fetch` and +jsdom's `DOMParser` are not byte-equal to Firefox's Gecko `DOMParser`, +which is what runs in production. Whitespace handling around `
` and +block elements is the usual suspect. If the comparator emits false- +positive diffs on text fields for the four `strip_tags` modules, the +right fix is to normalise whitespace in the comparator's `deep_equal` +rather than chase parser parity. The Selenium tier sits above and +provides the real-Gecko fidelity check. diff --git a/tests/_module-info.js b/tests/_module-info.js new file mode 100644 index 0000000..e6866a3 --- /dev/null +++ b/tests/_module-info.js @@ -0,0 +1,59 @@ +/** + * Shared helper for the map_item test drivers. + * + * Pre-validates a module by: + * 1. Running `node --check` on its file (syntax check; avoids the + * worker-killing experimental-ESM crash when a syntax error reaches + * the dynamic importer). + * 2. Dynamically importing it and checking for a `map_item` export. + * + * Results are cached per module name so test files that load this helper + * via separate Jest workers/files don't pay the spawnSync cost twice. + * + * Returns one of four states the test driver can branch on: + * { state: 'ok', map_item: } + * { state: 'no_map_item' } + * { state: 'syntax_error', error: } + * { state: 'import_error', error: } + */ + +import { spawnSync } from 'node:child_process'; +import { join, dirname } from 'node:path'; +import { fileURLToPath } from 'node:url'; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const MODULES_ROOT = join(__dirname, '..', 'modules'); + +const syntax_cache = new Map(); +const inspect_cache = new Map(); + +function check_module_syntax(module_name) { + if (syntax_cache.has(module_name)) return syntax_cache.get(module_name); + const module_path = join(MODULES_ROOT, `${module_name}.js`); + const result = spawnSync(process.execPath, ['--check', module_path], { encoding: 'utf8' }); + const out = result.status === 0 + ? null + : (result.stderr || result.stdout || `exit code ${result.status}`).trim(); + syntax_cache.set(module_name, out); + return out; +} + +export async function inspect_module(module_name) { + if (inspect_cache.has(module_name)) return inspect_cache.get(module_name); + const syntax_error = check_module_syntax(module_name); + let result; + if (syntax_error) { + result = { state: 'syntax_error', error: syntax_error }; + } else { + try { + const mod = await import(`../modules/${module_name}.js`); + result = typeof mod.map_item === 'function' + ? { state: 'ok', map_item: mod.map_item } + : { state: 'no_map_item' }; + } catch (e) { + result = { state: 'import_error', error: e }; + } + } + inspect_cache.set(module_name, result); + return result; +} diff --git a/tests/duplicate-behavior.test.js b/tests/duplicate-behavior.test.js index 031f663..9f0662b 100644 --- a/tests/duplicate-behavior.test.js +++ b/tests/duplicate-behavior.test.js @@ -5,8 +5,9 @@ * update or merge behaviors to duplicates across navigation boundaries. */ +import 'fake-indexeddb/auto'; + let Dexie; -require('fake-indexeddb/auto'); // Mock browser extension APIs global.browser = { diff --git a/tests/jest.compare.config.cjs b/tests/jest.compare.config.cjs new file mode 100644 index 0000000..070e2ff --- /dev/null +++ b/tests/jest.compare.config.cjs @@ -0,0 +1,20 @@ +// Tier 2 — live comparator against a 4CAT instance. +// +// Runs only `map_item_compare.test.js`. Requires FOURCAT_URL, +// FOURCAT_API_KEY, and FOURCAT_DATASETS to be set in tests/.env. Hard-errors +// rather than silently skipping if env is missing. +// +// Env is jsdom so that the four modules using `strip_tags` (gab, pinterest, +// rednote, truth) have a native DOMParser. The comparator uses cross-fetch +// to provide a jsdom-friendly fetch (jsdom doesn't ship fetch and undici +// crashes inside jsdom). +module.exports = { + testEnvironment: 'jsdom', + testMatch: ['**/map_item_compare.test.js'], + testPathIgnorePatterns: ['/node_modules/'], + transform: {}, + moduleFileExtensions: ['js', 'json'], + setupFiles: ['/setup-globals.cjs'], + testTimeout: 30000, + verbose: true +}; diff --git a/tests/jest.config.cjs b/tests/jest.config.cjs new file mode 100644 index 0000000..239abbc --- /dev/null +++ b/tests/jest.config.cjs @@ -0,0 +1,12 @@ +// Default Jest config — Tier 1 only (duplicate-behavior + load-only smoke). +// The comparator is excluded; invoke it via `npm run test:compare`. +module.exports = { + testEnvironment: 'jsdom', + testMatch: ['**/*.test.js'], + testPathIgnorePatterns: ['/node_modules/', 'map_item_compare\\.test\\.js$'], + transform: {}, + moduleFileExtensions: ['js', 'json'], + collectCoverageFrom: ['*.test.js'], + setupFiles: ['/setup-globals.cjs'], + verbose: true +}; diff --git a/tests/jest.config.js b/tests/jest.config.js deleted file mode 100644 index 7dd5b02..0000000 --- a/tests/jest.config.js +++ /dev/null @@ -1,8 +0,0 @@ -module.exports = { - testEnvironment: 'jsdom', - testMatch: ['**/*.test.js'], - transform: {}, - moduleFileExtensions: ['js', 'json'], - collectCoverageFrom: ['duplicate-behavior.test.js'], - verbose: true -}; diff --git a/tests/map_item.test.js b/tests/map_item.test.js new file mode 100644 index 0000000..774c083 --- /dev/null +++ b/tests/map_item.test.js @@ -0,0 +1,49 @@ +/** + * Load-only smoke for every module under `modules/*.js`. + * + * For each module file, runs `inspect_module()` and asserts the module: + * - parses (no SyntaxError) + * - imports without throwing + * - either exports a `map_item` function, or doesn't (both are fine here) + * + * No data is fed through `map_item`. That work belongs in the comparator + * (Tier 2 — `npm run test:compare`), where real items pulled from a 4CAT + * dataset provide both the input and the expected output. + * + * Catches: parse errors, import-time throws, broken top-level statements. + * Does NOT catch: anything that requires running `map_item` on real input. + */ + +import { readdirSync } from 'node:fs'; +import { join, dirname } from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { inspect_module } from './_module-info.js'; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const MODULES_ROOT = join(__dirname, '..', 'modules'); + +const module_files = readdirSync(MODULES_ROOT) + .filter(f => f.endsWith('.js') && !f.startsWith('_')); + +const module_info = {}; +for (const file of module_files) { + const name = file.replace(/\.js$/, ''); + module_info[name] = await inspect_module(name); +} + +describe('module load smoke', () => { + for (const file of module_files) { + const name = file.replace(/\.js$/, ''); + test(`modules/${file} loads cleanly`, () => { + const info = module_info[name]; + if (info.state === 'syntax_error') { + throw new Error(`syntax error in modules/${file}:\n${info.error}`); + } + if (info.state === 'import_error') { + throw new Error(`import failed for modules/${file}: ${info.error.message}`); + } + // 'ok' or 'no_map_item' — both acceptable at this tier. + expect(['ok', 'no_map_item']).toContain(info.state); + }); + } +}); diff --git a/tests/map_item_compare.test.js b/tests/map_item_compare.test.js new file mode 100644 index 0000000..43c5283 --- /dev/null +++ b/tests/map_item_compare.test.js @@ -0,0 +1,590 @@ +/** + * Compare JS map_item output against 4CAT's Python map_item via dataset keys. + * + * For each 4CAT dataset key in FOURCAT_DATASETS, this test: + * 1. HEADs the items endpoint to read the datasource id from the + * `X-4CAT-Dataset-*` response headers (no metadata-endpoint dependency) + * 2. translates that id back to a Zeeschuimer module name via + * zeeschuimer-to-4cat.json (used in reverse) + * 3. inspects the local module (must export map_item) + * 4. fetches in parallel, both as NDJSON: + * /download/ -> INPUTS (post-wrap) + * /api/dataset//items/?annotations=no&missing_fields=keep&stream=true + * -> mapped EXPECTED OUTPUTS + * 5. runs each input through the local map_item, then pairs by the + * resulting MAPPED `id` — which can differ from the raw input id (e.g. + * instagram maps to the post shortcode, not the numeric pk) — and + * deep-equals each mapped result against the corresponding expected + * output. + * + * The items endpoint is fetched with `stream=true` (NDJSON): its JSON-array + * form paginates at `limit=100`, silently dropping rows on larger datasets. + * `annotations=no` drops processor-added fields; `missing_fields=keep` keeps + * unmapped fields as `{ __missing: true, value: "" }` markers (matching the JS + * side) and additionally adds a comma-joined `missing_fields` summary key. + * That summary is API-only — the JS map_item never emits it — so it is + * excluded from the diff (see API_ONLY_FIELDS); the per-field markers it + * summarizes are still compared. + * + * Items from /download/ already have `wrap_for_map_item` applied by + * Zeeschuimer pre-upload, so they're fed to map_item directly without + * re-wrapping. The trade-off is that this comparator does not exercise + * `wrap_for_map_item` itself — see docs/map-item-test-plan.md for the + * accepted-gap rationale. + * + * Environment notes (fetch + DOMParser): + * - jsdom env so `strip_tags` (used by gab/pinterest/rednote/truth) has + * a native DOMParser. + * - jsdom doesn't ship `fetch`. Spiked three candidates on 2026-06-03 + * under node:20-alpine: + * * `undici` — crashes at import in jsdom (pokes at + * clearImmediate/markResourceTiming/fast-now + * timers that jsdom shadows). + * * `node-fetch` v3 — imports clean but `res.text()` throws + * `ReferenceError: TextDecoder is not defined` + * (jsdom doesn't expose TextDecoder as a global). + * * `cross-fetch/polyfill` — clean import + working round-trip. + * So this file imports `cross-fetch/polyfill`, which assigns + * `globalThis.fetch` when undefined. + * + * Invocation: + * npm run test:compare # runs every key in FOURCAT_DATASETS + * npm run test:compare -- # narrows to one key (must be in + * # FOURCAT_DATASETS to avoid typos) + * + * Hard-errors at registration time if FOURCAT_URL, FOURCAT_API_KEY, or + * FOURCAT_DATASETS is missing — by Tier 2 contract these are required. + */ + +import 'cross-fetch/polyfill'; +import 'dotenv/config'; +import { readFileSync, existsSync, writeFileSync } from 'node:fs'; +import { join, dirname } from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { inspect_module } from './_module-info.js'; + +const __dirname = dirname(fileURLToPath(import.meta.url)); + +// The end-of-run roll-up is written here, then printed by run-compare.mjs +// AFTER jest exits — jest buffers in-test stdout and hoists it above the +// result tree, so writing it from here directly would never land last. Keep +// in sync with the same constant in run-compare.mjs. +const SUMMARY_PATH = join(__dirname, '.compare-summary.txt'); + +const FOURCAT_URL = process.env.FOURCAT_URL?.replace(/\/$/, ''); +const FOURCAT_API_KEY = process.env.FOURCAT_API_KEY; + +// Hard-fail if env is missing — Tier 2 contract. +function require_env(name, value, placeholder_values = []) { + if (!value || placeholder_values.includes(value)) { + throw new Error( + `${name} is not configured. Set it in tests/.env (see tests/.env.example).` + ); + } + return value; +} +require_env('FOURCAT_URL', FOURCAT_URL); +require_env('FOURCAT_API_KEY', FOURCAT_API_KEY, ['your-api-key-here']); + +const FOURCAT_DATASETS = require_env( + 'FOURCAT_DATASETS', + process.env.FOURCAT_DATASETS, + ['key1,key2,key3'], +) + .split(',') + .map(k => k.trim()) + .filter(k => k.length > 0); + +if (FOURCAT_DATASETS.length === 0) { + throw new Error('FOURCAT_DATASETS parsed as empty. Set a comma-separated list of dataset keys in tests/.env.'); +} + +// Optional narrowing to a single dataset key. The `npm run test:compare -- +// ` form is handled by run-compare.mjs, which sets COMPARE_DATASET; jest +// itself would mis-read a bare key as a test-path-pattern filter and silently +// run nothing. A narrowed key must still be declared in FOURCAT_DATASETS — +// erroring on an unlisted key catches typos and keeps the dataset list the +// single source of truth. +const COMPARE_DATASET = process.env.COMPARE_DATASET?.trim() || undefined; +if (COMPARE_DATASET && !FOURCAT_DATASETS.includes(COMPARE_DATASET)) { + throw new Error( + `COMPARE_DATASET=${COMPARE_DATASET} is not listed in FOURCAT_DATASETS. ` + + `Add it to tests/.env before narrowing the run to it.` + ); +} + +const DATASET_KEYS_TO_RUN = COMPARE_DATASET ? [COMPARE_DATASET] : FOURCAT_DATASETS; + +// 4CAT datasource id -> Zeeschuimer module name. The on-disk map is +// authored in the natural direction (zeeschuimer -> 4cat); flip here. +const ID_MAP_PATH = join(__dirname, 'zeeschuimer-to-4cat.json'); +const ZEESCHUIMER_TO_4CAT = existsSync(ID_MAP_PATH) + ? JSON.parse(readFileSync(ID_MAP_PATH, 'utf8')) + : {}; +const FOURCAT_TO_ZEESCHUIMER = Object.fromEntries( + Object.entries(ZEESCHUIMER_TO_4CAT) + .filter(([k]) => !k.startsWith('_')) + .map(([z, f]) => [f, z]) +); + +// When true (default), comparison of a dataset stops at its first failing +// item; the remaining items are reported as a single skipped "halted" +// placeholder rather than one failure each. Disable it with the `--all` +// launcher flag (preferred — crosses every shell) or FAIL_FAST=0. Trim +// because `set FAIL_FAST=0 && ...` in cmd.exe includes the trailing space; +// treat both '0' and 'false' (case-insensitive) as off. +const FAIL_FAST_RAW = (process.env.FAIL_FAST ?? '').trim().toLowerCase(); +const FAIL_FAST = FAIL_FAST_RAW !== '0' && FAIL_FAST_RAW !== 'false'; + +function auth_headers(extra = {}) { + return { + // 4CAT accepts the raw key without a `Bearer ` prefix. + 'Authorization': FOURCAT_API_KEY, + ...extra, + }; +} + +async function fetch_headers(url) { + const res = await fetch(url, { method: 'HEAD', headers: auth_headers() }); + if (!res.ok) throw new Error(`HTTP ${res.status} from HEAD ${url}`); + return res.headers; +} + +async function fetch_ndjson(url) { + const res = await fetch(url, { headers: auth_headers() }); + const text = await res.text(); + if (!res.ok) throw new Error(`HTTP ${res.status} from ${url}: ${text}`); + return text + .split('\n') + .filter(line => line.trim().length > 0) + .map((line, i) => { + try { return JSON.parse(line); } + catch (e) { throw new Error(`bad NDJSON at line ${i} of ${url}: ${e.message}`); } + }); +} + +function normalize(value) { + return JSON.parse(JSON.stringify(value)); +} + +function looks_like_url(v) { + return typeof v === 'string' && /^https?:\/\//i.test(v); +} + +// Percent-decode for encoding-insensitive URL comparison. Decode each maximal +// %XX run on its own so a malformed sequence doesn't throw and abort the rest. +function decode_url_loose(s) { + return s.replace(/(?:%[0-9A-Fa-f]{2})+/g, run => { + try { return decodeURIComponent(run); } catch { return run; } + }); +} + +function deep_equal(a, b) { + if (a === b) return true; + if (a === null || b === null) return a === b; + if (typeof a !== typeof b) return false; + if (typeof a !== 'object') { + // Treat encoding-equivalent URLs as equal. The comparator targets bad + // data, not cosmetic percent-encoding differences: `=` vs `%3D` in a + // query value (and the like) resolve to the same URL, so 4CAT emitting + // one form while the JS normalizer emits the other is not a defect. + // Applied at the leaf so it covers URLs nested in arrays/objects too. + // Tradeoff: this also collapses `%2F` vs `/`, which can be semantically + // distinct — accepted, as a genuinely different URL still differs once + // decoded. + if (looks_like_url(a) && looks_like_url(b)) { + return decode_url_loose(a) === decode_url_loose(b); + } + return false; + } + if (Array.isArray(a) !== Array.isArray(b)) return false; + if (Array.isArray(a)) { + if (a.length !== b.length) return false; + return a.every((v, i) => deep_equal(v, b[i])); + } + const a_keys = Object.keys(a); + const b_keys = Object.keys(b); + if (a_keys.length !== b_keys.length) return false; + return a_keys.every(k => k in b && deep_equal(a[k], b[k])); +} + +function diff_objects(js_obj, py_obj) { + const diffs = []; + const keys = new Set([...Object.keys(js_obj ?? {}), ...Object.keys(py_obj ?? {})]); + for (const key of keys) { + const in_js = js_obj && key in js_obj; + const in_py = py_obj && key in py_obj; + if (!in_js) { + diffs.push({ key, kind: 'only_python', python: py_obj[key] }); + } else if (!in_py) { + diffs.push({ key, kind: 'only_js', js: js_obj[key] }); + } else if (!deep_equal(js_obj[key], py_obj[key])) { + diffs.push({ key, kind: 'mismatch', js: js_obj[key], python: py_obj[key] }); + } + } + return diffs; +} + +function format_diffs(diffs) { + return diffs.map(d => { + if (d.kind === 'only_js') { + return ` + only in JS: ${d.key} = ${JSON.stringify(d.js)}`; + } + if (d.kind === 'only_python') { + return ` - only in Python: ${d.key} = ${JSON.stringify(d.python)}`; + } + return ` ~ ${d.key}\n JS: ${JSON.stringify(d.js)}\n Python: ${JSON.stringify(d.python)}`; + }).join('\n'); +} + +function format_error_with_location(err) { + if (!err) return String(err); + const message = err.message || String(err); + const stack = err.stack || ''; + const module_frames = stack.split('\n') + .filter(l => l.includes('/modules/') || l.includes('\\modules\\')) + .slice(0, 3) + .map(l => l.trim()); + return module_frames.length + ? `${message}\n ${module_frames.join('\n ')}` + : message; +} + +// Map each input through the local map_item, then pair the mapped result +// against the expected output by `id`. Pairing MUST key on the mapped id: +// some modules emit an `id` that differs from the raw input id — instagram, +// for instance, maps to the post shortcode (`node.code`), not the numeric pk +// — so pairing raw input ids against the API's already-mapped ids would match +// nothing. Falls back to index pairing (with a logged warning) if either side +// lacks a usable id. A throw inside map_item is captured per-item and surfaced +// later as that item's failure. +function map_and_pair(inputs, outputs, map_item, dataset_key) { + // Map every input up front so pairing can key on the mapped id. + const mapped = inputs.map(input => { + try { + return { input, js_result: map_item(input), error: null }; + } catch (e) { + return { + input, + js_result: null, + error: new Error(`JS map_item threw: ${format_error_with_location(e)}`), + }; + } + }); + + const probe_mapped = mapped.find(m => m.js_result)?.js_result; + const probe_out = outputs[0]; + const has_id_mapped = probe_mapped && 'id' in probe_mapped && probe_mapped.id != null; + const has_id_out = probe_out && 'id' in probe_out && probe_out.id != null; + + if (!has_id_mapped || !has_id_out) { + // eslint-disable-next-line no-console + console.warn( + `[compare] ${dataset_key}: no usable 'id' on ${!has_id_mapped ? 'map_item output' : '/items'} ` + + `side — falling back to index pairing for this dataset.` + ); + const n = Math.min(mapped.length, outputs.length); + return { + mode: 'index', + pairs: Array.from({ length: n }, (_, i) => ({ + input: mapped[i].input, + js_result: mapped[i].js_result, + error: mapped[i].error, + expected: outputs[i], + id: i, + })), + input_count: inputs.length, + output_count: outputs.length, + unmatched_inputs: [], + unmatched_outputs: [], + }; + } + + // An id is NOT guaranteed unique: some datasources re-emit the same post + // across paginated/scroll responses (e.g. imgur gallery returns a post on + // every page it appears on), so a key can legitimately recur with a + // different `collected_from_url` per capture. Bucket outputs into a FIFO + // queue per id rather than a single slot — then the k-th input occurrence + // of an id pairs with the k-th output occurrence. Both endpoints stream the + // dataset in the same stored order, so occurrences line up. (A plain + // last-wins Map would cross-match occurrence #0 against the surviving + // occurrence #N, fabricating field diffs and bogus unmatched ids.) + const by_id_out = new Map(); + for (const item of outputs) { + const k = String(item.id); + if (!by_id_out.has(k)) by_id_out.set(k, []); + by_id_out.get(k).push(item); + } + + const pairs = []; + const unmatched_inputs = []; + for (const m of mapped) { + // A throw produces no mapped id to pair on. Surface it as its own + // failing item (labelled with the raw input id) rather than burying it + // in the unmatched-id list — otherwise an id-transforming module hides + // the actual map_item error behind a generic "unmatched input" report. + if (m.error) { + const label = m.input && m.input.id != null ? String(m.input.id) : '(no id)'; + pairs.push({ input: m.input, js_result: null, error: m.error, expected: null, id: label }); + continue; + } + // Key on the mapped id; a successful map whose id matches no remaining + // output occurrence is a genuine pairing miss and goes to unmatched_inputs. + const lookup_id = m.js_result && m.js_result.id != null ? String(m.js_result.id) : null; + const queue = lookup_id != null ? by_id_out.get(lookup_id) : undefined; + const expected = queue && queue.length ? queue.shift() : undefined; + if (expected) { + pairs.push({ input: m.input, js_result: m.js_result, error: null, expected, id: lookup_id }); + } else { + unmatched_inputs.push(lookup_id); + } + } + // Any output occurrences left in the queues had no matching input. + const unmatched_outputs = []; + for (const [id, queue] of by_id_out) { + for (let i = 0; i < queue.length; i++) unmatched_outputs.push(id); + } + return { + mode: 'id', + pairs, + input_count: inputs.length, + output_count: outputs.length, + unmatched_inputs, + unmatched_outputs, + }; +} + +// Recover the datasource id from a dataset's response headers. 4CAT exposes it +// directly as `X-4CAT-Dataset-Datasource`. Older responses may only carry +// `X-4CAT-Dataset-Type` (the datasource id with a `-search`/`-import` suffix), +// so fall back to stripping that — anchored to end-of-string because +// datasource ids can themselves contain hyphens (e.g. `xiaohongshu-comments`). +// The result is translated to a Zeeschuimer module via FOURCAT_TO_ZEESCHUIMER. +function datasource_id_from_headers(headers) { + const datasource = headers.get('x-4cat-dataset-datasource'); + if (datasource) return datasource.trim(); + const type = headers.get('x-4cat-dataset-type'); + if (type) return type.trim().replace(/-(search|import)$/, ''); + return null; +} + +// Fields 4CAT's API attaches to every mapped item that the JS map_item never +// produces, so they would otherwise diff as spurious "only_python" entries. +// `missing_fields` is a comma-joined summary of which fields came back as +// MissingMappedField — redundant with the per-field `{__missing:true}` +// markers, which ARE compared. +const API_ONLY_FIELDS = new Set(['missing_fields']); + +function strip_api_fields(obj) { + if (!obj || typeof obj !== 'object' || Array.isArray(obj)) return obj; + const out = {}; + for (const k of Object.keys(obj)) { + if (!API_ONLY_FIELDS.has(k)) out[k] = obj[k]; + } + return out; +} + +// Diff each paired (already-mapped) JS result against 4CAT's expected output. +// map_item was run up front during pairing — so we could key on the mapped id +// — so here we only diff, or report an input whose map_item threw. With +// FAIL_FAST on (default), stop at the first failing item and record how many +// were left unchecked — so one bad item yields a single failure plus one +// skipped "halted" placeholder, not N failures. +function compare_pairs(pairs) { + const results = []; + let halted_count = 0; + for (let i = 0; i < pairs.length; i++) { + const { id, js_result, error, expected } = pairs[i]; + let message = null; + if (error) { + message = error.message; + } else { + const diffs = diff_objects( + strip_api_fields(normalize(js_result)), + strip_api_fields(normalize(expected)), + ); + if (diffs.length > 0) { + message = `${diffs.length} field(s) differ between JS and 4CAT:\n${format_diffs(diffs)}`; + } + } + results.push({ id, ok: message === null, message }); + if (message !== null && FAIL_FAST) { + halted_count = pairs.length - (i + 1); + break; + } + } + return { results, halted_count }; +} + +// Pre-pass: for each dataset, resolve the datasource (HEAD), fetch items, and +// run the comparison up front, so tests register with knowable counts and a +// deterministic pass/fail per item. Fetch/setup failures become a single +// "setup" failure inside that dataset's describe. +const dataset_state = {}; +for (const key of DATASET_KEYS_TO_RUN) { + try { + // The same items URL serves double duty: a HEAD reveals the datasource + // (via X-4CAT-Dataset-* headers) with no body; the GET pulls the mapped + // rows. `stream=true` avoids the JSON form's limit=100 pagination, which + // would silently drop rows (and break id-pairing) on larger datasets. + const items_url = `${FOURCAT_URL}/api/dataset/${key}/items/?annotations=no&missing_fields=keep&stream=true`; + const headers = await fetch_headers(items_url); + const datasource_id = datasource_id_from_headers(headers); + if (!datasource_id) { + throw new Error( + `no datasource id in response headers for ${key} ` + + `(looked for X-4CAT-Dataset-Datasource / X-4CAT-Dataset-Type)` + ); + } + const module_name = FOURCAT_TO_ZEESCHUIMER[datasource_id] ?? datasource_id; + const module_state = await inspect_module(module_name); + + if (module_state.state === 'ok') { + const [inputs, outputs] = await Promise.all([ + fetch_ndjson(`${FOURCAT_URL}/download/${key}`), + fetch_ndjson(items_url), + ]); + const pairing = map_and_pair(inputs, outputs, module_state.map_item, key); + const comparison = compare_pairs(pairing.pairs); + dataset_state[key] = { datasource_id, module_name, module_state, pairing, comparison }; + } else { + dataset_state[key] = { datasource_id, module_name, module_state }; + } + } catch (e) { + dataset_state[key] = { error: e }; + } +} + +for (const dataset_key of DATASET_KEYS_TO_RUN) { + const info = dataset_state[dataset_key]; + + if (info.error) { + describe(`map_item compare: dataset ${dataset_key}`, () => { + test('setup', () => { throw info.error; }); + }); + continue; + } + + const { datasource_id, module_name, module_state, pairing, comparison } = info; + const label = `${dataset_key} (datasource: ${datasource_id}, module: ${module_name})`; + + if (module_state.state === 'no_map_item') { + describe(`map_item compare: ${label}`, () => { + test.skip(`modules/${module_name}.js has no map_item — nothing to compare`, () => {}); + }); + continue; + } + if (module_state.state === 'syntax_error' || module_state.state === 'import_error') { + const msg = module_state.state === 'syntax_error' + ? `syntax error:\n${module_state.error}` + : `import failed: ${module_state.error.message}`; + describe(`map_item compare: ${label}`, () => { + test('module loads', () => { throw new Error(msg); }); + }); + continue; + } + + describe(`map_item compare: ${label}`, () => { + test('pairing', () => { + const messages = []; + if (pairing.input_count !== pairing.output_count) { + messages.push( + `input count ${pairing.input_count} != output count ${pairing.output_count}` + ); + } + if (pairing.unmatched_inputs.length) { + const shown = pairing.unmatched_inputs.slice(0, 5).join(', '); + const extra = pairing.unmatched_inputs.length > 5 + ? ` (+${pairing.unmatched_inputs.length - 5} more)` + : ''; + messages.push(`unmatched input ids: ${shown}${extra}`); + } + if (pairing.unmatched_outputs.length) { + const shown = pairing.unmatched_outputs.slice(0, 5).join(', '); + const extra = pairing.unmatched_outputs.length > 5 + ? ` (+${pairing.unmatched_outputs.length - 5} more)` + : ''; + messages.push(`unmatched output ids: ${shown}${extra}`); + } + if (pairing.mode === 'index') { + messages.push(`paired by index (no usable 'id' field) — diffs may be misaligned`); + } + if (messages.length) throw new Error(messages.join('\n')); + }); + + comparison.results.forEach(({ id, ok, message }, i) => { + test(`item ${i} (id=${id})`, () => { + if (!ok) throw new Error(message); + }); + }); + + if (comparison.halted_count > 0) { + test.skip( + `halted after first failure — ${comparison.halted_count} later item(s) not compared ` + + `(pass --all, or set FAIL_FAST=0, to compare every item)`, + () => {}, + ); + } + }); +} + +// Reduce a dataset's pre-computed state to a single verdict + one-line detail. +// Mirrors the assertions above exactly so the summary never disagrees with the +// per-test results: PASS only when pairing is clean AND every compared item +// matched; a FAIL_FAST halt leaves items unchecked, so it cannot be a PASS. +function summarize_dataset(key, info) { + if (info.error) { + return { key, status: 'FAIL', datasource: '?', module: '?', detail: `setup error: ${info.error.message}` }; + } + const { datasource_id, module_name, module_state, pairing, comparison } = info; + if (module_state.state === 'no_map_item') { + return { key, status: 'SKIP', datasource: datasource_id, module: module_name, detail: 'no map_item — nothing to compare' }; + } + if (module_state.state === 'syntax_error' || module_state.state === 'import_error') { + return { key, status: 'FAIL', datasource: datasource_id, module: module_name, detail: `module ${module_state.state.replace('_', ' ')}` }; + } + + const pairing_problems = []; + if (pairing.input_count !== pairing.output_count) { + pairing_problems.push(`count ${pairing.input_count}!=${pairing.output_count}`); + } + if (pairing.unmatched_inputs.length) pairing_problems.push(`${pairing.unmatched_inputs.length} unmatched input(s)`); + if (pairing.unmatched_outputs.length) pairing_problems.push(`${pairing.unmatched_outputs.length} unmatched output(s)`); + if (pairing.mode === 'index') pairing_problems.push(`paired by index`); + + const compared = comparison.results.length; + const failed_items = comparison.results.filter(r => !r.ok).length; + const total = pairing.pairs.length; + + if (pairing_problems.length || failed_items) { + const parts = []; + if (pairing_problems.length) parts.push(`pairing: ${pairing_problems.join(', ')}`); + if (failed_items) { + const halted = comparison.halted_count > 0 ? `, halted (+${comparison.halted_count} unchecked)` : ''; + parts.push(`${failed_items}/${compared} item(s) differ${halted}`); + } + return { key, status: 'FAIL', datasource: datasource_id, module: module_name, detail: parts.join('; ') }; + } + return { key, status: 'PASS', datasource: datasource_id, module: module_name, detail: `${total}/${total} items match` }; +} + +// Build the per-datasource roll-up once the whole file has run and stash it +// for run-compare.mjs to print as the genuine final output (see SUMMARY_PATH). +afterAll(() => { + const rows = DATASET_KEYS_TO_RUN.map(key => summarize_dataset(key, dataset_state[key])); + const w_status = 4; // PASS/FAIL/SKIP + const w_module = Math.max(6, ...rows.map(r => r.module.length)); + + const lines = ['', '=== map_item compare summary ===']; + for (const r of rows) { + const mark = r.status === 'PASS' ? '✓' : r.status === 'SKIP' ? '○' : '✗'; + lines.push( + ` ${mark} ${r.status.padEnd(w_status)} ${r.module.padEnd(w_module)} ${r.key} — ${r.detail}` + ); + } + const passed = rows.filter(r => r.status === 'PASS').length; + const failed = rows.filter(r => r.status === 'FAIL').length; + const skipped = rows.filter(r => r.status === 'SKIP').length; + lines.push(`${rows.length} datasource(s): ${passed} passed, ${failed} failed, ${skipped} skipped`); + writeFileSync(SUMMARY_PATH, lines.join('\n') + '\n'); +}); diff --git a/tests/package-lock.json b/tests/package-lock.json index cc8f457..ada8011 100644 --- a/tests/package-lock.json +++ b/tests/package-lock.json @@ -8,7 +8,9 @@ "name": "zeeschuimer-db-tests", "version": "1.0.0", "devDependencies": { + "cross-fetch": "^4.0.0", "dexie": "^3.2.4", + "dotenv": "^16.4.5", "fake-indexeddb": "^5.0.1", "jest": "^29.7.0", "jest-environment-jsdom": "^29.7.0" @@ -1597,6 +1599,16 @@ "node": "^14.15.0 || ^16.10.0 || >=18.0.0" } }, + "node_modules/cross-fetch": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/cross-fetch/-/cross-fetch-4.1.0.tgz", + "integrity": "sha512-uKm5PU+MHTootlWEY+mZ4vvXoCn4fLQxT9dSc1sXVMSFkINTJVN8cAQROpwcKm8bJ/c7rgZVIBWzH5T78sNZZw==", + "dev": true, + "license": "MIT", + "dependencies": { + "node-fetch": "^2.7.0" + } + }, "node_modules/cross-spawn": { "version": "7.0.6", "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.6.tgz", @@ -1758,6 +1770,19 @@ "node": ">=12" } }, + "node_modules/dotenv": { + "version": "16.6.1", + "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.6.1.tgz", + "integrity": "sha512-uBq4egWHTcTt33a72vpSG0z3HnPuIl6NqYcTrKEg2azoEyl2hpW0zqlxysq2pK9HlDIHyHyakeYaYnSAwd8bow==", + "dev": true, + "license": "BSD-2-Clause", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://dotenvx.com" + } + }, "node_modules/dunder-proto": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz", @@ -3466,6 +3491,52 @@ "dev": true, "license": "MIT" }, + "node_modules/node-fetch": { + "version": "2.7.0", + "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.7.0.tgz", + "integrity": "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==", + "dev": true, + "license": "MIT", + "dependencies": { + "whatwg-url": "^5.0.0" + }, + "engines": { + "node": "4.x || >=6.0.0" + }, + "peerDependencies": { + "encoding": "^0.1.0" + }, + "peerDependenciesMeta": { + "encoding": { + "optional": true + } + } + }, + "node_modules/node-fetch/node_modules/tr46": { + "version": "0.0.3", + "resolved": "https://registry.npmjs.org/tr46/-/tr46-0.0.3.tgz", + "integrity": "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==", + "dev": true, + "license": "MIT" + }, + "node_modules/node-fetch/node_modules/webidl-conversions": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz", + "integrity": "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==", + "dev": true, + "license": "BSD-2-Clause" + }, + "node_modules/node-fetch/node_modules/whatwg-url": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-5.0.0.tgz", + "integrity": "sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw==", + "dev": true, + "license": "MIT", + "dependencies": { + "tr46": "~0.0.3", + "webidl-conversions": "^3.0.0" + } + }, "node_modules/node-int64": { "version": "0.4.0", "resolved": "https://registry.npmjs.org/node-int64/-/node-int64-0.4.0.tgz", diff --git a/tests/package.json b/tests/package.json index dc3654c..763321c 100644 --- a/tests/package.json +++ b/tests/package.json @@ -1,13 +1,17 @@ { "name": "zeeschuimer-db-tests", "version": "1.0.0", - "description": "Unit tests for Zeeschuimer duplicate handling logic", + "description": "Unit tests for Zeeschuimer duplicate handling logic and map_item generator output", + "type": "module", "scripts": { - "test": "jest", - "test:watch": "jest --watch" + "test": "node --experimental-vm-modules node_modules/jest/bin/jest.js --config jest.config.cjs", + "test:watch": "node --experimental-vm-modules node_modules/jest/bin/jest.js --config jest.config.cjs --watch", + "test:compare": "node run-compare.mjs" }, "devDependencies": { + "cross-fetch": "^4.0.0", "dexie": "^3.2.4", + "dotenv": "^16.4.5", "fake-indexeddb": "^5.0.1", "jest": "^29.7.0", "jest-environment-jsdom": "^29.7.0" diff --git a/tests/run-compare.mjs b/tests/run-compare.mjs new file mode 100644 index 0000000..bc7e88f --- /dev/null +++ b/tests/run-compare.mjs @@ -0,0 +1,71 @@ +/** + * Launcher for the Tier 2 map_item comparator (`npm run test:compare`). + * + * npm run test:compare -> compares every key in FOURCAT_DATASETS + * npm run test:compare -- -> narrows the run to a single key + * npm run test:compare -- --all -> compare every item (no fail-fast) + * npm run test:compare -- -t "id=123" -> key + forwarded jest flags + * + * Why this exists instead of invoking jest directly: jest treats any bare + * positional argument as a test-path-pattern filter. A 4CAT dataset key + * (`5daeba72a2dfbb5ed8c855f824a61570`) matches no test file path, so + * `jest ` silently discovers zero tests and exits "green" having run + * nothing. This launcher intercepts the first non-flag argument, hands it to + * the comparator through the COMPARE_DATASET env var, and forwards only the + * remaining flags to jest — so the key never reaches jest's argv. + */ + +import { spawn } from 'node:child_process'; +import { fileURLToPath } from 'node:url'; +import { dirname, join } from 'node:path'; +import { readFileSync, rmSync } from 'node:fs'; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const args = process.argv.slice(2); + +// The comparator writes its roll-up here (jest buffers in-test stdout and +// hoists it above the result tree, so we print it from this launcher after +// jest exits to make it the genuine last output). Keep in sync with the same +// constant in map_item_compare.test.js. +const SUMMARY_PATH = join(__dirname, '.compare-summary.txt'); +// Drop any stale summary up front so a crashed run can't print the prior one. +rmSync(SUMMARY_PATH, { force: true }); + +// First non-flag arg (if any) is the dataset key to narrow to. +const dataset_key = args.find(a => !a.startsWith('-')); +const flags = args.filter(a => a !== dataset_key); + +// `--all` (alias `--no-fail-fast`) compares every item instead of halting at +// the first failure. It's offered as a flag, not only via the FAIL_FAST env +// var, because `FAIL_FAST=0 npm run ...` does not reliably reach node when +// npm/node is the Windows binary invoked through WSL interop, and isn't env +// syntax at all in cmd.exe. A CLI flag crosses every shell; the env var still +// works where it propagates. +const disable_fail_fast = flags.includes('--all') || flags.includes('--no-fail-fast'); +const jest_flags = flags.filter(f => f !== '--all' && f !== '--no-fail-fast'); + +const env = { ...process.env }; +if (dataset_key) env.COMPARE_DATASET = dataset_key; +if (disable_fail_fast) env.FAIL_FAST = '0'; + +const jest_bin = join(__dirname, 'node_modules', 'jest', 'bin', 'jest.js'); +const child = spawn( + process.execPath, + ['--experimental-vm-modules', jest_bin, '--config', 'jest.compare.config.cjs', ...jest_flags], + { stdio: 'inherit', cwd: __dirname, env }, +); + +child.on('exit', code => { + // Print the roll-up after jest's own tally so it's the last thing on screen. + try { + process.stdout.write(readFileSync(SUMMARY_PATH, 'utf8')); + rmSync(SUMMARY_PATH, { force: true }); + } catch { + // No summary file (e.g. setup threw before afterAll) — nothing to print. + } + process.exit(code ?? 1); +}); +child.on('error', err => { + console.error(`failed to launch jest: ${err.message}`); + process.exit(1); +}); diff --git a/tests/setup-globals.cjs b/tests/setup-globals.cjs new file mode 100644 index 0000000..b55e659 --- /dev/null +++ b/tests/setup-globals.cjs @@ -0,0 +1,45 @@ +/** + * Make js/lib.js's helpers available as globals inside the Jest test + * environment, mirroring how the browser sees them after the manifest + * loads lib.js as a plain script. + * + * map_item bodies reference these as free identifiers (MappedItem, + * MissingMappedField, strip_tags, normalize_url_encoding, ...). Without + * this shim they'd hit ReferenceError as soon as a test invokes map_item. + * + * Names are auto-discovered from lib.js by regex-matching top-level + * `function name(...)` and `class Name ...` declarations. Adding a helper + * to lib.js makes it available to tests without touching this file. + */ + +const fs = require('node:fs'); +const path = require('node:path'); + +const lib_source = fs.readFileSync( + path.join(__dirname, '..', 'js', 'lib.js'), + 'utf8', +); + +// Match `function name(` and `class Name {` / `class Name extends` at +// column 0 of a line. lib.js is a classic script with all top-level +// declarations unindented; requiring column 0 keeps nested helpers (like +// the `_traverse_data` IIFE inside `traverse_data`) from being exposed. +const NAME_PATTERN = /^(?:function|class)\s+([A-Za-z_$][A-Za-z0-9_$]*)\b/gm; +const EXPOSED_NAMES = Array.from( + lib_source.matchAll(NAME_PATTERN), + m => m[1], +); + +if (EXPOSED_NAMES.length === 0) { + throw new Error( + 'setup-globals.cjs: no top-level function/class declarations found in js/lib.js — ' + + 'auto-discovery regex may be broken. Tests will ReferenceError if not fixed.' + ); +} + +const factory = new Function(` +${lib_source} +return { ${EXPOSED_NAMES.join(', ')} }; +`); + +Object.assign(globalThis, factory()); diff --git a/tests/translation-errors.md b/tests/translation-errors.md new file mode 100644 index 0000000..fcc160d --- /dev/null +++ b/tests/translation-errors.md @@ -0,0 +1,430 @@ +# Auto-generator translation errors + +Patterns of incorrect Python → JavaScript translation observed in +auto-generated `modules/*.js` files. Each entry has a search pattern so +this doc doubles as a checklist when reviewing a new auto-generator PR. + +When an entry is fixed at the generator level (no longer appears in +fresh output), mark it `[fixed]` and keep the entry around — useful +history when something regresses. + +## How to use + +- Found a new pattern? Add an entry below following the template. +- Reviewing a generator PR? `grep` each `Search pattern` against the + changed module files. Anything that hits is worth a manual look. +- Iterating on the generator prompt? The "Why" lines are the + feedback to add — they describe the exact Python-vs-JS semantic + difference the LLM keeps missing. + +## Template + +``` +### + +**Status:** open | fixed in generator | accepted + +**Why it happens:** + +**Wrong JS:** +```js + +``` + +**Correct JS:** +```js + +``` + +**Example:** `modules/.js:` + +**Search pattern:** `` +``` + +--- + +## Observed patterns + +### `in` operator on strings + +**Status:** open + +**Why it happens:** In Python, `"x" in some_string` is a substring check. +In JavaScript, the `in` operator only works on **objects** and checks for +property/key existence; using it with a string on the right-hand side +throws `TypeError: cannot use 'in' operator to search for "x" in `. + +**Wrong JS:** +```js +const is_polaris = '__typename' in item && 'polaris' in item.__typename.toLowerCase(); +``` + +**Correct JS:** +```js +const is_polaris = '__typename' in item && item.__typename.toLowerCase().includes('polaris'); +``` + +**Example:** `modules/instagram.js:513` + +**Search pattern:** `'[^']+' in [a-zA-Z_$][\w$]*\.` — quoted string followed +by `in` followed by a method call. Quick rough check: `grep -E "' in [a-zA-Z]" modules/` + +**Watch out for partial fixes:** seen as `'polaris' in (item.__typename ?? '').toLowerCase()` +— adding `?? ''` guards against `undefined` but the `in` operator itself +still throws on the resulting *string*. The fix is `.includes()`, not just +defaulting the operand. + +--- + +### Python f-string syntax left in single-quoted JS strings + +**Status:** open + +**Why it happens:** Python `f"... {var} ..."` interpolates. JS uses +template literals (backticks) with `${var}`. The auto-generator leaves the +`{var}` notation in a regular single- or double-quoted JS string, which is +just literal text — no interpolation happens. + +**Wrong JS:** +```js +throw new MapItemException('Unable to parse item: different user {user.id} and owner {owner.id}'); +``` + +**Correct JS:** +```js +throw new MapItemException(`Unable to parse item: different user ${user.id} and owner ${owner.id}`); +``` + +**Example:** `modules/instagram.js:754` + +**Search pattern:** `'[^']*\{[a-zA-Z_$][\w$.]*\}[^']*'` or `"[^"]*\{[a-zA-Z_$][\w$.]*\}[^"]*"` +— a non-template-literal string containing `{identifier}` or `{identifier.path}`. +Quick check: `grep -nE "['\"][^'\"]*\{[a-zA-Z_][a-zA-Z0-9_.]*\}[^'\"]*['\"]" modules/` + +--- + +### `?? {}` default that defeats subsequent truthy checks + +**Status:** open + +**Why it happens:** When porting Python's `node.get('user') or {}` (which is +intended to make subsequent code safe to call), the generator emits +`node.user ?? {}`. That's a *valid* Python-equivalent, **but** any following +`if (user && owner) { ... }` guard then never short-circuits because both +`{}` references are truthy. The check ends up reading "if user and owner +*objects* exist" when the intent was "if user and owner data exist." +Subsequent property accesses then compare real ids/usernames against +`undefined` on the missing side, often throwing. + +**Wrong JS:** +```js +const user = node.user ?? {}; +const owner = node.owner ?? {}; +if (user && owner) { + if (user.id === owner.id) { /* … */ } + else if (user.username !== owner.username) { + throw new MapItemException('different user and owner'); + } +} +``` + +**Correct JS** (depending on intent — pick one): +```js +// (a) drop the defaults so truthy guard means "both present" +const user = node.user; +const owner = node.owner; +if (user && owner) { /* compare */ } +``` +```js +// (b) check for actual content, not just object identity +const user = node.user ?? {}; +const owner = node.owner ?? {}; +if (Object.keys(user).length && Object.keys(owner).length) { /* compare */ } +``` + +**Example:** `modules/instagram.js:748-756` + +**Search pattern:** `\?\?\s*\{\s*\}` — any `?? {}` occurrence is worth a +review of subsequent guards. Quick check: `grep -nE "\?\?\s*\{\s*\}" modules/` + +--- + +### Bare relative path as a statement (junk auto-imports section) + +**Status:** open + +**Why it happens:** The generator emits an "auto-generated imports" marker +block at the top of the module but writes the import target as a bare +relative path on its own line (`../js/lib.js`) instead of a real `import` +statement. JS parses that as `..` then `.` then `/js/lib.js` — syntax error. + +**Wrong JS:** +```js +// === auto-generated imports for map_item — DO NOT EDIT BY HAND === +../js/lib.js +// === end auto-generated imports === +``` + +**Correct JS** (one of): +```js +// === auto-generated imports — DO NOT EDIT BY HAND === +// Provided as globals by js/lib.js (loaded via manifest.json): +// MappedItem, MissingMappedField, MapItemException, traverse_data, +// strip_tags, normalize_url_encoding, formatUtcTimestamp +// === end auto-generated imports === +``` + +Or, if a real import is intended, an ESM import with named bindings: +```js +import { MappedItem, MissingMappedField } from '../js/lib.js'; +``` + +**Example:** seen historically in `modules/tiktok.js:2` + +**Search pattern:** `^\.\./` at the start of a line in module files. +Quick check: `grep -nE "^\.\." modules/*.js` + +--- + +### Key-existence check (`'X' in obj`) used where Python intended value-truthiness (`obj.get('X')`) + +**Status:** open + +**Why it happens:** Python's `if node.get('usertags'):` is a *truthy check on +the value* — returns False if the key is missing **or** if the value is +`None`/empty/falsy. The generator translates this to `if ('usertags' in +node)`, which in JS is a *key-existence check* — returns True even when +the value is `null`. Subsequent property accesses on the null value then +throw `Cannot read properties of null`. + +**Wrong JS:** +```js +const usertags = 'usertags' in node ? node.usertags.in.map(...).join(',') : ''; +// node.usertags can be null → .in.map blows up +``` + +**Correct JS:** +```js +const usertags = node.usertags ? node.usertags.in.map(...).join(',') : ''; +``` + +**Example:** `modules/instagram.js:777` + +**Search pattern:** `'[^']+' in [a-zA-Z_$][\w$]*\s*\?` — quoted-string `in` +identifier followed by `?` (ternary). Quick check: +`grep -nE "'[^']+' in [a-zA-Z_]+ \?" modules/` + +--- + +### Datetime serialization format mismatch + +**Status:** open + +**Why it happens:** Python's `datetime.utcfromtimestamp(t).strftime('%Y-%m-%d %H:%M:%S')` +produces `"2026-05-13 21:27:31"` — space-separated, no timezone marker. JS's +`new Date(t * 1000).toISOString()` produces `"2026-05-13T21:27:31.000Z"` — T +separator, milliseconds, Z. The generator emits the JS `.toISOString()` form +instead of using the existing `formatUtcTimestamp` helper from lib.js that +mimics Python's output exactly. + +**Wrong JS:** +```js +collected_at = new Date(node.taken_at * 1000).toISOString(); +``` + +**Correct JS:** +```js +collected_at = formatUtcTimestamp(node.taken_at); +// formatUtcTimestamp is defined in js/lib.js as: +// new Date(unixSeconds * 1000).toISOString().replace('T', ' ').slice(0, 19) +``` + +**Example:** `modules/instagram.js:782` + +**Search pattern:** `new Date\([^)]+\)\.toISOString\(\)` — any use of +`.toISOString()`. The helper should be used instead. Quick check: +`grep -nE "\.toISOString\(\)" modules/` + +--- + +### `re.findall` capture groups vs JS `.match` with /g flag + +**Status:** open + +**Why it happens:** Python's `re.findall(r'#(\w+)', s)` returns the **capture +group contents**: `['lotr', 'woodart']`. JS's `s.match(/#(\w+)/g)` (with the +global flag) returns the **full matches**: `['#lotr', '#woodart']` — capture +groups are ignored. The generator translates the regex literally without +adjusting for this semantic difference, so the resulting strings keep +prefixes/wrappers that Python would have stripped. + +**Wrong JS:** +```js +hashtags: caption.match(/#([^\s!@#$%^&*()_+{}:"|<>?;',./`~]+)/g)?.join(',') +// produces "#lotr,#woodart" +``` + +**Correct JS:** +```js +// Option A: strip the literal prefix from each full match +hashtags: caption.match(/#([^\s...]+)/g)?.map(h => h.slice(1)).join(',') ?? '' +// Option B: use matchAll to get capture groups properly +hashtags: [...caption.matchAll(/#([^\s...]+)/g)].map(m => m[1]).join(',') ?? '' +``` + +**Example:** `modules/instagram.js:812` (also 766, 870 — three copies) + +**Search pattern:** `\.match\(/[^/]*\([^/]*\)[^/]*/g\)` — any `.match()` with +a global-flag regex containing a capture group. Quick check: +`grep -nE "\.match\(/.*\(.*\).*\/g\)" modules/` + +--- + +### `undefined` field values get dropped from JSON, but Python's `None` becomes `null` + +**Status:** open + +**Why it happens:** When `JSON.stringify` encounters an object property whose +value is `undefined`, it **omits the key entirely** from the output. Python's +`json.dumps` serializes `None` as `null`, keeping the key. The generator +writes assignments like `location.city = node.location.city` where the +right-hand side can be `undefined`, producing missing keys in JS output +that show up as `only in Python: = null` diffs against 4CAT. + +**Wrong JS:** +```js +location.city = node.location.city; // undefined if .city missing +// JSON.stringify({location_city: undefined}) → "{}" (key omitted) + +body: caption, // null if no caption — Python returns "" here, not null +``` + +**Correct JS:** +```js +// Whichever fallback Python uses for that specific field: +location.city = node.location.city ?? null; // some fields → null +body: caption ?? '', // other fields → "" +``` + +**Example:** `modules/instagram.js:745, 853` (`null` flavor), +559, 648, 798 (`""` flavor for `body`) + +**Note:** Python's choice of `None` vs `""` is per-field — there's no +universal rule. When the comparator reports `~ X JS: null Python: ""` use +`?? ''`. When it reports `- only in Python: X = null` use `?? null`. The +distinction matters because the JS output should match Python's choice +exactly for that field. + +**Search pattern:** harder to grep automatically — any property assignment +where the RHS could be `undefined`/`null` and the resulting field is +expected to appear in the mapped output. Look at "only in Python: X = null" +and "~ X JS: null Python: \"\"" diffs in the comparator output to find +specific cases. + +--- + +### Object-reference inequality used as type check + +**Status:** open + +**Why it happens:** The generator emits `caption !== new MissingMappedField('')` +to mean "caption is not a missing-marker", but `new MissingMappedField('')` +creates a fresh object every time, and `!==` on objects compares references. +The expression is **always true**, so the conditional never takes the +"missing" branch. Likely originates from Python idioms like `caption != ""` +or `caption is not None`, mistranslated through the MissingMappedField +abstraction. + +**Wrong JS:** +```js +hashtags: caption !== new MissingMappedField('') ? caption.match(...) : '', +// !== between two different object references is always true +``` + +**Correct JS:** +```js +// If the intent was "if caption has content", just truthy-check it: +hashtags: caption ? caption.match(...) : '', +// If the intent was "if caption is not a MissingMappedField instance": +hashtags: !(caption instanceof MissingMappedField) ? caption.match(...) : '', +``` + +**Example:** `modules/instagram.js:812` (and two other copies) + +**Search pattern:** `!== new [A-Z]` or `=== new [A-Z]` — any equality +comparison with a freshly-constructed object. Quick check: +`grep -nE "(!==|===) new [A-Z]" modules/` + +--- + +### `.method()` chain on potentially-null result + +**Status:** open + +**Why it happens:** In Python, calling a method on `None` raises +`AttributeError`, which 4CAT sometimes catches. In JS, calling a method on +`null`/`undefined` throws `TypeError: Cannot read properties of null +(reading '')`. The generator emits the same dotted chain without +optional-chaining (`?.`) protection. + +**Wrong JS:** +```js +hashtags: caption !== new MissingMappedField('') + ? caption.match(/#([^\s!@#$%^&*()_+{}:"|<>?;',./`~]+)/g)?.join(',') + : '', +``` +(here `caption` is allowed to be `null`, so `caption.match(...)` blows up +on null caption) + +**Correct JS:** +```js +hashtags: caption + ? caption.match(/#([^\s!@#$%^&*()_+{}:"|<>?;',./`~]+)/g)?.join(',') ?? '' + : '', +``` + +**Example:** `modules/instagram.js:809` + +**Search pattern:** harder to grep — needs reading. Worth manual review of +any field that uses `caption.match`, `something.split`, `something.join` +without `?.` on a value that could be null/undefined. + +--- + +## Generator prompt feedback (running list) + +Concrete things to fold into the generator's prompt over time: + +1. **Python `x in y` where `y` is a string** → use `y.includes(x)` in JS, + never `x in y`. +2. **Python f-strings** → use JS template literals (backticks) with + `${...}` syntax. Never leave `{...}` in single- or double-quoted strings. +3. **`?? {}` after a `.get(...) or {}` translation** → only use this if the + following code does property-access. If the following code does a + truthy guard (`if (x && y)`), drop the default and use just `node.user`. +4. **Method chains on possibly-null values** → use `?.` (optional + chaining) instead of `.` whenever the receiver could be null/undefined. +5. **The auto-imports header block** → emit either real `import { ... }` + statements with valid relative paths, or a comment-only header. + Never emit bare paths as JS statements. +6. **Python `node.get('X')` truthy check** → in JS, use `node.X` (or + `node.X != null`), not `'X' in node`. The `in` operator checks key + existence, which is True even for explicit-null values. +7. **Datetime serialization** → use the `formatUtcTimestamp` helper from + lib.js (which mimics Python's `strftime('%Y-%m-%d %H:%M:%S')` format), + not `new Date(...).toISOString()` (which has a different output shape: + T separator, milliseconds, Z suffix). +8. **`re.findall` with capture groups** → in JS, `.match(/.../g)` returns + full matches, NOT capture groups. To get capture-group behavior, use + either `[...s.matchAll(/.../g)].map(m => m[1])` or post-process the + full matches with `.map(...)` to strip the literal parts. +9. **Object-reference equality (`!== new X(...)`)** → never. Creating an + object with `new` produces a fresh reference; `===`/`!==` compares + identity. Use `instanceof X` for type checks, or compare values + directly. The MissingMappedField "is this missing?" check should be + `caption instanceof MissingMappedField` or just truthy-check the value. +10. **Python `None` → JSON `null` vs JS `undefined` → omitted** — when a + field's value could be missing and Python returns `null` for it, + JS must explicitly assign `null` (not leave the value as `undefined`). + `JSON.stringify` drops `undefined` keys silently. Use `value ?? null` + when the field is expected to appear in the mapped output. diff --git a/tests/zeeschuimer-to-4cat.json b/tests/zeeschuimer-to-4cat.json new file mode 100644 index 0000000..f7de942 --- /dev/null +++ b/tests/zeeschuimer-to-4cat.json @@ -0,0 +1,7 @@ +{ + "_comment": "Maps Zeeschuimer module filenames (without .js) to 4CAT datasource ids when they differ. Default behavior is identity — only include entries where the two diverge. Discovered via http://localhost/api/datasources/.", + "9gag": "ninegag", + "truth": "truthsocial", + "rednote": "xiaohongshu", + "rednote-comments": "xiaohongshu-comments" +}