Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
491f51b
minimal changes for direct from 4CAT mapping
dale-wahl May 5, 2026
b06805f
give me some standard helper functions
dale-wahl May 6, 2026
f9a2405
fix csv export
dale-wahl May 6, 2026
2f084b9
another to CSV fix
dale-wahl May 6, 2026
d787042
revert tiktok (mistaken test result commited)
dale-wahl May 6, 2026
a9fba9a
clean up UI (make download menu button)
dale-wahl May 6, 2026
0980a56
testing is hard in JS
dale-wahl May 6, 2026
46b96c7
add fixtures folder and README.md to explain what I did
dale-wahl May 6, 2026
487b5b6
add MapItemException
dale-wahl May 7, 2026
b6f487d
make a warning pop up
dale-wahl May 7, 2026
f28e310
add MapItemException
dale-wahl May 7, 2026
7dedad7
Merge branch 'master' into map_item_testing_actual_tests
dale-wahl May 26, 2026
f8c47d7
Merge branch 'map_item_testing' into map_item_testing_actual_tests
dale-wahl May 27, 2026
5baff31
add env variables for tests (to connect to 4CAT)
dale-wahl May 27, 2026
6a8ce38
mirror 4CAT API missing value
dale-wahl May 27, 2026
0c31403
test the 4cat API endpoint
dale-wahl May 27, 2026
be2f308
update docs and packages
dale-wahl May 27, 2026
caf1c7f
some mapping for odd datasource names
dale-wahl May 27, 2026
f10fc49
update existing map_item tests and add helper
dale-wahl May 27, 2026
3633cde
comparison testing for datasources
dale-wahl May 27, 2026
7d97a0f
list common translation errors
dale-wahl May 27, 2026
6ad4c13
package.json fix
dale-wahl May 27, 2026
11ffffb
rm other test doc
dale-wahl Jun 3, 2026
6cc6100
map_item.test.js verify modules import and map_item exists only
dale-wahl Jun 3, 2026
a090675
remove old fixtures and 4cat probe
dale-wahl Jun 3, 2026
c62a7e7
update lib.js note on new endpoint
dale-wahl Jun 3, 2026
234f1ce
update tests/.env.example (comments and dataset keys)
dale-wahl Jun 3, 2026
e0d0fb8
note on _loader.js for `wrap_for_map_item`
dale-wahl Jun 3, 2026
f2341d6
fix my test environment; scripts vs libraries
dale-wahl Jun 3, 2026
e39ad42
update map_item_compare.test.js for new 4CAT endpoints
dale-wahl Jun 3, 2026
d7fcb4c
fast_fail OR --all for tests
dale-wahl Jun 3, 2026
4f9e69c
use headers for datasource
dale-wahl Jun 3, 2026
8b918d4
add the --all instead of just fail_fail
dale-wahl Jun 3, 2026
00f0369
map_item_compare.test.js: compare based on mapped `id` field not raw …
dale-wahl Jun 4, 2026
c7bb9ac
map_item_compare.test.js: still show errors on failed `id` matches
dale-wahl Jun 4, 2026
53c2b6f
chore: sync map_item for bootstrap from 4CAT 888f0a126ea70404034f265f…
dale-wahl Jun 10, 2026
dd62a4e
Merge branch 'map_item_testing_actual_tests' into auto/4cat-map-item-…
Jun 10, 2026
ce9ba39
instagram.js: fix {} is truthy, location_city null vs ""
dale-wahl Jun 11, 2026
a5d981c
douyin: "" vs null and Missing vs null
dale-wahl Jun 11, 2026
25ab435
gab: key lost if undefined in JS
dale-wahl Jun 11, 2026
a83ebe8
map_item_test: fix order issue on id comparison
dale-wahl Jun 11, 2026
3b5d157
map_item_compare.test: summarize datasources that pass/fail
dale-wahl Jun 11, 2026
8118d81
threads.js fix some regex
dale-wahl Jun 11, 2026
743af85
map_item_compare.test: loosely test URLs (not byte for byte)
dale-wahl Jun 11, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@

# Testing artefacts
.temp-profile
tests/.env
tests/.env.local
tests/.compare-summary.txt
__pycache__/
*.pyc

# logs
geckodriver.log
10 changes: 10 additions & 0 deletions js/lib.js
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,16 @@ class MissingMappedField {
toString() {
return `${this.value}`;
}

// Mirror 4CAT's API serialization so JSON.stringify produces the same
// tagged form on both sides: 4CAT's /api/dataset/<key>/items/ endpoint,
// when called with `missing_fields=keep`, emits missing values as
// `{ __missing: true, value: <fallback> }`. Matching that shape here
// lets the map_item comparator deep-equal both sides without special
// handling.
toJSON() {
return { __missing: true, value: this.value };
}
}

/**
Expand Down
68 changes: 67 additions & 1 deletion modules/9gag.js
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,70 @@ export function capture(response, source_platform_url, source_url) {
}

return data["data"]["posts"];
}
}

// === auto-generated by 4cat map_item sync — BLOCK REPLACED AUTOMATICALLY ===
// (regenerated from datasources/ninegag/search_9gag.py)
export function map_item(post) {
// Convert Unix timestamp (seconds) to Date object
const postTimestampSec = post.creationTs;
const postTimestamp = new Date(postTimestampSec * 1000);

// Select the highest‑resolution image that is not a video
const images = Object.values(post.images ?? {});
const imageCandidates = images.filter(v => !('hasAudio' in v));
imageCandidates.sort((a, b) => (b.width * b.height) - (a.width * a.height));
const image = imageCandidates[0] ?? {};

// Select the highest‑resolution video (if any) and pick the best URL format
const videoCandidates = images.filter(v => ('hasAudio' in v));
videoCandidates.sort((a, b) => (b.width * b.height) - (a.width * a.height));
let videoUrl = "";
if (videoCandidates.length) {
const vid = videoCandidates[0];
if (vid.av1Url) videoUrl = vid.av1Url;
else if (vid.h265Url) videoUrl = vid.h265Url;
else if (vid.vp9Url) videoUrl = vid.vp9Url;
else if (vid.vp8Url) videoUrl = vid.vp8Url;
}

// Handle anonymous posts – they appear as the user "9GAGGER"
if (!post.creator) {
post.creator = {
username: "9GAGGER",
fullName: "",
emojiStatus: "",
isVerifiedAccount: ""
};
}

return new MappedItem({
collected_from_url: normalize_url_encoding(post.__import_meta?.source_platform_url ?? ""),
id: post.id,
url: post.url,
subject: post.title,
body: post.description,
timestamp: formatUtcTimestamp(postTimestampSec),
author: post.creator?.username ?? "",
author_name: post.creator?.fullName ?? "",
author_status: post.creator?.emojiStatus ?? "",
author_verified: post.creator?.isVerifiedAccount ? "yes" : "no",
type: post.type,
image_url: image.url ?? "",
video_url: videoUrl,
is_nsfw: post.nsfw === 0 ? "no" : "yes",
is_promoted: post.promoted === 0 ? "no" : "yes",
is_vote_masked: post.isVoteMasked === 0 ? "no" : "yes",
is_anonymous: !post.isAnonymous ? "no" : "yes",
source_domain: post.sourceDomain,
source_url: post.sourceUrl,
upvotes: post.upVoteCount,
downvotes: post.downVoteCount,
score: (post.upVoteCount ?? 0) - (post.downVoteCount ?? 0),
comments: post.commentsCount,
tags: (post.tags ?? []).map(t => t.key).join(","),
tags_annotated: (post.annotationTags ?? []).join(","),
unix_timestamp: postTimestampSec
});
}
// === end auto-generated ===
5 changes: 5 additions & 0 deletions modules/_loader.js
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
// Load-order dependency: `wrap_for_map_item` (used below) is a free global
// defined in js/lib.js, which manifest.json loads as a plain background
// script before this module. There is no import for it here on purpose —
// MV2 background scripts share one global scope. If lib.js stops being
// loaded first, the mapper wrapper below will ReferenceError.
async function load() {
const imported_modules = [
await import("./tiktok.js"),
Expand Down
266 changes: 265 additions & 1 deletion modules/douyin.js
Original file line number Diff line number Diff line change
Expand Up @@ -339,4 +339,268 @@ export function capture(response, source_platform_url, source_url) {
} else {
// console.log("Detected expected object(s) by no usable items found")
}
}
}

// === auto-generated by 4cat map_item sync — BLOCK REPLACED AUTOMATICALLY ===
// (regenerated from datasources/douyin/search_douyin.py)
function getChineseNumber(num) {
if (typeof num === "number") {
return num;
}
if (typeof num !== "string") {
return 0;
}
if (num.includes("万")) {
const cleaned = num.replace(/[^0-9.]/g, "");
return parseFloat(cleaned) * 10000;
}
const cleaned = num.replace(/[^0-9.]/g, "");
return cleaned ? parseInt(cleaned, 10) : 0;
}

export function map_item(item) {
// Helper to safely access nested properties
const get = (obj, path, def) => {
return path.reduce((o, p) => (o && o[p] != null ? o[p] : undefined), obj) ?? def;
};

const metadata = item["__import_meta"] ?? {};
let subject = "Post";
let stream_data = {};
let post_timestamp;
let video_url = "";
let video_thumbnail = "";
let video_description = "";
let duration = "Unknown";
let prevent_download = null;
let stats = {};
let author = {};
let video_tags = "";
let aweme_id_key, group_id_key, text_extra_key, hashtag_key, mention_key, author_id_key;
let mix_info_key, mix_id_key, mix_name_key;
let author_sec_key, avatar_thumb_key, url_list_key, is_fake_key;

if (item["ZS_collected_from_embed"]) {
// Embedded HTML format
if (item["cellRoom"] && item["cellRoom"] !== "$undefined") {
stream_data = item["cellRoom"]["rawdata"] ?? {};
}
if (Object.keys(stream_data).length) {
// Stream embedded
subject = "Stream";
const createtime = stream_data["createtime"] ?? (item["requestTime"] ? item["requestTime"] / 1000 : undefined);
post_timestamp = new Date((createtime ?? 0) * 1000);
video_url = stream_data["stream_url"]?.["flv_pull_url"]?.["FULL_HD1"] ?? "";
video_thumbnail = stream_data["video"]?.["cover"] ?? null;
video_description = stream_data["title"] ?? "";
duration = "Unknown";
stats = stream_data["stats"] ?? {};
author = stream_data["owner"] ?? {};
author_sec_key = "sec_uid";
avatar_thumb_key = "avatar_thumb";
url_list_key = "url_list";
is_fake_key = "is_ad_fake";
} else {
// Regular post embedded
post_timestamp = new Date(item["createTime"] * 1000);
const videos_list = item["video"]?.["bitRateList"];
if (videos_list) {
const videos = [...videos_list].sort((a, b) => (b["bitRate"] ?? 0) - (a["bitRate"] ?? 0));
video_url = "https" + (videos[0]["playApi"] ?? "");
} else {
video_url = "";
}
video_thumbnail = item["video"]?.["cover"] ?? null;
video_description = item["desc"] ?? "";
duration = item["duration"] ?? item["video"]?.["duration"] ?? "Unknown";
prevent_download = item["download"]?.["prevent"] ? "yes" : "no";
stats = item["stats"] ?? {};
author = item["authorInfo"] ?? {};
author_sec_key = "secUid";
avatar_thumb_key = "avatarThumb";
url_list_key = "urlList";
is_fake_key = "isAdFake";
}
// Embedded keys (same for both branches)
aweme_id_key = "awemeId";
group_id_key = "groupId";
text_extra_key = "textExtra";
hashtag_key = "hashtagName";
mention_key = "secUid";
author_id_key = "authorUserId";
mix_info_key = "mixInfo";
mix_id_key = "mixId";
mix_name_key = "mixName";
// Stats (may be MissingMappedField)
const collect_count = stats["collectCount"] ?? new MissingMappedField("Unknown");
const comment_count = stats["commentCount"] ?? new MissingMappedField("Unknown");
const digg_count = stats["diggCount"] ?? new MissingMappedField("Unknown");
const download_count = stats["downloadCount"] ?? new MissingMappedField("Unknown");
const forward_count = stats["forwardCount"] ?? new MissingMappedField("Unknown");
const play_count = stats["playCount"] ?? new MissingMappedField("Unknown");
const share_count = stats["shareCount"] ?? new MissingMappedField("Unknown");
// Video tags (guess)
video_tags = (item["videoTag"] ?? []).filter(t => t["tagName"]).map(t => t["tagName"]).join(",");
const mix_current_episode = (item[mix_info_key] ?? {})["currentEpisode"] ?? "N/A";
// Build result later – keep intermediate values in closure variables
var __embed_collect_count = collect_count;
var __embed_comment_count = comment_count;
var __embed_digg_count = digg_count;
var __embed_download_count = download_count;
var __embed_forward_count = forward_count;
var __embed_play_count = play_count;
var __embed_share_count = share_count;
var __embed_mix_current_episode = mix_current_episode;
} else {
// Non‑embedded JSON format
stream_data = item["rawdata"] ?? item["cell_room"]?.["rawdata"];
if (stream_data) {
// Stream (may be a JSON string)
if (typeof stream_data === "string") {
try { stream_data = JSON.parse(stream_data); } catch (e) { /* ignore */ }
}
subject = "Stream";
const create_time = stream_data["create_time"] ?? item["create_time"] ?? (metadata["timestamp_collected"] ? metadata["timestamp_collected"] / 1000 : undefined);
post_timestamp = new Date((create_time ?? 0) * 1000);
video_url = stream_data["stream_url"]?.["flv_pull_url"]?.["FULL_HD1"] ?? "";
video_thumbnail = stream_data["video"]?.["cover"] ?? null;
video_description = stream_data["title"] ?? "";
duration = "Unknown";
author = stream_data["owner"] ?? {};
video_tags = stream_data["video_feed_tag"] ?? "";
stats = stream_data["stats"] ?? {};
} else {
// Regular post
post_timestamp = new Date(item["create_time"] * 1000);
const videos_list = item["video"]?.["bit_rate"];
if (!videos_list) {
video_url = "";
video_thumbnail = "";
} else {
const videos = [...videos_list].sort((a, b) => (b["bit_rate"] ?? 0) - (a["bit_rate"] ?? 0));
video_url = videos[0]["play_addr"]?.["url_list"]?.[0] ?? "";
video_thumbnail = item["video"]?.["cover"]?.["url_list"]?.[0] ?? "";
}
video_description = item["desc"] ?? "";
duration = item["duration"] ?? item["video"]?.["duration"] ?? "Unknown";
author = item["author"] ?? {};
stats = item["statistics"] ?? {};
}
prevent_download = ("prevent_download" in item) ? (item["prevent_download"] ? "yes" : "no") : null;
// Keys for non‑embedded format
aweme_id_key = "aweme_id";
group_id_key = "group_id";
text_extra_key = "text_extra";
hashtag_key = "hashtag_name";
mention_key = "sec_uid";
author_id_key = "author_user_id";
mix_info_key = "mix_info";
mix_id_key = "mix_id";
mix_name_key = "mix_name";
author_sec_key = "sec_uid";
avatar_thumb_key = "avatar_thumb";
url_list_key = "url_list";
is_fake_key = "is_ad_fake";
// Stats (may be MissingMappedField)
const collect_count = stats ? (stats["collect_count"] ?? null) : new MissingMappedField("Unknown");
const comment_count = stats ? (stats["comment_count"] ?? null) : new MissingMappedField("Unknown");
const digg_count = stats ? (stats["digg_count"] ?? null) : new MissingMappedField("Unknown");
const download_count = stats ? (stats["download_count"] ?? null) : new MissingMappedField("Unknown");
const forward_count = stats ? (stats["forward_count"] ?? null) : new MissingMappedField("Unknown");
const play_count = stats ? (stats["play_count"] ?? null) : new MissingMappedField("Unknown");
const share_count = stats ? (stats["share_count"] ?? null) : new MissingMappedField("Unknown");
// Video tags list
video_tags = (item["video_tag"] ?? []).filter(t => t["tag_name"]).map(t => t["tag_name"]).join(",");
const mix_current_episode = item[mix_info_key] ? (item[mix_info_key]["statis"]?.["current_episode"] ?? "N/A") : "N/A";
var __embed_collect_count = collect_count;
var __embed_comment_count = comment_count;
var __embed_digg_count = digg_count;
var __embed_download_count = download_count;
var __embed_forward_count = forward_count;
var __embed_play_count = play_count;
var __embed_share_count = share_count;
var __embed_mix_current_episode = mix_current_episode;
}

// Stream stats (common)
const count_total_streams_viewers = stats["total_user"] ?? "N/A";
const count_current_stream_viewers = ("user_count_str" in stats) ? getChineseNumber(stats["user_count_str"]) : "N/A";

// Displayed flag for mix items
let displayed = true;
if (item["ZS_collected_from_mix"] && !item["ZS_first_mix_vid"]) {
displayed = false;
}

// Image URLs
const image_urls = [];
if (Array.isArray(item["images"])) {
for (const img of item["images"]) {
if (Array.isArray(img["url_list"])) {
image_urls.push(img["url_list"][0]);
} else if (Array.isArray(img["urlList"])) {
image_urls.push(img["urlList"][0]);
}
}
}

// Music fields
const music_obj = item["music"];
const music_author = (music_obj && music_obj !== "$undefined") ? (music_obj["author"] ?? "") : "";
const music_title = (music_obj && music_obj !== "$undefined") ? (music_obj["title"] ?? "") : "";
const music_url = (music_obj && music_obj !== "$undefined") ? (music_obj["play_url"]?.["uri"] ?? "") : "";

// Collection / Mix handling
let mix_current_episode = __embed_mix_current_episode;
if (mix_current_episode === "$undefined") mix_current_episode = "N/A";
const collection_id_raw = item[mix_info_key]?.[mix_id_key] ?? "N/A";
const collection_id = collection_id_raw === "$undefined" ? "N/A" : collection_id_raw;
const collection_name_raw = item[mix_info_key]?.[mix_name_key] ?? "N/A";
const collection_name = collection_name_raw === "$undefined" ? "N/A" : collection_name_raw;
const part_of_collection = (item[mix_info_key] && (mix_id_key in item[mix_info_key]) && collection_id !== "N/A") ? "yes" : "no";

// Build the mapped item
return new MappedItem({
"collected_from_url": normalize_url_encoding(metadata["source_platform_url"] ?? ""),
"id": item[aweme_id_key],
"thread_id": item[group_id_key],
"subject": subject,
"body": video_description,
"timestamp": formatUtcTimestamp(Math.floor(post_timestamp.getTime() / 1000)),
"post_url": subject === "Post" ? `https://www.douyin.com/video/${item[aweme_id_key]}` : `https://live.douyin.com/${author["web_rid"]}`,
"region": item["region"] ?? "",
"hashtags": (item[text_extra_key] ?? []).filter(t => t[hashtag_key]).map(t => t[hashtag_key]).join(","),
"mentions": (item[text_extra_key] ?? []).filter(t => t[mention_key]).map(t => `https://www.douyin.com/user/${t[mention_key]}`).join(","),
"video_tags": video_tags,
"prevent_download": prevent_download,
"video_url": video_url,
"video_thumbnail": video_thumbnail,
"video_duration": duration,
"image_urls": image_urls.join(","),
"music_author": music_author,
"music_title": music_title,
"music_url": music_url,
"collect_count": __embed_collect_count,
"comment_count": __embed_comment_count,
"digg_count": __embed_digg_count,
"download_count": __embed_download_count,
"forward_count": __embed_forward_count,
"play_count": __embed_play_count,
"share_count": __embed_share_count,
"count_total_streams_viewers": count_total_streams_viewers,
"count_current_stream_viewers": count_current_stream_viewers,
"author_user_id": item[author_id_key] ?? (author["uid"] ?? author["id"]),
"author_nickname": author["nickname"] ?? "",
"author_profile_url": `https://www.douyin.com/user/${author[author_sec_key]}`,
"author_thumbnail_url": author[avatar_thumb_key]?.[url_list_key]?.[0] ?? "",
"author_region": author["region"] ?? null,
"author_is_ad_fake": author[is_fake_key] ?? null,
"part_of_collection": part_of_collection,
"4CAT_first_video_displayed": displayed ? "yes" : "no",
"collection_id": collection_id,
"collection_name": collection_name,
"place_in_collection": mix_current_episode,
"unix_timestamp": Math.floor(post_timestamp.getTime() / 1000)
});
}
// === end auto-generated ===
Loading