From 2182b0f7fb3f6de18d09838639aed850569fbbe7 Mon Sep 17 00:00:00 2001 From: Adam Wildavsky Date: Sat, 27 Jun 2026 18:51:04 -0500 Subject: [PATCH 1/6] Fix trump-void1 move ordering to restore v2.9 search efficiency The heuristic extraction refactor changed weight_alloc_trump_void1's first branch from `lead_suit == trump` to `suit == trump`. Since that is exhaustive with the following `else if (suit != trump)`, the three ruffing branches (using the `24 - rank + ...` formula) became dead code, and trump ruffs were scored with side-suit discard weights instead. This mis-ordered ruffs, costing alpha-beta cutoffs. The effect is small for solve but compounds heavily in calc's warm-TT iterative deepening: calc explored ~34% more nodes than v2.9. Restoring the original `lead_suit == trump` pitch branch makes the ruffing branches reachable again and cuts calc time ~25% (gap to v2.9: 1.37x -> 1.02x). Ordering-only change; double-dummy results are unchanged. Co-authored-by: Cursor --- .../heuristic_sorting/heuristic_sorting.cpp | 48 ++++--------------- 1 file changed, 8 insertions(+), 40 deletions(-) diff --git a/library/src/heuristic_sorting/heuristic_sorting.cpp b/library/src/heuristic_sorting/heuristic_sorting.cpp index 449c0edf..95018f32 100644 --- a/library/src/heuristic_sorting/heuristic_sorting.cpp +++ b/library/src/heuristic_sorting/heuristic_sorting.cpp @@ -676,49 +676,17 @@ void weight_alloc_trump_void1(HeuristicContext& ctx) unsigned short suitCount = tpos.length[curr_hand][suit]; int suitAdd; - if (suit == trump) + if (lead_suit == trump) // We pitch { - // We trump a non-trump card. - - if (tpos.length[partner_lh][lead_suit] != 0) - { - // 3rd hand will follow. - if ((tpos.rank_in_suit[rho_lh][lead_suit] > - (tpos.rank_in_suit[partner_lh][lead_suit] | - bit_map_rank[ctx.lead0_rank])) || - ((tpos.length[rho_lh][lead_suit] == 0) && - (tpos.length[rho_lh][trump] != 0))) - { - // Partner can win with a card or by ruffing. - suitAdd = 60 + (suitCount << 6) / 44; - } - else - { - suitAdd = -2 + (suitCount << 6) / 36; - // Don't ruff from Kx. - if ((suitCount == 2) && - (tpos.second_best[suit].hand == curr_hand)) - suitAdd += -4; - } - } - else if ((tpos.length[rho_lh][lead_suit] == 0) && - (tpos.rank_in_suit[rho_lh][trump] > - tpos.rank_in_suit[partner_lh][trump])) - { - // Partner can overruff 3rd hand. - suitAdd = 60 + (suitCount << 6) / 44; - } - else if ((tpos.length[partner_lh][trump] == 0) && - (tpos.rank_in_suit[rho_lh][lead_suit] > - bit_map_rank[ctx.lead0_rank])) - { - // 3rd hand has no trumps, and partner has suit winner. - suitAdd = 60 + (suitCount << 6) / 44; - } + if (tpos.rank_in_suit[rho_lh][lead_suit] > + (tpos.rank_in_suit[partner_lh][lead_suit] | + bit_map_rank[ctx.lead0_rank])) + // Partner can win. + suitAdd = (suitCount << 6) / 44; else { - suitAdd = -2 + (suitCount << 6) / 36; - // Don't ruff from Kx. + // Don't pitch from Kx. + suitAdd = (suitCount << 6) / 36; if ((suitCount == 2) && (tpos.second_best[suit].hand == curr_hand)) suitAdd += -4; From 8ecebc8de2add27b275718bc86273be5a977d4e4 Mon Sep 17 00:00:00 2001 From: Adam Wildavsky Date: Sun, 28 Jun 2026 01:28:03 +0100 Subject: [PATCH 2/6] Fix incorrect comment Per Copilot. Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- library/src/heuristic_sorting/heuristic_sorting.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/library/src/heuristic_sorting/heuristic_sorting.cpp b/library/src/heuristic_sorting/heuristic_sorting.cpp index 95018f32..88478881 100644 --- a/library/src/heuristic_sorting/heuristic_sorting.cpp +++ b/library/src/heuristic_sorting/heuristic_sorting.cpp @@ -681,7 +681,7 @@ void weight_alloc_trump_void1(HeuristicContext& ctx) if (tpos.rank_in_suit[rho_lh][lead_suit] > (tpos.rank_in_suit[partner_lh][lead_suit] | bit_map_rank[ctx.lead0_rank])) - // Partner can win. + // RHO can win. suitAdd = (suitCount << 6) / 44; else { From 6234595600fdcaed6ac510efbd6d2d3cc393b3dd Mon Sep 17 00:00:00 2001 From: Adam Wildavsky Date: Sun, 28 Jun 2026 07:45:52 -0500 Subject: [PATCH 3/6] Fix signed->unsigned cast bugs that corrupted move ordering and pruning The heuristic/quick-tricks refactor introduced static_cast wrappers on values that v2.9 used as signed, changing search behavior: - make_3 / make_3_ctx: winner[]/second_best[] .hand and .rank were cast to unsigned char, turning the -1 "no card" sentinel into 255. This broke winner[trump].hand == -1 style checks in QuickTricks, losing cutoffs. - weight_alloc_trump_void2 / _void3: rel_rank[aggr[suit]][...] indexed through static_cast(aggr[suit]), truncating the 13-bit aggregate holding to 8 bits and reading the wrong rel_rank row. - QuickTricksPartnerHand{Trump,NT}: bit_map_rank index cast the signed rank through unsigned char. With these reverted to v2.9's signed handling, the per-move-generation ordering trace now matches v2.9 exactly (0 divergences on list1), closing the residual calc gap to parity. Ordering/pruning-only change; double-dummy results are unchanged and all library tests pass. Co-authored-by: Cursor --- library/src/ab_search.cpp | 16 ++++++++-------- .../src/heuristic_sorting/heuristic_sorting.cpp | 15 +++------------ library/src/quick_tricks.cpp | 4 ++-- 3 files changed, 13 insertions(+), 22 deletions(-) diff --git a/library/src/ab_search.cpp b/library/src/ab_search.cpp index 762d75f0..c5b8ee96 100644 --- a/library/src/ab_search.cpp +++ b/library/src/ab_search.cpp @@ -878,10 +878,10 @@ void make_3( int aggr = posPoint->aggr[st]; - posPoint->winner[st].rank = static_cast(thrp->rel[aggr].abs_rank[1][st].rank); - posPoint->winner[st].hand = static_cast(thrp->rel[aggr].abs_rank[1][st].hand); - posPoint->second_best[st].rank = static_cast(thrp->rel[aggr].abs_rank[2][st].rank); - posPoint->second_best[st].hand = static_cast(thrp->rel[aggr].abs_rank[2][st].hand); + posPoint->winner[st].rank = thrp->rel[aggr].abs_rank[1][st].rank; + posPoint->winner[st].hand = thrp->rel[aggr].abs_rank[1][st].hand; + posPoint->second_best[st].rank = thrp->rel[aggr].abs_rank[2][st].rank; + posPoint->second_best[st].hand = thrp->rel[aggr].abs_rank[2][st].hand; } } @@ -944,10 +944,10 @@ static void make_3_ctx( int aggr = posPoint->aggr[st]; - posPoint->winner[st].rank = static_cast(thrp->rel[aggr].abs_rank[1][st].rank); - posPoint->winner[st].hand = static_cast(thrp->rel[aggr].abs_rank[1][st].hand); - posPoint->second_best[st].rank = static_cast(thrp->rel[aggr].abs_rank[2][st].rank); - posPoint->second_best[st].hand = static_cast(thrp->rel[aggr].abs_rank[2][st].hand); + posPoint->winner[st].rank = thrp->rel[aggr].abs_rank[1][st].rank; + posPoint->winner[st].hand = thrp->rel[aggr].abs_rank[1][st].hand; + posPoint->second_best[st].rank = thrp->rel[aggr].abs_rank[2][st].rank; + posPoint->second_best[st].hand = thrp->rel[aggr].abs_rank[2][st].hand; } } diff --git a/library/src/heuristic_sorting/heuristic_sorting.cpp b/library/src/heuristic_sorting/heuristic_sorting.cpp index 88478881..8ddd0a2d 100644 --- a/library/src/heuristic_sorting/heuristic_sorting.cpp +++ b/library/src/heuristic_sorting/heuristic_sorting.cpp @@ -1213,10 +1213,7 @@ void weight_alloc_trump_void2(HeuristicContext& ctx) mply[k].rank < ctx.move1_rank) { // Don't underruff. - unsigned char aggrSuit = static_cast(tpos.aggr[suit]); - unsigned char moveRank = static_cast(mply[k].rank); - unsigned char relRankValue = static_cast(rel_rank[aggrSuit][moveRank]); - int r_rank = static_cast(relRankValue); + int r_rank = rel_rank[tpos.aggr[suit]][mply[k].rank]; suitAdd = (suitCount << 6) / 40; mply[k].weight = -32 + r_rank + suitAdd; } @@ -1386,10 +1383,7 @@ void weight_alloc_trump_void3(HeuristicContext& ctx) { for (int k = last_num_moves; k < num_moves; k++) { - int r_rank = static_cast( - static_cast( - rel_rank[static_cast(tpos.aggr[suit])] - [static_cast(mply[k].rank)])); + int r_rank = rel_rank[tpos.aggr[suit]][mply[k].rank]; if (mply[k].rank > ctx.move2_rank) mply[k].weight = 33 + r_rank; // Overruff else @@ -1404,10 +1398,7 @@ void weight_alloc_trump_void3(HeuristicContext& ctx) { for (int k = last_num_moves; k < num_moves; k++) { - int r_rank = static_cast( - static_cast( - rel_rank[static_cast(tpos.aggr[suit])] - [static_cast(mply[k].rank)])); + int r_rank = rel_rank[tpos.aggr[suit]][mply[k].rank]; mply[k].weight = 33 + r_rank; } } diff --git a/library/src/quick_tricks.cpp b/library/src/quick_tricks.cpp index 0c161406..48f37fe5 100644 --- a/library/src/quick_tricks.cpp +++ b/library/src/quick_tricks.cpp @@ -1000,7 +1000,7 @@ int QuickTricksPartnerHandTrump( if (ctx.thread_ptr()->rel[ranks].abs_rank[3][suit].hand == partner[hand]) { tpos.win_ranks[depth][suit] |= bit_map_rank[ - static_cast(static_cast(ctx.thread_ptr()->rel[ranks].abs_rank[3][suit].rank)) ]; + static_cast(ctx.thread_ptr()->rel[ranks].abs_rank[3][suit].rank) ]; tpos.win_ranks[depth][commSuit] |= bit_map_rank[commRank]; @@ -1110,7 +1110,7 @@ int QuickTricksPartnerHandNT( if (ctx.thread_ptr()->rel[ranks].abs_rank[3][suit].hand == partner[hand]) { tpos.win_ranks[depth][suit] |= bit_map_rank[ - static_cast(static_cast(ctx.thread_ptr()->rel[ranks].abs_rank[3][suit].rank)) ]; + static_cast(ctx.thread_ptr()->rel[ranks].abs_rank[3][suit].rank) ]; qt++; if (qt >= cutoff) return qt; From b08925f881868d6a1f49771dc4923490c6b2a380 Mon Sep 17 00:00:00 2001 From: Adam Wildavsky Date: Sun, 28 Jun 2026 16:12:18 -0500 Subject: [PATCH 4/6] Dispatch hardest boards first to shorten parallel calc tail The parallel board loop handed boards out in index order via an atomic counter, so a hard board picked near the end left one worker running long while the others sat idle. Hand out the hardest boards first (longest- processing-time-first) so the tail consists of cheap boards. parallel_all_boards_n gains an optional dispatch-order permutation: workers still pull from the same atomic counter, but the slot is mapped through the order before becoming a board number, so only the dispatch sequence changes and result placement is unaffected. The solve path passes no order and is unchanged. calc estimates per-deal difficulty with a cheap, trump-independent structural proxy (deal_fanout, mirroring Scheduler::Fanout) and sorts board indices by descending difficulty before dispatch. calc list1000 -n18: ~11.0s -> ~9.6s wall (~13%), user CPU unchanged. Co-authored-by: Cursor --- library/src/calc_tables.cpp | 45 +++++++++++++++++++++++++- library/src/system/parallel_boards.cpp | 20 +++++++++--- library/src/system/parallel_boards.hpp | 9 +++++- 3 files changed, 67 insertions(+), 7 deletions(-) diff --git a/library/src/calc_tables.cpp b/library/src/calc_tables.cpp index b0446fe8..1c9df8b0 100644 --- a/library/src/calc_tables.cpp +++ b/library/src/calc_tables.cpp @@ -8,12 +8,15 @@ */ #include "calc_tables.hpp" +#include +#include #include #include #include #include #include +#include #include #include #include @@ -23,6 +26,33 @@ extern Memory memory; extern Scheduler scheduler; +namespace +{ +// Cheap structural difficulty estimate (cards only, trump-independent). Used to +// dispatch the hardest boards first so the parallel tail is short. Mirrors +// Scheduler::Fanout: per hand, sum the number of card groups per suit, with a +// bonus for voids. +auto deal_fanout(const Deal& dl) -> int +{ + int fanout = 0; + for (int h = 0; h < DDS_HANDS; h++) + { + int fanout_suit = 0; + int num_voids = 0; + for (int s = 0; s < DDS_SUITS; s++) + { + const int c = static_cast(dl.remainCards[h][s] >> 2); + fanout_suit += group_data[c].last_group_ + 1; + if (c == 0) + num_voids++; + } + fanout_suit += num_voids * fanout_suit; + fanout += fanout_suit; + } + return fanout; +} +} + // Legacy overload (creates temporary context) auto calc_all_boards_n( Boards * bop, @@ -137,11 +167,24 @@ auto calc_all_boards_n( else { std::vector contexts(static_cast(nthreads)); + + // Dispatch hardest boards first to shorten the parallel tail. + std::vector fanout(static_cast(n)); + for (int i = 0; i < n; i++) + fanout[static_cast(i)] = deal_fanout(bop->deals[i]); + std::vector order(static_cast(n)); + std::iota(order.begin(), order.end(), 0); + std::stable_sort(order.begin(), order.end(), + [&](const int a, const int b) { + return fanout[static_cast(a)] > fanout[static_cast(b)]; + }); + err = parallel_all_boards_n(n, nthreads, [&](const int worker_id, const int bno) -> int { return calc_single_common_internal( contexts[static_cast(worker_id)], *bop, *solvedp, bno); - }); + }, + &order); } END_BLOCK_TIMER; diff --git a/library/src/system/parallel_boards.cpp b/library/src/system/parallel_boards.cpp index 750041e7..ebb2a0af 100644 --- a/library/src/system/parallel_boards.cpp +++ b/library/src/system/parallel_boards.cpp @@ -34,20 +34,29 @@ auto resolve_worker_count( auto parallel_all_boards_n( const int count, const int worker_cap, - const std::function& process_board) -> int + const std::function& process_board, + const std::vector* order) -> int { if (count <= 0) { return RETURN_NO_FAULT; } + // Map a dispatch slot to the board number to process. With an order, hand out + // boards in that sequence (e.g. hardest first); otherwise in index order. + const bool use_order = + (order != nullptr && static_cast(order->size()) == count); + auto board_of = [&](const int slot) -> int { + return use_order ? (*order)[static_cast(slot)] : slot; + }; + const int workers = resolve_worker_count(worker_cap, count); if (workers == 1) { - for (int bno = 0; bno < count; ++bno) + for (int slot = 0; slot < count; ++slot) { - const int rc = process_board(0, bno); + const int rc = process_board(0, board_of(slot)); if (rc != RETURN_NO_FAULT) { return rc; @@ -62,11 +71,12 @@ auto parallel_all_boards_n( auto worker = [&](const int worker_id) { for (;;) { - const int bno = next.fetch_add(1, std::memory_order_relaxed); - if (bno >= count || first_error.load(std::memory_order_relaxed) != RETURN_NO_FAULT) + const int slot = next.fetch_add(1, std::memory_order_relaxed); + if (slot >= count || first_error.load(std::memory_order_relaxed) != RETURN_NO_FAULT) { break; } + const int bno = board_of(slot); const int rc = process_board(worker_id, bno); if (rc != RETURN_NO_FAULT) diff --git a/library/src/system/parallel_boards.hpp b/library/src/system/parallel_boards.hpp index 2f19b6de..01292d05 100644 --- a/library/src/system/parallel_boards.hpp +++ b/library/src/system/parallel_boards.hpp @@ -10,6 +10,7 @@ #pragma once #include +#include /** @@ -28,9 +29,15 @@ auto resolve_worker_count(int max_threads, int count) -> int; * @param worker_cap Maximum worker threads; <= 0 uses hardware concurrency. * @param process_board Called for each board; must return RETURN_NO_FAULT (1) * on success. Receives the worker's thread index and board number. + * @param order Optional dispatch order: a permutation of [0, count) giving the + * sequence in which board numbers are handed out (e.g. hardest first to + * shorten the tail). When null/empty, boards are dispatched in index + * order. Only the dispatch order changes; @p process_board still receives + * the real board number, so result placement is unaffected. * @return First non-success code from @p process_board, or RETURN_NO_FAULT. */ auto parallel_all_boards_n( int count, int worker_cap, - const std::function& process_board) -> int; + const std::function& process_board, + const std::vector* order = nullptr) -> int; From 2d57e8dbae36df5d3fea7d704990d29b5f884cef Mon Sep 17 00:00:00 2001 From: Adam Wildavsky Date: Sun, 28 Jun 2026 17:39:43 -0500 Subject: [PATCH 5/6] Skip hardest-first dispatch for single-deal calc CalcDDtableN builds one board per strain for a single deal. deal_fanout is trump-independent, so all boards share one fanout and the difficulty sort is a pure no-op there. Gate the sort behind a difficulty_sort flag (default on for batch CalcAllTablesN) and disable it for the single-deal path. Co-authored-by: Cursor --- library/src/calc_tables.cpp | 41 +++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/library/src/calc_tables.cpp b/library/src/calc_tables.cpp index 1c9df8b0..01b2a79a 100644 --- a/library/src/calc_tables.cpp +++ b/library/src/calc_tables.cpp @@ -53,11 +53,14 @@ auto deal_fanout(const Deal& dl) -> int } } -// Legacy overload (creates temporary context) +// Legacy overload (creates temporary context). difficulty_sort dispatches the +// hardest boards first; it only helps across distinct deals (batch calc), so it +// is skipped for a single deal (all boards share one deal / one fanout). auto calc_all_boards_n( Boards * bop, SolvedBoards * solvedp, - int max_threads = 0) -> int; + int max_threads = 0, + bool difficulty_sort = true) -> int; auto calc_single_common_internal( @@ -140,7 +143,8 @@ auto calc_all_boards_n( auto calc_all_boards_n( Boards * bop, SolvedBoards * solvedp, - int max_threads) -> int + int max_threads, + bool difficulty_sort) -> int { const int n = bop->no_of_boards; if (n > MAXNOOFBOARDS) @@ -168,23 +172,29 @@ auto calc_all_boards_n( { std::vector contexts(static_cast(nthreads)); - // Dispatch hardest boards first to shorten the parallel tail. - std::vector fanout(static_cast(n)); - for (int i = 0; i < n; i++) - fanout[static_cast(i)] = deal_fanout(bop->deals[i]); - std::vector order(static_cast(n)); - std::iota(order.begin(), order.end(), 0); - std::stable_sort(order.begin(), order.end(), - [&](const int a, const int b) { - return fanout[static_cast(a)] > fanout[static_cast(b)]; - }); + // Dispatch hardest boards first to shorten the parallel tail. This only + // helps across distinct deals (batch calc); for a single deal every board + // shares one fanout, so the sort is skipped (it would be a no-op anyway). + std::vector order; + if (difficulty_sort) + { + std::vector fanout(static_cast(n)); + for (int i = 0; i < n; i++) + fanout[static_cast(i)] = deal_fanout(bop->deals[i]); + order.resize(static_cast(n)); + std::iota(order.begin(), order.end(), 0); + std::stable_sort(order.begin(), order.end(), + [&](const int a, const int b) { + return fanout[static_cast(a)] > fanout[static_cast(b)]; + }); + } err = parallel_all_boards_n(n, nthreads, [&](const int worker_id, const int bno) -> int { return calc_single_common_internal( contexts[static_cast(worker_id)], *bop, *solvedp, bno); }, - &order); + order.empty() ? nullptr : &order); } END_BLOCK_TIMER; @@ -235,7 +245,8 @@ int STDCALL CalcDDtableN( ind++; } - int res = calc_all_boards_n(&bo, &solved, maxThreads); + // Single deal: all boards share one deal, so hardest-first sorting is a no-op. + int res = calc_all_boards_n(&bo, &solved, maxThreads, /*difficulty_sort=*/false); if (res != 1) return res; From f4ed912bf19ef7c8e27322058bec28e184b1b1d5 Mon Sep 17 00:00:00 2001 From: Adam Wildavsky Date: Sun, 28 Jun 2026 22:28:09 -0500 Subject: [PATCH 6/6] Validate order is a permutation in parallel_all_boards_n Only honor the optional dispatch order when it is a valid permutation of [0, count: each element in range and unique. A malformed order (duplicates or out-of-range values) now falls back to index order, preventing invalid board indices from reaching process_board. EOF ) Co-authored-by: Cursor --- library/src/system/parallel_boards.cpp | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/library/src/system/parallel_boards.cpp b/library/src/system/parallel_boards.cpp index ebb2a0af..31a014de 100644 --- a/library/src/system/parallel_boards.cpp +++ b/library/src/system/parallel_boards.cpp @@ -31,6 +31,21 @@ auto resolve_worker_count( } +static auto is_permutation_of_range( + const std::vector& order, + const int count) -> bool +{ + std::vector seen(static_cast(count), 0); + for (const int v : order) + { + if (v < 0 || v >= count || seen[static_cast(v)]) + return false; + seen[static_cast(v)] = 1; + } + return true; +} + + auto parallel_all_boards_n( const int count, const int worker_cap, @@ -43,9 +58,13 @@ auto parallel_all_boards_n( } // Map a dispatch slot to the board number to process. With an order, hand out - // boards in that sequence (e.g. hardest first); otherwise in index order. + // boards in that sequence (e.g. hardest first); otherwise in index order. The + // order is only honored when it is a valid permutation of [0, count); a + // malformed order falls back to index order to avoid invalid board indices. const bool use_order = - (order != nullptr && static_cast(order->size()) == count); + (order != nullptr && + static_cast(order->size()) == count && + is_permutation_of_range(*order, count)); auto board_of = [&](const int slot) -> int { return use_order ? (*order)[static_cast(slot)] : slot; };