diff --git a/library/src/ab_search.cpp b/library/src/ab_search.cpp
index 762d75f0..c5b8ee96 100644
--- a/library/src/ab_search.cpp
+++ b/library/src/ab_search.cpp
@@ -878,10 +878,10 @@ void make_3(
 
       int aggr = posPoint->aggr[st];
 
-  posPoint->winner[st].rank = static_cast<unsigned char>(thrp->rel[aggr].abs_rank[1][st].rank);
-  posPoint->winner[st].hand = static_cast<unsigned char>(thrp->rel[aggr].abs_rank[1][st].hand);
-  posPoint->second_best[st].rank = static_cast<unsigned char>(thrp->rel[aggr].abs_rank[2][st].rank);
-  posPoint->second_best[st].hand = static_cast<unsigned char>(thrp->rel[aggr].abs_rank[2][st].hand);
+  posPoint->winner[st].rank = thrp->rel[aggr].abs_rank[1][st].rank;
+  posPoint->winner[st].hand = thrp->rel[aggr].abs_rank[1][st].hand;
+  posPoint->second_best[st].rank = thrp->rel[aggr].abs_rank[2][st].rank;
+  posPoint->second_best[st].hand = thrp->rel[aggr].abs_rank[2][st].hand;
 
     }
   }
@@ -944,10 +944,10 @@ static void make_3_ctx(
 
       int aggr = posPoint->aggr[st];
 
-      posPoint->winner[st].rank = static_cast<unsigned char>(thrp->rel[aggr].abs_rank[1][st].rank);
-      posPoint->winner[st].hand = static_cast<unsigned char>(thrp->rel[aggr].abs_rank[1][st].hand);
-      posPoint->second_best[st].rank = static_cast<unsigned char>(thrp->rel[aggr].abs_rank[2][st].rank);
-      posPoint->second_best[st].hand = static_cast<unsigned char>(thrp->rel[aggr].abs_rank[2][st].hand);
+      posPoint->winner[st].rank = thrp->rel[aggr].abs_rank[1][st].rank;
+      posPoint->winner[st].hand = thrp->rel[aggr].abs_rank[1][st].hand;
+      posPoint->second_best[st].rank = thrp->rel[aggr].abs_rank[2][st].rank;
+      posPoint->second_best[st].hand = thrp->rel[aggr].abs_rank[2][st].hand;
 
     }
   }
diff --git a/library/src/calc_tables.cpp b/library/src/calc_tables.cpp
index b0446fe8..01b2a79a 100644
--- a/library/src/calc_tables.cpp
+++ b/library/src/calc_tables.cpp
@@ -8,12 +8,15 @@
 */
 
 #include "calc_tables.hpp"
+#include <algorithm>
+#include <numeric>
 #include <vector>
 
 #include <pbn.hpp>
 #include <solve_board.hpp>
 #include <api/solve_board.hpp>
 #include <solver_if.hpp>
+#include <lookup_tables/lookup_tables.hpp>
 #include <system/memory.hpp>
 #include <system/parallel_boards.hpp>
 #include <system/scheduler.hpp>
@@ -23,11 +26,41 @@
 extern Memory memory;
 extern Scheduler scheduler;
 
-// Legacy overload (creates temporary context)
+namespace
+{
+// Cheap structural difficulty estimate (cards only, trump-independent). Used to
+// dispatch the hardest boards first so the parallel tail is short. Mirrors
+// Scheduler::Fanout: per hand, sum the number of card groups per suit, with a
+// bonus for voids.
+auto deal_fanout(const Deal& dl) -> int
+{
+  int fanout = 0;
+  for (int h = 0; h < DDS_HANDS; h++)
+  {
+    int fanout_suit = 0;
+    int num_voids = 0;
+    for (int s = 0; s < DDS_SUITS; s++)
+    {
+      const int c = static_cast<int>(dl.remainCards[h][s] >> 2);
+      fanout_suit += group_data[c].last_group_ + 1;
+      if (c == 0)
+        num_voids++;
+    }
+    fanout_suit += num_voids * fanout_suit;
+    fanout += fanout_suit;
+  }
+  return fanout;
+}
+}
+
+// Legacy overload (creates temporary context). difficulty_sort dispatches the
+// hardest boards first; it only helps across distinct deals (batch calc), so it
+// is skipped for a single deal (all boards share one deal / one fanout).
 auto calc_all_boards_n(
   Boards * bop,
   SolvedBoards * solvedp,
-  int max_threads = 0) -> int;
+  int max_threads = 0,
+  bool difficulty_sort = true) -> int;
 
 
 auto calc_single_common_internal(
@@ -110,7 +143,8 @@ auto calc_all_boards_n(
 auto calc_all_boards_n(
   Boards * bop,
   SolvedBoards * solvedp,
-  int max_threads) -> int
+  int max_threads,
+  bool difficulty_sort) -> int
 {
   const int n = bop->no_of_boards;
   if (n > MAXNOOFBOARDS)
@@ -137,11 +171,30 @@ auto calc_all_boards_n(
   else
   {
     std::vector<SolverContext> contexts(static_cast<unsigned>(nthreads));
+
+    // Dispatch hardest boards first to shorten the parallel tail. This only
+    // helps across distinct deals (batch calc); for a single deal every board
+    // shares one fanout, so the sort is skipped (it would be a no-op anyway).
+    std::vector<int> order;
+    if (difficulty_sort)
+    {
+      std::vector<int> fanout(static_cast<unsigned>(n));
+      for (int i = 0; i < n; i++)
+        fanout[static_cast<unsigned>(i)] = deal_fanout(bop->deals[i]);
+      order.resize(static_cast<unsigned>(n));
+      std::iota(order.begin(), order.end(), 0);
+      std::stable_sort(order.begin(), order.end(),
+        [&](const int a, const int b) {
+          return fanout[static_cast<unsigned>(a)] > fanout[static_cast<unsigned>(b)];
+        });
+    }
+
     err = parallel_all_boards_n(n, nthreads,
       [&](const int worker_id, const int bno) -> int {
         return calc_single_common_internal(
           contexts[static_cast<unsigned>(worker_id)], *bop, *solvedp, bno);
-      });
+      },
+      order.empty() ? nullptr : &order);
   }
 
   END_BLOCK_TIMER;
@@ -192,7 +245,8 @@ int STDCALL CalcDDtableN(
     ind++;
   }
 
-  int res = calc_all_boards_n(&bo, &solved, maxThreads);
+  // Single deal: all boards share one deal, so hardest-first sorting is a no-op.
+  int res = calc_all_boards_n(&bo, &solved, maxThreads, /*difficulty_sort=*/false);
   if (res != 1)
     return res;
 
diff --git a/library/src/heuristic_sorting/heuristic_sorting.cpp b/library/src/heuristic_sorting/heuristic_sorting.cpp
index 449c0edf..8ddd0a2d 100644
--- a/library/src/heuristic_sorting/heuristic_sorting.cpp
+++ b/library/src/heuristic_sorting/heuristic_sorting.cpp
@@ -676,49 +676,17 @@ void weight_alloc_trump_void1(HeuristicContext& ctx)
   unsigned short suitCount = tpos.length[curr_hand][suit];
   int suitAdd;
 
-  if (suit == trump)
+  if (lead_suit == trump) // We pitch
   {
-    // We trump a non-trump card.
-    
-    if (tpos.length[partner_lh][lead_suit] != 0)
-    {
-      // 3rd hand will follow.
-  if ((tpos.rank_in_suit[rho_lh][lead_suit] >
-       (tpos.rank_in_suit[partner_lh][lead_suit] |
-    bit_map_rank[ctx.lead0_rank])) ||
-          ((tpos.length[rho_lh][lead_suit] == 0) &&
-           (tpos.length[rho_lh][trump] != 0)))
-      {
-        // Partner can win with a card or by ruffing.
-        suitAdd = 60 + (suitCount << 6) / 44;
-      }
-      else
-      {
-        suitAdd = -2 + (suitCount << 6) / 36;
-        // Don't ruff from Kx.
-        if ((suitCount == 2) &&
-            (tpos.second_best[suit].hand == curr_hand))
-          suitAdd += -4;
-      }
-    }
-    else if ((tpos.length[rho_lh][lead_suit] == 0) &&
-             (tpos.rank_in_suit[rho_lh][trump] >
-              tpos.rank_in_suit[partner_lh][trump]))
-    {
-      // Partner can overruff 3rd hand.
-      suitAdd = 60 + (suitCount << 6) / 44;
-    }
-  else if ((tpos.length[partner_lh][trump] == 0) &&
-       (tpos.rank_in_suit[rho_lh][lead_suit] >
-        bit_map_rank[ctx.lead0_rank]))
-    {
-      // 3rd hand has no trumps, and partner has suit winner.
-      suitAdd = 60 + (suitCount << 6) / 44;
-    }
+    if (tpos.rank_in_suit[rho_lh][lead_suit] >
+        (tpos.rank_in_suit[partner_lh][lead_suit] |
+         bit_map_rank[ctx.lead0_rank]))
+      // RHO can win.
+      suitAdd = (suitCount << 6) / 44;
     else
     {
-      suitAdd = -2 + (suitCount << 6) / 36;
-      // Don't ruff from Kx.
+      // Don't pitch from Kx.
+      suitAdd = (suitCount << 6) / 36;
       if ((suitCount == 2) &&
           (tpos.second_best[suit].hand == curr_hand))
         suitAdd += -4;
@@ -1245,10 +1213,7 @@ void weight_alloc_trump_void2(HeuristicContext& ctx)
     mply[k].rank < ctx.move1_rank)
     {
       // Don't underruff.
-    unsigned char aggrSuit = static_cast<unsigned char>(tpos.aggr[suit]);
-    unsigned char moveRank = static_cast<unsigned char>(mply[k].rank);
-  unsigned char relRankValue = static_cast<unsigned char>(rel_rank[aggrSuit][moveRank]);
-    int r_rank = static_cast<int>(relRankValue);
+    int r_rank = rel_rank[tpos.aggr[suit]][mply[k].rank];
       suitAdd = (suitCount << 6) / 40;
       mply[k].weight = -32 + r_rank + suitAdd;
     }
@@ -1418,10 +1383,7 @@ void weight_alloc_trump_void3(HeuristicContext& ctx)
     {
       for (int k = last_num_moves; k < num_moves; k++)
       {
-    int r_rank = static_cast<int>(
-      static_cast<unsigned char>(
-  rel_rank[static_cast<unsigned char>(tpos.aggr[suit])]
-             [static_cast<unsigned char>(mply[k].rank)]));
+    int r_rank = rel_rank[tpos.aggr[suit]][mply[k].rank];
         if (mply[k].rank > ctx.move2_rank)
           mply[k].weight = 33 + r_rank; // Overruff
         else
@@ -1436,10 +1398,7 @@ void weight_alloc_trump_void3(HeuristicContext& ctx)
   {
     for (int k = last_num_moves; k < num_moves; k++)
     {
-    int r_rank = static_cast<int>(
-      static_cast<unsigned char>(
-  rel_rank[static_cast<unsigned char>(tpos.aggr[suit])]
-           [static_cast<unsigned char>(mply[k].rank)]));
+    int r_rank = rel_rank[tpos.aggr[suit]][mply[k].rank];
       mply[k].weight = 33 + r_rank;
     }
   }
diff --git a/library/src/quick_tricks.cpp b/library/src/quick_tricks.cpp
index 0c161406..48f37fe5 100644
--- a/library/src/quick_tricks.cpp
+++ b/library/src/quick_tricks.cpp
@@ -1000,7 +1000,7 @@ int QuickTricksPartnerHandTrump(
     if (ctx.thread_ptr()->rel[ranks].abs_rank[3][suit].hand == partner[hand])
     {
       tpos.win_ranks[depth][suit] |= bit_map_rank[
-        static_cast<int>(static_cast<unsigned char>(ctx.thread_ptr()->rel[ranks].abs_rank[3][suit].rank)) ];
+        static_cast<int>(ctx.thread_ptr()->rel[ranks].abs_rank[3][suit].rank) ];
 
       tpos.win_ranks[depth][commSuit] |= bit_map_rank[commRank];
 
@@ -1110,7 +1110,7 @@ int QuickTricksPartnerHandNT(
     if (ctx.thread_ptr()->rel[ranks].abs_rank[3][suit].hand == partner[hand])
     {
       tpos.win_ranks[depth][suit] |= bit_map_rank[
-        static_cast<int>(static_cast<unsigned char>(ctx.thread_ptr()->rel[ranks].abs_rank[3][suit].rank)) ];
+        static_cast<int>(ctx.thread_ptr()->rel[ranks].abs_rank[3][suit].rank) ];
       qt++;
       if (qt >= cutoff)
         return qt;
diff --git a/library/src/system/parallel_boards.cpp b/library/src/system/parallel_boards.cpp
index 750041e7..31a014de 100644
--- a/library/src/system/parallel_boards.cpp
+++ b/library/src/system/parallel_boards.cpp
@@ -31,23 +31,51 @@ auto resolve_worker_count(
 }
 
 
+static auto is_permutation_of_range(
+  const std::vector<int>& order,
+  const int count) -> bool
+{
+  std::vector<char> seen(static_cast<unsigned>(count), 0);
+  for (const int v : order)
+  {
+    if (v < 0 || v >= count || seen[static_cast<unsigned>(v)])
+      return false;
+    seen[static_cast<unsigned>(v)] = 1;
+  }
+  return true;
+}
+
+
 auto parallel_all_boards_n(
   const int count,
   const int worker_cap,
-  const std::function<int(int worker_id, int bno)>& process_board) -> int
+  const std::function<int(int worker_id, int bno)>& process_board,
+  const std::vector<int>* order) -> int
 {
   if (count <= 0)
   {
     return RETURN_NO_FAULT;
   }
 
+  // Map a dispatch slot to the board number to process. With an order, hand out
+  // boards in that sequence (e.g. hardest first); otherwise in index order. The
+  // order is only honored when it is a valid permutation of [0, count); a
+  // malformed order falls back to index order to avoid invalid board indices.
+  const bool use_order =
+    (order != nullptr &&
+     static_cast<int>(order->size()) == count &&
+     is_permutation_of_range(*order, count));
+  auto board_of = [&](const int slot) -> int {
+    return use_order ? (*order)[static_cast<unsigned>(slot)] : slot;
+  };
+
   const int workers = resolve_worker_count(worker_cap, count);
 
   if (workers == 1)
   {
-    for (int bno = 0; bno < count; ++bno)
+    for (int slot = 0; slot < count; ++slot)
     {
-      const int rc = process_board(0, bno);
+      const int rc = process_board(0, board_of(slot));
       if (rc != RETURN_NO_FAULT)
       {
         return rc;
@@ -62,11 +90,12 @@ auto parallel_all_boards_n(
   auto worker = [&](const int worker_id) {
     for (;;)
     {
-      const int bno = next.fetch_add(1, std::memory_order_relaxed);
-      if (bno >= count || first_error.load(std::memory_order_relaxed) != RETURN_NO_FAULT)
+      const int slot = next.fetch_add(1, std::memory_order_relaxed);
+      if (slot >= count || first_error.load(std::memory_order_relaxed) != RETURN_NO_FAULT)
       {
         break;
       }
+      const int bno = board_of(slot);
 
       const int rc = process_board(worker_id, bno);
       if (rc != RETURN_NO_FAULT)
diff --git a/library/src/system/parallel_boards.hpp b/library/src/system/parallel_boards.hpp
index 2f19b6de..01292d05 100644
--- a/library/src/system/parallel_boards.hpp
+++ b/library/src/system/parallel_boards.hpp
@@ -10,6 +10,7 @@
 #pragma once
 
 #include <functional>
+#include <vector>
 
 
 /**
@@ -28,9 +29,15 @@ auto resolve_worker_count(int max_threads, int count) -> int;
  * @param worker_cap Maximum worker threads; <= 0 uses hardware concurrency.
  * @param process_board Called for each board; must return RETURN_NO_FAULT (1)
  *        on success. Receives the worker's thread index and board number.
+ * @param order Optional dispatch order: a permutation of [0, count) giving the
+ *        sequence in which board numbers are handed out (e.g. hardest first to
+ *        shorten the tail). When null/empty, boards are dispatched in index
+ *        order. Only the dispatch order changes; @p process_board still receives
+ *        the real board number, so result placement is unaffected.
  * @return First non-success code from @p process_board, or RETURN_NO_FAULT.
  */
 auto parallel_all_boards_n(
   int count,
   int worker_cap,
-  const std::function<int(int worker_id, int bno)>& process_board) -> int;
+  const std::function<int(int worker_id, int bno)>& process_board,
+  const std::vector<int>* order = nullptr) -> int;