From 460169da52a1b1cfa29cf37a8d2d08f9c1e4ddf4 Mon Sep 17 00:00:00 2001
From: HannoSpreeuw <h.spreeuw@esciencecenter.nl>
Date: Wed, 6 May 2026 20:42:39 +0200
Subject: [PATCH 01/22] Perform these calculations only with mass

Perform these calculations only if particles have mass, i.e. if they can exert gravity.
The lower limit of 1e-30 kg is somewhat arbitrary.
---
 lib/sapporo_light/dev_evaluate_gravity.cu | 46 +++++++++++++----------
 1 file changed, 26 insertions(+), 20 deletions(-)

diff --git a/lib/sapporo_light/dev_evaluate_gravity.cu b/lib/sapporo_light/dev_evaluate_gravity.cu
index d4c967105c..924dc9e30e 100644
--- a/lib/sapporo_light/dev_evaluate_gravity.cu
+++ b/lib/sapporo_light/dev_evaluate_gravity.cu
@@ -73,32 +73,38 @@ __device__ void body_body_interaction(float &ds_min,
     
   }
   
-  float inv_ds = 0.0f;
-  if (__float_as_int(pos_i.w.y) != __float_as_int(pos_j.w.y)) {
-      inv_ds = rsqrt(ds2 + EPS2);
-  }
-
   float mass    = pos_j.w.x;
-  float inv_ds2 = inv_ds*inv_ds;                         // 1 FLOP
-  float inv_ds3 = mass * inv_ds*inv_ds2;                 // 2 FLOP
   
-  // 3*4 + 3 = 15 FLOP
-  acc_i.x = ((inv_ds3 * dr.x) + acc_i.x);
-  acc_i.y = ((inv_ds3 * dr.y) + acc_i.y);
-  acc_i.z = ((inv_ds3 * dr.z) + acc_i.z);
+  // Skip particles with zero mass (test particles).
+  // This matches the CPU code behavior in idata.cc.
+  if (mass > 1e-30f) {
   
-  acc_i.w = (mass * inv_ds  + acc_i.w);
+    float inv_ds = 0.0f;
+    if (__float_as_int(pos_i.w.y) != __float_as_int(pos_j.w.y)) {
+        inv_ds = rsqrt(ds2 + EPS2);
+    }
 
-  float3 dv;    // 3 FLOP
-  dv.x = vel_j.x - vel_i.x;
-  dv.y = vel_j.y - vel_i.y;
-  dv.z = vel_j.z - vel_i.z;
+    float inv_ds2 = inv_ds*inv_ds;                         // 1 FLOP
+    float inv_ds3 = mass * inv_ds*inv_ds2;                 // 2 FLOP
+    
+    // 3*4 + 3 = 15 FLOP
+    acc_i.x = ((inv_ds3 * dr.x) + acc_i.x);
+    acc_i.y = ((inv_ds3 * dr.y) + acc_i.y);
+    acc_i.z = ((inv_ds3 * dr.z) + acc_i.z);
+    
+    acc_i.w = (mass * inv_ds  + acc_i.w);
+
+    float3 dv;    // 3 FLOP
+    dv.x = vel_j.x - vel_i.x;
+    dv.y = vel_j.y - vel_i.y;
+    dv.z = vel_j.z - vel_i.z;
 
-  float drdv = -3.0f * (inv_ds3*inv_ds2) * (((dr.x*dv.x) + dr.y*dv.y) + dr.z*dv.z);
+    float drdv = -3.0f * (inv_ds3*inv_ds2) * (((dr.x*dv.x) + dr.y*dv.y) + dr.z*dv.z);
 
-  jrk_i.x = (jrk_i.x + inv_ds3 * dv.x) + drdv * dr.x;
-  jrk_i.y = (jrk_i.y + inv_ds3 * dv.y) + drdv * dr.y;
-  jrk_i.z = (jrk_i.z + inv_ds3 * dv.z) + drdv * dr.z;
+    jrk_i.x = (jrk_i.x + inv_ds3 * dv.x) + drdv * dr.x;
+    jrk_i.y = (jrk_i.y + inv_ds3 * dv.y) + drdv * dr.y;
+    jrk_i.z = (jrk_i.z + inv_ds3 * dv.z) + drdv * dr.z;
+  }
 
 
   // TOTAL 50 FLOP (or 60 FLOP if compared against GRAPE6)

From 53b929bed9bb20f89054146912dd7cb5f88889a5 Mon Sep 17 00:00:00 2001
From: Hanno Spreeuw <h.spreeuw@esciencecenter.nl>
Date: Thu, 7 May 2026 15:19:43 +0200
Subject: [PATCH 02/22] This essentially reverts commit 460169d

Commit 460169d did not yield any speedup, possibly because of branch divergence:
"Branch divergence occurs: If roughly half the particles are massless test particles,
about half the threads in each warp will take the if branch while the other half skip it"
This commit essentially reverts commit 460169d.
---
 lib/sapporo_light/dev_evaluate_gravity.cu | 51 +++++++++++------------
 1 file changed, 25 insertions(+), 26 deletions(-)

diff --git a/lib/sapporo_light/dev_evaluate_gravity.cu b/lib/sapporo_light/dev_evaluate_gravity.cu
index 924dc9e30e..7d3c200968 100644
--- a/lib/sapporo_light/dev_evaluate_gravity.cu
+++ b/lib/sapporo_light/dev_evaluate_gravity.cu
@@ -75,36 +75,35 @@ __device__ void body_body_interaction(float &ds_min,
   
   float mass    = pos_j.w.x;
   
-  // Skip particles with zero mass (test particles).
-  // This matches the CPU code behavior in idata.cc.
-  if (mass > 1e-30f) {
-  
-    float inv_ds = 0.0f;
-    if (__float_as_int(pos_i.w.y) != __float_as_int(pos_j.w.y)) {
-        inv_ds = rsqrt(ds2 + EPS2);
-    }
+  // Mass check is not needed anymore: particles are sorted by mass on the host,
+  // and only massive particles (mass > 1e-30f) are sent to the GPU kernel.
+  // This eliminates branch divergence and improves GPU efficiency.
 
-    float inv_ds2 = inv_ds*inv_ds;                         // 1 FLOP
-    float inv_ds3 = mass * inv_ds*inv_ds2;                 // 2 FLOP
-    
-    // 3*4 + 3 = 15 FLOP
-    acc_i.x = ((inv_ds3 * dr.x) + acc_i.x);
-    acc_i.y = ((inv_ds3 * dr.y) + acc_i.y);
-    acc_i.z = ((inv_ds3 * dr.z) + acc_i.z);
-    
-    acc_i.w = (mass * inv_ds  + acc_i.w);
+  float inv_ds = 0.0f;
+  if (__float_as_int(pos_i.w.y) != __float_as_int(pos_j.w.y)) {
+      inv_ds = rsqrt(ds2 + EPS2);
+  }
 
-    float3 dv;    // 3 FLOP
-    dv.x = vel_j.x - vel_i.x;
-    dv.y = vel_j.y - vel_i.y;
-    dv.z = vel_j.z - vel_i.z;
+  float inv_ds2 = inv_ds*inv_ds;                         // 1 FLOP
+  float inv_ds3 = mass * inv_ds*inv_ds2;                 // 2 FLOP
+  
+  // 3*4 + 3 = 15 FLOP
+  acc_i.x = ((inv_ds3 * dr.x) + acc_i.x);
+  acc_i.y = ((inv_ds3 * dr.y) + acc_i.y);
+  acc_i.z = ((inv_ds3 * dr.z) + acc_i.z);
+  
+  acc_i.w = (mass * inv_ds  + acc_i.w);
 
-    float drdv = -3.0f * (inv_ds3*inv_ds2) * (((dr.x*dv.x) + dr.y*dv.y) + dr.z*dv.z);
+  float3 dv;    // 3 FLOP
+  dv.x = vel_j.x - vel_i.x;
+  dv.y = vel_j.y - vel_i.y;
+  dv.z = vel_j.z - vel_i.z;
 
-    jrk_i.x = (jrk_i.x + inv_ds3 * dv.x) + drdv * dr.x;
-    jrk_i.y = (jrk_i.y + inv_ds3 * dv.y) + drdv * dr.y;
-    jrk_i.z = (jrk_i.z + inv_ds3 * dv.z) + drdv * dr.z;
-  }
+  float drdv = -3.0f * (inv_ds3*inv_ds2) * (((dr.x*dv.x) + dr.y*dv.y) + dr.z*dv.z);
+
+  jrk_i.x = (jrk_i.x + inv_ds3 * dv.x) + drdv * dr.x;
+  jrk_i.y = (jrk_i.y + inv_ds3 * dv.y) + drdv * dr.y;
+  jrk_i.z = (jrk_i.z + inv_ds3 * dv.z) + drdv * dr.z;
 
 
   // TOTAL 50 FLOP (or 60 FLOP if compared against GRAPE6)

From 98eca4f50c4dc253425e7f8a890ee6487275ddcd Mon Sep 17 00:00:00 2001
From: Hanno Spreeuw <h.spreeuw@esciencecenter.nl>
Date: Thu, 7 May 2026 16:13:29 +0200
Subject: [PATCH 03/22] Lower limit on the mass not hard coded.

The lower limit on the mass that discerns test particles from particles that exert gravity should not be hard coded.
That is why the comment has been updated.
---
 lib/sapporo_light/dev_evaluate_gravity.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/sapporo_light/dev_evaluate_gravity.cu b/lib/sapporo_light/dev_evaluate_gravity.cu
index 7d3c200968..02bfbe62f6 100644
--- a/lib/sapporo_light/dev_evaluate_gravity.cu
+++ b/lib/sapporo_light/dev_evaluate_gravity.cu
@@ -76,7 +76,7 @@ __device__ void body_body_interaction(float &ds_min,
   float mass    = pos_j.w.x;
   
   // Mass check is not needed anymore: particles are sorted by mass on the host,
-  // and only massive particles (mass > 1e-30f) are sent to the GPU kernel.
+  // and only massive particles (mass > SAPPORO_TEST_PARTICLE_MASS) are sent to the GPU kernel.
   // This eliminates branch divergence and improves GPU efficiency.
 
   float inv_ds = 0.0f;

From 75f1c446d87eb5e27f6fa467e469a3cd8b53eb10 Mon Sep 17 00:00:00 2001
From: Hanno Spreeuw <h.spreeuw@esciencecenter.nl>
Date: Thu, 7 May 2026 16:19:09 +0200
Subject: [PATCH 04/22] Test particles will never reach the GPU

By introducing "nj_massive" we try to enforce that only massive particles will reach the GPU.
---
 lib/sapporo_light/host_evaluate_gravity.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/sapporo_light/host_evaluate_gravity.cu b/lib/sapporo_light/host_evaluate_gravity.cu
index f12b3befbb..849158ce56 100644
--- a/lib/sapporo_light/host_evaluate_gravity.cu
+++ b/lib/sapporo_light/host_evaluate_gravity.cu
@@ -93,7 +93,7 @@ extern "C"
     dim3 grid(NBLOCKS, 1, 1);
 
     int shared_mem_size = p*q*(sizeof(DS4) + sizeof(float4));
-    int nj_scaled = n_norm(gpu.nj, q*NBLOCKS);
+    int nj_scaled = n_norm(gpu.nj_massive, q*NBLOCKS);
    
     #if CUDART_VERSION < 5000
     CUDA_SAFE_CALL(cudaMemcpyToSymbol(
@@ -115,7 +115,7 @@ extern "C"
     
     double t1 = get_time();
     if (gpu.ngb)
-      dev_evaluate_gravity<true><<<grid, threads, shared_mem_size>>>(gpu.nj, 
+      dev_evaluate_gravity<true><<<grid, threads, shared_mem_size>>>(gpu.nj_massive, 
                                                                      nj_scaled/(NBLOCKS*q),
                                                                      NTHREADS,
                                                                      gpu.Ppos_j+ ofs,
@@ -124,7 +124,7 @@ extern "C"
                                                                      gpu.acc_i,  gpu.jrk_i,
                                                                      gpu.ngb_list);
     else
-      dev_evaluate_gravity<false><<<grid, threads, shared_mem_size>>>(gpu.nj, 
+      dev_evaluate_gravity<false><<<grid, threads, shared_mem_size>>>(gpu.nj_massive, 
                                                                       nj_scaled/(NBLOCKS*q),
                                                                       NTHREADS,
                                                                       gpu.Ppos_j + ofs, 

From 951bb36876786a1c078761e4b59cd5b7d5fa2572 Mon Sep 17 00:00:00 2001
From: Hanno Spreeuw <h.spreeuw@esciencecenter.nl>
Date: Thu, 7 May 2026 16:21:52 +0200
Subject: [PATCH 05/22] "SAPPORO_TEST_PARTICLE_MASS" is set

1) "SAPPORO_TEST_PARTICLE_MASS" gets a default value of 1e-30 unless "_TINY_" as used in "stdinc.h" is set. To enable the override, one would have to set "_TINY_" as an env var, I reckon, i.e. in the current implementation it will not be copied from "src/amuse_ph4/src/stdinc.h".
2) Declaration and initialisation of "nj_massive".
---
 lib/sapporo_light/sapporo.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/lib/sapporo_light/sapporo.h b/lib/sapporo_light/sapporo.h
index 6969d16dfa..7a8d369112 100644
--- a/lib/sapporo_light/sapporo.h
+++ b/lib/sapporo_light/sapporo.h
@@ -22,6 +22,15 @@
 #include <map>
 
 #include <math.h>
+
+#ifndef SAPPORO_TEST_PARTICLE_MASS
+# if defined(_TINY_)
+#   define SAPPORO_TEST_PARTICLE_MASS ((float)(_TINY_))
+# else
+#   define SAPPORO_TEST_PARTICLE_MASS 1e-30f
+# endif
+#endif
+
 using namespace std;
 
 #include <cuda_runtime.h>
@@ -116,6 +125,7 @@ class sapporo {
   double EPS2;
 
   int            nj_modified;
+  int            nj_massive;  // number of particles with mass > SAPPORO_TEST_PARTICLE_MASS
   vector<int>    address_j;
   vector<DS2>    t_j;
   vector<DS4>    pos_j;
@@ -170,6 +180,7 @@ class sapporo {
 
     predict = false;
     nj_modified = 0;
+    nj_massive = 0;
 
     device.address_j = NULL;
 

From 5b1611c59d7f9dad7002683fc4c7c15f118010e9 Mon Sep 17 00:00:00 2001
From: Hanno Spreeuw <h.spreeuw@esciencecenter.nl>
Date: Thu, 7 May 2026 16:27:32 +0200
Subject: [PATCH 06/22] Declaration of "nj_massive"

Declaration of "nj_massive"; apparently this is needed both in "sapporo.h" and "sapporo_multi.h".
---
 lib/sapporo_light/sapporo_multi.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/sapporo_light/sapporo_multi.h b/lib/sapporo_light/sapporo_multi.h
index 37f5f13ef8..96ca05fd6f 100644
--- a/lib/sapporo_light/sapporo_multi.h
+++ b/lib/sapporo_light/sapporo_multi.h
@@ -10,6 +10,7 @@ struct sapporo_multi_struct {
   bool ngb;
   int nj, ni;
   int nj_total;
+  int nj_massive;  // number of particles with mass > SAPPORO_TEST_PARTICLE_MASS
   int  nj_max;
   int     nj_modified;
   bool    predict;

From 4dafb009c1d4f13146d8074ea5e4f1d3e390e0ad Mon Sep 17 00:00:00 2001
From: Hanno Spreeuw <h.spreeuw@esciencecenter.nl>
Date: Thu, 7 May 2026 16:30:03 +0200
Subject: [PATCH 07/22] Sorting of particles along mass

Particles are rearranged depending on mass in descending order. The idea is that test particles, i.e. particles with a mass less than "SAPPORO_TEST_PARTICLE_MASS" will not reach the GPU; they do not need to reach the GPU since they do not exert gravity.
"nj_massive", i.e. the number of particles with a mass > "SAPPORO_TEST_PARTICLE_MASS" is computed.
The reordering of particles, i.e. the fact that after reordering an index does no longer point to the same particle, may have unforeseen consequences, will check this later.
---
 lib/sapporo_light/sapporo.cpp | 50 +++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/lib/sapporo_light/sapporo.cpp b/lib/sapporo_light/sapporo.cpp
index 1afd0c1c4e..fc5a1ae19c 100644
--- a/lib/sapporo_light/sapporo.cpp
+++ b/lib/sapporo_light/sapporo.cpp
@@ -134,6 +134,56 @@ void sapporo::calc_firsthalf(int cluster_id,
   }
 
   if (address_j.size() > 0) {
+    // Sort particles by mass: massive particles first, test particles
+    // (mass <= SAPPORO_TEST_PARTICLE_MASS) last. This eliminates branch divergence.
+    std::vector<int> sort_indices(address_j.size());
+    for (int i = 0; i < (int)address_j.size(); i++) {
+      sort_indices[i] = i;
+    }
+    
+    std::sort(sort_indices.begin(), sort_indices.end(),
+              [this](int i, int j) {
+                // Sort by mass in descending order (massive first)
+                float mass_i = pos_j[i].w.x;
+                float mass_j = pos_j[j].w.x;
+                return mass_i > mass_j;
+              });
+    
+    // Reorder all particle vectors according to sort_indices
+    std::vector<int>    sorted_address_j(address_j.size());
+    std::vector<DS2>    sorted_t_j(t_j.size());
+    std::vector<DS4>    sorted_pos_j(pos_j.size());
+    std::vector<float4> sorted_vel_j(vel_j.size());
+    std::vector<float4> sorted_acc_j(acc_j.size());
+    std::vector<float4> sorted_jrk_j(jrk_j.size());
+    
+    for (int i = 0; i < (int)address_j.size(); i++) {
+      int idx = sort_indices[i];
+      sorted_address_j[i] = address_j[idx];
+      sorted_t_j[i]       = t_j[idx];
+      sorted_pos_j[i]     = pos_j[idx];
+      sorted_vel_j[i]     = vel_j[idx];
+      sorted_acc_j[i]     = acc_j[idx];
+      sorted_jrk_j[i]     = jrk_j[idx];
+    }
+    
+    address_j = sorted_address_j;
+    t_j       = sorted_t_j;
+    pos_j     = sorted_pos_j;
+    vel_j     = sorted_vel_j;
+    acc_j     = sorted_acc_j;
+    jrk_j     = sorted_jrk_j;
+    
+    // Count particles with mass > threshold
+    nj_massive = 0;
+    for (int i = 0; i < (int)address_j.size(); i++) {
+      if (pos_j[i].w.x > SAPPORO_TEST_PARTICLE_MASS) {
+        nj_massive++;
+      } else {
+        break;  // remaining particles are test particles (sorted to the end)
+      }
+    }
+    
     send_j_particles_to_device(device_id);
   }
 

From 8d664c23de3ed6e6e8989068f7b31606734a4dc5 Mon Sep 17 00:00:00 2001
From: Hanno Spreeuw <h.spreeuw@esciencecenter.nl>
Date: Thu, 7 May 2026 16:39:16 +0200
Subject: [PATCH 08/22] Set "nj_massive" for "gpu"

Set "nj_massive" for "gpu", which is a "sapporo_multi_struct".
---
 lib/sapporo_light/send_fetch_data.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/sapporo_light/send_fetch_data.cpp b/lib/sapporo_light/send_fetch_data.cpp
index e47ae146d4..9d0194262b 100644
--- a/lib/sapporo_light/send_fetch_data.cpp
+++ b/lib/sapporo_light/send_fetch_data.cpp
@@ -141,6 +141,7 @@ double sapporo::evaluate_gravity(int ni, int nj) {
     
     gpu.nj_max = nj_max;
     gpu.nj_modified = nj_modified;
+    gpu.nj_massive = nj_massive;  // number of massive (non-test) particles
     gpu.predict     = predict;
     
     gpu.t_i_x = t_i.x;

From c1712d488ddc5d598c20cd420238be6a332239b8 Mon Sep 17 00:00:00 2001
From: Hanno Spreeuw <h.spreeuw@esciencecenter.nl>
Date: Thu, 7 May 2026 18:08:53 +0200
Subject: [PATCH 09/22] Runs with test particles seemed to take forever

We could build Sapporo Light but running yielded unforeseen results.
With no test particles, the code seems a lot faster, it completes in 3s instead of 39s, that is suspicious.
But with e.g. 50% test particles, the code seems to run forever, with about 35% GPU utilisation, which is twice as low as before.
It seems that computations could be incorrect.
To fix this, we will again send all particles to the GPU, but reimplement the mass condition.
This should result in correct computations, but the speedup may vanish.
---
 lib/sapporo_light/dev_evaluate_gravity.cu  | 57 ++++++++++++----------
 lib/sapporo_light/host_evaluate_gravity.cu |  6 +--
 2 files changed, 35 insertions(+), 28 deletions(-)

diff --git a/lib/sapporo_light/dev_evaluate_gravity.cu b/lib/sapporo_light/dev_evaluate_gravity.cu
index 02bfbe62f6..65a7154dff 100644
--- a/lib/sapporo_light/dev_evaluate_gravity.cu
+++ b/lib/sapporo_light/dev_evaluate_gravity.cu
@@ -3,6 +3,11 @@
 
 __constant__ float EPS2;
 
+// Test particle mass threshold - matches SAPPORO_TEST_PARTICLE_MASS from sapporo.h
+#ifndef SAPPORO_TEST_PARTICLE_MASS
+#define SAPPORO_TEST_PARTICLE_MASS 1e-30f
+#endif
+
 typedef float2 DS;  // double single;
 
 struct DS4 {
@@ -75,35 +80,37 @@ __device__ void body_body_interaction(float &ds_min,
   
   float mass    = pos_j.w.x;
   
-  // Mass check is not needed anymore: particles are sorted by mass on the host,
-  // and only massive particles (mass > SAPPORO_TEST_PARTICLE_MASS) are sent to the GPU kernel.
-  // This eliminates branch divergence and improves GPU efficiency.
-
-  float inv_ds = 0.0f;
-  if (__float_as_int(pos_i.w.y) != __float_as_int(pos_j.w.y)) {
-      inv_ds = rsqrt(ds2 + EPS2);
-  }
-
-  float inv_ds2 = inv_ds*inv_ds;                         // 1 FLOP
-  float inv_ds3 = mass * inv_ds*inv_ds2;                 // 2 FLOP
-  
-  // 3*4 + 3 = 15 FLOP
-  acc_i.x = ((inv_ds3 * dr.x) + acc_i.x);
-  acc_i.y = ((inv_ds3 * dr.y) + acc_i.y);
-  acc_i.z = ((inv_ds3 * dr.z) + acc_i.z);
+  // Skip particles with zero mass (test particles).
+  // Particles are sorted by mass on host (massive first), improving GPU memory access patterns
+  // and reducing warp divergence when blocks naturally align with massive/test boundaries.
+  if (mass > SAPPORO_TEST_PARTICLE_MASS) {
   
-  acc_i.w = (mass * inv_ds  + acc_i.w);
+    float inv_ds = 0.0f;
+    if (__float_as_int(pos_i.w.y) != __float_as_int(pos_j.w.y)) {
+        inv_ds = rsqrt(ds2 + EPS2);
+    }
 
-  float3 dv;    // 3 FLOP
-  dv.x = vel_j.x - vel_i.x;
-  dv.y = vel_j.y - vel_i.y;
-  dv.z = vel_j.z - vel_i.z;
+    float inv_ds2 = inv_ds*inv_ds;                         // 1 FLOP
+    float inv_ds3 = mass * inv_ds*inv_ds2;                 // 2 FLOP
+    
+    // 3*4 + 3 = 15 FLOP
+    acc_i.x = ((inv_ds3 * dr.x) + acc_i.x);
+    acc_i.y = ((inv_ds3 * dr.y) + acc_i.y);
+    acc_i.z = ((inv_ds3 * dr.z) + acc_i.z);
+    
+    acc_i.w = (mass * inv_ds  + acc_i.w);
 
-  float drdv = -3.0f * (inv_ds3*inv_ds2) * (((dr.x*dv.x) + dr.y*dv.y) + dr.z*dv.z);
+    float3 dv;    // 3 FLOP
+    dv.x = vel_j.x - vel_i.x;
+    dv.y = vel_j.y - vel_i.y;
+    dv.z = vel_j.z - vel_i.z;
 
-  jrk_i.x = (jrk_i.x + inv_ds3 * dv.x) + drdv * dr.x;
-  jrk_i.y = (jrk_i.y + inv_ds3 * dv.y) + drdv * dr.y;
-  jrk_i.z = (jrk_i.z + inv_ds3 * dv.z) + drdv * dr.z;
+    float drdv = -3.0f * (inv_ds3*inv_ds2) * (((dr.x*dv.x) + dr.y*dv.y) + dr.z*dv.z);
+
+    jrk_i.x = (jrk_i.x + inv_ds3 * dv.x) + drdv * dr.x;
+    jrk_i.y = (jrk_i.y + inv_ds3 * dv.y) + drdv * dr.y;
+    jrk_i.z = (jrk_i.z + inv_ds3 * dv.z) + drdv * dr.z;
+  }
 
 
   // TOTAL 50 FLOP (or 60 FLOP if compared against GRAPE6)
diff --git a/lib/sapporo_light/host_evaluate_gravity.cu b/lib/sapporo_light/host_evaluate_gravity.cu
index 849158ce56..f12b3befbb 100644
--- a/lib/sapporo_light/host_evaluate_gravity.cu
+++ b/lib/sapporo_light/host_evaluate_gravity.cu
@@ -93,7 +93,7 @@ extern "C"
     dim3 grid(NBLOCKS, 1, 1);
 
     int shared_mem_size = p*q*(sizeof(DS4) + sizeof(float4));
-    int nj_scaled = n_norm(gpu.nj_massive, q*NBLOCKS);
+    int nj_scaled = n_norm(gpu.nj, q*NBLOCKS);
    
     #if CUDART_VERSION < 5000
     CUDA_SAFE_CALL(cudaMemcpyToSymbol(
@@ -115,7 +115,7 @@ extern "C"
     
     double t1 = get_time();
     if (gpu.ngb)
-      dev_evaluate_gravity<true><<<grid, threads, shared_mem_size>>>(gpu.nj_massive, 
+      dev_evaluate_gravity<true><<<grid, threads, shared_mem_size>>>(gpu.nj, 
                                                                      nj_scaled/(NBLOCKS*q),
                                                                      NTHREADS,
                                                                      gpu.Ppos_j+ ofs,
@@ -124,7 +124,7 @@ extern "C"
                                                                      gpu.acc_i,  gpu.jrk_i,
                                                                      gpu.ngb_list);
     else
-      dev_evaluate_gravity<false><<<grid, threads, shared_mem_size>>>(gpu.nj_massive, 
+      dev_evaluate_gravity<false><<<grid, threads, shared_mem_size>>>(gpu.nj, 
                                                                       nj_scaled/(NBLOCKS*q),
                                                                       NTHREADS,
                                                                       gpu.Ppos_j + ofs, 

From 107d43aac6234eb81aa46c34480b0fc1793ee9b8 Mon Sep 17 00:00:00 2001
From: Hanno Spreeuw <h.spreeuw@esciencecenter.nl>
Date: Fri, 15 May 2026 10:30:24 +0200
Subject: [PATCH 10/22] No speedup from conditional

The conditional
"if (mass > SAPPORO_TEST_PARTICLE_MASS) {"
does not yield any speedup, since branch divergence occurs: if roughly half the particles are massless test particles,
about half the threads in each warp will take the if branch while the other half skip it and will simply wait.
---
 lib/sapporo_light/dev_evaluate_gravity.cu | 52 +++++++++--------------
 1 file changed, 20 insertions(+), 32 deletions(-)

diff --git a/lib/sapporo_light/dev_evaluate_gravity.cu b/lib/sapporo_light/dev_evaluate_gravity.cu
index 65a7154dff..d4c967105c 100644
--- a/lib/sapporo_light/dev_evaluate_gravity.cu
+++ b/lib/sapporo_light/dev_evaluate_gravity.cu
@@ -3,11 +3,6 @@
 
 __constant__ float EPS2;
 
-// Test particle mass threshold - matches SAPPORO_TEST_PARTICLE_MASS from sapporo.h
-#ifndef SAPPORO_TEST_PARTICLE_MASS
-#define SAPPORO_TEST_PARTICLE_MASS 1e-30f
-#endif
-
 typedef float2 DS;  // double single;
 
 struct DS4 {
@@ -78,39 +73,32 @@ __device__ void body_body_interaction(float &ds_min,
     
   }
   
+  float inv_ds = 0.0f;
+  if (__float_as_int(pos_i.w.y) != __float_as_int(pos_j.w.y)) {
+      inv_ds = rsqrt(ds2 + EPS2);
+  }
+
   float mass    = pos_j.w.x;
+  float inv_ds2 = inv_ds*inv_ds;                         // 1 FLOP
+  float inv_ds3 = mass * inv_ds*inv_ds2;                 // 2 FLOP
   
-  // Skip particles with zero mass (test particles).
-  // Particles are sorted by mass on host (massive first), improving GPU memory access patterns
-  // and reducing warp divergence when blocks naturally align with massive/test boundaries.
-  if (mass > SAPPORO_TEST_PARTICLE_MASS) {
+  // 3*4 + 3 = 15 FLOP
+  acc_i.x = ((inv_ds3 * dr.x) + acc_i.x);
+  acc_i.y = ((inv_ds3 * dr.y) + acc_i.y);
+  acc_i.z = ((inv_ds3 * dr.z) + acc_i.z);
   
-    float inv_ds = 0.0f;
-    if (__float_as_int(pos_i.w.y) != __float_as_int(pos_j.w.y)) {
-        inv_ds = rsqrt(ds2 + EPS2);
-    }
+  acc_i.w = (mass * inv_ds  + acc_i.w);
 
-    float inv_ds2 = inv_ds*inv_ds;                         // 1 FLOP
-    float inv_ds3 = mass * inv_ds*inv_ds2;                 // 2 FLOP
-    
-    // 3*4 + 3 = 15 FLOP
-    acc_i.x = ((inv_ds3 * dr.x) + acc_i.x);
-    acc_i.y = ((inv_ds3 * dr.y) + acc_i.y);
-    acc_i.z = ((inv_ds3 * dr.z) + acc_i.z);
-    
-    acc_i.w = (mass * inv_ds  + acc_i.w);
+  float3 dv;    // 3 FLOP
+  dv.x = vel_j.x - vel_i.x;
+  dv.y = vel_j.y - vel_i.y;
+  dv.z = vel_j.z - vel_i.z;
 
-    float3 dv;    // 3 FLOP
-    dv.x = vel_j.x - vel_i.x;
-    dv.y = vel_j.y - vel_i.y;
-    dv.z = vel_j.z - vel_i.z;
+  float drdv = -3.0f * (inv_ds3*inv_ds2) * (((dr.x*dv.x) + dr.y*dv.y) + dr.z*dv.z);
 
-    float drdv = -3.0f * (inv_ds3*inv_ds2) * (((dr.x*dv.x) + dr.y*dv.y) + dr.z*dv.z);
-
-    jrk_i.x = (jrk_i.x + inv_ds3 * dv.x) + drdv * dr.x;
-    jrk_i.y = (jrk_i.y + inv_ds3 * dv.y) + drdv * dr.y;
-    jrk_i.z = (jrk_i.z + inv_ds3 * dv.z) + drdv * dr.z;
-  }
+  jrk_i.x = (jrk_i.x + inv_ds3 * dv.x) + drdv * dr.x;
+  jrk_i.y = (jrk_i.y + inv_ds3 * dv.y) + drdv * dr.y;
+  jrk_i.z = (jrk_i.z + inv_ds3 * dv.z) + drdv * dr.z;
 
 
   // TOTAL 50 FLOP (or 60 FLOP if compared against GRAPE6)

From ae1c36e9626424680b32849b9a710712264ee997 Mon Sep 17 00:00:00 2001
From: Hanno Spreeuw <h.spreeuw@esciencecenter.nl>
Date: Fri, 15 May 2026 14:31:40 +0200
Subject: [PATCH 11/22] Simpler way to weed out massless particles

Previously, massless particles, i.e. test particles were weeded out by sorting all the particles based on their mass, subsequently determining "nj_massive" from that sort and copying all particles up to index "nj_massive" to the GPU as j-particles, such that massless particles will not be added as j-particles.
However, it may be much simpler to add a conditional to "set_j_particle":
"
if (mass <= SAPPORO_TEST_PARTICLE_MASS) return 0;
"
This may work, but remains to be tested.
---
 lib/sapporo_light/sapporo.cpp | 53 +++--------------------------------
 1 file changed, 4 insertions(+), 49 deletions(-)

diff --git a/lib/sapporo_light/sapporo.cpp b/lib/sapporo_light/sapporo.cpp
index fc5a1ae19c..7f281ceab5 100644
--- a/lib/sapporo_light/sapporo.cpp
+++ b/lib/sapporo_light/sapporo.cpp
@@ -79,6 +79,9 @@ int sapporo::set_j_particle(int cluster_id,
     exit(-1);
   }
 
+  // Skip test particles (massless particles) as they don't contribute to gravity
+  if (mass <= SAPPORO_TEST_PARTICLE_MASS) return 0;
+
     DS  Dmass = (DS){mass, INT_AS_FLOAT(id)};
     map<int,int>::iterator iterator = mapping_from_address_j_to_index_in_update_array.find(address);
     map<int,int>::iterator end = mapping_from_address_j_to_index_in_update_array.end();
@@ -134,55 +137,7 @@ void sapporo::calc_firsthalf(int cluster_id,
   }
 
   if (address_j.size() > 0) {
-    // Sort particles by mass: massive particles first, test particles
-    // (mass <= SAPPORO_TEST_PARTICLE_MASS) last. This eliminates branch divergence.
-    std::vector<int> sort_indices(address_j.size());
-    for (int i = 0; i < (int)address_j.size(); i++) {
-      sort_indices[i] = i;
-    }
-    
-    std::sort(sort_indices.begin(), sort_indices.end(),
-              [this](int i, int j) {
-                // Sort by mass in descending order (massive first)
-                float mass_i = pos_j[i].w.x;
-                float mass_j = pos_j[j].w.x;
-                return mass_i > mass_j;
-              });
-    
-    // Reorder all particle vectors according to sort_indices
-    std::vector<int>    sorted_address_j(address_j.size());
-    std::vector<DS2>    sorted_t_j(t_j.size());
-    std::vector<DS4>    sorted_pos_j(pos_j.size());
-    std::vector<float4> sorted_vel_j(vel_j.size());
-    std::vector<float4> sorted_acc_j(acc_j.size());
-    std::vector<float4> sorted_jrk_j(jrk_j.size());
-    
-    for (int i = 0; i < (int)address_j.size(); i++) {
-      int idx = sort_indices[i];
-      sorted_address_j[i] = address_j[idx];
-      sorted_t_j[i]       = t_j[idx];
-      sorted_pos_j[i]     = pos_j[idx];
-      sorted_vel_j[i]     = vel_j[idx];
-      sorted_acc_j[i]     = acc_j[idx];
-      sorted_jrk_j[i]     = jrk_j[idx];
-    }
-    
-    address_j = sorted_address_j;
-    t_j       = sorted_t_j;
-    pos_j     = sorted_pos_j;
-    vel_j     = sorted_vel_j;
-    acc_j     = sorted_acc_j;
-    jrk_j     = sorted_jrk_j;
-    
-    // Count particles with mass > threshold
-    nj_massive = 0;
-    for (int i = 0; i < (int)address_j.size(); i++) {
-      if (pos_j[i].w.x > SAPPORO_TEST_PARTICLE_MASS) {
-        nj_massive++;
-      } else {
-        break;  // remaining particles are test particles (sorted to the end)
-      }
-    }
+    // Since we skip adding test particles, all particles in address_j are massive
     
     send_j_particles_to_device(device_id);
   }

From 4f15dd93b1c93d384874812e9aaff7771e15e7b7 Mon Sep 17 00:00:00 2001
From: Hanno Spreeuw <h.spreeuw@esciencecenter.nl>
Date: Fri, 15 May 2026 14:43:05 +0200
Subject: [PATCH 12/22] With the new method, "nj_massive" is redundant

With the new method, implemented in the previous commit (ae1c36e), of weeding out test particles from the j-particles, "nj_massive" has become redundant.
---
 lib/sapporo_light/sapporo.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/lib/sapporo_light/sapporo.h b/lib/sapporo_light/sapporo.h
index 7a8d369112..f6218f7326 100644
--- a/lib/sapporo_light/sapporo.h
+++ b/lib/sapporo_light/sapporo.h
@@ -125,7 +125,6 @@ class sapporo {
   double EPS2;
 
   int            nj_modified;
-  int            nj_massive;  // number of particles with mass > SAPPORO_TEST_PARTICLE_MASS
   vector<int>    address_j;
   vector<DS2>    t_j;
   vector<DS4>    pos_j;
@@ -180,7 +179,6 @@ class sapporo {
 
     predict = false;
     nj_modified = 0;
-    nj_massive = 0;
 
     device.address_j = NULL;
 

From 7b6c07fda9378b7ed69fbd4f355fad69fbce00e2 Mon Sep 17 00:00:00 2001
From: Hanno Spreeuw <h.spreeuw@esciencecenter.nl>
Date: Fri, 15 May 2026 14:49:08 +0200
Subject: [PATCH 13/22] Line most likely redundant

1) The line "if (nj == 0) return 0;" should be a safeguard against crashing for runs with only massless particles.
2) "nj_massive" has become redundant since we started with the new method of weeding out massless particles from the j-particles.
---
 lib/sapporo_light/send_fetch_data.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lib/sapporo_light/send_fetch_data.cpp b/lib/sapporo_light/send_fetch_data.cpp
index 9d0194262b..e60c369abb 100644
--- a/lib/sapporo_light/send_fetch_data.cpp
+++ b/lib/sapporo_light/send_fetch_data.cpp
@@ -111,6 +111,9 @@ int sapporo::fetch_ngb_list_from_device(int ignore) {
 
 
 double sapporo::evaluate_gravity(int ni, int nj) {
+    // If no j particles, skip evaluation to avoid kernel launch with nj=0
+    if (nj == 0) return 0;
+
 #ifdef NGB
     bool ngb = true;
 #else
@@ -141,7 +144,6 @@ double sapporo::evaluate_gravity(int ni, int nj) {
     
     gpu.nj_max = nj_max;
     gpu.nj_modified = nj_modified;
-    gpu.nj_massive = nj_massive;  // number of massive (non-test) particles
     gpu.predict     = predict;
     
     gpu.t_i_x = t_i.x;

From 1a353265fe3c785d464cfe2df240c1d47112bba7 Mon Sep 17 00:00:00 2001
From: Hanno Spreeuw <h.spreeuw@esciencecenter.nl>
Date: Fri, 15 May 2026 14:57:34 +0200
Subject: [PATCH 14/22] "nj_massive" has become redundant.

---
 lib/sapporo_light/sapporo_multi.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lib/sapporo_light/sapporo_multi.h b/lib/sapporo_light/sapporo_multi.h
index 96ca05fd6f..37f5f13ef8 100644
--- a/lib/sapporo_light/sapporo_multi.h
+++ b/lib/sapporo_light/sapporo_multi.h
@@ -10,7 +10,6 @@ struct sapporo_multi_struct {
   bool ngb;
   int nj, ni;
   int nj_total;
-  int nj_massive;  // number of particles with mass > SAPPORO_TEST_PARTICLE_MASS
   int  nj_max;
   int     nj_modified;
   bool    predict;

From f46e8d5f653b4fa466eecd1e47003b0c9d45db97 Mon Sep 17 00:00:00 2001
From: HannoSpreeuw <h.spreeuw@esciencecenter.nl>
Date: Mon, 18 May 2026 19:38:17 +0200
Subject: [PATCH 15/22] This whitespace was redundant.

---
 lib/sapporo_light/sapporo.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lib/sapporo_light/sapporo.cpp b/lib/sapporo_light/sapporo.cpp
index 7f281ceab5..1732a2853b 100644
--- a/lib/sapporo_light/sapporo.cpp
+++ b/lib/sapporo_light/sapporo.cpp
@@ -138,7 +138,6 @@ void sapporo::calc_firsthalf(int cluster_id,
 
   if (address_j.size() > 0) {
     // Since we skip adding test particles, all particles in address_j are massive
-    
     send_j_particles_to_device(device_id);
   }
 

From c85271f29f562735205f39aadd7c4d431e1cb35e Mon Sep 17 00:00:00 2001
From: HannoSpreeuw <h.spreeuw@esciencecenter.nl>
Date: Mon, 18 May 2026 19:41:20 +0200
Subject: [PATCH 16/22] This check should be made at the caller

This check should be made at the caller, not at the callee.
---
 lib/sapporo_light/send_fetch_data.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/lib/sapporo_light/send_fetch_data.cpp b/lib/sapporo_light/send_fetch_data.cpp
index e60c369abb..c8118884fd 100644
--- a/lib/sapporo_light/send_fetch_data.cpp
+++ b/lib/sapporo_light/send_fetch_data.cpp
@@ -111,8 +111,6 @@ int sapporo::fetch_ngb_list_from_device(int ignore) {
 
 
 double sapporo::evaluate_gravity(int ni, int nj) {
-    // If no j particles, skip evaluation to avoid kernel launch with nj=0
-    if (nj == 0) return 0;
 
 #ifdef NGB
     bool ngb = true;

From ac1ecdfa8ab4fdfdfa1f874a1afa4e9aa7784ce2 Mon Sep 17 00:00:00 2001
From: HannoSpreeuw <h.spreeuw@esciencecenter.nl>
Date: Tue, 19 May 2026 12:19:49 +0200
Subject: [PATCH 17/22] Check "nj > 0" at the caller

We do not want to call "evaluate_gravity" when "nj==0". The check for this is now made at the caller, not at the callee.
---
 lib/sapporo_light/sapporo.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lib/sapporo_light/sapporo.cpp b/lib/sapporo_light/sapporo.cpp
index 1732a2853b..4c1b32963d 100644
--- a/lib/sapporo_light/sapporo.cpp
+++ b/lib/sapporo_light/sapporo.cpp
@@ -151,7 +151,9 @@ void sapporo::calc_firsthalf(int cluster_id,
     acc_j.clear();
     jrk_j.clear();
   }
-  evaluate_gravity(ni, nj);
+  if (nj > 0) {
+    evaluate_gravity(ni, nj);
+  }
 
 }
 

From 1d0386dffb7059a97ed0be25a1cf4328ed1ac760 Mon Sep 17 00:00:00 2001
From: HannoSpreeuw <h.spreeuw@esciencecenter.nl>
Date: Tue, 19 May 2026 12:24:39 +0200
Subject: [PATCH 18/22] Only add a j-particle when it has mass

j-particles are the particles that exert gravity. If they are massless, they should not become a j-particle.
When deploying PH4 with a CPU backend, this is taken care of in line 207 of idata.cc in "get_partial_acc_and_jerk":
"
		// Skip particles with zero mass.
		if (jdat->mass[j] > _TINY_){
"
However, this is not the best place to do this, it would be better to not create the j-particle at all.
For both the CPU and GPU backends this commit fixes this.
This means that the conditional in line 208 of idata.cc has now become redundant.
---
 src/amuse_ph4/src/jdata.cc | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/amuse_ph4/src/jdata.cc b/src/amuse_ph4/src/jdata.cc
index 9acef22d21..30ddd9ad8c 100644
--- a/src/amuse_ph4/src/jdata.cc
+++ b/src/amuse_ph4/src/jdata.cc
@@ -139,6 +139,14 @@ int jdata::add_particle(real pmass, real pradius,
 			real dt)		// default = -1
 {
     const char *in_function = "jdata::add_particle";
+
+    if (pmass <= _TINY_) {
+        if (mpi_rank == 0) {
+            cerr << "Warning: ignoring particle with mass="
+                 << pmass << endl;
+        }
+        return -1;
+    }
     if (DEBUG > 2 && mpi_rank == 0) PRL(in_function);
 
     if (nj >= njbuf) {

From 3a2a91331a68a7f4e6583fab8e8022a8d6e03e3b Mon Sep 17 00:00:00 2001
From: HannoSpreeuw <h.spreeuw@esciencecenter.nl>
Date: Tue, 19 May 2026 16:55:40 +0200
Subject: [PATCH 19/22] This print statement is redundant

This print statement is redundant, addressing comment https://github.com/amusecode/amuse/pull/1266#discussion_r3266435966 by @LourensVeen.
---
 src/amuse_ph4/src/jdata.cc | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/amuse_ph4/src/jdata.cc b/src/amuse_ph4/src/jdata.cc
index 30ddd9ad8c..a52ede13c2 100644
--- a/src/amuse_ph4/src/jdata.cc
+++ b/src/amuse_ph4/src/jdata.cc
@@ -141,10 +141,6 @@ int jdata::add_particle(real pmass, real pradius,
     const char *in_function = "jdata::add_particle";
 
     if (pmass <= _TINY_) {
-        if (mpi_rank == 0) {
-            cerr << "Warning: ignoring particle with mass="
-                 << pmass << endl;
-        }
         return -1;
     }
     if (DEBUG > 2 && mpi_rank == 0) PRL(in_function);

From 3f3f787a081b4a820d588d1f55234bbd01d5b2d9 Mon Sep 17 00:00:00 2001
From: HannoSpreeuw <h.spreeuw@esciencecenter.nl>
Date: Tue, 19 May 2026 17:24:07 +0200
Subject: [PATCH 20/22] This conditional has become redundant

The
"
if (jdat->mass[j] > _TINY_){
"
conditional has become redundant since commit 1d0386dff, i.e. since particles are only added as j-particles if they have mass.
---
 src/amuse_ph4/src/idata.cc | 47 ++++++++++++++++++--------------------
 1 file changed, 22 insertions(+), 25 deletions(-)

diff --git a/src/amuse_ph4/src/idata.cc b/src/amuse_ph4/src/idata.cc
index 4f97d4f376..6a48ff172e 100644
--- a/src/amuse_ph4/src/idata.cc
+++ b/src/amuse_ph4/src/idata.cc
@@ -204,33 +204,30 @@ void idata::get_partial_acc_and_jerk()
 	ldnn[i] = _INFINITY_;
 	for (int k = 0; k < 3; k++) lacc[i][k] = ljerk[i][k] = 0;
 	for (int j = j_start; j < j_end; j++) {
-		// Skip particles with zero mass.
-		if (jdat->mass[j] > _TINY_){
-			r2 = xv = 0;
-			for (int k = 0; k < 3; k++) {
-				dx[k] = jdat->pred_pos[j][k] - ipos[i][k];
-				dv[k] = jdat->pred_vel[j][k] - ivel[i][k];
-				r2 += dx[k]*dx[k];
-				xv += dx[k]*dv[k];
-			}
-			r2i = 1/(r2+eps2+_TINY_);
-			ri = sqrt(r2i);
-			mri = jdat->mass[j]*ri;
-			mr3i = mri*r2i;
-			a3 = -3*xv*r2i;
-			// PRC(jdat->mpi_rank); PRC(ri); PRL(mri);
-			if (r2 > _TINY_) {
-				lpot[i] -= mri;
-				if (r2 < ldnn[i]) {
-					ldnn[i] = r2;
-					lnn[i] = j;
-				}
-			}
-			for (int k = 0; k < 3; k++) {
-				lacc[i][k] += mr3i*dx[k];
-				ljerk[i][k] += mr3i*(dv[k]+a3*dx[k]);
+		r2 = xv = 0;
+		for (int k = 0; k < 3; k++) {
+			dx[k] = jdat->pred_pos[j][k] - ipos[i][k];
+			dv[k] = jdat->pred_vel[j][k] - ivel[i][k];
+			r2 += dx[k]*dx[k];
+			xv += dx[k]*dv[k];
+		}
+		r2i = 1/(r2+eps2+_TINY_);
+		ri = sqrt(r2i);
+		mri = jdat->mass[j]*ri;
+		mr3i = mri*r2i;
+		a3 = -3*xv*r2i;
+		// PRC(jdat->mpi_rank); PRC(ri); PRL(mri);
+		if (r2 > _TINY_) {
+			lpot[i] -= mri;
+			if (r2 < ldnn[i]) {
+				ldnn[i] = r2;
+				lnn[i] = j;
 			}
 		}
+		for (int k = 0; k < 3; k++) {
+			lacc[i][k] += mr3i*dx[k];
+			ljerk[i][k] += mr3i*(dv[k]+a3*dx[k]);
+		}
 	}
 	ldnn[i] = sqrt(ldnn[i]);
     }

From b726e4feb5397a8076318c574adb0dd3d4d2ce2e Mon Sep 17 00:00:00 2001
From: Hanno Spreeuw <h.spreeuw@esciencecenter.nl>
Date: Tue, 19 May 2026 12:48:22 +0200
Subject: [PATCH 21/22] We should not add a particle when massless

We should not add a particle as a j-particle when massless, since j-particles are the particles that exert gravity.
---
 src/amuse_ph4/src/jdata.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/amuse_ph4/src/jdata.cc b/src/amuse_ph4/src/jdata.cc
index a52ede13c2..cf227ef39b 100644
--- a/src/amuse_ph4/src/jdata.cc
+++ b/src/amuse_ph4/src/jdata.cc
@@ -143,6 +143,7 @@ int jdata::add_particle(real pmass, real pradius,
     if (pmass <= _TINY_) {
         return -1;
     }
+    
     if (DEBUG > 2 && mpi_rank == 0) PRL(in_function);
 
     if (nj >= njbuf) {

From 1f708c9329cb853ca488571a1874d2539238b9ff Mon Sep 17 00:00:00 2001
From: Hanno Spreeuw <h.spreeuw@esciencecenter.nl>
Date: Thu, 21 May 2026 15:13:36 +0200
Subject: [PATCH 22/22] This increases achieved occupancy

This increases achieved occupancy by almost a factor four.

For the "dev_evaluate_gravity" kernel, the occupancy is increased from 16.66% to 61.65%. The grid size was 16, which yielded the following recommendation from profiling
"
The grid for this launch is configured to execute only 16 blocks, which is less than the GPU's 64 multiprocessors. This can underutilize some multiprocessors."

Now it is 256.
The improvement in occupancy was investigated using ncu:
"
ncu --kernel-name dev_evaluate_gravity -c 100 --print-summary per-kernel -o dev_evaluate_gravity.prof python scripts/Erwan_1207_imp.py
ncu -i dev_evaluate_gravity.prof.ncu-rep | less
"
---
 lib/sapporo_light/sapporo_defs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/sapporo_light/sapporo_defs.h b/lib/sapporo_light/sapporo_defs.h
index 0ae0876fe4..217721f313 100644
--- a/lib/sapporo_light/sapporo_defs.h
+++ b/lib/sapporo_light/sapporo_defs.h
@@ -3,7 +3,7 @@
 
 #define MAXCUDADEVICES 4
 #define NBODIES_MAX 524288
-#define NBLOCKS 16        /* number of block which can be run simultaneously */
+#define NBLOCKS 256        /* number of block which can be run simultaneously */
 
 #ifdef NGB
 #define NTHREADS 256   /* max number of threads which can run per block */