dipy · 36000 · Mar 17, 2026 · Mar 17, 2026 · Mar 17, 2026 · Mar 17, 2026
diff --git a/cuslines/cuda_c/generate_streamlines_cuda.cu b/cuslines/cuda_c/generate_streamlines_cuda.cu
@@ -36,6 +36,7 @@
 #include "utils.cu"
 #include "tracking_helpers.cu"
 #include "boot.cu"
+#include "ptt_init.cu"
 #include "ptt.cu"
 
 #define MAX_NUM_DIR (128)
@@ -292,6 +293,7 @@ __device__ int get_direction_prob_d(curandStatePhilox4_32_10_t *st,
 template<int BDIM_X,
          int BDIM_Y,
          ModelType MODEL_T,
+         typename DATA_T,
          typename REAL_T,
          typename REAL3_T>
 __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
@@ -308,7 +310,7 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
                          const int dimy,
                          const int dimz,
                          const int dimt,
-                         const REAL_T *__restrict__ dataf,
+                         DATA_T dataf,
                          const REAL_T *__restrict__ metric_map,
 		         const int samplm_nr,
                          const REAL3_T *__restrict__ sphere_vertices,
@@ -511,6 +513,7 @@ __global__ void getNumStreamlinesProb_k(const REAL_T max_angle,
 template<int BDIM_X,
          int BDIM_Y,
          ModelType MODEL_T,
+         typename DATA_T,
          typename REAL_T,
          typename REAL3_T>
 __global__ void genStreamlinesMergeProb_k(
@@ -527,7 +530,7 @@ __global__ void genStreamlinesMergeProb_k(
                                       const int dimy,
                                       const int dimz,
                                       const int dimt,
-                                      const REAL_T *__restrict__ dataf,
+                                      DATA_T dataf,
                                       const REAL_T *__restrict__ metric_map,
 				      const int samplm_nr,
                                       const REAL3_T *__restrict__ sphere_vertices,
@@ -617,7 +620,8 @@ __global__ void genStreamlinesMergeProb_k(
                 int stepsB;
                 const int tissue_classB = tracker_d<BDIM_X,
                                                     BDIM_Y,
-                                                    MODEL_T>(&st,
+                                                    MODEL_T,
+                                                    DATA_T>(&st,
 		                		             max_angle,
 			        			     tc_threshold,
 	                        			     step_size,
@@ -651,7 +655,8 @@ __global__ void genStreamlinesMergeProb_k(
                 int stepsF;
                 const int tissue_classF = tracker_d<BDIM_X,
                                                     BDIM_Y,
-                                                    MODEL_T>(&st,
+                                                    MODEL_T,
+                                                    DATA_T>(&st,
      	                    			             max_angle,
 		        				     tc_threshold,
 	                				     step_size,

diff --git a/cuslines/cuda_c/ptt.cu b/cuslines/cuda_c/ptt.cu
@@ -22,7 +22,8 @@ __device__ __forceinline__ void crossnorm3_d(REAL_T *dest, const REAL_T *src1, c
 }
 
 template<int BDIM_X, typename REAL_T, typename REAL3_T>
-__device__ REAL_T interp4_d(const REAL3_T pos, const REAL_T* frame, const REAL_T *__restrict__ pmf,
+__device__ REAL_T interp4_d(const REAL3_T pos, const REAL_T* frame,
+                            const cudaTextureObject_t *__restrict__ pmf,
                             const int dimx, const int dimy, const int dimz, const int dimt,
                             const REAL3_T *__restrict__ odf_sphere_vertices) {
     const int tidx = threadIdx.x;
@@ -64,14 +65,11 @@ __device__ REAL_T interp4_d(const REAL3_T pos, const REAL_T* frame, const REAL_T
     }
 #endif
 
-    // TODO: maybe this should be texture memory, I am not so sure
-    const int rv = trilinear_interp_d<THR_X_SL>(dimx, dimy, dimz, dimt, closest_odf_idx, pmf, pos, &__max_cos);
-
-    if (rv != 0) {
-        return 0;  // No support
-    } else {
-        return __max_cos;
-    }
+    // REAL_T z_query = pos.z + (REAL_T)(closest_odf_idx * dimz);
+    // return tex3D<REAL_T>(*pmf, pos.x, pos.y, z_query);
+
+    REAL_T x_query = (REAL_T)(closest_odf_idx*dimx) + pos.x;
+    return tex3D<REAL_T>(*pmf, x_query, pos.y, pos.z);
 }
 
 template<typename REAL_T>
@@ -191,7 +189,7 @@ __device__ void propagate_frame_d(REAL_T* propagator, REAL_T* frame, REAL_T* dir
 
 template<int BDIM_X, typename REAL_T, typename REAL3_T>
 __device__ REAL_T calculate_data_support_d(REAL_T support,
-                                           const REAL3_T pos, const REAL_T *__restrict__ pmf,
+                                           const REAL3_T pos, const cudaTextureObject_t *__restrict__ pmf,
                                            const int dimx, const int dimy, const int dimz, const int dimt,
                                            const REAL_T probe_step_size,
                                            const REAL_T absolpmf_thresh,
@@ -249,7 +247,7 @@ template<int BDIM_X,
          typename REAL3_T>
 __device__ int get_direction_ptt_d(
     curandStatePhilox4_32_10_t *st,
-    const REAL_T *__restrict__ pmf,
+    const cudaTextureObject_t *__restrict__ pmf,
     const REAL_T max_angle,
     const REAL_T step_size,
     REAL3_T dir,
@@ -294,9 +292,9 @@ __device__ int get_direction_ptt_d(
     REAL_T *__direc_sh = direc_sh + tidy*3;
     REAL3_T *__probing_pos_sh = probing_pos_sh + tidy;
 
-    const REAL_T probe_step_size = ((step_size / PROBE_FRAC) / (PROBE_QUALITY - 1));
+    const REAL_T probe_step_size = ((step_size / PROBE_FRAC) / (PROBE_QUALITY - 1));  // TODO: is this -1 necessary?
     const REAL_T max_curvature = 2.0 * SIN(max_angle / 2.0) / (step_size / PROBE_FRAC); // This seems to work well
-    const REAL_T absolpmf_thresh = PMF_THRESHOLD_P * max_d<BDIM_X>(dimt, pmf, REAL_MIN);
+    const REAL_T absolpmf_thresh = 0; // PMF_THRESHOLD_P * max_d<BDIM_X>(dimt, pmf, REAL_MIN); TODO: try 2.84 for max; i mean, this is completely broken
 
 #if 0
         printf("absolpmf_thresh: %f, max_curvature: %f, probe_step_size: %f\n", absolpmf_thresh, max_curvature, probe_step_size);
@@ -493,7 +491,7 @@ template<int BDIM_X,
          typename REAL3_T>
 __device__ bool init_frame_ptt_d(
     curandStatePhilox4_32_10_t *st,
-    const REAL_T *__restrict__ pmf,
+    const cudaTextureObject_t *__restrict__ pmf,
     const REAL_T max_angle,
     const REAL_T step_size,
     REAL3_T first_step,

diff --git a/cuslines/cuda_c/ptt_init.cu b/cuslines/cuda_c/ptt_init.cu
@@ -0,0 +1,86 @@
+template<int BDIM_X,
+         int BDIM_Y,
+         typename REAL_T,
+         typename REAL3_T>
+__global__ void getNumStreamlinesPtt_k(
+                                        const REAL_T max_angle,
+				        const REAL_T relative_peak_thres,
+				        const REAL_T min_separation_angle,
+				        const long long rndSeed,
+                                        const int nseed,
+                                        const REAL3_T *__restrict__ seeds,
+                                        const int dimx,
+                                        const int dimy,
+                                        const int dimz,
+                                        const int dimt,
+                                        const cudaTextureObject_t *__restrict__ pmf,
+                                        const REAL3_T *__restrict__ sphere_vertices,
+                                        const int2 *__restrict__ sphere_edges,
+                                        const int num_edges,
+                                        REAL3_T *__restrict__ shDir0,
+                                        int *slineOutOff) {
+
+        const int tidx = threadIdx.x;
+        const int tidy = threadIdx.y;
+
+        const int slid = blockIdx.x*blockDim.y + threadIdx.y;
+        const size_t gid = blockIdx.x * blockDim.y * blockDim.x + blockDim.x * threadIdx.y + threadIdx.x;
+
+        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
+        const int n32dimt = ((dimt+31)/32)*32;
+
+        if (slid >= nseed) {
+                return;
+        }
+
+        REAL3_T *__restrict__ __shDir = shDir0+slid*dimt;
+        curandStatePhilox4_32_10_t st;
+        curand_init(rndSeed, gid, 0, &st);
+
+        extern __shared__ REAL_T __sh[];
+        REAL_T *__pmf_data_sh = __sh + tidy*n32dimt;
+
+        REAL3_T point = seeds[slid];
+
+        #pragma unroll
+        for (int i = tidx; i < dimt; i += BDIM_X) {
+                // REAL_T z_query = point.z + (REAL_T)(i * dimz);
+                // __pmf_data_sh[i] = tex3D<REAL_T>(*pmf, point.x, point.y, z_query);
+
+                REAL_T x_query = (REAL_T)(i * dimx) + point.x;
+                __pmf_data_sh[i] = tex3D<REAL_T>(*pmf, x_query, point.y, point.z);
+        }
+        __syncwarp(WMASK);
+
+        const REAL_T absolpmf_thresh = PMF_THRESHOLD_P * max_d<BDIM_X>(dimt, __pmf_data_sh, REAL_MIN);
+        __syncwarp(WMASK);
+
+        #pragma unroll
+        for(int i = tidx; i < dimt; i += BDIM_X) {
+                if (__pmf_data_sh[i] < absolpmf_thresh) {
+                        __pmf_data_sh[i] = 0.0;
+                }
+        }
+        __syncwarp(WMASK);
+
+        int *__shInd = reinterpret_cast<int *>(__sh + BDIM_Y*n32dimt) + tidy*n32dimt;
+        int ndir = peak_directions_d<
+            BDIM_X,
+            BDIM_Y>(__pmf_data_sh,
+                    __shDir,
+                    sphere_vertices,
+                    sphere_edges,
+                    num_edges,
+                    dimt,
+                    __shInd,
+                    relative_peak_thres,
+                    min_separation_angle);
+
+        if (tidx == 0) {
+                slineOutOff[slid] = ndir;
+        }
+
+        return;
+}
diff --git a/cuslines/cuda_python/cu_direction_getters.py b/cuslines/cuda_python/cu_direction_getters.py
@@ -322,7 +322,7 @@ class ProbDirectionGetter(GPUDirectionGetter):
     def __init__(self):
         checkCudaErrors(driver.cuInit(0))
         self.getnum_kernel_name = f"getNumStreamlinesProb_k<{THR_X_SL},{BLOCK_Y},{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>"
-        self.genstreamlines_kernel_name = f"genStreamlinesMergeProb_k<{THR_X_SL},{BLOCK_Y},PROB,{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>"
+        self.genstreamlines_kernel_name = f"genStreamlinesMergeProb_k<{THR_X_SL},{BLOCK_Y},PROB,const REAL_T *__restrict__,{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>"
         self.compile_program()
 
     def getNumStreamlines(self, n, nseeds_gpu, block, grid, sp):
@@ -333,6 +333,11 @@ def getNumStreamlines(self, n, nseeds_gpu, block, grid, sp):
         )
         config = LaunchConfig(block=block, grid=grid, shmem_size=shared_memory)
 
+        if isinstance(sp.gpu_tracker.dataf_d[n], runtime.cudaTextureObject_t):
+            dataf_d_n = sp.gpu_tracker.dataf_d[n].getPtr()
+        else:
+            dataf_d_n = sp.gpu_tracker.dataf_d[n]
+
         launch(
             sp.gpu_tracker.streams[n],
             config,
@@ -347,7 +352,7 @@ def getNumStreamlines(self, n, nseeds_gpu, block, grid, sp):
             sp.gpu_tracker.dimy,
             sp.gpu_tracker.dimz,
             sp.gpu_tracker.dimt,
-            sp.gpu_tracker.dataf_d[n],
+            dataf_d_n,
             sp.gpu_tracker.sphere_vertices_d[n],
             sp.gpu_tracker.sphere_edges_d[n],
             sp.gpu_tracker.nedges,
@@ -363,6 +368,11 @@ def generateStreamlines(self, n, nseeds_gpu, block, grid, sp):
         shared_memory = self._shared_mem_bytes(sp)
         config = LaunchConfig(block=block, grid=grid, shmem_size=shared_memory)
 
+        if isinstance(sp.gpu_tracker.dataf_d[n], runtime.cudaTextureObject_t):
+            dataf_d_n = sp.gpu_tracker.dataf_d[n].getPtr()
+        else:
+            dataf_d_n = sp.gpu_tracker.dataf_d[n]
+
         launch(
             sp.gpu_tracker.streams[n],
             config,
@@ -380,7 +390,7 @@ def generateStreamlines(self, n, nseeds_gpu, block, grid, sp):
             sp.gpu_tracker.dimy,
             sp.gpu_tracker.dimz,
             sp.gpu_tracker.dimt,
-            sp.gpu_tracker.dataf_d[n],
+            dataf_d_n,
             sp.gpu_tracker.metric_map_d[n],
             sp.gpu_tracker.samplm_nr,
             sp.gpu_tracker.sphere_vertices_d[n],
@@ -397,8 +407,8 @@ def generateStreamlines(self, n, nseeds_gpu, block, grid, sp):
 class PttDirectionGetter(ProbDirectionGetter):
     def __init__(self):
         checkCudaErrors(driver.cuInit(0))
-        self.getnum_kernel_name = f"getNumStreamlinesProb_k<{THR_X_SL},{BLOCK_Y},{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>"
-        self.genstreamlines_kernel_name = f"genStreamlinesMergeProb_k<{THR_X_SL},{BLOCK_Y},PTT,{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>"
+        self.getnum_kernel_name = f"getNumStreamlinesPtt_k<{THR_X_SL},{BLOCK_Y},{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>"
+        self.genstreamlines_kernel_name = f"genStreamlinesMergeProb_k<{THR_X_SL},{BLOCK_Y},PTT,const cudaTextureObject_t *__restrict__,{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>"
         self.compile_program()
 
     def _shared_mem_bytes(self, sp):

diff --git a/cuslines/cuda_python/cu_propagate_seeds.py b/cuslines/cuda_python/cu_propagate_seeds.py
@@ -204,10 +204,6 @@ def _cleanup(self):
         self.nSlines_old = self.nSlines
         self.gpu_tracker.rng_offset += self.nseeds
 
-    # TODO: performance: better queuing/batching of seeds,
-    # if more performance needed,
-    # given exponential nature of streamlines
-    # May be better to do in cuda code directly
     def propagate(self, seeds):
         self.nseeds = len(seeds)
         self.nseeds_per_gpu = (