Skip to content
Open
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions cuslines/cuda_c/generate_streamlines_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
#include "utils.cu"
#include "tracking_helpers.cu"
#include "boot.cu"
#include "ptt_init.cu"
#include "ptt.cu"

#define MAX_NUM_DIR (128)
Expand Down Expand Up @@ -292,6 +293,7 @@ __device__ int get_direction_prob_d(curandStatePhilox4_32_10_t *st,
template<int BDIM_X,
int BDIM_Y,
ModelType MODEL_T,
typename DATA_T,
typename REAL_T,
typename REAL3_T>
__device__ int tracker_d(curandStatePhilox4_32_10_t *st,
Expand All @@ -308,7 +310,7 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
const int dimy,
const int dimz,
const int dimt,
const REAL_T *__restrict__ dataf,
DATA_T dataf,
const REAL_T *__restrict__ metric_map,
const int samplm_nr,
const REAL3_T *__restrict__ sphere_vertices,
Expand Down Expand Up @@ -511,6 +513,7 @@ __global__ void getNumStreamlinesProb_k(const REAL_T max_angle,
template<int BDIM_X,
int BDIM_Y,
ModelType MODEL_T,
typename DATA_T,
typename REAL_T,
typename REAL3_T>
__global__ void genStreamlinesMergeProb_k(
Expand All @@ -527,7 +530,7 @@ __global__ void genStreamlinesMergeProb_k(
const int dimy,
const int dimz,
const int dimt,
const REAL_T *__restrict__ dataf,
DATA_T dataf,
const REAL_T *__restrict__ metric_map,
const int samplm_nr,
const REAL3_T *__restrict__ sphere_vertices,
Expand Down Expand Up @@ -617,7 +620,8 @@ __global__ void genStreamlinesMergeProb_k(
int stepsB;
const int tissue_classB = tracker_d<BDIM_X,
BDIM_Y,
MODEL_T>(&st,
MODEL_T,
DATA_T>(&st,
max_angle,
tc_threshold,
step_size,
Expand Down Expand Up @@ -651,7 +655,8 @@ __global__ void genStreamlinesMergeProb_k(
int stepsF;
const int tissue_classF = tracker_d<BDIM_X,
BDIM_Y,
MODEL_T>(&st,
MODEL_T,
DATA_T>(&st,
max_angle,
tc_threshold,
step_size,
Expand Down
26 changes: 12 additions & 14 deletions cuslines/cuda_c/ptt.cu
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ __device__ __forceinline__ void crossnorm3_d(REAL_T *dest, const REAL_T *src1, c
}

template<int BDIM_X, typename REAL_T, typename REAL3_T>
__device__ REAL_T interp4_d(const REAL3_T pos, const REAL_T* frame, const REAL_T *__restrict__ pmf,
__device__ REAL_T interp4_d(const REAL3_T pos, const REAL_T* frame,
const cudaTextureObject_t *__restrict__ pmf,
const int dimx, const int dimy, const int dimz, const int dimt,
const REAL3_T *__restrict__ odf_sphere_vertices) {
const int tidx = threadIdx.x;
Expand Down Expand Up @@ -64,14 +65,11 @@ __device__ REAL_T interp4_d(const REAL3_T pos, const REAL_T* frame, const REAL_T
}
#endif

// TODO: maybe this should be texture memory, I am not so sure
const int rv = trilinear_interp_d<THR_X_SL>(dimx, dimy, dimz, dimt, closest_odf_idx, pmf, pos, &__max_cos);

if (rv != 0) {
return 0; // No support
} else {
return __max_cos;
}
// REAL_T z_query = pos.z + (REAL_T)(closest_odf_idx * dimz);
// return tex3D<REAL_T>(*pmf, pos.x, pos.y, z_query);

REAL_T x_query = (REAL_T)(closest_odf_idx*dimx) + pos.x;
return tex3D<REAL_T>(*pmf, x_query, pos.y, pos.z);
}

template<typename REAL_T>
Expand Down Expand Up @@ -191,7 +189,7 @@ __device__ void propagate_frame_d(REAL_T* propagator, REAL_T* frame, REAL_T* dir

template<int BDIM_X, typename REAL_T, typename REAL3_T>
__device__ REAL_T calculate_data_support_d(REAL_T support,
const REAL3_T pos, const REAL_T *__restrict__ pmf,
const REAL3_T pos, const cudaTextureObject_t *__restrict__ pmf,
const int dimx, const int dimy, const int dimz, const int dimt,
const REAL_T probe_step_size,
const REAL_T absolpmf_thresh,
Expand Down Expand Up @@ -249,7 +247,7 @@ template<int BDIM_X,
typename REAL3_T>
__device__ int get_direction_ptt_d(
curandStatePhilox4_32_10_t *st,
const REAL_T *__restrict__ pmf,
const cudaTextureObject_t *__restrict__ pmf,
const REAL_T max_angle,
const REAL_T step_size,
REAL3_T dir,
Expand Down Expand Up @@ -294,9 +292,9 @@ __device__ int get_direction_ptt_d(
REAL_T *__direc_sh = direc_sh + tidy*3;
REAL3_T *__probing_pos_sh = probing_pos_sh + tidy;

const REAL_T probe_step_size = ((step_size / PROBE_FRAC) / (PROBE_QUALITY - 1));
const REAL_T probe_step_size = ((step_size / PROBE_FRAC) / (PROBE_QUALITY - 1)); // TODO: is this -1 necessary?
const REAL_T max_curvature = 2.0 * SIN(max_angle / 2.0) / (step_size / PROBE_FRAC); // This seems to work well
const REAL_T absolpmf_thresh = PMF_THRESHOLD_P * max_d<BDIM_X>(dimt, pmf, REAL_MIN);
const REAL_T absolpmf_thresh = 0; // PMF_THRESHOLD_P * max_d<BDIM_X>(dimt, pmf, REAL_MIN); TODO: try 2.84 for max; i mean, this is completely broken

#if 0
printf("absolpmf_thresh: %f, max_curvature: %f, probe_step_size: %f\n", absolpmf_thresh, max_curvature, probe_step_size);
Expand Down Expand Up @@ -493,7 +491,7 @@ template<int BDIM_X,
typename REAL3_T>
__device__ bool init_frame_ptt_d(
curandStatePhilox4_32_10_t *st,
const REAL_T *__restrict__ pmf,
const cudaTextureObject_t *__restrict__ pmf,
const REAL_T max_angle,
const REAL_T step_size,
REAL3_T first_step,
Expand Down
86 changes: 86 additions & 0 deletions cuslines/cuda_c/ptt_init.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
template<int BDIM_X,
int BDIM_Y,
typename REAL_T,
typename REAL3_T>
__global__ void getNumStreamlinesPtt_k(
const REAL_T max_angle,
const REAL_T relative_peak_thres,
const REAL_T min_separation_angle,
const long long rndSeed,
const int nseed,
const REAL3_T *__restrict__ seeds,
const int dimx,
const int dimy,
const int dimz,
const int dimt,
const cudaTextureObject_t *__restrict__ pmf,
const REAL3_T *__restrict__ sphere_vertices,
const int2 *__restrict__ sphere_edges,
const int num_edges,
REAL3_T *__restrict__ shDir0,
int *slineOutOff) {

const int tidx = threadIdx.x;
const int tidy = threadIdx.y;

const int slid = blockIdx.x*blockDim.y + threadIdx.y;
const size_t gid = blockIdx.x * blockDim.y * blockDim.x + blockDim.x * threadIdx.y + threadIdx.x;

const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));

const int n32dimt = ((dimt+31)/32)*32;

if (slid >= nseed) {
return;
}

REAL3_T *__restrict__ __shDir = shDir0+slid*dimt;
curandStatePhilox4_32_10_t st;
curand_init(rndSeed, gid, 0, &st);

extern __shared__ REAL_T __sh[];
REAL_T *__pmf_data_sh = __sh + tidy*n32dimt;

REAL3_T point = seeds[slid];

#pragma unroll
for (int i = tidx; i < dimt; i += BDIM_X) {
// REAL_T z_query = point.z + (REAL_T)(i * dimz);
// __pmf_data_sh[i] = tex3D<REAL_T>(*pmf, point.x, point.y, z_query);

REAL_T x_query = (REAL_T)(i * dimx) + point.x;
__pmf_data_sh[i] = tex3D<REAL_T>(*pmf, x_query, point.y, point.z);
}
__syncwarp(WMASK);

const REAL_T absolpmf_thresh = PMF_THRESHOLD_P * max_d<BDIM_X>(dimt, __pmf_data_sh, REAL_MIN);
__syncwarp(WMASK);

#pragma unroll
for(int i = tidx; i < dimt; i += BDIM_X) {
if (__pmf_data_sh[i] < absolpmf_thresh) {
__pmf_data_sh[i] = 0.0;
}
}
__syncwarp(WMASK);

int *__shInd = reinterpret_cast<int *>(__sh + BDIM_Y*n32dimt) + tidy*n32dimt;
int ndir = peak_directions_d<
BDIM_X,
BDIM_Y>(__pmf_data_sh,
__shDir,
sphere_vertices,
sphere_edges,
num_edges,
dimt,
__shInd,
relative_peak_thres,
min_separation_angle);

if (tidx == 0) {
slineOutOff[slid] = ndir;
}

return;
}
20 changes: 15 additions & 5 deletions cuslines/cuda_python/cu_direction_getters.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,7 @@ class ProbDirectionGetter(GPUDirectionGetter):
def __init__(self):
checkCudaErrors(driver.cuInit(0))
self.getnum_kernel_name = f"getNumStreamlinesProb_k<{THR_X_SL},{BLOCK_Y},{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>"
self.genstreamlines_kernel_name = f"genStreamlinesMergeProb_k<{THR_X_SL},{BLOCK_Y},PROB,{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>"
self.genstreamlines_kernel_name = f"genStreamlinesMergeProb_k<{THR_X_SL},{BLOCK_Y},PROB,const REAL_T *__restrict__,{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>"
self.compile_program()

def getNumStreamlines(self, n, nseeds_gpu, block, grid, sp):
Expand All @@ -333,6 +333,11 @@ def getNumStreamlines(self, n, nseeds_gpu, block, grid, sp):
)
config = LaunchConfig(block=block, grid=grid, shmem_size=shared_memory)

if isinstance(sp.gpu_tracker.dataf_d[n], runtime.cudaTextureObject_t):
dataf_d_n = sp.gpu_tracker.dataf_d[n].getPtr()
else:
dataf_d_n = sp.gpu_tracker.dataf_d[n]

launch(
sp.gpu_tracker.streams[n],
config,
Expand All @@ -347,7 +352,7 @@ def getNumStreamlines(self, n, nseeds_gpu, block, grid, sp):
sp.gpu_tracker.dimy,
sp.gpu_tracker.dimz,
sp.gpu_tracker.dimt,
sp.gpu_tracker.dataf_d[n],
dataf_d_n,
sp.gpu_tracker.sphere_vertices_d[n],
sp.gpu_tracker.sphere_edges_d[n],
sp.gpu_tracker.nedges,
Expand All @@ -363,6 +368,11 @@ def generateStreamlines(self, n, nseeds_gpu, block, grid, sp):
shared_memory = self._shared_mem_bytes(sp)
config = LaunchConfig(block=block, grid=grid, shmem_size=shared_memory)

if isinstance(sp.gpu_tracker.dataf_d[n], runtime.cudaTextureObject_t):
dataf_d_n = sp.gpu_tracker.dataf_d[n].getPtr()
else:
dataf_d_n = sp.gpu_tracker.dataf_d[n]

launch(
sp.gpu_tracker.streams[n],
config,
Expand All @@ -380,7 +390,7 @@ def generateStreamlines(self, n, nseeds_gpu, block, grid, sp):
sp.gpu_tracker.dimy,
sp.gpu_tracker.dimz,
sp.gpu_tracker.dimt,
sp.gpu_tracker.dataf_d[n],
dataf_d_n,
sp.gpu_tracker.metric_map_d[n],
sp.gpu_tracker.samplm_nr,
sp.gpu_tracker.sphere_vertices_d[n],
Expand All @@ -397,8 +407,8 @@ def generateStreamlines(self, n, nseeds_gpu, block, grid, sp):
class PttDirectionGetter(ProbDirectionGetter):
def __init__(self):
checkCudaErrors(driver.cuInit(0))
self.getnum_kernel_name = f"getNumStreamlinesProb_k<{THR_X_SL},{BLOCK_Y},{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>"
self.genstreamlines_kernel_name = f"genStreamlinesMergeProb_k<{THR_X_SL},{BLOCK_Y},PTT,{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>"
self.getnum_kernel_name = f"getNumStreamlinesPtt_k<{THR_X_SL},{BLOCK_Y},{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>"
self.genstreamlines_kernel_name = f"genStreamlinesMergeProb_k<{THR_X_SL},{BLOCK_Y},PTT,const cudaTextureObject_t *__restrict__,{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>"
self.compile_program()

def _shared_mem_bytes(self, sp):
Expand Down
4 changes: 0 additions & 4 deletions cuslines/cuda_python/cu_propagate_seeds.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,10 +204,6 @@ def _cleanup(self):
self.nSlines_old = self.nSlines
self.gpu_tracker.rng_offset += self.nseeds

# TODO: performance: better queuing/batching of seeds,
# if more performance needed,
# given exponential nature of streamlines
# May be better to do in cuda code directly
def propagate(self, seeds):
self.nseeds = len(seeds)
self.nseeds_per_gpu = (
Expand Down
Loading
Loading