diff --git a/test/smoke-limbo/xteamr/test_xteamr.cpp b/test/smoke-limbo/xteamr/test_xteamr.cpp index f1de0eb84..5cd2075c5 100644 --- a/test/smoke-limbo/xteamr/test_xteamr.cpp +++ b/test/smoke-limbo/xteamr/test_xteamr.cpp @@ -55,15 +55,8 @@ unsigned int ignore_times = #define _XTEAM_NUM_TEAMS 80 #endif -#if (_XTEAM_NUM_THREADS <= 1024) && \ - ((_XTEAM_NUM_THREADS & (_XTEAM_NUM_THREADS - 1)) == 0) -#define _SUM_OVERLOAD_64_FCT _overload_to_extern_sum_16x64 -#define _SUM_OVERLOAD_32_FCT _overload_to_extern_sum_32x32 -#define _MAX_OVERLOAD_64_FCT _overload_to_extern_max_16x64 -#define _MAX_OVERLOAD_32_FCT _overload_to_extern_max_32x32 -#define _MIN_OVERLOAD_64_FCT _overload_to_extern_min_16x64 -#define _MIN_OVERLOAD_32_FCT _overload_to_extern_min_32x32 -#else +#if (_XTEAM_NUM_THREADS > 1024) || \ + ((_XTEAM_NUM_THREADS & (_XTEAM_NUM_THREADS - 1)) != 0) #error Invalid value for _XTEAM_NUM_THREADS. Expected upper limit: 1024 and a power of 2. #endif @@ -258,8 +251,8 @@ template T sim_dot(T *a, T *b, int warp_size) { T val0 = lc0.rnv; _BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc0.size, lc0.stride, lc0.offset) val0 += a[i] * b[i]; - _SUM_OVERLOAD_64_FCT(val0, &sum, lc0.team_vals, lc0.td_ptr, lc0.rnv, k, - _XTEAM_NUM_TEAMS); + _overload_to_extern_sum(val0, &sum, lc0.team_vals, lc0.td_ptr, lc0.rnv, k, + _XTEAM_NUM_TEAMS); } } else { #pragma omp target teams distribute parallel for num_teams(_XTEAM_NUM_TEAMS) \ @@ -271,8 +264,8 @@ template T sim_dot(T *a, T *b, int warp_size) { _BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc0.size, lc0.stride, lc0.offset) val0 += a[i] * b[i]; _LIMIT_JUMP_TO_CUDA_REDUCED_THREADS(_XTEAM_NUM_TEAMS) - _SUM_OVERLOAD_32_FCT(val0, &sum, lc0.team_vals, lc0.td_ptr, lc0.rnv, k, - _XTEAM_NUM_TEAMS); + _overload_to_extern_sum(val0, &sum, lc0.team_vals, lc0.td_ptr, lc0.rnv, k, + _XTEAM_NUM_TEAMS); } } return sum; @@ -311,7 +304,7 @@ template T sim_max(T *c, int warp_size) { T val1 = lc1.rnv; _BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc1.size, lc1.stride, lc1.offset) val1 = (c[i] > val1) ? c[i] : val1; - _MAX_OVERLOAD_64_FCT(val1, &retval, lc1.team_vals, lc1.td_ptr, lc1.rnv, k, + _overload_to_extern_max(val1, &retval, lc1.team_vals, lc1.td_ptr, lc1.rnv, k, _XTEAM_NUM_TEAMS); } } else { @@ -324,7 +317,7 @@ template T sim_max(T *c, int warp_size) { _BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc1.size, lc1.stride, lc1.offset) val1 = (c[i] > val1) ? c[i] : val1; _LIMIT_JUMP_TO_CUDA_REDUCED_THREADS(_XTEAM_NUM_TEAMS) - _MAX_OVERLOAD_32_FCT(val1, &retval, lc1.team_vals, lc1.td_ptr, lc1.rnv, k, + _overload_to_extern_max(val1, &retval, lc1.team_vals, lc1.td_ptr, lc1.rnv, k, _XTEAM_NUM_TEAMS); } } @@ -364,7 +357,7 @@ template T sim_min(T *c, int warp_size) { T val2 = lc2.rnv; _BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc2.size, lc2.stride, lc2.offset) val2 = (c[i] < val2) ? c[i] : val2; - _MIN_OVERLOAD_64_FCT(val2, &retval, lc2.team_vals, lc2.td_ptr, lc2.rnv, k, + _overload_to_extern_min(val2, &retval, lc2.team_vals, lc2.td_ptr, lc2.rnv, k, _XTEAM_NUM_TEAMS); } } else { @@ -377,7 +370,7 @@ template T sim_min(T *c, int warp_size) { _BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc2.size, lc2.stride, lc2.offset) val2 = (c[i] < val2) ? c[i] : val2; _LIMIT_JUMP_TO_CUDA_REDUCED_THREADS(_XTEAM_NUM_TEAMS) - _MIN_OVERLOAD_32_FCT(val2, &retval, lc2.team_vals, lc2.td_ptr, lc2.rnv, k, + _overload_to_extern_min(val2, &retval, lc2.team_vals, lc2.td_ptr, lc2.rnv, k, _XTEAM_NUM_TEAMS); } } @@ -625,8 +618,8 @@ template T sim_dot_complex(T *a, T *b, int warp_size) { T val3 = lc3.rnv; _BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc3.size, lc3.stride, lc3.offset) val3 += a[i] * b[i]; - _SUM_OVERLOAD_64_FCT(val3, &sum, lc3.team_vals, lc3.td_ptr, lc3.rnv, k, - _XTEAM_NUM_TEAMS); + _overload_to_extern_sum(val3, &sum, lc3.team_vals, lc3.td_ptr, lc3.rnv, k, + _XTEAM_NUM_TEAMS); } } else { #pragma omp target teams distribute parallel for num_teams(_XTEAM_NUM_TEAMS) \ @@ -638,8 +631,8 @@ template T sim_dot_complex(T *a, T *b, int warp_size) { _BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc3.size, lc3.stride, lc3.offset) val3 += a[i] * b[i]; _LIMIT_JUMP_TO_CUDA_REDUCED_THREADS(_XTEAM_NUM_TEAMS) - _SUM_OVERLOAD_32_FCT(val3, &sum, lc3.team_vals, lc3.td_ptr, lc3.rnv, k, - _XTEAM_NUM_TEAMS); + _overload_to_extern_sum(val3, &sum, lc3.team_vals, lc3.td_ptr, lc3.rnv, k, + _XTEAM_NUM_TEAMS); } } return sum; diff --git a/test/smoke-limbo/xteamr/test_xteamr.h b/test/smoke-limbo/xteamr/test_xteamr.h index 93243eed5..843cea4ef 100644 --- a/test/smoke-limbo/xteamr/test_xteamr.h +++ b/test/smoke-limbo/xteamr/test_xteamr.h @@ -18,67 +18,35 @@ #if defined(__AMDGCN__) || defined(__NVPTX__) extern "C" { #define _RF_LDS volatile __attribute__((address_space(3))) -void _INLINE_ATTR_ __kmpc_xteamr_d_16x64 +void _INLINE_ATTR_ __kmpc_xteamr_d (double v, double *r_ptr, double *tvs, uint32_t *td, void (*_rf)(double *, double), void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double iv, const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */); -void _INLINE_ATTR_ __kmpc_xteamr_f_16x64 +void _INLINE_ATTR_ __kmpc_xteamr_f (float v, float *r_ptr, float *tvs, uint32_t *td, void (*_rf)(float *, float), void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float iv, const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */); -void _INLINE_ATTR_ __kmpc_xteamr_cd_16x64 +void _INLINE_ATTR_ __kmpc_xteamr_cd (_CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD), void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD iv, const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */); -void _INLINE_ATTR_ __kmpc_xteamr_cf_16x64 +void _INLINE_ATTR_ __kmpc_xteamr_cf (_CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF), void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF iv, const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */); -void _INLINE_ATTR_ __kmpc_xteamr_i_16x64 +void _INLINE_ATTR_ __kmpc_xteamr_i (int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int), void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int iv, const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */); -void _INLINE_ATTR_ __kmpc_xteamr_ui_16x64 +void _INLINE_ATTR_ __kmpc_xteamr_ui (_UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI), void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI iv, const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */); -void _INLINE_ATTR_ __kmpc_xteamr_l_16x64 +void _INLINE_ATTR_ __kmpc_xteamr_l (long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long), void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long iv, const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */); -void _INLINE_ATTR_ __kmpc_xteamr_ul_16x64 - (_UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL), - void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL iv, - const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */); -void _INLINE_ATTR_ __kmpc_xteamr_d_32x32 - (double v, double *r_ptr, double *tvs, uint32_t *td, void (*_rf)(double *, double), - void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double iv, - const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */); -void _INLINE_ATTR_ __kmpc_xteamr_f_32x32 - (float v, float *r_ptr, float *tvs, uint32_t *td, void (*_rf)(float *, float), - void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float iv, - const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */); -void _INLINE_ATTR_ __kmpc_xteamr_cd_32x32 - (_CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD), - void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD iv, - const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */); -void _INLINE_ATTR_ __kmpc_xteamr_cf_32x32 - (_CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF), - void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF iv, - const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */); -void _INLINE_ATTR_ __kmpc_xteamr_i_32x32 - (int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int), - void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int iv, - const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */); -void _INLINE_ATTR_ __kmpc_xteamr_ui_32x32 - (_UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI), - void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI iv, - const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */); -void _INLINE_ATTR_ __kmpc_xteamr_l_32x32 - (long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long), - void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long iv, - const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */); -void _INLINE_ATTR_ __kmpc_xteamr_ul_32x32 +void _INLINE_ATTR_ __kmpc_xteamr_ul (_UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL), void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL iv, const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */); @@ -133,67 +101,35 @@ int __kmpc_get_warp_size(); extern "C" { #undef _RF_LDS #define _RF_LDS -void __kmpc_xteamr_d_16x64 - (double v, double *r_ptr, double *tvs, uint32_t *td, void (*_rf)(double *, double), - void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double iv, - const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */){}; -void __kmpc_xteamr_f_16x64 - (float v, float *r_ptr, float *tvs, uint32_t *td, void (*_rf)(float *, float), - void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float iv, - const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */){}; -void __kmpc_xteamr_cd_16x64 - (_CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD), - void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD iv, - const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */){}; -void __kmpc_xteamr_cf_16x64 - (_CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF), - void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF iv, - const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */){}; -void __kmpc_xteamr_i_16x64 - (int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int), - void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int iv, - const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */){}; -void __kmpc_xteamr_ui_16x64 - (_UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI), - void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI iv, - const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */){}; -void __kmpc_xteamr_l_16x64 - (long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long), - void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long iv, - const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */){}; -void __kmpc_xteamr_ul_16x64 - (_UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL), - void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL iv, - const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */){}; -void __kmpc_xteamr_d_32x32 +void __kmpc_xteamr_d (double v, double *r_ptr, double *tvs, uint32_t *td, void (*_rf)(double *, double), void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double iv, const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */){}; -void __kmpc_xteamr_f_32x32 +void __kmpc_xteamr_f (float v, float *r_ptr, float *tvs, uint32_t *td, void (*_rf)(float *, float), void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float iv, const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */){}; -void __kmpc_xteamr_cd_32x32 +void __kmpc_xteamr_cd (_CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD), void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD iv, const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */){}; -void __kmpc_xteamr_cf_32x32 +void __kmpc_xteamr_cf (_CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF), void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF iv, const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */){}; -void __kmpc_xteamr_i_32x32 +void __kmpc_xteamr_i (int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int), void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int iv, const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */){}; -void __kmpc_xteamr_ui_32x32 +void __kmpc_xteamr_ui (_UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI), void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI iv, const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */){}; -void __kmpc_xteamr_l_32x32 +void __kmpc_xteamr_l (long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long), void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long iv, const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */){}; -void __kmpc_xteamr_ul_32x32 +void __kmpc_xteamr_ul (_UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL), void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL iv, const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */){}; @@ -248,165 +184,85 @@ int __kmpc_get_warp_size(){ // These overloaded function definitions are for this test framework // (xteamr.cpp) to invoke the extern DexviceRTL helper functions. -void _INLINE_ATTR_ _overload_to_extern_sum_16x64 - (double val, double *rv, double *tvs, uint32_t *td, const double iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_d_16x64(val, rv, tvs, td, - __kmpc_rfun_sum_d, __kmpc_rfun_sum_lds_d, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_sum_16x64 - (float val, float *rv, float *tvs, uint32_t *td, const float iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_f_16x64(val, rv, tvs, td, - __kmpc_rfun_sum_f, __kmpc_rfun_sum_lds_f, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_sum_16x64 - (_CD val, _CD *rv, _CD *tvs, uint32_t *td, const _CD iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_cd_16x64(val, rv, tvs, td, - __kmpc_rfun_sum_cd, __kmpc_rfun_sum_lds_cd, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_sum_16x64 - (_CF val, _CF *rv, _CF *tvs, uint32_t *td, const _CF iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_cf_16x64(val, rv, tvs, td, - __kmpc_rfun_sum_cf, __kmpc_rfun_sum_lds_cf, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_sum_16x64 - (int val, int *rv, int *tvs, uint32_t *td, const int iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_i_16x64(val, rv, tvs, td, - __kmpc_rfun_sum_i, __kmpc_rfun_sum_lds_i, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_sum_16x64 - (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_ui_16x64(val, rv, tvs, td, - __kmpc_rfun_sum_ui, __kmpc_rfun_sum_lds_ui, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_sum_16x64 - (long val, long *rv, long *tvs, uint32_t *td, const long iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_l_16x64(val, rv, tvs, td, - __kmpc_rfun_sum_l, __kmpc_rfun_sum_lds_l, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_sum_16x64 - (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_ul_16x64(val, rv, tvs, td, - __kmpc_rfun_sum_ul, __kmpc_rfun_sum_lds_ul, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_sum_32x32 +void _INLINE_ATTR_ _overload_to_extern_sum (double val, double *rv, double *tvs, uint32_t *td, const double iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_d_32x32(val, rv, tvs, td, + { __kmpc_xteamr_d(val, rv, tvs, td, __kmpc_rfun_sum_d, __kmpc_rfun_sum_lds_d, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_sum_32x32 +void _INLINE_ATTR_ _overload_to_extern_sum (float val, float *rv, float *tvs, uint32_t *td, const float iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_f_32x32(val, rv, tvs, td, + { __kmpc_xteamr_f(val, rv, tvs, td, __kmpc_rfun_sum_f, __kmpc_rfun_sum_lds_f, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_sum_32x32 +void _INLINE_ATTR_ _overload_to_extern_sum (_CD val, _CD *rv, _CD *tvs, uint32_t *td, const _CD iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_cd_32x32(val, rv, tvs, td, + { __kmpc_xteamr_cd(val, rv, tvs, td, __kmpc_rfun_sum_cd, __kmpc_rfun_sum_lds_cd, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_sum_32x32 +void _INLINE_ATTR_ _overload_to_extern_sum (_CF val, _CF *rv, _CF *tvs, uint32_t *td, const _CF iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_cf_32x32(val, rv, tvs, td, + { __kmpc_xteamr_cf(val, rv, tvs, td, __kmpc_rfun_sum_cf, __kmpc_rfun_sum_lds_cf, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_sum_32x32 +void _INLINE_ATTR_ _overload_to_extern_sum (int val, int *rv, int *tvs, uint32_t *td, const int iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_i_32x32(val, rv, tvs, td, + { __kmpc_xteamr_i(val, rv, tvs, td, __kmpc_rfun_sum_i, __kmpc_rfun_sum_lds_i, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_sum_32x32 +void _INLINE_ATTR_ _overload_to_extern_sum (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_ui_32x32(val, rv, tvs, td, + { __kmpc_xteamr_ui(val, rv, tvs, td, __kmpc_rfun_sum_ui, __kmpc_rfun_sum_lds_ui, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_sum_32x32 +void _INLINE_ATTR_ _overload_to_extern_sum (long val, long *rv, long *tvs, uint32_t *td, const long iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_l_32x32(val, rv, tvs, td, + { __kmpc_xteamr_l(val, rv, tvs, td, __kmpc_rfun_sum_l, __kmpc_rfun_sum_lds_l, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_sum_32x32 +void _INLINE_ATTR_ _overload_to_extern_sum (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_ul_32x32(val, rv, tvs, td, + { __kmpc_xteamr_ul(val, rv, tvs, td, __kmpc_rfun_sum_ul, __kmpc_rfun_sum_lds_ul, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_max_16x64 - (double val, double *rv, double *tvs, uint32_t *td, const double iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_d_16x64(val, rv, tvs, td, - __kmpc_rfun_max_d, __kmpc_rfun_max_lds_d, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_max_16x64 - (float val, float *rv, float *tvs, uint32_t *td, const float iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_f_16x64(val, rv, tvs, td, - __kmpc_rfun_max_f, __kmpc_rfun_max_lds_f, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_max_16x64 - (int val, int *rv, int *tvs, uint32_t *td, const int iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_i_16x64(val, rv, tvs, td, - __kmpc_rfun_max_i, __kmpc_rfun_max_lds_i, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_max_16x64 - (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_ui_16x64(val, rv, tvs, td, - __kmpc_rfun_max_ui, __kmpc_rfun_max_lds_ui, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_max_16x64 - (long val, long *rv, long *tvs, uint32_t *td, const long iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_l_16x64(val, rv, tvs, td, - __kmpc_rfun_max_l, __kmpc_rfun_max_lds_l, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_max_16x64 - (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_ul_16x64(val, rv, tvs, td, - __kmpc_rfun_max_ul, __kmpc_rfun_max_lds_ul, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_max_32x32 +void _INLINE_ATTR_ _overload_to_extern_max (double val, double *rv, double *tvs, uint32_t *td, const double iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_d_32x32(val, rv, tvs, td, + { __kmpc_xteamr_d(val, rv, tvs, td, __kmpc_rfun_max_d, __kmpc_rfun_max_lds_d, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_max_32x32 +void _INLINE_ATTR_ _overload_to_extern_max (float val, float *rv, float *tvs, uint32_t *td, const float iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_f_32x32(val, rv, tvs, td, + { __kmpc_xteamr_f(val, rv, tvs, td, __kmpc_rfun_max_f, __kmpc_rfun_max_lds_f, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_max_32x32 +void _INLINE_ATTR_ _overload_to_extern_max (int val, int *rv, int *tvs, uint32_t *td, const int iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_i_32x32(val, rv, tvs, td, + { __kmpc_xteamr_i(val, rv, tvs, td, __kmpc_rfun_max_i, __kmpc_rfun_max_lds_i, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_max_32x32 +void _INLINE_ATTR_ _overload_to_extern_max (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_ui_32x32(val, rv, tvs, td, + { __kmpc_xteamr_ui(val, rv, tvs, td, __kmpc_rfun_max_ui, __kmpc_rfun_max_lds_ui, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_max_32x32 +void _INLINE_ATTR_ _overload_to_extern_max (long val, long *rv, long *tvs, uint32_t *td, const long iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_l_32x32(val, rv, tvs, td, + { __kmpc_xteamr_l(val, rv, tvs, td, __kmpc_rfun_max_l, __kmpc_rfun_max_lds_l, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_max_32x32 +void _INLINE_ATTR_ _overload_to_extern_max (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_ul_32x32(val, rv, tvs, td, + { __kmpc_xteamr_ul(val, rv, tvs, td, __kmpc_rfun_max_ul, __kmpc_rfun_max_lds_ul, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_min_16x64 - (double val, double *rv, double *tvs, uint32_t *td, const double iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_d_16x64(val, rv, tvs, td, - __kmpc_rfun_min_d, __kmpc_rfun_min_lds_d, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_min_16x64 - (float val, float *rv, float *tvs, uint32_t *td, const float iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_f_16x64(val, rv, tvs, td, - __kmpc_rfun_min_f, __kmpc_rfun_min_lds_f, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_min_16x64 - (int val, int *rv, int *tvs, uint32_t *td, const int iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_i_16x64(val, rv, tvs, td, - __kmpc_rfun_min_i, __kmpc_rfun_min_lds_i, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_min_16x64 - (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_ui_16x64(val, rv, tvs, td, - __kmpc_rfun_min_ui, __kmpc_rfun_min_lds_ui, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_min_16x64 - (long val, long *rv, long *tvs, uint32_t *td, const long iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_l_16x64(val, rv, tvs, td, - __kmpc_rfun_min_l, __kmpc_rfun_min_lds_l, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_min_16x64 - (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_ul_16x64(val, rv, tvs, td, - __kmpc_rfun_min_ul, __kmpc_rfun_min_lds_ul, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_min_32x32 +void _INLINE_ATTR_ _overload_to_extern_min (double val, double *rv, double *tvs, uint32_t *td, const double iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_d_32x32(val, rv, tvs, td, + { __kmpc_xteamr_d(val, rv, tvs, td, __kmpc_rfun_min_d, __kmpc_rfun_min_lds_d, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_min_32x32 +void _INLINE_ATTR_ _overload_to_extern_min (float val, float *rv, float *tvs, uint32_t *td, const float iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_f_32x32(val, rv, tvs, td, + { __kmpc_xteamr_f(val, rv, tvs, td, __kmpc_rfun_min_f, __kmpc_rfun_min_lds_f, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_min_32x32 +void _INLINE_ATTR_ _overload_to_extern_min (int val, int *rv, int *tvs, uint32_t *td, const int iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_i_32x32(val, rv, tvs, td, + { __kmpc_xteamr_i(val, rv, tvs, td, __kmpc_rfun_min_i, __kmpc_rfun_min_lds_i, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_min_32x32 +void _INLINE_ATTR_ _overload_to_extern_min (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_ui_32x32(val, rv, tvs, td, + { __kmpc_xteamr_ui(val, rv, tvs, td, __kmpc_rfun_min_ui, __kmpc_rfun_min_lds_ui, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_min_32x32 +void _INLINE_ATTR_ _overload_to_extern_min (long val, long *rv, long *tvs, uint32_t *td, const long iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_l_32x32(val, rv, tvs, td, + { __kmpc_xteamr_l(val, rv, tvs, td, __kmpc_rfun_min_l, __kmpc_rfun_min_lds_l, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_min_32x32 +void _INLINE_ATTR_ _overload_to_extern_min (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_ul_32x32(val, rv, tvs, td, + { __kmpc_xteamr_ul(val, rv, tvs, td, __kmpc_rfun_min_ul, __kmpc_rfun_min_lds_ul, iv, k, numteams);} #undef _CD #undef _CF diff --git a/test/smoke-limbo/xteamr_builtins/test_xteamr.cpp b/test/smoke-limbo/xteamr_builtins/test_xteamr.cpp index 08babd0a6..a0b3d6a0e 100644 --- a/test/smoke-limbo/xteamr_builtins/test_xteamr.cpp +++ b/test/smoke-limbo/xteamr_builtins/test_xteamr.cpp @@ -56,15 +56,8 @@ unsigned int ignore_times = #define _XTEAM_NUM_TEAMS 100 #endif -#if (_XTEAM_NUM_THREADS <= 1024) && \ - ((_XTEAM_NUM_THREADS & (_XTEAM_NUM_THREADS - 1)) == 0) -#define _SUM_OVERLOAD_64_FCT _overload_to_extern_sum_16x64 -#define _SUM_OVERLOAD_32_FCT _overload_to_extern_sum_32x32 -#define _MAX_OVERLOAD_64_FCT _overload_to_extern_max_16x64 -#define _MAX_OVERLOAD_32_FCT _overload_to_extern_max_32x32 -#define _MIN_OVERLOAD_64_FCT _overload_to_extern_min_16x64 -#define _MIN_OVERLOAD_32_FCT _overload_to_extern_min_32x32 -#else +#if (_XTEAM_NUM_THREADS > 1024) || \ + ((_XTEAM_NUM_THREADS & (_XTEAM_NUM_THREADS - 1)) != 0) #error Invalid value for _XTEAM_NUM_THREADS. Expected upper limit: 1024 and a power of 2. #endif @@ -261,7 +254,7 @@ template T sim_dot(T *a, T *b, int warp_size) { T val0 = lc0.rnv; _BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc0.size, lc0.stride, lc0.offset) val0 += a[i] * b[i]; - _SUM_OVERLOAD_64_FCT(val0, &sum, lc0.team_vals, lc0.td_ptr, lc0.rnv, k, + _overload_to_extern_sum(val0, &sum, lc0.team_vals, lc0.td_ptr, lc0.rnv, k, _XTEAM_NUM_TEAMS); } } else { @@ -274,7 +267,7 @@ template T sim_dot(T *a, T *b, int warp_size) { _BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc0.size, lc0.stride, lc0.offset) val0 += a[i] * b[i]; _LIMIT_JUMP_TO_CUDA_REDUCED_THREADS(_XTEAM_NUM_TEAMS) - _SUM_OVERLOAD_32_FCT(val0, &sum, lc0.team_vals, lc0.td_ptr, lc0.rnv, k, + _overload_to_extern_sum(val0, &sum, lc0.team_vals, lc0.td_ptr, lc0.rnv, k, _XTEAM_NUM_TEAMS); } } @@ -314,8 +307,8 @@ template T sim_max(T *c, int warp_size) { T val1 = lc1.rnv; _BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc1.size, lc1.stride, lc1.offset) val1 = (c[i] > val1) ? c[i] : val1; - _MAX_OVERLOAD_64_FCT(val1, &retval, lc1.team_vals, lc1.td_ptr, lc1.rnv, k, - _XTEAM_NUM_TEAMS); + _overload_to_extern_max(val1, &retval, lc1.team_vals, lc1.td_ptr, lc1.rnv, k, + _XTEAM_NUM_TEAMS); } } else { #pragma omp target teams distribute parallel for num_teams(_XTEAM_NUM_TEAMS) \ @@ -327,8 +320,8 @@ template T sim_max(T *c, int warp_size) { _BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc1.size, lc1.stride, lc1.offset) val1 = (c[i] > val1) ? c[i] : val1; _LIMIT_JUMP_TO_CUDA_REDUCED_THREADS(_XTEAM_NUM_TEAMS) - _MAX_OVERLOAD_32_FCT(val1, &retval, lc1.team_vals, lc1.td_ptr, lc1.rnv, k, - _XTEAM_NUM_TEAMS); + _overload_to_extern_max(val1, &retval, lc1.team_vals, lc1.td_ptr, lc1.rnv, k, + _XTEAM_NUM_TEAMS); } } return retval; @@ -367,7 +360,7 @@ template T sim_min(T *c, int warp_size) { T val2 = lc2.rnv; _BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc2.size, lc2.stride, lc2.offset) val2 = (c[i] < val2) ? c[i] : val2; - _MIN_OVERLOAD_64_FCT(val2, &retval, lc2.team_vals, lc2.td_ptr, lc2.rnv, k, + _overload_to_extern_min(val2, &retval, lc2.team_vals, lc2.td_ptr, lc2.rnv, k, _XTEAM_NUM_TEAMS); } } else { @@ -380,7 +373,7 @@ template T sim_min(T *c, int warp_size) { _BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc2.size, lc2.stride, lc2.offset) val2 = (c[i] < val2) ? c[i] : val2; _LIMIT_JUMP_TO_CUDA_REDUCED_THREADS(_XTEAM_NUM_TEAMS) - _MIN_OVERLOAD_32_FCT(val2, &retval, lc2.team_vals, lc2.td_ptr, lc2.rnv, k, + _overload_to_extern_min(val2, &retval, lc2.team_vals, lc2.td_ptr, lc2.rnv, k, _XTEAM_NUM_TEAMS); } } @@ -628,7 +621,7 @@ template T sim_dot_complex(T *a, T *b, int warp_size) { T val3 = lc3.rnv; _BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc3.size, lc3.stride, lc3.offset) val3 += a[i] * b[i]; - _SUM_OVERLOAD_64_FCT(val3, &sum, lc3.team_vals, lc3.td_ptr, lc3.rnv, k, + _overload_to_extern_sum(val3, &sum, lc3.team_vals, lc3.td_ptr, lc3.rnv, k, _XTEAM_NUM_TEAMS); } } else { @@ -641,7 +634,7 @@ template T sim_dot_complex(T *a, T *b, int warp_size) { _BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc3.size, lc3.stride, lc3.offset) val3 += a[i] * b[i]; _LIMIT_JUMP_TO_CUDA_REDUCED_THREADS(_XTEAM_NUM_TEAMS) - _SUM_OVERLOAD_32_FCT(val3, &sum, lc3.team_vals, lc3.td_ptr, lc3.rnv, k, + _overload_to_extern_sum(val3, &sum, lc3.team_vals, lc3.td_ptr, lc3.rnv, k, _XTEAM_NUM_TEAMS); } } diff --git a/test/smoke-limbo/xteamr_builtins/test_xteamr.h b/test/smoke-limbo/xteamr_builtins/test_xteamr.h index 7ee2edf95..c08f6e043 100644 --- a/test/smoke-limbo/xteamr_builtins/test_xteamr.h +++ b/test/smoke-limbo/xteamr_builtins/test_xteamr.h @@ -15,67 +15,35 @@ #if defined(__AMDGCN__) || defined(__NVPTX__) extern "C" { #define _RF_LDS volatile __attribute__((address_space(3))) -void _INLINE_ATTR_ __kmpc_xteamr_d_16x64 +void _INLINE_ATTR_ __kmpc_xteamr_d (double v, double *r_ptr, double *tvs, uint32_t *td, void (*_rf)(double *, double), void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double iv, const uint64_t k, const uint32_t numteams); -void _INLINE_ATTR_ __kmpc_xteamr_f_16x64 +void _INLINE_ATTR_ __kmpc_xteamr_f (float v, float *r_ptr, float *tvs, uint32_t *td, void (*_rf)(float *, float), void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float iv, const uint64_t k, const uint32_t numteams); -void _INLINE_ATTR_ __kmpc_xteamr_cd_16x64 +void _INLINE_ATTR_ __kmpc_xteamr_cd (_CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD), void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD iv, const uint64_t k, const uint32_t numteams); -void _INLINE_ATTR_ __kmpc_xteamr_cf_16x64 +void _INLINE_ATTR_ __kmpc_xteamr_cf (_CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF), void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF iv, const uint64_t k, const uint32_t numteams); -void _INLINE_ATTR_ __kmpc_xteamr_i_16x64 +void _INLINE_ATTR_ __kmpc_xteamr_i (int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int), void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int iv, const uint64_t k, const uint32_t numteams); -void _INLINE_ATTR_ __kmpc_xteamr_ui_16x64 +void _INLINE_ATTR_ __kmpc_xteamr_ui (_UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI), void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI iv, const uint64_t k, const uint32_t numteams); -void _INLINE_ATTR_ __kmpc_xteamr_l_16x64 +void _INLINE_ATTR_ __kmpc_xteamr_l (long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long), void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long iv, const uint64_t k, const uint32_t numteams); -void _INLINE_ATTR_ __kmpc_xteamr_ul_16x64 - (_UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL), - void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL iv, - const uint64_t k, const uint32_t numteams); -void _INLINE_ATTR_ __kmpc_xteamr_d_32x32 - (double v, double *r_ptr, double *tvs, uint32_t *td, void (*_rf)(double *, double), - void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double iv, - const uint64_t k, const uint32_t numteams); -void _INLINE_ATTR_ __kmpc_xteamr_f_32x32 - (float v, float *r_ptr, float *tvs, uint32_t *td, void (*_rf)(float *, float), - void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float iv, - const uint64_t k, const uint32_t numteams); -void _INLINE_ATTR_ __kmpc_xteamr_cd_32x32 - (_CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD), - void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD iv, - const uint64_t k, const uint32_t numteams); -void _INLINE_ATTR_ __kmpc_xteamr_cf_32x32 - (_CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF), - void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF iv, - const uint64_t k, const uint32_t numteams); -void _INLINE_ATTR_ __kmpc_xteamr_i_32x32 - (int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int), - void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int iv, - const uint64_t k, const uint32_t numteams); -void _INLINE_ATTR_ __kmpc_xteamr_ui_32x32 - (_UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI), - void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI iv, - const uint64_t k, const uint32_t numteams); -void _INLINE_ATTR_ __kmpc_xteamr_l_32x32 - (long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long), - void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long iv, - const uint64_t k, const uint32_t numteams); -void _INLINE_ATTR_ __kmpc_xteamr_ul_32x32 +void _INLINE_ATTR_ __kmpc_xteamr_ul (_UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL), void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL iv, const uint64_t k, const uint32_t numteams); @@ -130,67 +98,35 @@ int __kmpc_get_warp_size(); extern "C" { #undef _RF_LDS #define _RF_LDS -void __kmpc_xteamr_d_16x64 - (double v, double *r_ptr, double *tvs, uint32_t *td, void (*_rf)(double *, double), - void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double iv, - const uint64_t k, const uint32_t numteams){}; -void __kmpc_xteamr_f_16x64 - (float v, float *r_ptr, float *tvs, uint32_t *td, void (*_rf)(float *, float), - void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float iv, - const uint64_t k, const uint32_t numteams){}; -void __kmpc_xteamr_cd_16x64 - (_CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD), - void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD iv, - const uint64_t k, const uint32_t numteams){}; -void __kmpc_xteamr_cf_16x64 - (_CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF), - void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF iv, - const uint64_t k, const uint32_t numteams){}; -void __kmpc_xteamr_i_16x64 - (int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int), - void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int iv, - const uint64_t k, const uint32_t numteams){}; -void __kmpc_xteamr_ui_16x64 - (_UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI), - void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI iv, - const uint64_t k, const uint32_t numteams){}; -void __kmpc_xteamr_l_16x64 - (long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long), - void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long iv, - const uint64_t k, const uint32_t numteams){}; -void __kmpc_xteamr_ul_16x64 - (_UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL), - void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL iv, - const uint64_t k, const uint32_t numteams){}; -void __kmpc_xteamr_d_32x32 +void __kmpc_xteamr_d (double v, double *r_ptr, double *tvs, uint32_t *td, void (*_rf)(double *, double), void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double iv, const uint64_t k, const uint32_t numteams){}; -void __kmpc_xteamr_f_32x32 +void __kmpc_xteamr_f (float v, float *r_ptr, float *tvs, uint32_t *td, void (*_rf)(float *, float), void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float iv, const uint64_t k, const uint32_t numteams){}; -void __kmpc_xteamr_cd_32x32 +void __kmpc_xteamr_cd (_CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD), void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD iv, const uint64_t k, const uint32_t numteams){}; -void __kmpc_xteamr_cf_32x32 +void __kmpc_xteamr_cf (_CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF), void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF iv, const uint64_t k, const uint32_t numteams){}; -void __kmpc_xteamr_i_32x32 +void __kmpc_xteamr_i (int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int), void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int iv, const uint64_t k, const uint32_t numteams){}; -void __kmpc_xteamr_ui_32x32 +void __kmpc_xteamr_ui (_UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI), void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI iv, const uint64_t k, const uint32_t numteams){}; -void __kmpc_xteamr_l_32x32 +void __kmpc_xteamr_l (long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long), void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long iv, const uint64_t k, const uint32_t numteams){}; -void __kmpc_xteamr_ul_32x32 +void __kmpc_xteamr_ul (_UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL), void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL iv, const uint64_t k, const uint32_t numteams){}; @@ -245,165 +181,85 @@ int __kmpc_get_warp_size(){ // These overloaded function definitions are for this test framework // (xteamr.cpp) to invoke the extern DexviceRTL helper functions. -void _INLINE_ATTR_ _overload_to_extern_sum_16x64 - (double val, double *rv, double *tvs, uint32_t *td, const double iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_d_16x64(val, rv, tvs, td, - __kmpc_rfun_sum_d, __kmpc_rfun_sum_lds_d, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_sum_16x64 - (float val, float *rv, float *tvs, uint32_t *td, const float iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_f_16x64(val, rv, tvs, td, - __kmpc_rfun_sum_f, __kmpc_rfun_sum_lds_f, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_sum_16x64 - (_CD val, _CD *rv, _CD *tvs, uint32_t *td, const _CD iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_cd_16x64(val, rv, tvs, td, - __kmpc_rfun_sum_cd, __kmpc_rfun_sum_lds_cd, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_sum_16x64 - (_CF val, _CF *rv, _CF *tvs, uint32_t *td, const _CF iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_cf_16x64(val, rv, tvs, td, - __kmpc_rfun_sum_cf, __kmpc_rfun_sum_lds_cf, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_sum_16x64 - (int val, int *rv, int *tvs, uint32_t *td, const int iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_i_16x64(val, rv, tvs, td, - __kmpc_rfun_sum_i, __kmpc_rfun_sum_lds_i, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_sum_16x64 - (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_ui_16x64(val, rv, tvs, td, - __kmpc_rfun_sum_ui, __kmpc_rfun_sum_lds_ui, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_sum_16x64 - (long val, long *rv, long *tvs, uint32_t *td, const long iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_l_16x64(val, rv, tvs, td, - __kmpc_rfun_sum_l, __kmpc_rfun_sum_lds_l, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_sum_16x64 - (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_ul_16x64(val, rv, tvs, td, - __kmpc_rfun_sum_ul, __kmpc_rfun_sum_lds_ul, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_sum_32x32 +void _INLINE_ATTR_ _overload_to_extern_sum (double val, double *rv, double *tvs, uint32_t *td, const double iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_d_32x32(val, rv, tvs, td, + { __kmpc_xteamr_d(val, rv, tvs, td, __kmpc_rfun_sum_d, __kmpc_rfun_sum_lds_d, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_sum_32x32 +void _INLINE_ATTR_ _overload_to_extern_sum (float val, float *rv, float *tvs, uint32_t *td, const float iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_f_32x32(val, rv, tvs, td, + { __kmpc_xteamr_f(val, rv, tvs, td, __kmpc_rfun_sum_f, __kmpc_rfun_sum_lds_f, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_sum_32x32 +void _INLINE_ATTR_ _overload_to_extern_sum (_CD val, _CD *rv, _CD *tvs, uint32_t *td, const _CD iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_cd_32x32(val, rv, tvs, td, + { __kmpc_xteamr_cd(val, rv, tvs, td, __kmpc_rfun_sum_cd, __kmpc_rfun_sum_lds_cd, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_sum_32x32 +void _INLINE_ATTR_ _overload_to_extern_sum (_CF val, _CF *rv, _CF *tvs, uint32_t *td, const _CF iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_cf_32x32(val, rv, tvs, td, + { __kmpc_xteamr_cf(val, rv, tvs, td, __kmpc_rfun_sum_cf, __kmpc_rfun_sum_lds_cf, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_sum_32x32 +void _INLINE_ATTR_ _overload_to_extern_sum (int val, int *rv, int *tvs, uint32_t *td, const int iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_i_32x32(val, rv, tvs, td, + { __kmpc_xteamr_i(val, rv, tvs, td, __kmpc_rfun_sum_i, __kmpc_rfun_sum_lds_i, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_sum_32x32 +void _INLINE_ATTR_ _overload_to_extern_sum (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_ui_32x32(val, rv, tvs, td, + { __kmpc_xteamr_ui(val, rv, tvs, td, __kmpc_rfun_sum_ui, __kmpc_rfun_sum_lds_ui, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_sum_32x32 +void _INLINE_ATTR_ _overload_to_extern_sum (long val, long *rv, long *tvs, uint32_t *td, const long iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_l_32x32(val, rv, tvs, td, + { __kmpc_xteamr_l(val, rv, tvs, td, __kmpc_rfun_sum_l, __kmpc_rfun_sum_lds_l, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_sum_32x32 +void _INLINE_ATTR_ _overload_to_extern_sum (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_ul_32x32(val, rv, tvs, td, + { __kmpc_xteamr_ul(val, rv, tvs, td, __kmpc_rfun_sum_ul, __kmpc_rfun_sum_lds_ul, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_max_16x64 - (double val, double *rv, double *tvs, uint32_t *td, const double iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_d_16x64(val, rv, tvs, td, - __kmpc_rfun_max_d, __kmpc_rfun_max_lds_d, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_max_16x64 - (float val, float *rv, float *tvs, uint32_t *td, const float iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_f_16x64(val, rv, tvs, td, - __kmpc_rfun_max_f, __kmpc_rfun_max_lds_f, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_max_16x64 - (int val, int *rv, int *tvs, uint32_t *td, const int iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_i_16x64(val, rv, tvs, td, - __kmpc_rfun_max_i, __kmpc_rfun_max_lds_i, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_max_16x64 - (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_ui_16x64(val, rv, tvs, td, - __kmpc_rfun_max_ui, __kmpc_rfun_max_lds_ui, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_max_16x64 - (long val, long *rv, long *tvs, uint32_t *td, const long iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_l_16x64(val, rv, tvs, td, - __kmpc_rfun_max_l, __kmpc_rfun_max_lds_l, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_max_16x64 - (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_ul_16x64(val, rv, tvs, td, - __kmpc_rfun_max_ul, __kmpc_rfun_max_lds_ul, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_max_32x32 +void _INLINE_ATTR_ _overload_to_extern_max (double val, double *rv, double *tvs, uint32_t *td, const double iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_d_32x32(val, rv, tvs, td, + { __kmpc_xteamr_d(val, rv, tvs, td, __kmpc_rfun_max_d, __kmpc_rfun_max_lds_d, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_max_32x32 +void _INLINE_ATTR_ _overload_to_extern_max (float val, float *rv, float *tvs, uint32_t *td, const float iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_f_32x32(val, rv, tvs, td, + { __kmpc_xteamr_f(val, rv, tvs, td, __kmpc_rfun_max_f, __kmpc_rfun_max_lds_f, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_max_32x32 +void _INLINE_ATTR_ _overload_to_extern_max (int val, int *rv, int *tvs, uint32_t *td, const int iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_i_32x32(val, rv, tvs, td, + { __kmpc_xteamr_i(val, rv, tvs, td, __kmpc_rfun_max_i, __kmpc_rfun_max_lds_i, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_max_32x32 +void _INLINE_ATTR_ _overload_to_extern_max (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_ui_32x32(val, rv, tvs, td, + { __kmpc_xteamr_ui(val, rv, tvs, td, __kmpc_rfun_max_ui, __kmpc_rfun_max_lds_ui, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_max_32x32 +void _INLINE_ATTR_ _overload_to_extern_max (long val, long *rv, long *tvs, uint32_t *td, const long iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_l_32x32(val, rv, tvs, td, + { __kmpc_xteamr_l(val, rv, tvs, td, __kmpc_rfun_max_l, __kmpc_rfun_max_lds_l, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_max_32x32 +void _INLINE_ATTR_ _overload_to_extern_max (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_ul_32x32(val, rv, tvs, td, + { __kmpc_xteamr_ul(val, rv, tvs, td, __kmpc_rfun_max_ul, __kmpc_rfun_max_lds_ul, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_min_16x64 - (double val, double *rv, double *tvs, uint32_t *td, const double iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_d_16x64(val, rv, tvs, td, - __kmpc_rfun_min_d, __kmpc_rfun_min_lds_d, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_min_16x64 - (float val, float *rv, float *tvs, uint32_t *td, const float iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_f_16x64(val, rv, tvs, td, - __kmpc_rfun_min_f, __kmpc_rfun_min_lds_f, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_min_16x64 - (int val, int *rv, int *tvs, uint32_t *td, const int iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_i_16x64(val, rv, tvs, td, - __kmpc_rfun_min_i, __kmpc_rfun_min_lds_i, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_min_16x64 - (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_ui_16x64(val, rv, tvs, td, - __kmpc_rfun_min_ui, __kmpc_rfun_min_lds_ui, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_min_16x64 - (long val, long *rv, long *tvs, uint32_t *td, const long iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_l_16x64(val, rv, tvs, td, - __kmpc_rfun_min_l, __kmpc_rfun_min_lds_l, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_min_16x64 - (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_ul_16x64(val, rv, tvs, td, - __kmpc_rfun_min_ul, __kmpc_rfun_min_lds_ul, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_min_32x32 +void _INLINE_ATTR_ _overload_to_extern_min (double val, double *rv, double *tvs, uint32_t *td, const double iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_d_32x32(val, rv, tvs, td, + { __kmpc_xteamr_d(val, rv, tvs, td, __kmpc_rfun_min_d, __kmpc_rfun_min_lds_d, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_min_32x32 +void _INLINE_ATTR_ _overload_to_extern_min (float val, float *rv, float *tvs, uint32_t *td, const float iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_f_32x32(val, rv, tvs, td, + { __kmpc_xteamr_f(val, rv, tvs, td, __kmpc_rfun_min_f, __kmpc_rfun_min_lds_f, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_min_32x32 +void _INLINE_ATTR_ _overload_to_extern_min (int val, int *rv, int *tvs, uint32_t *td, const int iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_i_32x32(val, rv, tvs, td, + { __kmpc_xteamr_i(val, rv, tvs, td, __kmpc_rfun_min_i, __kmpc_rfun_min_lds_i, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_min_32x32 +void _INLINE_ATTR_ _overload_to_extern_min (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_ui_32x32(val, rv, tvs, td, + { __kmpc_xteamr_ui(val, rv, tvs, td, __kmpc_rfun_min_ui, __kmpc_rfun_min_lds_ui, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_min_32x32 +void _INLINE_ATTR_ _overload_to_extern_min (long val, long *rv, long *tvs, uint32_t *td, const long iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_l_32x32(val, rv, tvs, td, + { __kmpc_xteamr_l(val, rv, tvs, td, __kmpc_rfun_min_l, __kmpc_rfun_min_lds_l, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_min_32x32 +void _INLINE_ATTR_ _overload_to_extern_min (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_ul_32x32(val, rv, tvs, td, + { __kmpc_xteamr_ul(val, rv, tvs, td, __kmpc_rfun_min_ul, __kmpc_rfun_min_lds_ul, iv, k, numteams);} #undef _CD #undef _CF diff --git a/test/smoke-limbo/xteamr_extended/test_xteamr.cpp b/test/smoke-limbo/xteamr_extended/test_xteamr.cpp index 43268da8b..47ddaf9ac 100644 --- a/test/smoke-limbo/xteamr_extended/test_xteamr.cpp +++ b/test/smoke-limbo/xteamr_extended/test_xteamr.cpp @@ -58,15 +58,8 @@ unsigned int ignore_times = 2; // ignore this many timings first #define _XTEAM_NUM_TEAMS 104 #endif -#if (_XTEAM_NUM_THREADS <= 1024) && \ - ((_XTEAM_NUM_THREADS & (_XTEAM_NUM_THREADS - 1)) == 0) -#define _SUM_OVERLOAD_64_FCT _overload_to_extern_sum_16x64 -#define _SUM_OVERLOAD_32_FCT _overload_to_extern_sum_32x32 -#define _MAX_OVERLOAD_64_FCT _overload_to_extern_max_16x64 -#define _MAX_OVERLOAD_32_FCT _overload_to_extern_max_32x32 -#define _MIN_OVERLOAD_64_FCT _overload_to_extern_min_16x64 -#define _MIN_OVERLOAD_32_FCT _overload_to_extern_min_32x32 -#else +#if (_XTEAM_NUM_THREADS > 1024) || \ + ((_XTEAM_NUM_THREADS & (_XTEAM_NUM_THREADS - 1)) != 0) #error Invalid value for _XTEAM_NUM_THREADS. Expected upper limit: 1024 and a power of 2. #endif @@ -411,8 +404,8 @@ T sim_dot(T *a, T *b, int warp_size) { T val0 = lc0.rnv; _BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc0.size, lc0.stride, lc0.offset) val0 += a[i] * b[i]; - _SUM_OVERLOAD_64_FCT(val0, &sum, lc0.team_vals, lc0.td_ptr, lc0.rnv, k, - _XTEAM_NUM_TEAMS); + _overload_to_extern_sum(val0, &sum, lc0.team_vals, lc0.td_ptr, lc0.rnv, k, + _XTEAM_NUM_TEAMS); } } else { #pragma omp target teams distribute parallel for num_teams(_XTEAM_NUM_TEAMS) \ @@ -424,8 +417,8 @@ T sim_dot(T *a, T *b, int warp_size) { _BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc0.size, lc0.stride, lc0.offset) val0 += a[i] * b[i]; _LIMIT_JUMP_TO_CUDA_REDUCED_THREADS(_XTEAM_NUM_TEAMS) - _SUM_OVERLOAD_32_FCT(val0, &sum, lc0.team_vals, lc0.td_ptr, lc0.rnv, k, - _XTEAM_NUM_TEAMS); + _overload_to_extern_sum(val0, &sum, lc0.team_vals, lc0.td_ptr, lc0.rnv, k, + _XTEAM_NUM_TEAMS); } } return sum; @@ -468,8 +461,8 @@ T sim_dot_extended(T *a, T *b, int warp_size) { EXT_T valb = b[i]; val0 += vala * valb; } - _SUM_OVERLOAD_64_FCT(val0, &sum, lc0.team_vals, lc0.td_ptr, lc0.rnv, k, - _XTEAM_NUM_TEAMS); + _overload_to_extern_sum(val0, &sum, lc0.team_vals, lc0.td_ptr, lc0.rnv, k, + _XTEAM_NUM_TEAMS); } } else { #pragma omp target teams distribute parallel for num_teams(_XTEAM_NUM_TEAMS) \ @@ -479,8 +472,8 @@ T sim_dot_extended(T *a, T *b, int warp_size) { _BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc0.size, lc0.stride, lc0.offset) val0 += a[i] * b[i]; _LIMIT_JUMP_TO_CUDA_REDUCED_THREADS(_XTEAM_NUM_TEAMS) - _SUM_OVERLOAD_32_FCT(val0, &sum, lc0.team_vals, lc0.td_ptr, lc0.rnv, k, - _XTEAM_NUM_TEAMS); + _overload_to_extern_sum(val0, &sum, lc0.team_vals, lc0.td_ptr, lc0.rnv, k, + _XTEAM_NUM_TEAMS); } } return (T) sum; @@ -521,7 +514,7 @@ T sim_max(T *c, int warp_size) { T val1 = lc1.rnv; _BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc1.size, lc1.stride, lc1.offset) val1 = (c[i] > val1) ? c[i] : val1; - _MAX_OVERLOAD_64_FCT(val1, &retval, lc1.team_vals, lc1.td_ptr, lc1.rnv, k, + _overload_to_extern_max(val1, &retval, lc1.team_vals, lc1.td_ptr, lc1.rnv, k, _XTEAM_NUM_TEAMS); } } else { @@ -534,7 +527,7 @@ T sim_max(T *c, int warp_size) { _BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc1.size, lc1.stride, lc1.offset) val1 = (c[i] > val1) ? c[i] : val1; _LIMIT_JUMP_TO_CUDA_REDUCED_THREADS(_XTEAM_NUM_TEAMS) - _MAX_OVERLOAD_32_FCT(val1, &retval, lc1.team_vals, lc1.td_ptr, lc1.rnv, k, + _overload_to_extern_max(val1, &retval, lc1.team_vals, lc1.td_ptr, lc1.rnv, k, _XTEAM_NUM_TEAMS); } } @@ -576,7 +569,7 @@ T sim_max_extended(T *c, int warp_size) { EXT_T val1 = lc1.rnv; _BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc1.size, lc1.stride, lc1.offset) val1 = ((EXT_T) c[i] > val1) ? c[i] : val1; - _MAX_OVERLOAD_64_FCT(val1, &retval, lc1.team_vals, lc1.td_ptr, lc1.rnv, k, + _overload_to_extern_max(val1, &retval, lc1.team_vals, lc1.td_ptr, lc1.rnv, k, _XTEAM_NUM_TEAMS); } } else { @@ -589,7 +582,7 @@ T sim_max_extended(T *c, int warp_size) { _BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc1.size, lc1.stride, lc1.offset) val1 = ((EXT_T) c[i] > val1) ? c[i] : val1; _LIMIT_JUMP_TO_CUDA_REDUCED_THREADS(_XTEAM_NUM_TEAMS) - _MAX_OVERLOAD_32_FCT(val1, &retval, lc1.team_vals, lc1.td_ptr, lc1.rnv, k, + _overload_to_extern_max(val1, &retval, lc1.team_vals, lc1.td_ptr, lc1.rnv, k, _XTEAM_NUM_TEAMS); } } @@ -631,7 +624,7 @@ T sim_min(T *c, int warp_size) { T val2 = lc2.rnv; _BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc2.size, lc2.stride, lc2.offset) val2 = (c[i] < val2) ? c[i] : val2; - _MIN_OVERLOAD_64_FCT(val2, &retval, lc2.team_vals, lc2.td_ptr, lc2.rnv, k, + _overload_to_extern_min(val2, &retval, lc2.team_vals, lc2.td_ptr, lc2.rnv, k, _XTEAM_NUM_TEAMS); } } else { @@ -644,7 +637,7 @@ T sim_min(T *c, int warp_size) { _BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc2.size, lc2.stride, lc2.offset) val2 = (c[i] < val2) ? c[i] : val2; _LIMIT_JUMP_TO_CUDA_REDUCED_THREADS(_XTEAM_NUM_TEAMS) - _MIN_OVERLOAD_32_FCT(val2, &retval, lc2.team_vals, lc2.td_ptr, lc2.rnv, k, + _overload_to_extern_min(val2, &retval, lc2.team_vals, lc2.td_ptr, lc2.rnv, k, _XTEAM_NUM_TEAMS); } } @@ -685,7 +678,7 @@ T sim_min_extended(T *c, int warp_size) { EXT_T val2 = lc2.rnv; _BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc2.size, lc2.stride, lc2.offset) val2 = ((EXT_T) c[i] < val2) ? c[i] : val2; - _MIN_OVERLOAD_64_FCT(val2, &retval, lc2.team_vals, lc2.td_ptr, lc2.rnv, k, + _overload_to_extern_min(val2, &retval, lc2.team_vals, lc2.td_ptr, lc2.rnv, k, _XTEAM_NUM_TEAMS); } } else { @@ -696,7 +689,7 @@ T sim_min_extended(T *c, int warp_size) { _BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc2.size, lc2.stride, lc2.offset) val2 = ((EXT_T)c[i] < val2) ? c[i] : val2; _LIMIT_JUMP_TO_CUDA_REDUCED_THREADS(_XTEAM_NUM_TEAMS) - _MIN_OVERLOAD_32_FCT(val2, &retval, lc2.team_vals, lc2.td_ptr, lc2.rnv, k, + _overload_to_extern_min(val2, &retval, lc2.team_vals, lc2.td_ptr, lc2.rnv, k, _XTEAM_NUM_TEAMS); } } @@ -1169,8 +1162,8 @@ template T sim_dot_complex(T *a, T *b, int warp_size) { T val3 = lc3.rnv; _BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc3.size, lc3.stride, lc3.offset) val3 += a[i] * b[i]; - _SUM_OVERLOAD_64_FCT(val3, &sum, lc3.team_vals, lc3.td_ptr, lc3.rnv, k, - _XTEAM_NUM_TEAMS); + _overload_to_extern_sum(val3, &sum, lc3.team_vals, lc3.td_ptr, lc3.rnv, k, + _XTEAM_NUM_TEAMS); } } else { #pragma omp target teams distribute parallel for num_teams(_XTEAM_NUM_TEAMS) \ @@ -1182,8 +1175,8 @@ template T sim_dot_complex(T *a, T *b, int warp_size) { _BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc3.size, lc3.stride, lc3.offset) val3 += a[i] * b[i]; _LIMIT_JUMP_TO_CUDA_REDUCED_THREADS(_XTEAM_NUM_TEAMS) - _SUM_OVERLOAD_32_FCT(val3, &sum, lc3.team_vals, lc3.td_ptr, lc3.rnv, k, - _XTEAM_NUM_TEAMS); + _overload_to_extern_sum(val3, &sum, lc3.team_vals, lc3.td_ptr, lc3.rnv, k, + _XTEAM_NUM_TEAMS); } } return sum; diff --git a/test/smoke-limbo/xteamr_extended/test_xteamr.h b/test/smoke-limbo/xteamr_extended/test_xteamr.h index c14606a79..b02af5e59 100644 --- a/test/smoke-limbo/xteamr_extended/test_xteamr.h +++ b/test/smoke-limbo/xteamr_extended/test_xteamr.h @@ -15,67 +15,35 @@ #if defined(__AMDGCN__) || defined(__NVPTX__) extern "C" { #define _RF_LDS volatile __attribute__((address_space(3))) -void _INLINE_ATTR_ __kmpc_xteamr_d_16x64 +void _INLINE_ATTR_ __kmpc_xteamr_d (double v, double *r_ptr, double *tvs, uint32_t *td, void (*_rf)(double *, double), void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double iv, const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */); -void _INLINE_ATTR_ __kmpc_xteamr_f_16x64 +void _INLINE_ATTR_ __kmpc_xteamr_f (float v, float *r_ptr, float *tvs, uint32_t *td, void (*_rf)(float *, float), void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float iv, const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */); -void _INLINE_ATTR_ __kmpc_xteamr_cd_16x64 +void _INLINE_ATTR_ __kmpc_xteamr_cd (_CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD), void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD iv, const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */); -void _INLINE_ATTR_ __kmpc_xteamr_cf_16x64 +void _INLINE_ATTR_ __kmpc_xteamr_cf (_CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF), void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF iv, const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */); -void _INLINE_ATTR_ __kmpc_xteamr_i_16x64 +void _INLINE_ATTR_ __kmpc_xteamr_i (int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int), void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int iv, const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */); -void _INLINE_ATTR_ __kmpc_xteamr_ui_16x64 +void _INLINE_ATTR_ __kmpc_xteamr_ui (_UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI), void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI iv, const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */); -void _INLINE_ATTR_ __kmpc_xteamr_l_16x64 +void _INLINE_ATTR_ __kmpc_xteamr_l (long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long), void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long iv, const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */); -void _INLINE_ATTR_ __kmpc_xteamr_ul_16x64 - (_UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL), - void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL iv, - const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */); -void _INLINE_ATTR_ __kmpc_xteamr_d_32x32 - (double v, double *r_ptr, double *tvs, uint32_t *td, void (*_rf)(double *, double), - void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double iv, - const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */); -void _INLINE_ATTR_ __kmpc_xteamr_f_32x32 - (float v, float *r_ptr, float *tvs, uint32_t *td, void (*_rf)(float *, float), - void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float iv, - const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */); -void _INLINE_ATTR_ __kmpc_xteamr_cd_32x32 - (_CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD), - void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD iv, - const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */); -void _INLINE_ATTR_ __kmpc_xteamr_cf_32x32 - (_CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF), - void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF iv, - const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */); -void _INLINE_ATTR_ __kmpc_xteamr_i_32x32 - (int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int), - void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int iv, - const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */); -void _INLINE_ATTR_ __kmpc_xteamr_ui_32x32 - (_UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI), - void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI iv, - const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */); -void _INLINE_ATTR_ __kmpc_xteamr_l_32x32 - (long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long), - void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long iv, - const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */); -void _INLINE_ATTR_ __kmpc_xteamr_ul_32x32 +void _INLINE_ATTR_ __kmpc_xteamr_ul (_UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL), void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL iv, const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */); @@ -130,67 +98,35 @@ int __kmpc_get_warp_size(); extern "C" { #undef _RF_LDS #define _RF_LDS -void __kmpc_xteamr_d_16x64 - (double v, double *r_ptr, double *tvs, uint32_t *td, void (*_rf)(double *, double), - void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double iv, - const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */){}; -void __kmpc_xteamr_f_16x64 - (float v, float *r_ptr, float *tvs, uint32_t *td, void (*_rf)(float *, float), - void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float iv, - const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */){}; -void __kmpc_xteamr_cd_16x64 - (_CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD), - void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD iv, - const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */){}; -void __kmpc_xteamr_cf_16x64 - (_CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF), - void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF iv, - const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */){}; -void __kmpc_xteamr_i_16x64 - (int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int), - void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int iv, - const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */){}; -void __kmpc_xteamr_ui_16x64 - (_UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI), - void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI iv, - const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */){}; -void __kmpc_xteamr_l_16x64 - (long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long), - void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long iv, - const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */){}; -void __kmpc_xteamr_ul_16x64 - (_UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL), - void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL iv, - const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */){}; -void __kmpc_xteamr_d_32x32 +void __kmpc_xteamr_d (double v, double *r_ptr, double *tvs, uint32_t *td, void (*_rf)(double *, double), void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double iv, const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */){}; -void __kmpc_xteamr_f_32x32 +void __kmpc_xteamr_f (float v, float *r_ptr, float *tvs, uint32_t *td, void (*_rf)(float *, float), void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float iv, const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */){}; -void __kmpc_xteamr_cd_32x32 +void __kmpc_xteamr_cd (_CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD), void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD iv, const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */){}; -void __kmpc_xteamr_cf_32x32 +void __kmpc_xteamr_cf (_CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF), void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF iv, const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */){}; -void __kmpc_xteamr_i_32x32 +void __kmpc_xteamr_i (int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int), void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int iv, const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */){}; -void __kmpc_xteamr_ui_32x32 +void __kmpc_xteamr_ui (_UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI), void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI iv, const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */){}; -void __kmpc_xteamr_l_32x32 +void __kmpc_xteamr_l (long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long), void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long iv, const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */){}; -void __kmpc_xteamr_ul_32x32 +void __kmpc_xteamr_ul (_UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL), void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL iv, const uint64_t k, const uint32_t numteams, int32_t Scope = 1 /* device */){}; @@ -245,165 +181,85 @@ int __kmpc_get_warp_size(){ // These overloaded function definitions are for this test framework // (xteamr.cpp) to invoke the extern DexviceRTL helper functions. -void _INLINE_ATTR_ _overload_to_extern_sum_16x64 - (double val, double *rv, double *tvs, uint32_t *td, const double iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_d_16x64(val, rv, tvs, td, - __kmpc_rfun_sum_d, __kmpc_rfun_sum_lds_d, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_sum_16x64 - (float val, float *rv, float *tvs, uint32_t *td, const float iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_f_16x64(val, rv, tvs, td, - __kmpc_rfun_sum_f, __kmpc_rfun_sum_lds_f, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_sum_16x64 - (_CD val, _CD *rv, _CD *tvs, uint32_t *td, const _CD iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_cd_16x64(val, rv, tvs, td, - __kmpc_rfun_sum_cd, __kmpc_rfun_sum_lds_cd, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_sum_16x64 - (_CF val, _CF *rv, _CF *tvs, uint32_t *td, const _CF iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_cf_16x64(val, rv, tvs, td, - __kmpc_rfun_sum_cf, __kmpc_rfun_sum_lds_cf, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_sum_16x64 - (int val, int *rv, int *tvs, uint32_t *td, const int iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_i_16x64(val, rv, tvs, td, - __kmpc_rfun_sum_i, __kmpc_rfun_sum_lds_i, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_sum_16x64 - (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_ui_16x64(val, rv, tvs, td, - __kmpc_rfun_sum_ui, __kmpc_rfun_sum_lds_ui, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_sum_16x64 - (long val, long *rv, long *tvs, uint32_t *td, const long iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_l_16x64(val, rv, tvs, td, - __kmpc_rfun_sum_l, __kmpc_rfun_sum_lds_l, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_sum_16x64 - (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_ul_16x64(val, rv, tvs, td, - __kmpc_rfun_sum_ul, __kmpc_rfun_sum_lds_ul, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_sum_32x32 +void _INLINE_ATTR_ _overload_to_extern_sum (double val, double *rv, double *tvs, uint32_t *td, const double iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_d_32x32(val, rv, tvs, td, + { __kmpc_xteamr_d(val, rv, tvs, td, __kmpc_rfun_sum_d, __kmpc_rfun_sum_lds_d, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_sum_32x32 +void _INLINE_ATTR_ _overload_to_extern_sum (float val, float *rv, float *tvs, uint32_t *td, const float iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_f_32x32(val, rv, tvs, td, + { __kmpc_xteamr_f(val, rv, tvs, td, __kmpc_rfun_sum_f, __kmpc_rfun_sum_lds_f, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_sum_32x32 +void _INLINE_ATTR_ _overload_to_extern_sum (_CD val, _CD *rv, _CD *tvs, uint32_t *td, const _CD iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_cd_32x32(val, rv, tvs, td, + { __kmpc_xteamr_cd(val, rv, tvs, td, __kmpc_rfun_sum_cd, __kmpc_rfun_sum_lds_cd, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_sum_32x32 +void _INLINE_ATTR_ _overload_to_extern_sum (_CF val, _CF *rv, _CF *tvs, uint32_t *td, const _CF iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_cf_32x32(val, rv, tvs, td, + { __kmpc_xteamr_cf(val, rv, tvs, td, __kmpc_rfun_sum_cf, __kmpc_rfun_sum_lds_cf, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_sum_32x32 +void _INLINE_ATTR_ _overload_to_extern_sum (int val, int *rv, int *tvs, uint32_t *td, const int iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_i_32x32(val, rv, tvs, td, + { __kmpc_xteamr_i(val, rv, tvs, td, __kmpc_rfun_sum_i, __kmpc_rfun_sum_lds_i, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_sum_32x32 +void _INLINE_ATTR_ _overload_to_extern_sum (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_ui_32x32(val, rv, tvs, td, + { __kmpc_xteamr_ui(val, rv, tvs, td, __kmpc_rfun_sum_ui, __kmpc_rfun_sum_lds_ui, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_sum_32x32 +void _INLINE_ATTR_ _overload_to_extern_sum (long val, long *rv, long *tvs, uint32_t *td, const long iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_l_32x32(val, rv, tvs, td, + { __kmpc_xteamr_l(val, rv, tvs, td, __kmpc_rfun_sum_l, __kmpc_rfun_sum_lds_l, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_sum_32x32 +void _INLINE_ATTR_ _overload_to_extern_sum (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_ul_32x32(val, rv, tvs, td, + { __kmpc_xteamr_ul(val, rv, tvs, td, __kmpc_rfun_sum_ul, __kmpc_rfun_sum_lds_ul, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_max_16x64 - (double val, double *rv, double *tvs, uint32_t *td, const double iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_d_16x64(val, rv, tvs, td, - __kmpc_rfun_max_d, __kmpc_rfun_max_lds_d, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_max_16x64 - (float val, float *rv, float *tvs, uint32_t *td, const float iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_f_16x64(val, rv, tvs, td, - __kmpc_rfun_max_f, __kmpc_rfun_max_lds_f, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_max_16x64 - (int val, int *rv, int *tvs, uint32_t *td, const int iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_i_16x64(val, rv, tvs, td, - __kmpc_rfun_max_i, __kmpc_rfun_max_lds_i, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_max_16x64 - (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_ui_16x64(val, rv, tvs, td, - __kmpc_rfun_max_ui, __kmpc_rfun_max_lds_ui, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_max_16x64 - (long val, long *rv, long *tvs, uint32_t *td, const long iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_l_16x64(val, rv, tvs, td, - __kmpc_rfun_max_l, __kmpc_rfun_max_lds_l, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_max_16x64 - (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_ul_16x64(val, rv, tvs, td, - __kmpc_rfun_max_ul, __kmpc_rfun_max_lds_ul, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_max_32x32 +void _INLINE_ATTR_ _overload_to_extern_max (double val, double *rv, double *tvs, uint32_t *td, const double iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_d_32x32(val, rv, tvs, td, + { __kmpc_xteamr_d(val, rv, tvs, td, __kmpc_rfun_max_d, __kmpc_rfun_max_lds_d, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_max_32x32 +void _INLINE_ATTR_ _overload_to_extern_max (float val, float *rv, float *tvs, uint32_t *td, const float iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_f_32x32(val, rv, tvs, td, + { __kmpc_xteamr_f(val, rv, tvs, td, __kmpc_rfun_max_f, __kmpc_rfun_max_lds_f, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_max_32x32 +void _INLINE_ATTR_ _overload_to_extern_max (int val, int *rv, int *tvs, uint32_t *td, const int iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_i_32x32(val, rv, tvs, td, + { __kmpc_xteamr_i(val, rv, tvs, td, __kmpc_rfun_max_i, __kmpc_rfun_max_lds_i, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_max_32x32 +void _INLINE_ATTR_ _overload_to_extern_max (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_ui_32x32(val, rv, tvs, td, + { __kmpc_xteamr_ui(val, rv, tvs, td, __kmpc_rfun_max_ui, __kmpc_rfun_max_lds_ui, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_max_32x32 +void _INLINE_ATTR_ _overload_to_extern_max (long val, long *rv, long *tvs, uint32_t *td, const long iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_l_32x32(val, rv, tvs, td, + { __kmpc_xteamr_l(val, rv, tvs, td, __kmpc_rfun_max_l, __kmpc_rfun_max_lds_l, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_max_32x32 +void _INLINE_ATTR_ _overload_to_extern_max (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_ul_32x32(val, rv, tvs, td, + { __kmpc_xteamr_ul(val, rv, tvs, td, __kmpc_rfun_max_ul, __kmpc_rfun_max_lds_ul, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_min_16x64 - (double val, double *rv, double *tvs, uint32_t *td, const double iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_d_16x64(val, rv, tvs, td, - __kmpc_rfun_min_d, __kmpc_rfun_min_lds_d, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_min_16x64 - (float val, float *rv, float *tvs, uint32_t *td, const float iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_f_16x64(val, rv, tvs, td, - __kmpc_rfun_min_f, __kmpc_rfun_min_lds_f, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_min_16x64 - (int val, int *rv, int *tvs, uint32_t *td, const int iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_i_16x64(val, rv, tvs, td, - __kmpc_rfun_min_i, __kmpc_rfun_min_lds_i, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_min_16x64 - (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_ui_16x64(val, rv, tvs, td, - __kmpc_rfun_min_ui, __kmpc_rfun_min_lds_ui, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_min_16x64 - (long val, long *rv, long *tvs, uint32_t *td, const long iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_l_16x64(val, rv, tvs, td, - __kmpc_rfun_min_l, __kmpc_rfun_min_lds_l, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_min_16x64 - (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_ul_16x64(val, rv, tvs, td, - __kmpc_rfun_min_ul, __kmpc_rfun_min_lds_ul, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_min_32x32 +void _INLINE_ATTR_ _overload_to_extern_min (double val, double *rv, double *tvs, uint32_t *td, const double iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_d_32x32(val, rv, tvs, td, + { __kmpc_xteamr_d(val, rv, tvs, td, __kmpc_rfun_min_d, __kmpc_rfun_min_lds_d, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_min_32x32 +void _INLINE_ATTR_ _overload_to_extern_min (float val, float *rv, float *tvs, uint32_t *td, const float iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_f_32x32(val, rv, tvs, td, + { __kmpc_xteamr_f(val, rv, tvs, td, __kmpc_rfun_min_f, __kmpc_rfun_min_lds_f, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_min_32x32 +void _INLINE_ATTR_ _overload_to_extern_min (int val, int *rv, int *tvs, uint32_t *td, const int iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_i_32x32(val, rv, tvs, td, + { __kmpc_xteamr_i(val, rv, tvs, td, __kmpc_rfun_min_i, __kmpc_rfun_min_lds_i, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_min_32x32 +void _INLINE_ATTR_ _overload_to_extern_min (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_ui_32x32(val, rv, tvs, td, + { __kmpc_xteamr_ui(val, rv, tvs, td, __kmpc_rfun_min_ui, __kmpc_rfun_min_lds_ui, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_min_32x32 +void _INLINE_ATTR_ _overload_to_extern_min (long val, long *rv, long *tvs, uint32_t *td, const long iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_l_32x32(val, rv, tvs, td, + { __kmpc_xteamr_l(val, rv, tvs, td, __kmpc_rfun_min_l, __kmpc_rfun_min_lds_l, iv, k, numteams);} -void _INLINE_ATTR_ _overload_to_extern_min_32x32 +void _INLINE_ATTR_ _overload_to_extern_min (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv, const uint64_t k, const uint32_t numteams) - { __kmpc_xteamr_ul_32x32(val, rv, tvs, td, + { __kmpc_xteamr_ul(val, rv, tvs, td, __kmpc_rfun_min_ul, __kmpc_rfun_min_lds_ul, iv, k, numteams);} #undef _CD #undef _CF