diff --git a/include/dolphin/ppc_math.h b/include/dolphin/ppc_math.h new file mode 100644 index 0000000..db12db5 --- /dev/null +++ b/include/dolphin/ppc_math.h @@ -0,0 +1,136 @@ +#ifndef DOLPHIN_PPC_MATH_H +#define DOLPHIN_PPC_MATH_H + +#include +#include + +// frsqrte matching courtesy of Geotale, with reference to https://achurch.org/cpu-tests/ppc750cl.s + +struct BaseAndDec32 { + uint32_t base; + int32_t dec; +}; + +struct BaseAndDec64 { + uint64_t base; + int64_t dec; +}; + +union c32 { + uint32_t u; + float f; +}; + +union c64 { + uint64_t u; + double f; +}; + +#define EXPONENT_SHIFT_F64 ((uint64_t)52) +#define MANTISSA_MASK_F64 ((uint64_t)0x000fffffffffffffULL) +#define EXPONENT_MASK_F64 ((uint64_t)0x7ff0000000000000ULL) +#define SIGN_MASK_F64 ((uint64_t)0x8000000000000000ULL) + +static const struct BaseAndDec64 RSQRTE_TABLE[32] = { + {0x69fa000000000ULL, -0x15a0000000LL}, + {0x5f2e000000000ULL, -0x13cc000000LL}, + {0x554a000000000ULL, -0x1234000000LL}, + {0x4c30000000000ULL, -0x10d4000000LL}, + {0x43c8000000000ULL, -0x0f9c000000LL}, + {0x3bfc000000000ULL, -0x0e88000000LL}, + {0x34b8000000000ULL, -0x0d94000000LL}, + {0x2df0000000000ULL, -0x0cb8000000LL}, + {0x2794000000000ULL, -0x0bf0000000LL}, + {0x219c000000000ULL, -0x0b40000000LL}, + {0x1bfc000000000ULL, -0x0aa0000000LL}, + {0x16ae000000000ULL, -0x0a0c000000LL}, + {0x11a8000000000ULL, -0x0984000000LL}, + {0x0ce6000000000ULL, -0x090c000000LL}, + {0x0862000000000ULL, -0x0898000000LL}, + {0x0416000000000ULL, -0x082c000000LL}, + {0xffe8000000000ULL, -0x1e90000000LL}, + {0xf0a4000000000ULL, -0x1c00000000LL}, + {0xe2a8000000000ULL, -0x19c0000000LL}, + {0xd5c8000000000ULL, -0x17c8000000LL}, + {0xc9e4000000000ULL, -0x1610000000LL}, + {0xbedc000000000ULL, -0x1490000000LL}, + {0xb498000000000ULL, -0x1330000000LL}, + {0xab00000000000ULL, -0x11f8000000LL}, + {0xa204000000000ULL, -0x10e8000000LL}, + {0x9994000000000ULL, -0x0fe8000000LL}, + {0x91a0000000000ULL, -0x0f08000000LL}, + {0x8a1c000000000ULL, -0x0e38000000LL}, + {0x8304000000000ULL, -0x0d78000000LL}, + {0x7c48000000000ULL, -0x0cc8000000LL}, + {0x75e4000000000ULL, -0x0c28000000LL}, + {0x6fd0000000000ULL, -0x0b98000000LL}, +}; + +#ifdef _MSC_VER +#include +static inline uint32_t ppc_clz64(uint64_t x) { + unsigned long idx; + _BitScanReverse64(&idx, x); + return 63u - (uint32_t)idx; +} +#else +static inline uint32_t ppc_clz64(uint64_t x) { + return (uint32_t)__builtin_clzll(x); +} +#endif + +static inline double frsqrte(double val) { + union c64 bits; + uint64_t mantissa; + int64_t exponent; + int sign; + uint32_t key; + uint64_t new_exp; + const struct BaseAndDec64 *entry; + union c64 result; + + bits.f = val; + mantissa = bits.u & MANTISSA_MASK_F64; + exponent = (int64_t)(bits.u & EXPONENT_MASK_F64); + sign = (bits.u & SIGN_MASK_F64) != 0; + + if (mantissa == 0 && exponent == 0) { + return copysign(INFINITY, bits.f); + } + + if ((uint64_t)exponent == EXPONENT_MASK_F64) { + if (mantissa == 0) { + return sign ? NAN : 0.0; + } + return val; + } + + if (sign) { + return NAN; + } + + if (exponent == 0) { + uint32_t shift = ppc_clz64(mantissa) - (uint32_t)(63 - EXPONENT_SHIFT_F64); + mantissa <<= shift; + mantissa &= MANTISSA_MASK_F64; + exponent -= (int64_t)(shift - 1) << EXPONENT_SHIFT_F64; + } + + key = (uint32_t)(((uint64_t)exponent | mantissa) >> 37); + new_exp = ((uint64_t)((0xbfcLL << EXPONENT_SHIFT_F64) - exponent) >> 1) & EXPONENT_MASK_F64; + + entry = &RSQRTE_TABLE[0x1fu & (key >> 11)]; + result.u = new_exp | (uint64_t)(entry->base + entry->dec * (int64_t)(key & 0x7ffu)); + return result.f; +} + +// One Newton-Raphson step +static inline float ppc_rsqrte(float x) { + double rsqrt_d = frsqrte((double)x); + float nwork0 = (float)(rsqrt_d * rsqrt_d); + float nwork1 = (float)(rsqrt_d * 0.5); + nwork0 = fmaf(-nwork0, x, 3.0f); + return nwork0 * nwork1; +} + +#endif // DOLPHIN_PPC_MATH_H diff --git a/lib/dolphin/mtx/vec.c b/lib/dolphin/mtx/vec.c index 9651ca3..7100d7a 100644 --- a/lib/dolphin/mtx/vec.c +++ b/lib/dolphin/mtx/vec.c @@ -2,6 +2,10 @@ #include #include +#if EMULATE_PS_MATH +#include +#endif + void C_VECAdd(const Vec* a, const Vec* b, Vec* ab) { assert(a && "VECAdd(): NULL VecPtr 'a' "); assert(b && "VECAdd(): NULL VecPtr 'b' "); @@ -29,31 +33,49 @@ void C_VECScale(const Vec* src, Vec* dst, f32 scale) { } void C_VECNormalize(const Vec* src, Vec* unit) { - f32 mag; + f32 sqsum; + f32 rsqrt; assert(src && "VECNormalize(): NULL VecPtr 'src' "); assert(unit && "VECNormalize(): NULL VecPtr 'unit' "); - mag = (src->z * src->z) + ((src->x * src->x) + (src->y * src->y)); - assert(0.0f != mag && "VECNormalize(): zero magnitude vector "); - - mag = 1.0f/ sqrtf(mag); - unit->x = src->x * mag; - unit->y = src->y * mag; - unit->z = src->z * mag; +#if EMULATE_PS_MATH + sqsum = (src->z * src->z + src->x * src->x) + src->y * src->y; +#else + sqsum = (src->z * src->z) + ((src->x * src->x) + (src->y * src->y)); +#endif + assert(0.0f != sqsum && "VECNormalize(): zero magnitude vector "); + +#if EMULATE_PS_MATH + rsqrt = ppc_rsqrte(sqsum); +#else + rsqrt = 1.0f / sqrtf(sqsum); +#endif + unit->x = src->x * rsqrt; + unit->y = src->y * rsqrt; + unit->z = src->z * rsqrt; } f32 C_VECSquareMag(const Vec* v) { - f32 sqmag; - assert(v && "VECMag(): NULL VecPtr 'v' "); - - sqmag = v->z * v->z + ((v->x * v->x) + (v->y * v->y)); - return sqmag; +#if EMULATE_PS_MATH + return (v->z * v->z + v->x * v->x) + v->y * v->y; +#else + return v->z * v->z + ((v->x * v->x) + (v->y * v->y)); +#endif } f32 C_VECMag(const Vec* v) { +#if EMULATE_PS_MATH + f32 sqmag; + sqmag = C_VECSquareMag(v); + if (sqmag == 0.0f) { + return 0.0f; + } + return sqmag * ppc_rsqrte(sqmag); +#else return sqrtf(C_VECSquareMag(v)); +#endif } f32 C_VECDotProduct(const Vec* a, const Vec* b) { @@ -131,14 +153,25 @@ void C_VECReflect(const Vec* src, const Vec* normal, Vec* dst) { } f32 C_VECSquareDistance(const Vec* a, const Vec* b) { - Vec diff; - - diff.x = a->x - b->x; - diff.y = a->y - b->y; - diff.z = a->z - b->z; - return (diff.z * diff.z) + ((diff.x * diff.x) + (diff.y * diff.y)); + f32 dx = a->x - b->x; + f32 dy = a->y - b->y; + f32 dz = a->z - b->z; +#if EMULATE_PS_MATH + return (dx * dx + dy * dy) + dz * dz; +#else + return (dz * dz) + ((dx * dx) + (dy * dy)); +#endif } f32 C_VECDistance(const Vec* a, const Vec* b) { +#if EMULATE_PS_MATH + f32 sqdist; + sqdist = C_VECSquareDistance(a, b); + if (sqdist == 0.0f) { + return 0.0f; + } + return sqdist * ppc_rsqrte(sqdist); +#else return sqrtf(C_VECSquareDistance(a, b)); +#endif }