963 lines
35 KiB
C
963 lines
35 KiB
C
/*
|
|
File: std_math_pow.c
|
|
Author: Taylor Robbins
|
|
Date: 08\28\2025
|
|
Description:
|
|
** Holds implementation for power related functions like pow(), log(), exp(), ldexp(), etc.
|
|
*/
|
|
|
|
float powf(float base, float exponent)
|
|
{
|
|
uint32_t signBias = 0;
|
|
uint32_t baseInt, exponentInt;
|
|
|
|
baseInt = asuint(base);
|
|
exponentInt = asuint(exponent);
|
|
if (predict_false(baseInt - 0x00800000 >= 0x7F800000 - 0x00800000 || zeroinfnan32(exponentInt)))
|
|
{
|
|
// Either (base < 0x1p-126 or inf or nan) or (exponent is 0 or inf or nan).
|
|
if (predict_false(zeroinfnan32(exponentInt)))
|
|
{
|
|
if (2 * exponentInt == 0) { return 1.0f; }
|
|
if (baseInt == 0x3F800000) { return 1.0f; }
|
|
if (2 * baseInt > 2u * 0x7F800000 || 2 * exponentInt > 2u * 0x7F800000) { return base + exponent; }
|
|
if (2 * baseInt == 2 * 0x3F800000) { return 1.0f; }
|
|
if ((2 * baseInt < 2 * 0x3F800000) == !(exponentInt & 0x80000000)) { return 0.0f; } // |base|<1 && exponent==inf or |base|>1 && exponent==-inf.
|
|
return exponent * exponent;
|
|
}
|
|
if (predict_false(zeroinfnan32(baseInt)))
|
|
{
|
|
float_t baseSquared = base * base;
|
|
if (baseInt & 0x80000000 && checkint32(exponentInt) == 1) { baseSquared = -baseSquared; }
|
|
// Without the barrier some versions of clang hoist the 1/baseSquared and
|
|
// thus division by zero exception can be signaled spuriously.
|
|
return exponentInt & 0x80000000 ? fp_barrierf(1 / baseSquared) : baseSquared;
|
|
}
|
|
// base and exponent are non-zero finite.
|
|
if (baseInt & 0x80000000)
|
|
{
|
|
// Finite base < 0.
|
|
int exponentType = checkint32(exponentInt);
|
|
if (exponentType == 0) { return __math_invalidf(base); }
|
|
if (exponentType == 1) { signBias = exp2inline_SIGN_BIAS; }
|
|
baseInt &= 0x7FFFFFFF;
|
|
}
|
|
if (baseInt < 0x00800000)
|
|
{
|
|
// Normalize subnormal base so exponent becomes negative.
|
|
baseInt = asuint(base * 0x1p23F);
|
|
baseInt &= 0x7FFFFFFF;
|
|
baseInt -= 23 << 23;
|
|
}
|
|
}
|
|
double_t logBase = log2_inline(baseInt);
|
|
double_t exponentLogBase = exponent * logBase; // cannot overflow, exponent is single prec.
|
|
if (predict_false((asuint64(exponentLogBase) >> 47 & 0xFFFF) >= asuint64(126.0 * POWF_SCALE) >> 47))
|
|
{
|
|
// |exponent*log(base)| >= 126.
|
|
if (exponentLogBase > 0x1.FFFFFFFD1D571p+6 * POWF_SCALE) { return __math_oflowf(signBias); }
|
|
if (exponentLogBase <= -150.0 * POWF_SCALE) { return __math_uflowf(signBias); }
|
|
}
|
|
return exp2_inline(exponentLogBase, signBias);
|
|
}
|
|
double pow(double base, double exponent)
|
|
{
|
|
uint32_t signBias = 0;
|
|
uint64_t baseInt, exponentInt;
|
|
uint32_t baseTop12, exponentTop12;
|
|
|
|
baseInt = asuint64(base);
|
|
exponentInt = asuint64(exponent);
|
|
baseTop12 = top12(base);
|
|
exponentTop12 = top12(exponent);
|
|
if (predict_false(baseTop12 - 0x001 >= 0x7FF - 0x001 || (exponentTop12 & 0x7FF) - 0x3BE >= 0x43E - 0x3BE))
|
|
{
|
|
// Note: if |exponent| > 1075 * ln2 * 2^53 ~= 0x1.749p62 then pow(base,exponent) = inf/0
|
|
// and if |exponent| < 2^-54 / 1075 ~= 0x1.e7b6p-65 then pow(base,exponent) = +-1.
|
|
// Special cases: (base < 0x1p-126 or inf or nan) or
|
|
// (|exponent| < 0x1p-65 or |exponent| >= 0x1p63 or nan).
|
|
if (predict_false(zeroinfnan64(exponentInt)))
|
|
{
|
|
if (2 * exponentInt == 0) { return 1.0; }
|
|
if (baseInt == asuint64(1.0)) { return 1.0; }
|
|
if (2 * baseInt > 2 * asuint64(INFINITY) || 2 * exponentInt > 2 * asuint64(INFINITY)) { return base + exponent; }
|
|
if (2 * baseInt == 2 * asuint64(1.0)) { return 1.0; }
|
|
if ((2 * baseInt < 2 * asuint64(1.0)) == !(exponentInt >> 63)) { return 0.0; } // |base|<1 && exponent==inf or |base|>1 && exponent==-inf.
|
|
return exponent * exponent;
|
|
}
|
|
if (predict_false(zeroinfnan64(baseInt)))
|
|
{
|
|
double_t xSquared = base * base;
|
|
if (baseInt >> 63 && checkint64(exponentInt) == 1) { xSquared = -xSquared; }
|
|
// Without the barrier some versions of clang hoist the 1/xSquared and
|
|
// thus division by zero exception can be signaled spuriously.
|
|
return (exponentInt >> 63) ? fp_barrier(1 / xSquared) : xSquared;
|
|
}
|
|
// Here base and exponent are non-zero finite.
|
|
if (baseInt >> 63)
|
|
{
|
|
// Finite base < 0.
|
|
int exponentType = checkint64(exponentInt);
|
|
if (exponentType == 0) { return __math_invalid(base); }
|
|
if (exponentType == 1) { signBias = expinline_SIGN_BIAS; }
|
|
baseInt &= 0x7FFFFFFFFFFFFFFF;
|
|
baseTop12 &= 0x7FF;
|
|
}
|
|
if ((exponentTop12 & 0X7FF) - 0X3BE >= 0X43E - 0X3BE)
|
|
{
|
|
// Note: signBias == 0 here because exponent is not odd.
|
|
if (baseInt == asuint64(1.0)) { return 1.0; }
|
|
if ((exponentTop12 & 0x7FF) < 0x3BE)
|
|
{
|
|
// |exponent| < 2^-65, base^exponent ~= 1 + exponent*log(base).
|
|
return 1.0;
|
|
}
|
|
return (baseInt > asuint64(1.0)) == (exponentTop12 < 0x800)
|
|
? __math_oflow(0)
|
|
: __math_uflow(0);
|
|
}
|
|
if (baseTop12 == 0)
|
|
{
|
|
// Normalize subnormal base so exponent becomes negative.
|
|
baseInt = asuint64(base * 0x1p52);
|
|
baseInt &= 0x7FFFFFFFFFFFFFFF;
|
|
baseInt -= (52ULL << 52);
|
|
}
|
|
}
|
|
|
|
double_t baseLow1;
|
|
double_t baseHigh1 = log_inline(baseInt, &baseLow1);
|
|
double_t exponentHigh1, exponentLow1;
|
|
double_t exponentHigh2 = asdouble(exponentInt & (-1ULL << 27));
|
|
double_t exponentLow2 = exponent - exponentHigh2;
|
|
double_t baseHigh2 = asdouble(asuint64(baseHigh1) & (-1ULL << 27));
|
|
double_t baseLow2 = baseHigh1 - baseHigh2 + baseLow1;
|
|
exponentHigh1 = exponentHigh2 * baseHigh2;
|
|
exponentLow1 = exponentLow2 * baseHigh2 + exponent * baseLow2; // |exponentLow1| < |exponentHigh1| * 2^-25.
|
|
return exp_inline(exponentHigh1, exponentLow1, signBias);
|
|
}
|
|
|
|
float logf(float value)
|
|
{
|
|
double_t zVar, rVar, rVarSquared, result, yVar, cInverse, logc;
|
|
uint32_t valueInt, zVarInt, temp;
|
|
int vVar, index;
|
|
|
|
valueInt = asuint(value);
|
|
// Fix sign of zero with downward rounding when value==1.
|
|
if (predict_false(valueInt == 0x3F800000)) { return 0; }
|
|
if (predict_false(valueInt - 0x00800000 >= 0x7F800000 - 0x00800000))
|
|
{
|
|
// value < 0x1p-126 or inf or nan.
|
|
if (valueInt * 2 == 0) { return __math_divzerof(1); }
|
|
if (valueInt == 0x7F800000) { return value; } // log(inf) == inf.
|
|
if ((valueInt & 0x80000000) || valueInt * 2 >= 0xFF000000) { return __math_invalidf(value); }
|
|
// value is subnormal, normalize it.
|
|
valueInt = asuint(value * 0x1p23F);
|
|
valueInt -= 23 << 23;
|
|
}
|
|
|
|
// value = 2^vVar zVar; where zVar is in range [OFF,2*OFF] and exact.
|
|
// The range is split into N subintervals.
|
|
// The ith subinterval contains zVar and c is near its center.
|
|
temp = valueInt - logf_OFF;
|
|
index = (temp >> (23 - LOGF_TABLE_BITS)) % logf_N;
|
|
vVar = ((int32_t)temp >> 23); // arithmetic shift
|
|
zVarInt = valueInt - (temp & 0x1FF << 23);
|
|
cInverse = logf_T[index].invc;
|
|
logc = logf_T[index].logc;
|
|
zVar = (double_t)asfloat(zVarInt);
|
|
|
|
// log(value) = log1p(zVar/c-1) + log(c) + vVar*Ln2
|
|
rVar = (zVar * cInverse) - 1;
|
|
yVar = logc + ((double_t)vVar * logf_Ln2);
|
|
|
|
// Pipelined polynomial evaluation to approximate log1p(rVar).
|
|
rVarSquared = (rVar * rVar);
|
|
result = (logf_A[1] * rVar) + logf_A[2];
|
|
result = (logf_A[0] * rVarSquared) + result;
|
|
result = (result * rVarSquared) + (yVar + rVar);
|
|
return eval_as_float(result);
|
|
}
|
|
double log(double value)
|
|
{
|
|
double_t wVar, zVar, rVar, rVarSquared, rVarCubed, result, cInverse, logc, vVarDelta, resultHigh, resultLow;
|
|
uint64_t valueInt, zVarInt, tmp;
|
|
uint32_t top;
|
|
int vVar, index;
|
|
|
|
valueInt = asuint64(value);
|
|
top = top16(value);
|
|
if (predict_false(valueInt - asuint64(1.0 - 0x1p-4) < asuint64(1.0 + 0x1.09p-4) - asuint64(1.0 - 0x1p-4)))
|
|
{
|
|
// Handle close to 1.0 inputs separately.
|
|
// Fix sign of zero with downward rounding when value==1.
|
|
if (predict_false(valueInt == asuint64(1.0))) { return 0; }
|
|
rVar = value - 1.0;
|
|
rVarSquared = rVar * rVar;
|
|
rVarCubed = rVar * rVarSquared;
|
|
result = rVarCubed *
|
|
(log_B[1] + rVar * log_B[2] + rVarSquared * log_B[3] +
|
|
rVarCubed * (log_B[4] + rVar * log_B[5] + rVarSquared * log_B[6] +
|
|
rVarCubed * (log_B[7] + rVar * log_B[8] + rVarSquared * log_B[9] + rVarCubed * log_B[10])));
|
|
// Worst-case error is around 0.507 ULP.
|
|
wVar = rVar * 0x1p27;
|
|
double_t rhi = rVar + wVar - wVar;
|
|
double_t rlo = rVar - rhi;
|
|
wVar = rhi * rhi * log_B[0]; // log_B[0] == -0.5.
|
|
resultHigh = rVar + wVar;
|
|
resultLow = rVar - resultHigh + wVar;
|
|
resultLow += log_B[0] * rlo * (rhi + rVar);
|
|
result += resultLow;
|
|
result += resultHigh;
|
|
return eval_as_double(result);
|
|
}
|
|
if (predict_false(top - 0x0010 >= 0x7FF0 - 0x0010))
|
|
{
|
|
// value < 0x1p-1022 or inf or nan.
|
|
if (valueInt * 2 == 0) { return __math_divzero(1); }
|
|
if (valueInt == asuint64(INFINITY)) { return value; } // log(inf) == inf.
|
|
if ((top & 0x8000) || (top & 0x7FF0) == 0x7FF0) { return __math_invalid(value); }
|
|
// value is subnormal, normalize it.
|
|
valueInt = asuint64(value * 0x1p52);
|
|
valueInt -= 52ULL << 52;
|
|
}
|
|
|
|
// value = 2^vVar zVar; where zVar is in range [OFF,2*OFF) and exact.
|
|
// The range is split into N subintervals.
|
|
// The ith subinterval contains zVar and c is near its center.
|
|
tmp = valueInt - log_OFF;
|
|
index = (tmp >> (52 - LOG_TABLE_BITS)) % log_N;
|
|
vVar = (int64_t)tmp >> 52; /* arithmetic shift */
|
|
zVarInt = valueInt - (tmp & 0xfffULL << 52);
|
|
cInverse = log_T[index].invc;
|
|
logc = log_T[index].logc;
|
|
zVar = asdouble(zVarInt);
|
|
|
|
// log(value) = log1p(zVar/c-1) + log(c) + vVar*Ln2.
|
|
// rVar ~= zVar/c - 1, |rVar| < 1/(2*N).
|
|
// rounding error: 0x1p-55/N + 0x1p-66.
|
|
rVar = (zVar - log_T2[index].chi - log_T2[index].clo) * cInverse;
|
|
vVarDelta = (double_t)vVar;
|
|
|
|
// resultHigh + resultLow = rVar + log(c) + vVar*Ln2.
|
|
wVar = vVarDelta * log_Ln2hi + logc;
|
|
resultHigh = wVar + rVar;
|
|
resultLow = wVar - resultHigh + rVar + vVarDelta * log_Ln2lo;
|
|
|
|
// log(value) = resultLow + (log1p(rVar) - rVar) + resultHigh.
|
|
rVarSquared = rVar * rVar; // rounding error: 0x1p-54/N^2.
|
|
// Worst case error if |result| > 0x1p-5:
|
|
// 0.5 + 4.13/N + abs-poly-error*2^57 ULP (+ 0.002 ULP without fma)
|
|
// Worst case error if |result| > 0x1p-4:
|
|
// 0.5 + 2.06/N + abs-poly-error*2^56 ULP (+ 0.001 ULP without fma).
|
|
result = resultLow + (rVarSquared * log_A[0]) +
|
|
rVar * rVarSquared * (log_A[1] + rVar * log_A[2] + rVarSquared * (log_A[3] + rVar * log_A[4])) + resultHigh;
|
|
return eval_as_double(result);
|
|
}
|
|
|
|
float log2f(float value)
|
|
{
|
|
double_t zVar, rVar, rVarSquared, pVar, result, yVar, vInverse, logc;
|
|
uint32_t valueInt, zVarInt, top, tmp;
|
|
int k, i;
|
|
|
|
valueInt = asuint(value);
|
|
// Fix sign of zero with downward rounding when value==1.
|
|
if (predict_false(valueInt == 0x3F800000)) { return 0; }
|
|
if (predict_false(valueInt - 0x00800000 >= 0x7F800000 - 0x00800000))
|
|
{
|
|
// value < 0x1p-126 or inf or nan.
|
|
if (valueInt * 2 == 0) { return __math_divzerof(1); }
|
|
if (valueInt == 0x7F800000) { return value; } // log2(inf) == inf.
|
|
if ((valueInt & 0x80000000) || valueInt * 2 >= 0xFF000000) { return __math_invalidf(value); }
|
|
// value is subnormal, normalize it.
|
|
valueInt = asuint(value * 0x1p23f);
|
|
valueInt -= (23 << 23);
|
|
}
|
|
|
|
// value = 2^k zVar; where zVar is in range [OFF,2*OFF] and exact.
|
|
// The range is split into N subintervals.
|
|
// The ith subinterval contains zVar and c is near its center.
|
|
tmp = valueInt - log2f_OFF;
|
|
i = (tmp >> (23 - LOG2F_TABLE_BITS)) % log2f_N;
|
|
top = (tmp & 0xFF800000);
|
|
zVarInt = valueInt - top;
|
|
k = (int32_t)tmp >> 23; // arithmetic shift
|
|
vInverse = log2f_T[i].invc;
|
|
logc = log2f_T[i].logc;
|
|
zVar = (double_t)asfloat(zVarInt);
|
|
|
|
// log2(value) = log1p(z/c-1)/ln2 + log2(c) + k
|
|
rVar = zVar * vInverse - 1;
|
|
yVar = logc + (double_t)k;
|
|
|
|
// Pipelined polynomial evaluation to approximate log1p(rVar)/ln2.
|
|
rVarSquared = rVar * rVar;
|
|
result = log2f_A[1] * rVar + log2f_A[2];
|
|
result = log2f_A[0] * rVarSquared + result;
|
|
pVar = log2f_A[3] * rVar + yVar;
|
|
result = result * rVarSquared + pVar;
|
|
return eval_as_float(result);
|
|
}
|
|
double log2(double value)
|
|
{
|
|
double_t zVar, rVar, rVarSquared, rVarQuad, result, cInverse, logc, tempDouble, resultHigh, resultLow, tVar1, tVar2, tVar3, polyValue;
|
|
uint64_t valueInt, iz, tmp;
|
|
uint32_t top;
|
|
int temp, index;
|
|
|
|
valueInt = asuint64(value);
|
|
top = top16(value);
|
|
if (predict_false(valueInt - asuint64(1.0 - 0x1.5B51p-5) < asuint64(1.0 + 0x1.6AB2p-5) - asuint64(1.0 - 0x1.5B51p-5)))
|
|
{
|
|
// Handle close to 1.0 inputs separately.
|
|
// Fix sign of zero with downward rounding when value==1.
|
|
if (predict_false(valueInt == asuint64(1.0))) { return 0; }
|
|
rVar = value - 1.0;
|
|
double_t rVarHigh, rVarLow;
|
|
rVarHigh = asdouble(asuint64(rVar) & -1ULL << 32);
|
|
rVarLow = rVar - rVarHigh;
|
|
resultHigh = (rVarHigh * log2_InvLn2hi);
|
|
resultLow = (rVarLow * log2_InvLn2hi) + (rVar * log2_InvLn2lo);
|
|
rVarSquared = rVar * rVar; // rounding error: 0x1p-62.
|
|
rVarQuad = rVarSquared * rVarSquared;
|
|
// Worst-case error is less than 0.54 ULP (0.55 ULP without fma).
|
|
polyValue = rVarSquared * (log2_B[0] + (rVar * log2_B[1]));
|
|
result = resultHigh + polyValue;
|
|
resultLow += resultHigh - result + polyValue;
|
|
resultLow += rVarQuad * (log2_B[2] + rVar * log2_B[3] + rVarSquared * (log2_B[4] + rVar * log2_B[5]) +
|
|
rVarQuad * (log2_B[6] + rVar * log2_B[7] + rVarSquared * (log2_B[8] + rVar * log2_B[9])));
|
|
result += resultLow;
|
|
return eval_as_double(result);
|
|
}
|
|
|
|
if (predict_false(top - 0x0010 >= 0x7FF0 - 0x0010))
|
|
{
|
|
// value < 0x1p-1022 or inf or nan.
|
|
if (valueInt * 2 == 0) { return __math_divzero(1); }
|
|
if (valueInt == asuint64(INFINITY)) { return value; } // log(inf) == inf.
|
|
if ((top & 0x8000) || (top & 0x7FF0) == 0x7FF0) { return __math_invalid(value); }
|
|
// value is subnormal, normalize it.
|
|
valueInt = asuint64(value * 0x1p52);
|
|
valueInt -= 52ULL << 52;
|
|
}
|
|
|
|
// value = 2^temp zVar; where zVar is in range [OFF,2*OFF) and exact.
|
|
// The range is split into N subintervals.
|
|
// The ith subinterval contains zVar and c is near its center.
|
|
tmp = valueInt - log2_OFF;
|
|
index = (tmp >> (52 - LOG2_TABLE_BITS)) % log2_N;
|
|
temp = (int64_t)tmp >> 52; // arithmetic shift
|
|
iz = valueInt - (tmp & 0xfffULL << 52);
|
|
cInverse = log2_T[index].invc;
|
|
logc = log2_T[index].logc;
|
|
zVar = asdouble(iz);
|
|
tempDouble = (double_t)temp;
|
|
|
|
// log2(value) = log2(zVar/c) + log2(c) + temp.
|
|
// rVar ~= zVar/c - 1, |rVar| < 1/(2*N).
|
|
double_t rVarHigh, rVarLow;
|
|
// rounding error: 0x1p-55/N + 0x1p-65.
|
|
rVar = (zVar - log2_T2[index].chi - log2_T2[index].clo) * cInverse;
|
|
rVarHigh = asdouble(asuint64(rVar) & -1ULL << 32);
|
|
rVarLow = rVar - rVarHigh;
|
|
tVar1 = rVarHigh * log2_InvLn2hi;
|
|
tVar2 = rVarLow * log2_InvLn2hi + rVar * log2_InvLn2lo;
|
|
|
|
// resultHigh + resultLow = rVar/ln2 + log2(c) + temp.
|
|
tVar3 = tempDouble + logc;
|
|
resultHigh = tVar3 + tVar1;
|
|
resultLow = tVar3 - resultHigh + tVar1 + tVar2;
|
|
|
|
// log2(rVar+1) = rVar/ln2 + rVar^2*poly(rVar).
|
|
// Evaluation is optimized assuming superscalar pipelined execution.
|
|
rVarSquared = rVar * rVar; // rounding error: 0x1p-54/N^2.
|
|
rVarQuad = rVarSquared * rVarSquared;
|
|
// Worst-case error if |result| > 0x1p-4: 0.547 ULP (0.550 ULP without fma).
|
|
// ~ 0.5 + 2/N/ln2 + abs-poly-error*0x1p56 ULP (+ 0.003 ULP without fma).
|
|
polyValue = log2_A[0] + rVar * log2_A[1] + rVarSquared * (log2_A[2] + rVar * log2_A[3]) + rVarQuad * (log2_A[4] + rVar * log2_A[5]);
|
|
result = resultLow + (rVarSquared * polyValue) + resultHigh;
|
|
return eval_as_double(result);
|
|
}
|
|
|
|
float log10f(float value)
|
|
{
|
|
union { float value; uint32_t integer; } valueUnion = { value };
|
|
float_t hfsq, valueSubOne, sVar, sVarSquared, sVarQuad, tVar1, tVar2, tSum, vVarOriginal, highFloat, lowFloat;
|
|
uint32_t valueInt;
|
|
int vVar;
|
|
|
|
valueInt = valueUnion.integer;
|
|
vVar = 0;
|
|
if (valueInt < 0x00800000 || valueInt>>31) // value < 2**-126
|
|
{
|
|
if (valueInt<<1 == 0) { return -1 / (value * value); } // log(+-0)=-inf
|
|
if (valueInt>>31) { return (value - value) / 0.0f; } // log(-#) = NaN
|
|
// subnormal number, scale up value
|
|
vVar -= 25;
|
|
value *= 0x1p25F;
|
|
valueUnion.value = value;
|
|
valueInt = valueUnion.integer;
|
|
}
|
|
else if (valueInt >= 0x7F800000) { return value; }
|
|
else if (valueInt == 0x3F800000) { return 0; }
|
|
|
|
// reduce value into [sqrt(2)/2, sqrt(2)]
|
|
valueInt += 0x3F800000 - 0x3F3504F3;
|
|
vVar += (int)(valueInt>>23) - 0x7F;
|
|
valueInt = (valueInt & 0x007FFFFF) + 0x3F3504F3;
|
|
valueUnion.integer = valueInt;
|
|
value = valueUnion.value;
|
|
|
|
valueSubOne = value - 1.0f;
|
|
sVar = valueSubOne / (2.0f + valueSubOne);
|
|
sVarSquared = sVar * sVar;
|
|
sVarQuad = sVarSquared * sVarSquared;
|
|
tVar1= sVarQuad * (Lg2 + (sVarQuad * Lg4));
|
|
tVar2= sVarSquared * (Lg1 + (sVarQuad * Lg3));
|
|
tSum = tVar2 + tVar1;
|
|
hfsq = 0.5f * valueSubOne * valueSubOne;
|
|
|
|
highFloat = valueSubOne - hfsq;
|
|
valueUnion.value = highFloat;
|
|
valueUnion.integer &= 0xFFFFF000;
|
|
highFloat = valueUnion.value;
|
|
lowFloat = valueSubOne - highFloat - hfsq + (sVar * (hfsq + tSum));
|
|
vVarOriginal = vVar;
|
|
return (vVarOriginal * log10_2lo) + ((lowFloat + highFloat) * ivln10lo) + (lowFloat * ivln10hi) + (highFloat * ivln10hi) + (vVarOriginal * log10_2hi);
|
|
}
|
|
double log10(double value)
|
|
{
|
|
union { double value; uint64_t integer; } valueUnion = { value };
|
|
double_t hfsq, valueSubOne, sVar, sVarSquared, sVarQuad, tVar1, tVar2, tSum, vVarOriginal, result, highDouble, lowDouble, resultHigh, resultLow;
|
|
uint32_t valueUpperWord;
|
|
int vVar;
|
|
|
|
valueUpperWord = (valueUnion.integer >> 32);
|
|
vVar = 0;
|
|
if (valueUpperWord < 0x00100000 || (valueUpperWord >> 31))
|
|
{
|
|
if ((valueUnion.integer << 1) == 0) { return -1 / (value * value); } // log(+-0)=-inf
|
|
if (valueUpperWord >> 31) { return (value - value) / 0.0; } // log(-#) = NaN
|
|
// subnormal number, scale value up
|
|
vVar -= 54;
|
|
value *= 0x1p54;
|
|
valueUnion.value = value;
|
|
valueUpperWord = (valueUnion.integer >> 32);
|
|
}
|
|
else if (valueUpperWord >= 0x7FF00000) { return value; }
|
|
else if (valueUpperWord == 0x3FF00000 && (valueUnion.integer << 32) == 0) { return 0; }
|
|
|
|
// reduce value into [sqrt(2)/2, sqrt(2)]
|
|
valueUpperWord += 0x3FF00000 - 0x3FE6A09E;
|
|
vVar += (int)(valueUpperWord >> 20) - 0x3FF;
|
|
valueUpperWord = (valueUpperWord & 0x000FFFFF) + 0x3FE6A09E;
|
|
valueUnion.integer = (uint64_t)valueUpperWord << 32 | (valueUnion.integer & 0xFFFFFFFF);
|
|
value = valueUnion.value;
|
|
|
|
valueSubOne = value - 1.0;
|
|
hfsq = 0.5 * valueSubOne * valueSubOne;
|
|
sVar = valueSubOne / (2.0 + valueSubOne);
|
|
sVarSquared = sVar * sVar;
|
|
sVarQuad = sVarSquared * sVarSquared;
|
|
tVar1 = sVarQuad * (Lg2d + sVarQuad * (Lg4d + sVarQuad * Lg6d));
|
|
tVar2 = sVarSquared * (Lg1d + sVarQuad * (Lg3d + sVarQuad * (Lg5d + sVarQuad * Lg7d)));
|
|
tSum = tVar2 + tVar1;
|
|
|
|
// See log2.c for details.
|
|
// highDouble+lowDouble = valueSubOne - hfsq + sVar*(hfsq+tSum) ~ log(1+valueSubOne)
|
|
highDouble = valueSubOne - hfsq;
|
|
valueUnion.value = highDouble;
|
|
valueUnion.integer &= (uint64_t)-1 << 32;
|
|
highDouble = valueUnion.value;
|
|
lowDouble = valueSubOne - highDouble - hfsq + (sVar * (hfsq + tSum));
|
|
|
|
// resultHigh+resultLow ~ log10(1+valueSubOne) + vVar*log10(2)
|
|
resultHigh = highDouble * ivln10hid;
|
|
vVarOriginal = vVar;
|
|
result = vVarOriginal * log10_2hid;
|
|
resultLow = vVarOriginal * log10_2lod + (lowDouble + highDouble) * ivln10lod + lowDouble * ivln10hid;
|
|
|
|
// Extra precision in for adding result is not strictly needed
|
|
// since there is no very large cancellation near value = sqrt(2) or
|
|
// value = 1/sqrt(2), but we do it anyway since it costs little on CPUs
|
|
// with some parallelism and it reduces the error for many args.
|
|
sVarQuad = result + resultHigh;
|
|
resultLow += (result - sVarQuad) + resultHigh;
|
|
resultHigh = sVarQuad;
|
|
|
|
return resultLow + resultHigh;
|
|
}
|
|
|
|
float _scalbnf(float value, int power)
|
|
{
|
|
union { float value; uint32_t integer; } valueUnion;
|
|
float_t result = value;
|
|
|
|
if (power > 127)
|
|
{
|
|
result *= 0x1p127F;
|
|
power -= 127;
|
|
if (power > 127)
|
|
{
|
|
result *= 0x1p127F;
|
|
power -= 127;
|
|
if (power > 127) { power = 127; }
|
|
}
|
|
}
|
|
else if (power < -126)
|
|
{
|
|
result *= 0x1p-126F * 0x1p24F;
|
|
power += 126 - 24;
|
|
if (power < -126)
|
|
{
|
|
result *= 0x1p-126F * 0x1p24F;
|
|
power += 126 - 24;
|
|
if (power < -126) { power = -126; }
|
|
}
|
|
}
|
|
valueUnion.integer = ((uint32_t)(0x7F + power) << 23);
|
|
result = result * valueUnion.value;
|
|
return result;
|
|
}
|
|
double _scalbn(double value, int power)
|
|
{
|
|
union { double value; uint64_t integer; } valueUnion;
|
|
double_t result = value;
|
|
|
|
if (power > 1023)
|
|
{
|
|
result *= 0x1p1023;
|
|
power -= 1023;
|
|
if (power > 1023)
|
|
{
|
|
result *= 0x1p1023;
|
|
power -= 1023;
|
|
if (power > 1023) { power = 1023; }
|
|
}
|
|
}
|
|
else if (power < -1022)
|
|
{
|
|
// make sure final power < -53 to avoid double
|
|
// rounding in the subnormal range
|
|
result *= 0x1p-1022 * 0x1p53;
|
|
power += 1022 - 53;
|
|
if (power < -1022)
|
|
{
|
|
result *= 0x1p-1022 * 0x1p53;
|
|
power += 1022 - 53;
|
|
if (power < -1022) { power = -1022; }
|
|
}
|
|
}
|
|
valueUnion.integer = ((uint64_t)(0x3FF + power) << 52);
|
|
result = result * valueUnion.value;
|
|
return result;
|
|
}
|
|
|
|
float ldexpf(float value, int exponent)
|
|
{
|
|
return scalbnf(value, exponent);
|
|
}
|
|
double ldexp(double value, int exponent)
|
|
{
|
|
return scalbn(value, exponent);
|
|
}
|
|
|
|
float expf(float value)
|
|
{
|
|
uint32_t abstop;
|
|
uint64_t ki, t;
|
|
double_t kd, xd, z, r, r2, y, s;
|
|
|
|
xd = (double_t)value;
|
|
abstop = top12(value) & 0x7FF;
|
|
if (predict_false(abstop >= top12(88.0f)))
|
|
{
|
|
// |value| >= 88 or value is nan.
|
|
if (asuint(value) == asuint(-INFINITY)) { return 0.0f; }
|
|
if (abstop >= top12(INFINITY)) { return value + value; }
|
|
if (value > 0x1.62E42Ep6F) { return __math_oflowf(0); } // value > log(0x1p128) ~= 88.72
|
|
if (value < -0x1.9FE368p6F) { return __math_uflowf(0); } // value < log(0x1p-150) ~= -103.97
|
|
}
|
|
|
|
// value*N/Ln2 = k + r with r in [-1/2, 1/2] and int k.
|
|
z = exp2f_InvLn2N * xd;
|
|
|
|
// Round and convert z to int, the result is in [-150*N, 128*N] and
|
|
// ideally ties-to-even rule is used, otherwise the magnitude of r
|
|
// can be bigger which gives larger approximation error.
|
|
kd = eval_as_double(z + exp2f_SHIFT);
|
|
ki = asuint64(kd);
|
|
kd -= exp2f_SHIFT;
|
|
r = z - kd;
|
|
|
|
// exp(value) = 2^(k/N) * 2^(r/N) ~= s * (C0*r^3 + C1*r^2 + C2*r + 1)
|
|
t = exp2f_T[ki % EXP2F_N];
|
|
t += ki << (52 - EXP2F_TABLE_BITS);
|
|
s = asdouble(t);
|
|
z = exp2f_C[0] * r + exp2f_C[1];
|
|
r2 = r * r;
|
|
y = exp2f_C[2] * r + 1;
|
|
y = z * r2 + y;
|
|
y = y * s;
|
|
return eval_as_float(y);
|
|
}
|
|
double exp(double value)
|
|
{
|
|
uint32_t abstop;
|
|
uint64_t ki, idx, top, sbits;
|
|
double_t kd, z, r, r2, scale, tail, tmp;
|
|
|
|
abstop = top12(value) & 0x7ff;
|
|
if (predict_false(abstop - top12(0x1p-54) >= top12(512.0) - top12(0x1p-54))) {
|
|
if (abstop - top12(0x1p-54) >= 0x80000000)
|
|
// Avoid spurious underflow for tiny value.
|
|
// Note: 0 is common input.
|
|
return 1.0 + value;
|
|
if (abstop >= top12(1024.0)) {
|
|
if (asuint64(value) == asuint64(-INFINITY))
|
|
return 0.0;
|
|
if (abstop >= top12(INFINITY))
|
|
return 1.0 + value;
|
|
if (asuint64(value) >> 63)
|
|
return __math_uflow(0);
|
|
else
|
|
return __math_oflow(0);
|
|
}
|
|
// Large value is special cased below.
|
|
abstop = 0;
|
|
}
|
|
|
|
// exp(value) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)].
|
|
// value = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N].
|
|
z = exp_InvLn2N * value;
|
|
// z - kd is in [-1, 1] in non-nearest rounding modes.
|
|
kd = eval_as_double(z + exp_Shift);
|
|
ki = asuint64(kd);
|
|
kd -= exp_Shift;
|
|
r = value + kd * exp_NegLn2hiN + kd * exp_NegLn2loN;
|
|
// 2^(k/N) ~= scale * (1 + tail).
|
|
idx = 2 * (ki % exp_N);
|
|
top = ki << (52 - EXP_TABLE_BITS);
|
|
tail = asdouble(exp_T[idx]);
|
|
// This is only a valid scale when -1023*N < k < 1024*N.
|
|
sbits = exp_T[idx + 1] + top;
|
|
// exp(value) = 2^(k/N) * exp(r) ~= scale + scale * (tail + exp(r) - 1).
|
|
// Evaluation is optimized assuming superscalar pipelined execution.
|
|
r2 = r * r;
|
|
// Without fma the worst case error is 0.25/N ulp larger.
|
|
// Worst case error is less than 0.5+1.11/N+(abs poly error * 2^53) ulp.
|
|
tmp = tail + r + r2 * (exp_C2 + r * exp_C3) + r2 * r2 * (exp_C4 + r * exp_C5);
|
|
if (predict_false(abstop == 0)) { return exp_specialcase(tmp, sbits, ki); }
|
|
scale = asdouble(sbits);
|
|
/* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
|
|
is no spurious underflow here even without fma. */
|
|
return eval_as_double(scale + scale * tmp);
|
|
}
|
|
|
|
//NOTE: sqrt is one of the few functions we can use the __builtin version of
|
|
#if 1
|
|
float sqrtf(float value) { return __builtin_sqrtf(value); }
|
|
double sqrt(double value) { return __builtin_sqrt(value); }
|
|
#else
|
|
float sqrtf(float value)
|
|
{
|
|
uint32_t valueInt, mVar, mVar1, mVar0, even, ey;
|
|
|
|
jsPrintString("Called sqrtf!");
|
|
valueInt = asuint(value);
|
|
if (predict_false(valueInt - 0x00800000 >= 0x7F800000 - 0x00800000))
|
|
{
|
|
// value < 0x1p-126 or inf or nan.
|
|
if (valueInt * 2 == 0) { return value; }
|
|
if (valueInt == 0x7F800000) { return value; }
|
|
if (valueInt > 0x7F800000) { return __math_invalidf(value); }
|
|
// value is subnormal, normalize it.
|
|
valueInt = asuint(value * 0x1p23f);
|
|
valueInt -= (23 << 23);
|
|
}
|
|
|
|
// value = 4^e mVar; with int e and mVar in [1, 4).
|
|
even = (valueInt & 0x00800000);
|
|
mVar1 = ((valueInt << 8) | 0x80000000);
|
|
mVar0 = ((valueInt << 7) & 0x7FFFFFFF);
|
|
mVar = (even ? mVar0 : mVar1);
|
|
|
|
// 2^e is the exponent part of the return value.
|
|
ey = (valueInt >> 1);
|
|
ey += (0x3F800000 >> 1);
|
|
ey &= 0x7F800000;
|
|
|
|
// compute rVar ~ 1/sqrt(mVar), sVar ~ sqrt(mVar) with 2 goldschmidt iterations.
|
|
static const uint32_t three = 0xC0000000;
|
|
uint32_t rVar, sVar, dVar, uVar, iVar;
|
|
iVar = (valueInt >> 17) % 128;
|
|
rVar = (uint32_t)__rsqrt_table[iVar] << 16;
|
|
// |rVar*sqrt(mVar) - 1| < 0x1p-8
|
|
sVar = MultiplyU32Overflow(mVar, rVar);
|
|
// |sVar/sqrt(mVar) - 1| < 0x1p-8
|
|
dVar = MultiplyU32Overflow(sVar, rVar);
|
|
uVar = three - dVar;
|
|
rVar = MultiplyU32Overflow(rVar, uVar) << 1;
|
|
// |rVar*sqrt(mVar) - 1| < 0x1.7bp-16
|
|
sVar = MultiplyU32Overflow(sVar, uVar) << 1;
|
|
// |sVar/sqrt(mVar) - 1| < 0x1.7bp-16
|
|
dVar = MultiplyU32Overflow(sVar, rVar);
|
|
uVar = three - dVar;
|
|
sVar = MultiplyU32Overflow(sVar, uVar);
|
|
// -0x1.03p-28 < sVar/sqrt(mVar) - 1 < 0x1.fp-31
|
|
sVar = (sVar - 1)>>6;
|
|
// sVar < sqrt(mVar) < sVar + 0x1.08p-23
|
|
|
|
// compute nearest rounded result.
|
|
uint32_t dVar0, dVar1, dVar2;
|
|
float result, tVar;
|
|
dVar0 = (mVar << 16) - (sVar * sVar);
|
|
dVar1 = sVar - dVar0;
|
|
dVar2 = dVar1 + sVar + 1;
|
|
sVar += (dVar1 >> 31);
|
|
sVar &= 0x007FFFFF;
|
|
sVar |= ey;
|
|
result = asfloat(sVar);
|
|
// handle rounding and inexact exception.
|
|
uint32_t tiny = (predict_false(dVar2 == 0) ? 0 : 0x01000000);
|
|
tiny |= ((dVar1 ^ dVar2) & 0x80000000);
|
|
tVar = asfloat(tiny);
|
|
result = eval_as_float(result + tVar);
|
|
return result;
|
|
}
|
|
double sqrt(double value)
|
|
{
|
|
uint64_t valueInt, top, mVar;
|
|
|
|
// special case handling.
|
|
valueInt = asuint64(value);
|
|
top = (valueInt >> 52);
|
|
if (predict_false((top - 0x001) >= (0x7FF - 0x001)))
|
|
{
|
|
// value < 0x1p-1022 or inf or nan.
|
|
if (valueInt * 2 == 0) { return value; }
|
|
if (valueInt == 0x7FF0000000000000) { return value; }
|
|
if (valueInt > 0x7FF0000000000000) { return __math_invalid(value); }
|
|
// value is subnormal, normalize it.
|
|
valueInt = asuint64(value * 0x1p52);
|
|
top = valueInt >> 52;
|
|
top -= 52;
|
|
}
|
|
|
|
// argument reduction:
|
|
// value = 4^e mVar; with integer e, and mVar in [1, 4)
|
|
// mVar: fixed point representation [2.62]
|
|
// 2^e is the exponent part of the result.
|
|
int even = (top & 1);
|
|
mVar = ((valueInt << 11) | 0x8000000000000000);
|
|
if (even) { mVar >>= 1; }
|
|
top = ((top + 0x3FF) >> 1);
|
|
|
|
/* approximate rVar ~ 1/sqrt(mVar) and sVar ~ sqrt(mVar) when mVar in [1,4)
|
|
|
|
initial estimate:
|
|
7bit table lookup (1bit exponent and 6bit significand).
|
|
|
|
iterative approximation:
|
|
using 2 goldschmidt iterations with 32bit int arithmetics
|
|
and a final iteration with 64bit int arithmetics.
|
|
|
|
details:
|
|
|
|
the relative error (e = r0 sqrt(mVar)-1) of a linear estimate
|
|
(r0 = a mVar + b) is |e| < 0.085955 ~ 0x1.6p-4 at best,
|
|
a table lookup is faster and needs one less iteration
|
|
6 bit lookup table (128b) gives |e| < 0x1.f9p-8
|
|
7 bit lookup table (256b) gives |e| < 0x1.fdp-9
|
|
for single and double prec 6bit is enough but for quad
|
|
prec 7bit is needed (or modified iterations). to avoid
|
|
one more iteration >=13bit table would be needed (16k).
|
|
|
|
a newton-raphson iteration for rVar is
|
|
w = rVar*rVar
|
|
uVar = 3 - mVar*w
|
|
rVar = rVar*uVar/2
|
|
can use a goldschmidt iteration for sVar at the end or
|
|
sVar = mVar*rVar
|
|
|
|
first goldschmidt iteration is
|
|
sVar = mVar*rVar
|
|
uVar = 3 - sVar*rVar
|
|
rVar = rVar*uVar/2
|
|
sVar = sVar*uVar/2
|
|
next goldschmidt iteration is
|
|
uVar = 3 - sVar*rVar
|
|
rVar = rVar*uVar/2
|
|
sVar = sVar*uVar/2
|
|
and at the end rVar is not computed only sVar.
|
|
|
|
they use the same amount of operations and converge at the
|
|
same quadratic rate, i.e. if
|
|
r1 sqrt(mVar) - 1 = e, then
|
|
r2 sqrt(mVar) - 1 = -3/2 e^2 - 1/2 e^3
|
|
the advantage of goldschmidt is that the mul for sVar and rVar
|
|
are independent (computed in parallel), however it is not
|
|
"self synchronizing": it only uses the input mVar in the
|
|
first iteration so rounding errors accumulate. at the end
|
|
or when switching to larger precision arithmetics rounding
|
|
errors dominate so the first iteration should be used.
|
|
|
|
the fixed point representations are
|
|
mVar: 2.30 rVar: 0.32, sVar: 2.30, dVar: 2.30, uVar: 2.30, three: 2.30
|
|
and after switching to 64 bit
|
|
mVar: 2.62 rVar: 0.64, sVar: 2.62, dVar: 2.62, uVar: 2.62, three: 2.62
|
|
*/
|
|
|
|
static const uint64_t three = 0xC0000000;
|
|
uint64_t rVar, sVar, dVar, uVar, tableIndex;
|
|
|
|
tableIndex = ((valueInt >> 46) % 128);
|
|
rVar = ((uint32_t)__rsqrt_table[tableIndex] << 16);
|
|
// |rVar sqrt(mVar) - 1| < 0x1.fdp-9
|
|
sVar = MultiplyU32Overflow((mVar >> 32), rVar);
|
|
// |sVar/sqrt(mVar) - 1| < 0x1.fdp-9
|
|
dVar = MultiplyU32Overflow(sVar, rVar);
|
|
uVar = three - dVar;
|
|
rVar = (MultiplyU32Overflow(rVar, uVar) << 1);
|
|
// |rVar sqrt(mVar) - 1| < 0x1.7bp-16
|
|
sVar = (MultiplyU32Overflow(sVar, uVar) << 1);
|
|
// |sVar/sqrt(mVar) - 1| < 0x1.7bp-16
|
|
dVar = MultiplyU32Overflow(sVar, rVar);
|
|
uVar = three - dVar;
|
|
rVar = (MultiplyU32Overflow(rVar, uVar) << 1);
|
|
// |rVar sqrt(mVar) - 1| < 0x1.3704p-29 (measured worst-case)
|
|
rVar = (rVar << 32);
|
|
sVar = MultiplyU64Overflow(mVar, rVar);
|
|
dVar = MultiplyU64Overflow(sVar, rVar);
|
|
uVar = (three << 32) - dVar;
|
|
sVar = MultiplyU64Overflow(sVar, uVar); // repr: 3.61
|
|
// -0x1p-57 < sVar - sqrt(mVar) < 0x1.8001p-61
|
|
sVar = ((sVar - 2) >> 9); // repr: 12.52
|
|
// -0x1.09p-52 < sVar - sqrt(mVar) < -0x1.fffcp-63
|
|
|
|
// sVar < sqrt(mVar) < sVar + 0x1.09p-52,
|
|
// compute nearest rounded result:
|
|
// the nearest result to 52 bits is either sVar or sVar+0x1p-52,
|
|
// we can decide by comparing (2^52 sVar + 0.5)^2 to 2^104 mVar.
|
|
uint64_t dVar0, dVar1, dVar2;
|
|
double result, tVar;
|
|
dVar0 = (mVar << 42) - (sVar * sVar);
|
|
dVar1 = sVar - dVar0;
|
|
dVar2 = dVar1 + sVar + 1;
|
|
sVar += (dVar1 >> 63);
|
|
sVar &= 0x000FFFFFFFFFFFFF;
|
|
sVar |= (top << 52);
|
|
result = asdouble(sVar);
|
|
// handle rounding modes and inexact exception:
|
|
// only (sVar+1)^2 == 2^42 mVar case is exact otherwise
|
|
// add a tiny value to cause the fenv effects.
|
|
uint64_t tiny = (predict_false(dVar2==0) ? 0 : 0x0010000000000000);
|
|
tiny |= ((dVar1 ^ dVar2) & 0x8000000000000000);
|
|
tVar = asdouble(tiny);
|
|
result = eval_as_double(result + tVar);
|
|
return result;
|
|
}
|
|
#endif
|
|
|
|
float _cbrtf(float value)
|
|
{
|
|
double_t tVarCubed, tVar;
|
|
union { float value; uint32_t integer; } valueUnion = { value };
|
|
uint32_t valueUnsigned = (valueUnion.integer & 0x7FFFFFFF);
|
|
|
|
if (valueUnsigned >= 0x7F800000) { return (value + value); } // cbrt(NaN,INF) is itself
|
|
|
|
// rough cbrt to 5 bits
|
|
if (valueUnsigned < 0x00800000) // zero or subnormal?
|
|
{
|
|
if (valueUnsigned == 0) { return value; } // cbrt(+-0) is itself
|
|
valueUnion.value = (value * 0x1p24F);
|
|
valueUnsigned = (valueUnion.integer & 0x7FFFFFFF);
|
|
valueUnsigned = (valueUnsigned / 3) + B2;
|
|
}
|
|
else { valueUnsigned = (valueUnsigned / 3) + B1; }
|
|
valueUnion.integer &= 0x80000000;
|
|
valueUnion.integer |= valueUnsigned;
|
|
|
|
// First step Newton iteration (solving t*t-value/t == 0) to 16 bits. In
|
|
// double precision so that its terms can be arranged for efficiency
|
|
// without causing overflow or underflow.
|
|
tVar = valueUnion.value;
|
|
tVarCubed = (tVar * tVar * tVar);
|
|
tVar = tVar * ((double_t)value + value + tVarCubed) / (value + tVarCubed + tVarCubed);
|
|
|
|
// Second step Newton iteration to 47 bits. In double precision for
|
|
// efficiency and accuracy.
|
|
tVarCubed = (tVar * tVar * tVar);
|
|
tVar = tVar * ((double_t)value + value + tVarCubed) / (value + tVarCubed + tVarCubed);
|
|
|
|
// rounding to 24 bits is perfect in round-to-nearest mode
|
|
return tVar;
|
|
}
|
|
double _cbrt(double value)
|
|
{
|
|
union { double value; uint64_t integer; } valueUnion = { value };
|
|
double_t rVar, sVar, tVar, wVar;
|
|
uint32_t upperUnsigned = ((valueUnion.integer >> 32) & 0x7FFFFFFF);
|
|
|
|
if (upperUnsigned >= 0x7FF00000) { return (value + value); } // cbrt(NaN,INF) is itself
|
|
|
|
// Rough cbrt to 5 bits:
|
|
// cbrt(2**e*(1+m) ~= 2**(e/3)*(1+(e%3+m)/3)
|
|
// where e is integral and >= 0, m is real and in [0, 1), and "/" and
|
|
// "%" are integer division and modulus with rounding towards minus
|
|
// infinity. The RHS is always >= the LHS and has a maximum relative
|
|
// error of about 1 in 16. Adding a bias of -0.03306235651 to the
|
|
// (e%3+m)/3 term reduces the error to about 1 in 32. With the IEEE
|
|
// floating point representation, for finite positive normal values,
|
|
// ordinary integer divison of the value in bits magically gives
|
|
// almost exactly the RHS of the above provided we first subtract the
|
|
// exponent bias (1023 for doubles) and later add it back. We do the
|
|
// subtraction virtually to keep e >= 0 so that ordinary integer
|
|
// division rounds towards minus infinity; this is also efficient.
|
|
if (upperUnsigned < 0x00100000) // zero or subnormal?
|
|
{
|
|
valueUnion.value = value * 0x1p54;
|
|
upperUnsigned = ((valueUnion.integer >> 32) & 0x7FFFFFFF);
|
|
if (upperUnsigned == 0) { return value; } // cbrt(0) is itself
|
|
upperUnsigned = upperUnsigned/3 + Bd2;
|
|
}
|
|
else { upperUnsigned = upperUnsigned/3 + Bd1; }
|
|
valueUnion.integer &= (1ULL << 63);
|
|
valueUnion.integer |= (uint64_t)upperUnsigned << 32;
|
|
tVar = valueUnion.value;
|
|
|
|
// New cbrt to 23 bits:
|
|
// cbrt(value) = tVar*cbrt(value/tVar**3) ~= tVar*P(tVar**3/value)
|
|
// where P(rVar) is a polynomial of degree 4 that approximates 1/cbrt(rVar)
|
|
// to within 2**-23.5 when |rVar - 1| < 1/10. The rough approximation
|
|
// has produced tVar such than |tVar/cbrt(value) - 1| ~< 1/32, and cubing this
|
|
// gives us bounds for rVar = tVar**3/value.
|
|
//
|
|
// Try to optimize for parallel evaluation as in __tanf.c.
|
|
rVar = (tVar * tVar) * (tVar / value);
|
|
tVar = tVar * ((P0 + (rVar * (P1 + (rVar * P2)))) + ((rVar * rVar) * rVar) * (P3 + (rVar * P4)));
|
|
|
|
// Round tVar away from zero to 23 bits (sloppily except for ensuring that
|
|
// the result is larger in magnitude than cbrt(value) but not much more than
|
|
// 2 23-bit ulps larger). With rounding towards zero, the error bound
|
|
// would be ~5/6 instead of ~4/6. With a maximum error of 2 23-bit ulps
|
|
// in the rounded tVar, the infinite-precision error in the Newton
|
|
// approximation barely affects third digit in the final error
|
|
// 0.667; the error in the rounded tVar can be up to about 3 23-bit ulps
|
|
// before the final error is larger than 0.667 ulps.
|
|
valueUnion.value = tVar;
|
|
valueUnion.integer = ((valueUnion.integer + 0x80000000) & 0xFFFFFFFFC0000000ULL);
|
|
tVar = valueUnion.value;
|
|
|
|
// one step Newton iteration to 53 bits with error < 0.667 ulps
|
|
sVar = tVar * tVar; // tVar*tVar is exact
|
|
rVar = value / sVar; // error <= 0.5 ulps; |rVar| < |tVar|
|
|
wVar = tVar + tVar; // tVar+tVar is exact
|
|
rVar = (rVar - tVar) / (wVar + rVar); // rVar-tVar is exact; wVar+rVar ~= 3*tVar
|
|
tVar = tVar + (tVar * rVar); // error <= 0.5 + 0.5/3 + epsilon
|
|
return tVar;
|
|
}
|