/*
** $Id: fastmath.c,v 1.67 2005/08/26 17:36:32 roberto Exp $
** Approximations
** See Copyright Notice in agena.h
*/

/*
import fastmath, stats
s := seq();
for x from -100 to 100 by 0.001 do
   insert fastmath.dexp10(x) |- math.exp10(x) into s
od;
stats.median(stats.sorted(s)):
stats.amean(s):

import fastmath

gcc := fastmath.sunpow
fn := fastmath.dpow

watch();
for i from -10 to 10 by 0.001 do
   x := fn(i)
od;
watch():

watch();
for i from -10 to 10 by 0.001 do
   y := gcc(i)
od;
watch():
*/

#include <stdlib.h>
#include <stdint.h>  /* for UINT32_MAX */
#include <math.h>

#define fastmath_c
#define LUA_LIB

#include "agena.h"
#include "agnxlib.h"
#include "agenalib.h"
#include "agnconf.h"
#include "agncmpt.h"  /* for fmaf, trunc and isfinite, and FLT_EVAL_METHOD constant, and off64* types */
#include "sunpro.h"
#include "prepdefs.h"  /* FORCE_INLINE */

#define AGENA_LIBVERSION	"fastmath 1.1.3 for Agena as of July 04, 2023\n"

#if !(defined(LUA_DOS) || defined(__OS2__) || defined(LUA_ANSI))
#define AGENA_FASTMATHLIBNAME "fastmath"
LUALIB_API int (luaopen_fastmath) (lua_State *L);
#endif

/* Approximates the inverse root 1/root(x, degree) using the Quake III method. x is the radicant, degree the
   degree-th root which by default is 2. n is the number of iterations to be conducted and by default is 2^degree.
   xhalf is the internal equivalent of x, 0.5*x by default. The greater the degree, the less accurate is the result.

   See: https://stackoverflow.com/questions/11644441/fast-inverse-square-root-on-x64,
   https://www.xaymar.com/2016/12/17/what-is-the-fastest-way-to-get-an-inverse-square-root-part-2
   Modified. 2.10.4
   Must use a union instead of *(<type> *)&y, and the proposed int64_t type given in the web pages
   listed above is wrong, anyway. */
static int fastmath_invroot (lua_State *L) {
  lua_Number x, y, xhalf;
  double_cast d;
  int i, n, degree;
  x = agn_checknumber(L, 1);
  if (x ==0) y = 0;
  else if (x < 0) y = AGN_NAN;
  else {
    degree = agnL_optposint(L, 2, 2);
    n = agnL_optposint(L, 3, luai_numipow(2, degree));
    xhalf = agnL_optnumber(L, 4, 0.5*x);  /* internal equivalent for x, 0.5*x by default */
    /* int64_t i = *(int64_t *)&y; does not work with GCC 6.3.0, especially not with int64_t's */
    d.f = x;
    /* The magic number 0x5fe6eb50c7b537a9 is for doubles is from https://cs.uwaterloo.ca/~m32rober/rsqrt.pdf */
    /* d.i = 0x5fe6eb50c7b537a9 - (d.i >> 1);  */ /* first guess of 1/sqrt(x) */
    d.i = agnL_optnumber(L, 5, 0x5fe6eb50c7b537a9LL) - (d.i >> 1);  /* 0x5fe6eb50c7b537a9ll taken from:
      https://www.xaymar.com/2016/12/17/what-is-the-fastest-way-to-get-an-inverse-square-root-part-2/ */
    /* y = *(lua_Number *)&i;  does not work with GCC 6.3.0  */
    y = d.f;
    for (i=0; i < n; i++)  /* 4 Newton iterations to approximate 1/sqrt(x) */
      y = y*(1.5 - (xhalf*luai_numipow(y, degree)));  /* 1.5 = `threehalfs` */
    /* to compute the square root, now execute: y = ((1/y) + (x*y))*0.5; (1 additional Heron iteration has shown
       to maximise the precision), see: http://www.reactos.org/pipermail/ros-diffs/2015-May/057628.html */
  }
  lua_pushnumber(L, y);  /* square y to return 1/x */
  return 1;
}


/* Approximates 1/sqrt(x) using Quakes Fast Inverse Square Root method.
   5 % faster than the inverse of GCC's builtin sqrt.
   In the range -1000 .. 1000 with step size 0.1, mean error is 5.96994e-5, median error is 5.008699e-5.
   See: https://betterexplained.com/articles/understanding-quakes-fast-inverse-square-root */
static int fastmath_invsqrt (lua_State *L) {  /* new in 2.14.6 */
  float x, xhalf;
  ieee_float_shape_signedint u;
  x = agn_checknumber(L, 1);
  xhalf = 0.5f * x;
  u.f = x;                        /* store floating-point bits in integer */
  u.i = 0x5f3759df - (u.i >> 1);  /* initial guess for Newton's method */
  x = u.f;                        /* convert new bits into float */
  x = x*(1.5f - xhalf*x*x);       /* One round of Newton's method */
  lua_pushnumber(L, x);
  return 1;
}


/* Approximates of the reciprocal of its argument of type number. With x < 0 returns infinity.
   The return is a number. The function is purely experimental.
   54 % slower than GCC's `1/x`.
   In the range 0 .. 1000 with step size 0.001, mean error is 0.0006720364, median error is 0.00010009.
   See: https://stackoverflow.com/questions/9939322/fast-1-x-division-reciprocal/39714493 */
static int fastmath_reciprocal (lua_State *L) {
  double_cast d;
  d.f = agn_checknumber(L, 1);
  d.i = (0xbfcdd6a18f6a6f52ULL - d.i) >> 1;
  lua_pushnumber(L, d.f*d.f);
  return 1;
}


/* Computes the reciprocal 1/sqrt(x), taken from https://github.com/nickzman/hyperspace/blob/master/frsqrt.hh#L399
   24 % slower than GCC's 1/sqrt(x).
   In the range 0 .. 1000 with step size 0.001, mean error is 2.455737e-018, median error is 0. */
static int fastmath_rsqrt (lua_State *L) {
  double x = agn_checknumber(L, 1);
  const double xhalf = 0.5*x;
  long long i = *(long long*)&x;
  i = 0x5fe6ec85e7de30daLL - (i >> 1);
  x = *(double*)&i;
  x = x*(1.5 - xhalf*x*x);
  x = x*(1.5 - xhalf*x*x);
  x = x*(1.5 - xhalf*x*x);
  x = x*(1.5 - xhalf*x*x);
  lua_pushnumber(L, x);
  return 1;
}


/* Taken from `branchless scalar math routines`, by Phillip Trudeau.
   see: https://raw.githubusercontent.com/pmttavara/pt_math/master/pt_math.h

   Copyright (c) Phillip Trudeau-Tavara, 2017-2019. All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted.

   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

   The float version is not faster. 17 % slower than GCC's sqrt. */
static FORCE_INLINE double pt_sqrt (double x) {
  long long y;
  double z;
  if (x < 0) return AGN_NAN;
  { int j; for (j = 0; j < CHAR_BIT; j++) ((char *)&y)[j] = ((char *)&x)[j]; }
  y = ((y - 0x0010000000000000LL) >> 1) + 0x2000000000000000LL;
  { int j; for (j = 0; j < CHAR_BIT; j++) ((char *)&z)[j] = ((char *)&y)[j]; }
  z = 0.5*(x/z + z);
  return 0.5*(x/z + z);
}

/* Approximates sqrt(x), 2.32.1.
   17 % slower than GCC's sqrt.
   In the range 0 .. 1000 with step size 0.1, median error is 0.000000089006936 and mean error is 0.000031388319. */
static int fastmath_dsqrt (lua_State *L) {
  lua_pushnumber(L, pt_sqrt(agn_checknumber(L, 1)));
  return 1;
}


static int fastmath_drsqrt (lua_State *L) {  /* 2.32.1 */
  int j;
  long long y;
  double x, z;
  x = agn_checknumber(L, 1);
  if (x <= 0) {
    lua_pushundefined(L);
    return 1;
  }
  z = 0.5*x;
  for (j = 0; j < CHAR_BIT; j++) ((char *)&y)[j] = ((char *)&x)[j];
  y = 0x5fe6eb50c7b537a9LL - (y >> 1);
  for (j = 0; j < CHAR_BIT; j++) ((char *)&x)[j] = ((char *)&y)[j];
  x *= 1.5 - z*x*x;
  lua_pushnumber(L, x*(1.5 - z*x*x));
  return 1;
}


/* Approximates of the square root using two numeric methods. The return is a number. The returns are two
   numbers: guesses computed using C doubles and floats, in this order. The function is purely experimental.
   See: http://bits.stephan-brumme.com/squareRoot.html
   11 % slower than GCC's sqrt.
   In the range 0 .. 1000 with step size 0.001, median error is 0.476598445 and mean error is 0.48653319. */
static int fastmath_sqroot (lua_State *L) {
  ieee_float_shape_type d;
  double_cast e;
  double x = agn_checknumber(L, 1);
  if (x < 0) {  /* 2.12.0 RC 3 change */
    lua_pushundefined(L);
    return 1;
  }
  d.value = x;
  e.f = x;
  /* adjust bias */
  d.word += (127 << 23);
  /* approximation of square root */
  d.word >>= 1;
  /* see: http://www.reactos.org/pipermail/ros-diffs/2015-May/057628.html */
  e.i = 0x5fe6eb50c7b537a9LL - (e.i >> 1);
  lua_pushnumber(L, 1/e.f);    /* double approximation */
  lua_pushnumber(L, d.value);  /* float approximation */
  return 2;
}

/*
http://jsperf.com/mysin-vs-sin/9
https://www.desmos.com/calculator/8nkxlrmp7a
http://devmaster.net/posts

See: https://web.archive.org/web/20161223122133/http://http.developer.nvidia.com:80/Cg/asin.html
*/
static int fastmath_sincosfast (lua_State *L) {  /* 2.11.0, 15 percent faster than calling Agena's sin and cos operators */
  float x, s, c;
  x = agn_checknumber(L, 1);
  if (lua_gettop(L) == 1)
    tools_sincosfast(x, &s, &c);
  else {
    /* taken from http://lab.polygonal.de/2007/07/18/fast-and-accurate-sinecosine-approximation/ */
    /* always wrap input angle to -PI .. PI */
    x = tools_reducerange(x, -PI, PI);
    /* compute sine */
    s = INVPIO4 * x - (x)*INVPISQO4*x*x;
    s += 0.225 * (tools_signum(s)*s*s - s);           /* 2.15.0 change, 2.17.1 optimisation */
    /* compute cosine: sin(x + PI/2) = cos(x) */
    x += PIO2;
    if (x > PI) x -= PI2;
    c = INVPIO4 * x - tools_signum(x)*INVPISQO4*x*x;  /* 2.15.0 change, 2.17.1 optimisation */
    c += 0.225 * (tools_signum(c)*c*c - c);           /* 2.15.0 change, 2.17.1 optimisation */
  }
  lua_pushnumber(L, s);
  lua_pushnumber(L, c);
  return 2;
}


/* Taken from `branchless scalar math routines`, by Phillip Trudeau.
   see: https://raw.githubusercontent.com/pmttavara/pt_math/master/pt_math.h */
static double pt_round_unchecked (double x) {
  *(volatile double *)(char *)&x += 6755399441055744.0;
  *(volatile double *)(char *)&x -= 6755399441055744.0;
  return x;
}

#define pt_sin(x) ({ \
  x *= 0.15915494309189534; \
  x -= pt_round_unchecked(x); \
  x *= 0.5 - (x < 0 ? -x : x); \
  (x*(57.3460872862336*(x < 0 ? -x : x) + 12.4158695446104)); \
})

/* 30 % faster than sun_sin, 2.32.1
   In the range -1000 .. 1000 with step size 0.001, median error is 0.000574565 and mean error is 0.0005261111. */
static int fastmath_dsin (lua_State *L) {
  double x = agn_checknumber(L, 1);
  lua_pushnumber(L, pt_sin(x));
  return 1;
}


/* 28 % faster than sun_cos, 2.32.1
   In the range -1000 .. 1000 with step size 0.001, median error is 0.000574439 and mean error is 0.000526013. */
static int fastmath_dcos (lua_State *L) {
  double x = agn_checknumber(L, 1) + PIO2;
  lua_pushnumber(L, pt_sin(x));
  return 1;
}


/* 37 % faster than sun_tan, 2.32.2
   In the range -1000 .. 1000 with step size 0.001, median error is 0.00068536 and mean error is 0.136447. */
static int fastmath_dtan (lua_State *L) {
  double x, y, si, co;
  x = y = agn_checknumber(L, 1);
  si = pt_sin(x);
  y += PIO2;
  co = pt_sin(y);
  lua_pushnumber(L, co != 0 ? si/co : AGN_NAN);
  return 1;
}


/* 17 % faster than gcc's asin
   In the range -1 .. 1 with step size 0.000001, median error is 0.008815365 and mean error is 0.007884382. */
static int fastmath_dasin (lua_State *L) {  /* 2.32.1 */
  double x = agn_checknumber(L, 1);
  if (fabs(x) > 1) {
    lua_pushundefined(L);
    return 1;
  }
  x = pt_sqrt(1 + x) - pt_sqrt(1 - x);
  lua_pushnumber(L, x * (0.131754508171 * (x < 0 ? -x : x) + 0.924391722181));
  return 1;
}


/* 26 % faster than gcc's acos
   In the range -1 .. 1 with step size 0.000001, median error is 0.00881536 and mean error is 0.00788438. */
static int fastmath_dacos (lua_State *L) {  /* 2.32.1 */
  double x = agn_checknumber(L, 1);
  if (fabs(x) > 1) {
    lua_pushundefined(L);
    return 1;
  }
  x = pt_sqrt(1 + x) - pt_sqrt(1 - x);
  lua_pushnumber(L, PIO2 - x*(0.131754508171*(x < 0 ? -x : x) + 0.924391722181));
  return 1;
}


/* 39 % faster than gcc's atan
   In the range -1 .. 1 with step size 0.000001, median error is 0.0042227 and mean error is 0.00382986. */
static int fastmath_datan (lua_State *L) {  /* 2.32.1 */
  double x, y;
  x = agn_checknumber(L, 1);
  x /= (x < 0 ? -x : x) + 1;
  y = (x < 0 ? -x : x);
  lua_pushnumber(L, x*(y*(-1.45667498914*y + 2.18501248371) + 0.842458832225));
  return 1;
}


/* Returns an approximation of the sine of the given number x, in radians; 2.14.6
   7 % slower than Sun's sun_sin
   In the range -1000 .. 1000 with step size 0.001, median error is 0.000474045 and mean error is 0.00044432. */
static int fastmath_sinfast (lua_State *L) {
  lua_pushnumber(L, tools_sinfast(agn_checknumber(L, 1)));
  return 1;
}


/* Returns an approximation of the cosine of the given number x, in radians. Depending on the CPU used, it is up to 40 percent faster
   than calling the `cos` operator. 2.14.6
   8 % slower than Sun's sun_cos
   In the range -1000 .. 1000 with step size 0.001, median error is 0.000473941 and mean error is 0.000444221. */
static int fastmath_cosfast (lua_State *L) {
  lua_pushnumber(L, tools_cosfast(agn_checknumber(L, 1)));
  return 1;
}


/* Returns an approximation of the tangent of the given number x, in radians. 2.14.6
   11 % slower than Sun's sun_tan
   In the range -1000 .. 1000 with step size 0.001, median error is 0.000467639 and mean error is 0.1244864. */
static int fastmath_tanfast (lua_State *L) {
  lua_Number x, si, co;
  x = agn_checknumber(L, 1);
  si = tools_sinfast(x);
  co = tools_cosfast(x);
  lua_pushnumber(L, co == 0 ? AGN_NAN : si/co);  /* 2.32.2 patch */
  return 1;
}


/* Approximates sqrt(x) but is much slower than the sqrt operator. If x < 0, returns `undefined`.
   In the range 0 .. 1000 with step size 0.1, median error is 0.00791 and mean error is 0.00829. */
static int fastmath_sqrtfast (lua_State *L) {  /* 2.11.0 */
  lua_pushnumber(L, tools_sqrtfast(agn_checknumber(L, 1)));
  return 1;
}


/* The function approximates sqrt(x).
   1 % slower than GCC's sqrt.
   In the range 0 .. 1000 with step size 0.1, median error is 0.2600843 and mean error is 0.3900767. */
static int fastmath_sqrtapx (lua_State *L) {  /* 2.15.1 */
  lua_pushnumber(L, tools_sqrtapx(agn_checknumber(L, 1)));
  return 1;
}


/* The function approximates log2(x) for number x, and returns a number. If x <= 0, the result is wrong.
   18 % faster than GCC's log2.
   In the range 0 .. 1000 with step size 0.1, median error is 0.00008242754665 and mean error is 0.0000764132614. */
static int fastmath_lbfast (lua_State *L) {  /* new in 2.14.6 */
  lua_pushnumber(L, tools_lbfast(agn_checknumber(L, 1)));
  return 1;
}


/* The function approximates ln(x) for number x, and returns a number. If x <= 0, the result is wrong.
   18 % faster than GCC's log2.
   In the range 0 .. 1000 with step size 0.001, median error is 0.000057118 and mean error is 0.000052952. */
static int fastmath_lnfast (lua_State *L) {  /* 2.14.6 */
  lua_pushnumber(L, (float)LN2 * tools_lbfast(agn_checknumber(L, 1)));
  return 1;
}


/* Returns the hypotenuse of the two numbers x and y; the return is a number. The function is sixty percent faster than `hypot`,
   but prone two round-off errors and overflow. 60 percent faster than `hypot', 27 percent faster than the Agena implementation
   sqrt(x**2 + y**2). */
static int fastmath_hypotfast (lua_State *L) {  /* 2.14.4 */
  lua_Number x, y;
  x = agn_checknumber(L, 1);
  y = agn_checknumber(L, 2);
  lua_pushnumber(L, sqrt(x*x + y*y));
  return 1;
}


/* Approximates 2^x, 2.14.6.
   9 % faster than tools_exp2.
   In the range -1000 .. 10 with step size 0.001, median error is 1.1755178e-038 and mean error is 0.00003296572. */
static int fastmath_exp2fast (lua_State *L) {
  lua_pushnumber(L, tools_pow2fast(agn_checknumber(L, 1)));
  return 1;
}


/* Approximates e^x with enough precision.
   43% _faster_ than GCC's exp.
   In the range -1000 .. 10 with step size 0.001, median error is 1.1755178-038 and mean error is 0.0005315205.
   For x > 10 accuracy becomes quite bad. 2.14.6 */
static int fastmath_expfast (lua_State *L) {
  lua_pushnumber(L, tools_pow2fast((float)agn_checknumber(L, 1) * (float)INVLN2));
  return 1;
}


/* Taken from `branchless scalar math routines`,
   by Phillip Trudeau
   see: https://raw.githubusercontent.com/pmttavara/pt_math/master/pt_math.h */
static double pt_exp2 (double x) {
  double y;
  long long exponent;
  if (x <= -1022) return 0;
  if (x > 1024) return HUGE_VAL;
  exponent = (long long)(x + 1023);
  x += 1023 - exponent;
  exponent <<= 52;
  { int j; for (j=0; j < CHAR_BIT; j++) ((char *)&y)[j] = ((char *)&exponent)[j]; }
  x *= x*0.339766027260413688582 + 0.660233972739586311418;
  return y*(x + 1);
}

/* Approximates E^x, 2.32.1
   43 % faster than Sun's sun_exp.
   In the range -1000 .. 10 with step size 0.001, median error is 1.446234407e-218 and mean error is 0.0384171. */
static int fastmath_dexp (lua_State *L) {
  lua_pushnumber(L, pt_exp2(INVLN2*agn_checknumber(L, 1)));
  return 1;
}

/* Approximates 2^x, 2.32.1
   5 % faster than tools_exp2.
   In the range -1000 .. 10 with step size 0.001, median error is 1.32809424e-152 and mean error is 0.00254373. */
static int fastmath_dexp2 (lua_State *L) {
  lua_pushnumber(L, pt_exp2(agn_checknumber(L, 1)));
  return 1;
}

/* Approximates 10^x, 2.32.2
   79 % faster than tools_exp2.
   In the range -1000 .. 5 with step size 0.001, median error is 0 and mean error is 0.0717718. */
static int fastmath_dexp10 (lua_State *L) {
  lua_pushnumber(L, pt_exp2(LOG2_10*agn_checknumber(L, 1)));
  return 1;
}

/* Approximates E^x, 2.32.1
   43 % faster than Sun's sun_exp.
   In the range -1000 .. 10 with step size 0.001, median error is 1.0563240394147e-215 and mean error is 0.85082267873818.
   Taken from https://codingforspeed.com/using-faster-exponential-approximation/ 2.41.3 UNDOC */
static int fastmath_exp1024 (lua_State *L) {
  lua_Number x;
  x = 1.0 + agn_checknumber(L, 1)/1024;
  x *= x; x *= x; x *= x; x *= x;
  x *= x; x *= x; x *= x; x *= x;
  x *= x; x *= x;
  lua_pushnumber(L, x);
  return 1;
}


/* Taken from `branchless scalar math routines`, by Phillip Trudeau.
   see: https://raw.githubusercontent.com/pmttavara/pt_math/master/pt_math.h */
static FORCE_INLINE double pt_log2 (double x) {
  long long y;
  double result;
  if (x == 0) return -HUGE_VAL;
  if (x < 0) return AGN_NAN;
  { int j; for (j=0; j < CHAR_BIT; j++) ((char *)&y)[j] = ((char *)&x)[j]; }
  y >>= 52;
  result = (double)y;
  { int j; for (j=0; j < CHAR_BIT; j++) ((char *)&y)[j] = ((char *)&x)[j]; }
  y = (y & 0x000fffffffffffffLL) | 0x3ff0000000000000LL;
  { int j; for (j=0; j < CHAR_BIT; j++) ((char *)&x)[j] = ((char *)&y)[j]; }
  result += -1024 + x*(-0.33333333333333333*x + 2) - 0.66666666666666666;
  return result;
}

/* Approximates the logarithm for base exp(1).
   12 % faster than sun_log; 2.32.1
   In the range 0 .. 1000 with step size 0.001, median error is 0.003392382 and mean error is 0.003499538.*/
static int fastmath_dlog (lua_State *L) {
  lua_pushnumber(L, LN2*pt_log2(agn_checknumber(L, 1)));
  return 1;
}

/* Approximates the logarithm for base 2.
   18 % faster than sun_log2; 2.32.1
   In the range 0 .. 1000 with step size 0.001, median error is 0.00489417 and mean error is 0.0050487668. */
static int fastmath_dlog2 (lua_State *L) {
  lua_pushnumber(L, pt_log2(agn_checknumber(L, 1)));
  return 1;
}

/* Approximates the logarithm for base 10.
   18 % faster than sun_log10; 2.32.1
   In the range 0 .. 1000 with step size 0.001, median error is 0.0014732928 and mean error is 0.00151983. */
static int fastmath_dlog10 (lua_State *L) {
  lua_pushnumber(L, INVLOG2_10*pt_log2(agn_checknumber(L, 1)));
  return 1;
}


/* Compute natural logarithm, maximum error 0.85089 ulps, 2.41.3, UNDOC
   Taken from:
   https://stackoverflow.com/questions/39821367/very-fast-approximate-logarithm-natural-log-function-in-c
   extended version is 30 % slower than sun_log; slim version is 11 % slower.
   In the range 0 .. 1000 with step size 0.001, median error is 6.8821514913608e-006 (extended version: 2.1783863601854e-008)
   and mean error is 6.8429470521791e-006 (extended version: 2.4696775892209e-008).
   2 % faster than sun_log */
static float njuffa_logf (double a) {
  float i, m, r, s, t;
  int e;
  m = frexpf(a, &e);
  if (m < 0.666666667f) {
    m += m;
    e -= 1;
  }
  i = (float)e;
  /* m in [2/3, 4/3] */
  m -= 1.0f;
  s = m*m;
  /* Compute log1p(m) for m in [-1/3, 1/3], this extended version is slow:
  r =             -0.130310059f;  // -0x1.0ae000p-3
  t =              0.140869141f;  //  0x1.208000p-3
  r = fmaf(r, s, -0.121483512f);  // -0x1.f198b2p-4
  t = fmaf(t, s,  0.139814854f);  //  0x1.1e5740p-3
  r = fmaf(r, s, -0.166846126f);  // -0x1.55b36cp-3
  t = fmaf(t, s,  0.200120345f);  //  0x1.99d8b2p-3
  r = fmaf(r, s, -0.249996200f);  // -0x1.fffe02p-3
  r = fmaf(t, m, r);
  r = fmaf(r, m,  0.333331972f);  //  0x1.5554fap-2
  r = fmaf(r, m, -0.500000000f);  // -0x1.000000p-1
  r = fmaf(r, s, m);
  r = fmaf(i,  0.693147182f, r);  //  0x1.62e430p-1 = log(2)
  if (!((a > 0.0f) && (a < INFINITY))) {
    r = a + a;
    if (a <= 0.0f) r = AGN_NAN;
    // if (a == 0.0f) r = -INFINITY;
  } */
  /* Compute log1p(f) for f in [-1/3, 1/3] */
  r = fmaf(0.230836749f, m, -0.279208571f); /* 0x1.d8c0f0p-3, -0x1.1de8dap-2 */
  t = fmaf(0.331826031f, m, -0.498910338f); /* 0x1.53ca34p-2, -0x1.fee25ap-2 */
  r = fmaf(r, s, t);
  r = fmaf(r, s, m);
  return fmaf(i, 0.693147182f, r); /* 0x1.62e430p-1 = log(2) */
}

static int fastmath_lnjuffa (lua_State *L) {
  lua_pushnumber(L, njuffa_logf(agn_checknumber(L, 1)));
  return 1;
}


/* Taken from: https://stackoverflow.com/questions/9799041/efficient-implementation-of-natural-logarithm-ln-and-exponentiation

Maple 7, play with Digits and polynomial degree
with(numapprox):
Digits := 20:
dgree := 8
minimax(ln(x), x=1 ..2, dgree, 1, 'maxerror');
maxerror;
# dgree = 6, Digits = 20, err = 0.127933636397776e-5, median = 0.0017500645963096,  amean = 0.0053161078061399, 10 % faster than sun_log
# dgree = 8, Digits = 20, err = .2933024089109e-7,    median = 0.00033780734217714, amean = 0.0024357567041372, 5 % faster than sun_log
*/

static lua_Number ln_minimax (lua_Number y) {
  int lbint;
  lua_Number d, x, r;
  if (y <= 0) return AGN_NAN;
  lbint = tools_msb((int)y);  /* reduce, see: https://stackoverflow.com/a/4970859/6630230 */
  d = (lua_Number)(1 << lbint);
  x = y/d;  /* normalized value between [1.0, 2.0] */
  /* r = -1.7417939 + (2.8212026 + (-1.4699568 + (0.44717955 - 0.056570851 * x) * x) * x) * x; */
  r = -2.1071953597654539175+(4.2371832233645652631+(-3.7027848670359588896+(2.2780482522696610898+(-.87816086992668570128+(.19071860762757627481-.17807707203174740806e-1*x)*x)*x)*x)*x)*x;
/*  r = -2.3743015759612313746+(5.6524795489805186515+(-6.9364178209958099741+(6.4365537244488959574+(-4.1681941318200071907+(1.8290588132187118371+(-.51872188567312728968+(.85843308898376262131e-1-.62999517662832451896e-2*x)*x)*x)*x)*x)*x)*x)*x; */
  return r + ((lua_Number)lbint)*LN2;  /* ln(2) = 0.69314718... */
}

static int fastmath_lnmaple (lua_State *L) {
  lua_pushnumber(L, ln_minimax(agn_checknumber(L, 1)));
  return 1;
}


/* Taken from `branchless scalar math routines`, by Phillip Trudeau.
   see: https://raw.githubusercontent.com/pmttavara/pt_math/master/pt_math.h */
static double pt_pow (double a, double b) {
  unsigned long long i;
  double sign = 1;
  if (b < 0) {
    if (a == 0) return AGN_NAN;  /* added to avoid infinity */
    a = 1/a; b = -b;  /* changed, do not call pt_pow recursively */
  }
  i = (unsigned long long)b;
  b -= i;
  if (a < 0) {
    if (b) return AGN_NAN;
    else {
      a = -a;
      if (i & 1) sign = -1;  /* changed, do not call pt_pow recursively; return -pt_pow(a, (double)i);  */
    }
  }
  if (a == 0 && b == 0) {  /* added so that 0^b, b > 0, is 0 and not undefined */
    return (b + i == 0) ? AGN_NAN : 0;
  }
  b = pt_exp2(b*pt_log2(a));
  for (; i; a *= a, i >>= 1) {
    if (i & 1) b *= a;
  }
  return sign*b;
}

/* 48 % faster than Sun's sun_pow.
   In the range -10 .. 10 with step size 0.01, median error is 0.00151075 and mean error is 1782116753241 [sic !]. */
static int fastmath_dpow (lua_State *L) {
  lua_pushnumber(L, pt_pow(agn_checknumber(L, 1), agn_checknumber(L, 2)));
  return 1;
}


static double pt_trunc (double x) {
  if (!((x < 0 ? -x : x) <= 4503599627370495.5)) return x;
  return (double)(long long)x;
}

static FORCE_INLINE double pt_frac (double x) { return x - pt_trunc(x); }

/* 1 % slower than GCC's trunc.
   In the range -1000 .. 1000 with step size 0.000001, median error is 0 and mean error is 0. */
static int fastmath_dtrunc (lua_State *L) {
  lua_pushnumber(L, pt_trunc(agn_checknumber(L, 1)));
  return 1;
}

/* As fast as Sun's sun_frac.
   In the range -1000 .. 1000 with step size 0.000001, median error is 0 and mean error is 0. */
static int fastmath_dfrac (lua_State *L) {
  lua_pushnumber(L, pt_frac(agn_checknumber(L, 1)));
  return 1;
}


/* Computed with Maple (20 % faster than GCC's sin function):
#Sine Cheby Coeffs
Digits := 20:  # twenty !  15 does nor produce better running times
with(numapprox): with(orthopoly, T):
chebyshev(sin(1/2*Pi*x), x=-1..1):
convert(", polynom):
convert(", horner, x): subs(x^2=z, ");

Calculating the cosine this way does not work for some quadrants, so we use PSIN and the ZX Spectrum's
way of preparing the call to sine w/i cosine, in zxchecbysin/cos/tan */

#define PSIN(x) ({ \
  lua_Number z = x*x; \
  ((1.5707963267948966192 + \
  (-0.64596409750624625330 + \
  (0.79692626246167037175e-1 + \
  (-0.46817541353186093624e-2 + \
  (0.16044118478693090188e-3 + \
  (-0.35988432338073450945e-5 + \
  (0.56921726335299650366e-7 + \
  (-0.66879976947136401180e-9 + \
  (0.60639571856145550333e-11 - \
   0.42440321259280037967e-13*z)*z)*z)*z)*z)*z)*z)*z)*z)*x); \
})

static int fastmath_zxchebysin (lua_State *L) {  /* 3.2.1 */
  lua_Number x = agn_checknumber(L, 1);
  x = INVPI2*x;
  x -= sun_floor(x + 0.5);
  x = 4.0*x;
  if (x > 1) x = 2 - x;
  else if (x < -1) x = -x - 2;
  lua_pushnumber(L, PSIN(x));
  return 1;
}


static int fastmath_zxchebycos (lua_State *L) {  /* 3.2.1 */
  lua_Number x = agn_checknumber(L, 1);
  x = INVPI2*x;
  x -= sun_floor(x + 0.5);
  x = 4.0*x;
  /* this is how the ZX Spectrum 48 ROM assembler routine at offset 20 works: preparation to call sine */
  x = (x > 0) ? 1 - x : -x + 1;
  /* argument transformed, prepare to calculate `sine`, same as in zxchebysin */
  if (x > 1) x = 2 - x;
  else if (x < -1) x = -x - 2;
  lua_pushnumber(L, PSIN(x));
  return 1;
}


static int fastmath_zxchebytan (lua_State *L) {  /* 3.2.1 */
  lua_Number x, w, co;
  x = agn_checknumber(L, 1);
  x = INVPI2*x;
  x -= sun_floor(x + 0.5);
  /* cosine first */
  x = 4.0*x;
  /* this is how the ZX Spectrum 48 ROM assembler routine at offset 20 works: preparation to call sine */
  w = (x > 0) ? 1 - x : -x + 1;
  /* argument transformed, prepare to calculate `cosine`, same as in zxchebysin */
  if (w > 1) w = 2 - w;
  else if (w < -1) w = -w - 2;
  co = PSIN(w);
  if (co == 0) {
    lua_pushundefined(L);
  } else {  /* now sine */
    if (x > 1) x = 2 - x;
    else if (x < -1) x = -x - 2;
    lua_pushnumber(L, PSIN(x)/co);
  }
  return 1;
}


/* fast `floor` function: rounds down to the nearest integer, 3.2.2 */
static int fastmath_floor (lua_State *L) {
  lua_Number x = agn_checknumber(L, 1);
  int xi = (int)x;
  lua_pushnumber(L, x < xi ? xi - 1 : xi);
  return 1;
}


/* Factorial, see https://en.wikipedia.org/wiki/Stirling%27s_approximation & https://dlmf.nist.gov/5.11#i.p2, 3.5.3,
   UNDOC for the results are not that good. */
static int fastmath_factfast (lua_State *L) {
  lua_Number n, n2, n4;
  n = agn_checknumber(L, 1);
  n2 = n*n;
  n4 = n2*n2;
  lua_pushnumber(L,  /* APPLE GCC for whatever reason would compute with integers instead of deoubles if constants had no trailing .0 fraction, 3.7.6 */
    sqrt(2*PI*n)*sun_pow(n/EXP1, n, 0) *
      (1.0 + 1.0/(12.0*n) + 1.0/(288.0*n2) - 139.0/(51840.0*n2*n) - 571.0/2488320.0*n4 + 163879.0/(209018880.0*n4*n) + 5246819.0/(75246796800.0*n2*n4)));
  return 1;
}


static const luaL_Reg fastmathlib[] = {
  {"expfast", fastmath_expfast},              /* added on January 21, 2019 */
  {"exp1024", fastmath_exp1024},              /* added on July 02, 2023 */
  {"dexp", fastmath_dexp},                    /* added on September 27, 2022 */
  {"dexp2", fastmath_dexp2},                  /* added on September 27, 2022 */
  {"dexp10", fastmath_dexp10},                /* added on September 30, 2022 */
  {"dpow", fastmath_dpow},                    /* added on September 28, 2022 */
  {"factfast", fastmath_factfast},            /* added on December 05, 2023 */
  {"floor", fastmath_floor},                  /* added on August 02, 2023 */
  {"hypotfast", fastmath_hypotfast},          /* added on January 01, 2019 */
  {"invroot", fastmath_invroot},              /* added on August 01, 2017 */
  {"invsqrt", fastmath_invsqrt},              /* added on January 21, 2019 */
  {"rsqrt", fastmath_rsqrt},                  /* added on September 28, 2022 */
  {"lbfast", fastmath_lbfast},                /* added on June 16, 2018 */
  {"lnfast", fastmath_lnfast},                /* added on January 21, 2019 */
  {"lnjuffa", fastmath_lnjuffa},              /* added on July 02, 2023 */
  {"lnmaple", fastmath_lnmaple},              /* added on July 03, 2023 */
  {"dlog", fastmath_dlog},                    /* added on September 27, 2022 */
  {"dlog2", fastmath_dlog2},                  /* added on September 27, 2022 */
  {"dlog10", fastmath_dlog10},                /* added on September 30, 2022 */
  {"exp2fast", fastmath_exp2fast},            /* added on January 22, 2019 */
  {"pow2fast", fastmath_exp2fast},            /* BACKWARD compatibility; added on January 22, 2019 */
  {"reciprocal", fastmath_reciprocal},        /* added on August 01, 2017 */
  {"cosfast", fastmath_cosfast},              /* added on January 18, 2019 */
  {"sinfast", fastmath_sinfast},              /* added on January 18, 2019 */
  {"sincosfast", fastmath_sincosfast},        /* added on October 17, 2017 */
  {"dasin", fastmath_dasin},                  /* added on September 27, 2022 */
  {"dacos", fastmath_dacos},                  /* added on September 27, 2022 */
  {"datan", fastmath_datan},                  /* added on September 27, 2022 */
  {"dsin", fastmath_dsin},                    /* added on September 27, 2022 */
  {"dcos", fastmath_dcos},                    /* added on September 27, 2022 */
  {"dtan", fastmath_dtan},                    /* added on September 27, 2022 */
  {"sqroot", fastmath_sqroot},                /* added on August 07, 2017 */
  {"sqrtfast", fastmath_sqrtfast},            /* added on October 17, 2017 */
  {"dsqrt", fastmath_dsqrt},                  /* added on September 27, 2022 */
  {"drsqrt", fastmath_drsqrt},                /* added on September 27, 2022 */
  {"sqrtapx", fastmath_sqrtapx},              /* added on May 23, 2019 */
  {"tanfast", fastmath_tanfast},              /* added on January 18, 2019 */
  {"dtrunc", fastmath_dtrunc},                /* added on September 30, 2022 */
  {"dfrac", fastmath_dfrac},                  /* added on September 30, 2022 */
  {"zxchebycos", fastmath_zxchebycos},        /* added on July 31, 2023 */
  {"zxchebysin", fastmath_zxchebysin},        /* added on July 31, 2023 */
  {"zxchebytan", fastmath_zxchebytan},        /* added on July 31, 2023 */
  {NULL, NULL}
};


/*
** Open fastmath library
*/
LUALIB_API int luaopen_fastmath (lua_State *L) {
  luaL_register(L, AGENA_FASTMATHLIBNAME, fastmathlib);
  return 1;
}

