/*
* Fast exp(x) computation (with SSE2 optimizations).
*
* Copyright (c) 2010, Naoaki Okazaki
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the names of the authors nor the names of its contributors
* may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
* OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
To compile this code using gcc:
gcc -o fastexp -O3 -fomit-frame-pointer -msse2 -mfpmath=sse -ffast-math -lm fastexp.c
To compile this code using Microsoft Visual C++ 2008 (or later):
Simply create a console project, and add this file to the project.
*/
#include <math.h>
#include <memory.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <emmintrin.h>
/*
Useful macro definitions for memory alignment:
http://homepage1.nifty.com/herumi/prog/gcc-and-vc.html#MIE_ALIGN
*/
#ifdef _MSC_VER
#define MIE_ALIGN(x) __declspec(align(x))
#else
#define MIE_ALIGN(x) __attribute__((aligned(x)))
#endif
#ifdef _MSC_VER
#include <malloc.h>
#else
#include <stdlib.h>
static inline void *_aligned_malloc(size_t size, size_t alignment)
{
void *p;
int ret = posix_memalign(&p, alignment, size);
return (ret == 0) ? p : 0;
}
#endif
#define CONST_128D(var, val) \
MIE_ALIGN(16) static const double var[2] = {(val), (val)}
#define CONST_128I(var, v1, v2, v3, v4) \
MIE_ALIGN(16) static const int var[4] = {(v1), (v2), (v3), (v4)}
typedef struct {
const char *name;
void (*func)(double *values, int n);
double error_peak;
double error_rms;
clock_t elapsed_time;
double *values;
} performance_t;
typedef union {
double d;
unsigned short s[4];
} ieee754;
static const double MAXLOG = 7.08396418532264106224E2; /* log 2**1022 */
static const double MINLOG = -7.08396418532264106224E2; /* log 2**-1022 */
static const double LOG2E = 1.4426950408889634073599; /* 1/log(2) */
static const double INFINITY = 1.79769313486231570815E308;
static const double C1 = 6.93145751953125E-1;
static const double C2 = 1.42860682030941723212E-6;
void remez5_0_log2_sse(double *values, int num)
{
int i;
CONST_128D(one, 1.);
CONST_128D(log2e, 1.4426950408889634073599);
CONST_128D(maxlog, 7.09782712893383996843e2); // log(2**1024)
CONST_128D(minlog, -7.08396418532264106224e2); // log(2**-1022)
CONST_128D(c1, 6.93145751953125E-1);
CONST_128D(c2, 1.42860682030941723212E-6);
CONST_128D(w5, 1.185268231308989403584147407056378360798378534739e-2);
CONST_128D(w4, 3.87412011356070379615759057344100690905653320886699e-2);
CONST_128D(w3, 0.16775408658617866431779970932853611481292418818223);
CONST_128D(w2, 0.49981934577169208735732248650232562589934399402426);
CONST_128D(w1, 1.00001092396453942157124178508842412412025643386873);
CONST_128D(w0, 0.99999989311082729779536722205742989232069120354073);
const __m128i offset = _mm_setr_epi32(1023, 1023, 0, 0);
for (i = 0;i < num;i += 4) {
__m128i k1, k2;
__m128d p1, p2;
__m128d a1, a2;
__m128d xmm0, xmm1;
__m128d x1, x2;
/* Load four double values. */
xmm0 = _mm_load_pd(maxlog);
xmm1 = _mm_load_pd(minlog);
x1 = _mm_load_pd(values+i);
x2 = _mm_load_pd(values+i+2);
x1 = _mm_min_pd(x1, xmm0);
x2 = _mm_min_pd(x2, xmm0);
x1 = _mm_max_pd(x1, xmm1);
x2 = _mm_max_pd(x2, xmm1);
/* a = x / log2; */
xmm0 = _mm_load_pd(log2e);
xmm1 = _mm_setzero_pd();
a1 = _mm_mul_pd(x1, xmm0);
a2 = _mm_mul_pd(x2, xmm0);
/* k = (int)floor(a); p = (float)k; */
p1 = _mm_cmplt_pd(a1, xmm1);
p2 = _mm_cmplt_pd(a2, xmm1);
xmm0 = _mm_load_pd(one);
p1 = _mm_and_pd(p1, xmm0);
p2 = _mm_and_pd(p2, xmm0);
a1 = _mm_sub_pd(a1, p1);
a2 = _mm_sub_pd(a2, p2);
k1 = _mm_cvttpd_epi32(a1);
k2 = _mm_cvttpd_epi32(a2);
p1 = _mm_cvtepi32_pd(k1);
p2 = _mm_cvtepi32_pd(k2);
/* x -= p * log2; */
xmm0 = _mm_load_pd(c1);
xmm1 = _mm_load_pd(c2);
a1 = _mm_mul_pd(p1, xmm0);
a2 = _mm_mul_pd(p2, xmm0);
x1 = _mm_sub_pd(x1, a1);
x2 = _mm_sub_pd(x2, a2);
a1 = _mm_mul_pd(p1, xmm1);
a2 = _mm_mul_pd(p2, xmm1);
x1 = _mm_sub_pd(x1, a1);
x2 = _mm_sub_pd(x2, a2);
/* Compute e^x using a polynomial approximation. */
xmm0 = _mm_load_pd(w5);
xmm1 = _mm_load_pd(w4);
a1 = _mm_mul_pd(x1, xmm0);
a2 = _mm_mul_pd(x2, xmm0);
a1 = _mm_add_pd(a1, xmm1);
a2 = _mm_add_pd(a2, xmm1);
xmm0 = _mm_load_pd(w3);
xmm1 = _mm_load_pd(w2);
a1 = _mm_mul_pd(a1, x1);
a2 = _mm_mul_pd(a2, x2);
a1 = _mm_add_pd(a1, xmm0);
a2 = _mm_add_pd(a2, xmm0);
a1 = _mm_mul_pd(a1, x1);
a2 = _mm_mul_pd(a2, x2);
a1 = _mm_add_pd(a1, xmm1);
a2 = _mm_add_pd(a2, xmm1);
xmm0 = _mm_load_pd(w1);
xmm1 = _mm_load_pd(w0);
a1 = _mm_mul_pd(a1, x1);
a2 = _mm_mul_pd(a2, x2);
a1 = _mm_add_pd(a1, xmm0);
a2 = _mm_add_pd(a2, xmm0);
a1 = _mm_mul_pd(a1, x1);
a2 = _mm_mul_pd(a2, x2);
a1 = _mm_add_pd(a1, xmm1);
a2 = _mm_add_pd(a2, xmm1);
/* p = 2^k; */
k1 = _mm_add_epi32(k1, offset);
k2 = _mm_add_epi32(k2, offset);
k1 = _mm_slli_epi32(k1, 20);
k2 = _mm_slli_epi32(k2, 20);
k1 = _mm_shuffle_epi32(k1, _MM_SHUFFLE(1,3,0,2));
k2 = _mm_shuffle_epi32(k2, _MM_SHUFFLE(1,3,0,2));
p1 = _mm_castsi128_pd(k1);
p2 = _mm_castsi128_pd(k2);
/* a *= 2^k. */
a1 = _mm_mul_pd(a1, p1);
a2 = _mm_mul_pd(a2, p2);
/* Store the results. */
_mm_store_pd(values+i, a1);
_mm_store_pd(values+i+2, a2);
}
}
void remez7_0_log2_sse(double *values, int num)
{
int i;
CONST_128D(one, 1.);
CONST_128D(log2e, 1.4426950408889634073599);
CONST_128D(maxlog, 7.09782712893383996843e2); // log(2**1024)
CONST_128D(minlog, -7.08396418532264106224e2); // log(2**-1022)
CONST_128D(c1, 6.93145751953125E-1);
CONST_128D(c2, 1.42860682030941723212E-6);
CONST_128D(w7, 2.8177033701883768252768416240498659720165776534637e-4);
CONST_128D(w6, 1.2890480764261415880540047007217296074165060445915e-3);
CONST_128D(w5, 8.3937617380265463210782576208636760245107957044397e-3);
CONST_128D(w4, 4.16466393513638680290956986766301695821268933286258e-2);
CONST_128D(w3, 0.16667025104576204064704670601843171167692234960251);
CONST_128D(w2, 0.49999968588211828322575537372426392301459912262932);
CONST_128D(w1, 1.00000001047169007093329328214028329840435313172085);
CONST_128D(w0, 0.99999999994275232965232540108706952362552348759169);
const __m128i offset = _mm_setr_epi32(1023, 1023, 0, 0);
for (i = 0;i < num;i += 4) {
__m128i k1, k2;
__m128d p1, p2;
__m128d a1, a2;
__m128d xmm0, xmm1;
__m128d x1, x2;
/* Load four double values. */
xmm0 = _mm_load_pd(maxlog);
xmm1 = _mm_load_pd(minlog);
x1 = _mm_load_pd(values+i);
x2 = _mm_load_pd(values+i+2);
x1 = _mm_min_pd(x1, xmm0);
x2 = _mm_min_pd(x2, xmm0);
x1 = _mm_max_pd(x1, xmm1);
x2 = _mm_max_pd(x2, xmm1);
/* a = x / log2; */
xmm0 = _mm_load_pd(log2e);
xmm1 = _mm_setzero_pd();
a1 = _mm_mul_pd(x1, xmm0);
a2 = _mm_mul_pd(x2, xmm0);
/* k = (int)floor(a); p = (float)k; */
p1 = _mm_cmplt_pd(a1, xmm1);
p2 = _mm_cmplt_pd(a2, xmm1);
xmm0 = _mm_load_pd(one);
p1 = _mm_and_pd(p1, xmm0);
p2 = _mm_and_pd(p2, xmm0);
a1 = _mm_sub_pd(a1, p1);
a2 = _mm_sub_pd(a2, p2);
k1 = _mm_cvttpd_epi32(a1);
k2 = _mm_cvttpd_epi32(a2);
p1 = _mm_cvtepi32_pd(k1);
p2 = _mm_cvtepi32_pd(k2);
/* x -= p * log2; */
xmm0 = _mm_load_pd(c1);
xmm1 = _mm_load_pd(c2);
a1 = _mm_mul_pd(p1, xmm0);
a2 = _mm_mul_pd(p2, xmm0);
x1 = _mm_sub_pd(x1, a1);
x2 = _mm_sub_pd(x2, a2);
a1 = _mm_mul_pd(p1, xmm1);
a2 = _mm_mul_pd(p2, xmm1);
x1 = _mm_sub_pd(x1, a1);
x2 = _mm_sub_pd(x2, a2);
/* Compute e^x using a polynomial approximation. */
xmm0 = _mm_load_pd(w7);
xmm1 = _mm_load_pd(w6);
a1 = _mm_mul_pd(x1, xmm0);
a2 = _mm_mul_pd(x2, xmm0);
a1 = _mm_add_pd(a1, xmm1);
a2 = _mm_add_pd(a2, xmm1);
xmm0 = _mm_load_pd(w5);
xmm1 = _mm_load_pd(w4);
a1 = _mm_mul_pd(a1, x1);
a2 = _mm_mul_pd(a2, x2);
a1 = _mm_add_pd(a1, xmm0);
a2 = _mm_add_pd(a2, xmm0);
a1 = _mm_mul_pd(a1, x1);
a2 = _mm_mul_pd(a2, x2);
a1 = _mm_add_pd(a1, xmm1);
a2 = _mm_add_pd(a2, xmm1);
xmm0 = _mm_load_pd(w3);
xmm1 = _mm_load_pd(w2);
a1 = _mm_mul_pd(a1, x1);
a2 = _mm_mul_pd(a2, x2);
a1 = _mm_add_pd(a1, xmm0);
a2 = _mm_add_pd(a2, xmm0);
a1 = _mm_mul_pd(a1, x1);
a2 = _mm_mul_pd(a2, x2);
a1 = _mm_add_pd(a1, xmm1);
a2 = _mm_add_pd(a2, xmm1);
xmm0 = _mm_load_pd(w1);
xmm1 = _mm_load_pd(w0);
a1 = _mm_mul_pd(a1, x1);
a2 = _mm_mul_pd(a2, x2);
a1 = _mm_add_pd(a1, xmm0);
a2 = _mm_add_pd(a2, xmm0);
a1 = _mm_mul_pd(a1, x1);
a2 = _mm_mul_pd(a2, x2);
a1 = _mm_add_pd(a1, xmm1);
a2 = _mm_add_pd(a2, xmm1);
/* p = 2^k; */
k1 = _mm_add_epi32(k1, offset);
k2 = _mm_add_epi32(k2, offset);
k1 = _mm_slli_epi32(k1, 20);
k2 = _mm_slli_epi32(k2, 20);
k1 = _mm_shuffle_epi32(k1, _MM_SHUFFLE(1,3,0,2));
k2 = _mm_shuffle_epi32(k2, _MM_SHUFFLE(1,3,0,2));
p1 = _mm_castsi128_pd(k1);
p2 = _mm_castsi128_pd(k2);
/* a *= 2^k. */
a1 = _mm_mul_pd(a1, p1);
a2 = _mm_mul_pd(a2, p2);
/* Store the results. */
_mm_store_pd(values+i, a1);
_mm_store_pd(values+i+2, a2);
}
}
void remez9_0_log2_sse(double *values, int num)
{
int i;
CONST_128D(one, 1.);
CONST_128D(log2e, 1.4426950408889634073599);
CONST_128D(maxlog, 7.09782712893383996843e2); // log(2**1024)
CONST_128D(minlog, -7.08396418532264106224e2); // log(2**-1022)
CONST_128D(c1, 6.93145751953125E-1);
CONST_128D(c2, 1.42860682030941723212E-6);
CONST_128D(w9, 3.9099787920346160288874633639268318097077213911751e-6);
CONST_128D(w8, 2.299608440919942766555719515783308016700833740918e-5);
CONST_128D(w7, 1.99930498409474044486498978862963995247838069436646e-4);
CONST_128D(w6, 1.38812674551586429265054343505879910146775323730237e-3);
CONST_128D(w5, 8.3335688409829575034112982839739473866857586300664e-3);
CONST_128D(w4, 4.1666622504201078708502686068113075402683415962893e-2);
CONST_128D(w3, 0.166666671414320541875332123507829990378055646330574);
CONST_128D(w2, 0.49999999974109940909767965915362308135415179642286);
CONST_128D(w1, 1.0000000000054730504284163017295863259125942049362);
CONST_128D(w0, 0.99999999999998091336479463057053516986466888462081);
const __m128i offset = _mm_setr_epi32(1023, 1023, 0, 0);
for (i = 0;i < num;i += 4) {
__m128i k1, k2;
__m128d p1, p2;
__m128d a1, a2;
__m128d xmm0, xmm1;
__m128d x1, x2;
/* Load four double values. */
xmm0 = _mm_load_pd(maxlog);
xmm1 = _mm_load_pd(minlog);
x1 = _mm_load_pd(values+i);
x2 = _mm_load_pd(values+i+2);
x1 = _mm_min_pd(x1, xmm0);
x2 = _mm_min_pd(x2, xmm0);
x1 = _mm_max_pd(x1, xmm1);
x2 = _mm_max_pd(x2, xmm1);
/* a = x / log2; */
xmm0 = _mm_load_pd(log2e);
xmm1 = _mm_setzero_pd();
a1 = _mm_mul_pd(x1, xmm0);
a2 = _mm_mul_pd(x2, xmm0);
/* k = (int)floor(a); p = (float)k; */
p1 = _mm_cmplt_pd(a1, xmm1);
p2 = _mm_cmplt_pd(a2, xmm1);
xmm0 = _mm_load_pd(one);
p1 = _mm_and_pd(p1, xmm0);
p2 = _mm_and_pd(p2, xmm0);
a1 = _mm_sub_pd(a1, p1);
a2 = _mm_sub_pd(a2, p2);
k1 = _mm_cvttpd_epi32(a1);
k2 = _mm_cvttpd_epi32(a2);
p1 = _mm_cvtepi32_pd(k1);
p2 = _mm_cvtepi32_pd(k2);
/* x -= p * log2; */
xmm0 = _mm_load_pd(c1);
xmm1 = _mm_load_pd(c2);
a1 = _mm_mul_pd(p1, xmm0);
a2 = _mm_mul_pd(p2, xmm0);
x1 = _mm_sub_pd(x1, a1);
x2 = _mm_sub_pd(x2, a2);
a1 = _mm_mul_pd(p1, xmm1);
a2 = _mm_mul_pd(p2, xmm1);
x1 = _mm_sub_pd(x1, a1);
x2 = _mm_sub_pd(x2, a2);
/* Compute e^x using a polynomial approximation. */
xmm0 = _mm_load_pd(w9);
xmm1 = _mm_load_pd(w8);
a1 = _mm_mul_pd(x1, xmm0);
a2 = _mm_mul_pd(x2, xmm0);
a1 = _mm_add_pd(a1, xmm1);
a2 = _mm_add_pd(a2, xmm1);
xmm0 = _mm_load_pd(w7);
xmm1 = _mm_load_pd(w6);
a1 = _mm_mul_pd(a1, x1);
a2 = _mm_mul_pd(a2, x2);
a1 = _mm_add_pd(a1, xmm0);
a2 = _mm_add_pd(a2, xmm0);
a1 = _mm_mul_pd(a1, x1);
a2 = _mm_mul_pd(a2, x2);
a1 = _mm_add_pd(a1, xmm1);
a2 = _mm_add_pd(a2, xmm1);
xmm0 = _mm_load_pd(w5);
xmm1 = _mm_load_pd(w4);
a1 = _mm_mul_pd(a1, x1);
a2 = _mm_mul_pd(a2, x2);
a1 = _mm_add_pd(a1, xmm0);
a2 = _mm_add_pd(a2, xmm0);
a1 = _mm_mul_pd(a1, x1);
a2 = _mm_mul_pd(a2, x2);
a1 = _mm_add_pd(a1, xmm1);
a2 = _mm_add_pd(a2, xmm1);
xmm0 = _mm_load_pd(w3);
xmm1 = _mm_load_pd(w2);
a1 = _mm_mul_pd(a1, x1);
a2 = _mm_mul_pd(a2, x2);
a1 = _mm_add_pd(a1, xmm0);
a2 = _mm_add_pd(a2, xmm0);
a1 = _mm_mul_pd(a1, x1);
a2 = _mm_mul_pd(a2, x2);
a1 = _mm_add_pd(a1, xmm1);
a2 = _mm_add_pd(a2, xmm1);
xmm0 = _mm_load_pd(w1);
xmm1 = _mm_load_pd(w0);
a1 = _mm_mul_pd(a1, x1);
a2 = _mm_mul_pd(a2, x2);
a1 = _mm_add_pd(a1, xmm0);
a2 = _mm_add_pd(a2, xmm0);
a1 = _mm_mul_pd(a1, x1);
a2 = _mm_mul_pd(a2, x2);
a1 = _mm_add_pd(a1, xmm1);
a2 = _mm_add_pd(a2, xmm1);
/* p = 2^k; */
k1 = _mm_add_epi32(k1, offset);
k2 = _mm_add_epi32(k2, offset);
k1 = _mm_slli_epi32(k1, 20);
k2 = _mm_slli_epi32(k2, 20);
k1 = _mm_shuffle_epi32(k1, _MM_SHUFFLE(1,3,0,2));
k2 = _mm_shuffle_epi32(k2, _MM_SHUFFLE(1,3,0,2));
p1 = _mm_castsi128_pd(k1);
p2 = _mm_castsi128_pd(k2);
/* a *= 2^k. */
a1 = _mm_mul_pd(a1, p1);
a2 = _mm_mul_pd(a2, p2);
/* Store the results. */
_mm_store_pd(values+i, a1);
_mm_store_pd(values+i+2, a2);
}
}
void remez11_0_log2_sse(double *values, int num)
{
int i;
CONST_128D(one, 1.);
CONST_128D(log2e, 1.4426950408889634073599);
CONST_128D(maxlog, 7.09782712893383996843e2); // log(2**1024)
CONST_128D(minlog, -7.08396418532264106224e2); // log(2**-1022)
CONST_128D(c1, 6.93145751953125E-1);
CONST_128D(c2, 1.42860682030941723212E-6);
CONST_128D(w11, 3.5524625185478232665958141148891055719216674475023e-8);
CONST_128D(w10, 2.5535368519306500343384723775435166753084614063349e-7);
CONST_128D(w9, 2.77750562801295315877005242757916081614772210463065e-6);
CONST_128D(w8, 2.47868893393199945541176652007657202642495832996107e-5);
CONST_128D(w7, 1.98419213985637881240770890090795533564573406893163e-4);
CONST_128D(w6, 1.3888869684178659239014256260881685824525255547326e-3);
CONST_128D(w5, 8.3333337052009872221152811550156335074160546333973e-3);
CONST_128D(w4, 4.1666666621080810610346717440523105184720007971655e-2);
CONST_128D(w3, 0.166666666669960803484477734308515404418108830469798);
CONST_128D(w2, 0.499999999999877094481580370323249951329122224389189);
CONST_128D(w1, 1.0000000000000017952745258419615282194236357388884);
CONST_128D(w0, 0.99999999999999999566016490920259318691496540598896);
const __m128i offset = _mm_setr_epi32(1023, 1023, 0, 0);
for (i = 0;i < num;i += 4) {
__m128i k1, k2;
__m128d p1, p2;
__m128d a1, a2;
__m128d xmm0, xmm1;
__m128d x1, x2;
/* Load four double values. */
xmm0 = _mm_load_pd(maxlog);
xmm1 = _mm_load_pd(minlog);
x1 = _mm_load_pd(values+i);
x2 = _mm_load_pd(values+i+2);
x1 = _mm_min_pd(x1, xmm0);
x2 = _mm_min_pd(x2, xmm0);
x1 = _mm_max_pd(x1, xmm1);
x2 = _mm_max_pd(x2, xmm1);
/* a = x / log2; */
xmm0 = _mm_load_pd(log2e);
xmm1 = _mm_setzero_pd();
a1 = _mm_mul_pd(x1, xmm0);
a2 = _mm_mul_pd(x2, xmm0);
/* k = (int)floor(a); p = (float)k; */
p1 = _mm_cmplt_pd(a1, xmm1);
p2 = _mm_cmplt_pd(a2, xmm1);
xmm0 = _mm_load_pd(one);
p1 = _mm_and_pd(p1, xmm0);
p2 = _mm_and_pd(p2, xmm0);
a1 = _mm_sub_pd(a1, p1);
a2 = _mm_sub_pd(a2, p2);
k1 = _mm_cvttpd_epi32(a1);
k2 = _mm_cvttpd_epi32(a2);
p1 = _mm_cvtepi32_pd(k1);
p2 = _mm_cvtepi32_pd(k2);
/* x -= p * log2; */
xmm0 = _mm_load_pd(c1);
xmm1 = _mm_load_pd(c2);
a1 = _mm_mul_pd(p1, xmm0);
a2 = _mm_mul_pd(p2, xmm0);
x1 = _mm_sub_pd(x1, a1);
x2 = _mm_sub_pd(x2, a2);
a1 = _mm_mul_pd(p1, xmm1);
a2 = _mm_mul_pd(p2, xmm1);
x1 = _mm_sub_pd(x1, a1);
x2 = _mm_sub_pd(x2, a2);
/* Compute e^x using a polynomial approximation. */
xmm0 = _mm_load_pd(w11);
xmm1 = _mm_load_pd(w10);
a1 = _mm_mul_pd(x1, xmm0);
a2 = _mm_mul_pd(x2, xmm0);
a1 = _mm_add_pd(a1, xmm1);
a2 = _mm_add_pd(a2, xmm1);
xmm0 = _mm_load_pd(w9);
xmm1 = _mm_load_pd(w8);
a1 = _mm_mul_pd(a1, x1);
a2 = _mm_mul_pd(a2, x2);
a1 = _mm_add_pd(a1, xmm0);
a2 = _mm_add_pd(a2, xmm0);
a1 = _mm_mul_pd(a1, x1);
a2 = _mm_mul_pd(a2, x2);
a1 = _mm_add_pd(a1, xmm1);
a2 = _mm_add_pd(a2, xmm1);
xmm0 = _mm_load_pd(w7);
xmm1 = _mm_load_pd(w6);
a1 = _mm_mul_pd(a1, x1);
a2 = _mm_mul_pd(a2, x2);
a1 = _mm_add_pd(a1, xmm0);
a2 = _mm_add_pd(a2, xmm0);
a1 = _mm_mul_pd(a1, x1);
a2 = _mm_mul_pd(a2, x2);
a1 = _mm_add_pd(a1, xmm1);
a2 = _mm_add_pd(a2, xmm1);
xmm0 = _mm_load_pd(w5);
xmm1 = _mm_load_pd(w4);
a1 = _mm_mul_pd(a1, x1);
a2 = _mm_mul_pd(a2, x2);
a1 = _mm_add_pd(a1, xmm0);
a2 = _mm_add_pd(a2, xmm0);
a1 = _mm_mul_pd(a1, x1);
a2 = _mm_mul_pd(a2, x2);
a1 = _mm_add_pd(a1, xmm1);
a2 = _mm_add_pd(a2, xmm1);
xmm0 = _mm_load_pd(w3);
xmm1 = _mm_load_pd(w2);
a1 = _mm_mul_pd(a1, x1);
a2 = _mm_mul_pd(a2, x2);
a1 = _mm_add_pd(a1, xmm0);
a2 = _mm_add_pd(a2, xmm0);
a1 = _mm_mul_pd(a1, x1);
a2 = _mm_mul_pd(a2, x2);
a1 = _mm_add_pd(a1, xmm1);
a2 = _mm_add_pd(a2, xmm1);
xmm0 = _mm_load_pd(w1);
xmm1 = _mm_load_pd(w0);
a1 = _mm_mul_pd(a1, x1);
a2 = _mm_mul_pd(a2, x2);
a1 = _mm_add_pd(a1, xmm0);
a2 = _mm_add_pd(a2, xmm0);
a1 = _mm_mul_pd(a1, x1);
a2 = _mm_mul_pd(a2, x2);
a1 = _mm_add_pd(a1, xmm1);
a2 = _mm_add_pd(a2, xmm1);
/* p = 2^k; */
k1 = _mm_add_epi32(k1, offset);
k2 = _mm_add_epi32(k2, offset);
k1 = _mm_slli_epi32(k1, 20);
k2 = _mm_slli_epi32(k2, 20);
k1 = _mm_shuffle_epi32(k1, _MM_SHUFFLE(1,3,0,2));
k2 = _mm_shuffle_epi32(k2, _MM_SHUFFLE(1,3,0,2));
p1 = _mm_castsi128_pd(k1);
p2 = _mm_castsi128_pd(k2);
/* a *= 2^k. */
a1 = _mm_mul_pd(a1, p1);
a2 = _mm_mul_pd(a2, p2);
/* Store the results. */
_mm_store_pd(values+i, a1);
_mm_store_pd(values+i+2, a2);
}
}
void remez5_0_log2(double *values, int num)
{
int i;
for (i = 0;i < num;++i) {
int n;
double a, px, x = values[i];
ieee754 u;
/* n = floor(x / log 2) */
a = LOG2E * x;
a -= (a < 0);
n = (int)a;
/* x -= n * log2 */
px = (double)n;
x -= px * C1;
x -= px * C2;
/* Compute e^x using a polynomial approximation. */
a = 1.185268231308989403584147407056378360798378534739e-2;
a *= x;
a += 3.87412011356070379615759057344100690905653320886699e-2;
a *= x;
a += 0.16775408658617866431779970932853611481292418818223;
a *= x;
a += 0.49981934577169208735732248650232562589934399402426;
a *= x;
a += 1.00001092396453942157124178508842412412025643386873;
a *= x;
a += 0.99999989311082729779536722205742989232069120354073;
/* Build 2^n in double. */
u.d = 0;
n += 1023;
u.s[3] = (unsigned short)((n << 4) & 0x7FF0);
values[i] = a * u.d;
}
}
void remez7_0_log2(double *values, int num)
{
int i;
for (i = 0;i < num;++i) {
int n;
double a, px, x = values[i];
ieee754 u;
/* n = floor(x / log 2) */
a = LOG2E * x;
a -= (a < 0);
n = (int)a;
/* x -= n * log2 */
px = (double)n;
x -= px * C1;
x -= px * C2;
/* Compute e^x using a polynomial approximation. */
a = 2.8177033701883768252768416240498659720165776534637e-4;
a *= x;
a += 1.2890480764261415880540047007217296074165060445915e-3;
a *= x;
a += 8.3937617380265463210782576208636760245107957044397e-3;
a *= x;
a += 4.16466393513638680290956986766301695821268933286258e-2;
a *= x;
a += 0.16667025104576204064704670601843171167692234960251;
a *= x;
a += 0.49999968588211828322575537372426392301459912262932;
a *= x;
a += 1.00000001047169007093329328214028329840435313172085;
a *= x;
a += 0.99999999994275232965232540108706952362552348759169;
/* Build 2^n in double. */
u.d = 0;
n += 1023;
u.s[3] = (unsigned short)((n << 4) & 0x7FF0);
values[i] = a * u.d;
}
}
void remez9_0_log2(double *values, int num)
{
int i;
for (i = 0;i < num;++i) {
int n;
double a, px, x = values[i];
ieee754 u;
/* n = floor(x / log 2) */
a = LOG2E * x;
a -= (a < 0);
n = (int)a;
/* x -= n * log2 */
px = (double)n;
x -= px * C1;
x -= px * C2;
/* Compute e^x using a polynomial approximation. */
a = 3.9099787920346160288874633639268318097077213911751e-6;
a *= x;
a += 2.299608440919942766555719515783308016700833740918e-5;
a *= x;
a += 1.99930498409474044486498978862963995247838069436646e-4;
a *= x;
a += 1.38812674551586429265054343505879910146775323730237e-3;
a *= x;
a += 8.3335688409829575034112982839739473866857586300664e-3;
a *= x;
a += 4.1666622504201078708502686068113075402683415962893e-2;
a *= x;
a += 0.166666671414320541875332123507829990378055646330574;
a *= x;
a += 0.49999999974109940909767965915362308135415179642286;
a *= x;
a += 1.0000000000054730504284163017295863259125942049362;
a *= x;
a += 0.99999999999998091336479463057053516986466888462081;
/* Build 2^n in double. */
u.d = 0;
n += 1023;
u.s[3] = (unsigned short)((n << 4) & 0x7FF0);
values[i] = a * u.d;
}
}
void remez11_0_log2(double *values, int num)
{
int i;
for (i = 0;i < num;++i) {
int n;
double a, px, x = values[i];
ieee754 u;
/* n = floor(x / log 2) */
a = LOG2E * x;
a -= (a < 0);
n = (int)a;
/* x -= n * log2 */
px = (double)n;
x -= px * C1;
x -= px * C2;
/* Compute e^x using a polynomial approximation. */
a = 3.5524625185478232665958141148891055719216674475023e-8;
a *= x;
a += 2.5535368519306500343384723775435166753084614063349e-7;
a *= x;
a += 2.77750562801295315877005242757916081614772210463065e-6;
a *= x;
a += 2.47868893393199945541176652007657202642495832996107e-5;
a *= x;
a += 1.98419213985637881240770890090795533564573406893163e-4;
a *= x;
a += 1.3888869684178659239014256260881685824525255547326e-3;
a *= x;
a += 8.3333337052009872221152811550156335074160546333973e-3;
a *= x;
a += 4.1666666621080810610346717440523105184720007971655e-2;
a *= x;
a += 0.166666666669960803484477734308515404418108830469798;
a *= x;
a += 0.499999999999877094481580370323249951329122224389189;
a *= x;
a += 1.0000000000000017952745258419615282194236357388884;
a *= x;
a += 0.99999999999999999566016490920259318691496540598896;
/* Build 2^n in double. */
u.d = 0;
n += 1023;
u.s[3] = (unsigned short)((n << 4) & 0x7FF0);
values[i] = a * u.d;
}
}
void remez13_0_log2(double *values, int num)
{
int i;
for (i = 0;i < num;++i) {
int n;
double a, px, x = values[i];
ieee754 u;
/* n = floor(x / log 2) */
a = LOG2E * x;
a -= (a < 0);
n = (int)a;
/* x -= n * log2 */
px = (double)n;
x -= px * C1;
x -= px * C2;
/* Compute e^x using a polynomial approximation. */
a = 2.2762932529150460619497906285755631573256951680928e-10;
a *= x;
a += 1.93367224471636363463651078554697568501742984768854e-9;
a *= x;
a += 2.52543927629810215309605055241333435158089916072836e-8;
a *= x;
a += 2.7540144801860636516400552945824228138166510066936e-7;
a *= x;
a += 2.75583147053220552447847947870361102182338458956017e-6;
a *= x;
a += 2.4801546952196268386551625341270924245448949820144e-5;
a *= x;
a += 1.98412709907914555147826749325174275063119441397255e-4;
a *= x;
a += 1.38888888661019235625906849288500655817315802824057e-3;
a *= x;
a += 8.333333333639598748175600716777066054934952111002e-3;
a *= x;
a += 4.16666666666400203177888506088903193769823751590758e-2;
a *= x;
a += 0.16666666666666805461760233769080472413024408831279;
a *= x;
a += 0.499999999999999962289068852705038969538821467770838;
a *= x;
a += 1.0000000000000000004034789178104888993769939051368;
a *= x;
a += 0.99999999999999999999928421898456045238677548461656;
/* Build 2^n in double. */
u.d = 0;
n += 1023;
u.s[3] = (unsigned short)((n << 4) & 0x7FF0);
values[i] = a * u.d;
}
}
void vecexp_remez5_05_05(double *values, int num)
{
int i;
for (i = 0;i < num;++i) {
int n;
double a, px, x = values[i];
ieee754 u;
/* n = round(x / log 2) */
a = LOG2E * x + 0.5;
n = (int)a;
n -= (a < 0);
/* x -= n * log2 */
px = (double)n;
x -= px * C1;
x -= px * C2;
/* Compute e^x using a polynomial approximation. */
a = 8.4330373844473099813810362019505406631926085091323e-3;
a *= x;
a += 4.21902068347366333044988703875445200415500811946999e-2;
a *= x;
a += 0.166651841763284853673636070741102751158699113784309;
a *= x;
a += 0.499950835134960574061545109283769441275186262483568;
a *= x;
a += 1.00000058571014555295260592037508552163073364874505;
a *= x;
a += 1.00000068337676053288862820129773498043487842807125;
/* Build 2^n in double. */
u.d = 0;
n += 1023;
u.s[3] = (unsigned short)((n << 4) & 0x7FF0);
values[i] = a * u.d;
}
}
void vecexp_remez7_05_05(double *values, int num)
{
int i;
for (i = 0;i < num;++i) {
int n;
double a, px, x = values[i];
ieee754 u;
/* n = round(x / log 2) */
a = LOG2E * x + 0.5;
n = (int)a;
n -= (a < 0);
/* x -= n * log2 */
px = (double)n;
x -= px * C1;
x -= px * C2;
/* Compute e^x using a polynomial approximation. */
a = 2.00141610818912228248905083859257590733113260526108e-4;
a *= x;
a += 1.40133811518724818098307842775997566282188412967947e-3;
a *= x;
a += 8.3329654693594055317268781464679686290363549855757e-3;
a *= x;
a += 4.166471896964041208094102436547981534517053537243e-2;
a *= x;
a += 0.166666696439690960142075219571022911788612657807629;
a *= x;
a += 0.50000009744621522508878089211950835930852534338768;
a *= x;
a += 0.99999999932306797486003312890413784132553482368976;
a *= x;
a += 0.99999999923843009897272522306505037302103095520883;
/* Build 2^n in double. */
u.d = 0;
n += 1023;
u.s[3] = (unsigned short)((n << 4) & 0x7FF0);
values[i] = a * u.d;
}
}
void vecexp_remez9_05_05(double *values, int num)
{
int i;
for (i = 0;i < num;++i) {
int n;
double a, px, x = values[i];
ieee754 u;
/* n = round(x / log 2) */
a = LOG2E * x + 0.5;
n = (int)a;
n -= (a < 0);
/* x -= n * log2 */
px = (double)n;
x -= px * C1;
x -= px * C2;
/* Compute e^x using a polynomial approximation. */
a = 2.7745788375840179681283717651801389074131600858024e-6;
a *= x;
a += 2.4974359052778718615679108099578563503364071042705e-5;
a *= x;
a += 1.98407490538622509507316519103953447383315553069056e-4;
a *= x;
a += 1.38885105592078787329129443877314802341416378733678e-3;
a *= x;
a += 8.3333339723641809789104572730499312924226776107297e-3;
a *= x;
a += 4.16666700463431739324947223978815539635978032483308e-2;
a *= x;
a += 0.166666666634013689476902841482489499264356163708049;
a *= x;
a += 0.49999999989435323455294486871496414870608658630883;
a *= x;
a += 1.00000000000048028927516239905073191568245953555709;
a *= x;
a += 1.00000000000052833535680371242873466617726870356609;
/* Build 2^n in double. */
u.d = 0;
n += 1023;
u.s[3] = (unsigned short)((n << 4) & 0x7FF0);
values[i] = a * u.d;
}
}
void vecexp_remez11_05_05(double *values, int num)
{
int i;
for (i = 0;i < num;++i) {
int n;
double a, px, x = values[i];
ieee754 u;
/* n = round(x / log 2) */
a = LOG2E * x + 0.5;
n = (int)a;
n -= (a < 0);
/* x -= n * log2 */
px = (double)n;
x -= px * C1;
x -= px * C2;
/* Compute e^x using a polynomial approximation. */
a = 2.51929868958558990100711431834993496570205380722036e-8;
a *= x;
a += 2.7714302972476470857291821430459163185492029688565e-7;
a *= x;
a += 2.7556840833993121572460386718599369519143729132621e-6;
a *= x;
a += 2.48011454027599465130934662141475239085795284566129e-5;
a *= x;
a += 1.98412706284313544001695569595213030418978110809054e-4;
a *= x;
a += 1.38888894619673011407756064437561180652889917616113e-3;
a *= x;
a += 8.3333333326960625302778843136952722007056714994369e-3;
a *= x;
a += 4.1666666663307925829807454939715320634892173110281e-2;
a *= x;
a += 0.16666666666668912253462202236231047014798117425631;
a *= x;
a += 0.50000000000007198509320027789925087826877822951327;
a *= x;
a += 0.99999999999999976925988096235016345824070346859229;
a *= x;
a += 0.9999999999999997500224010478732616185808429624127;
/* Build 2^n in double. */
u.d = 0;
n += 1023;
u.s[3] = (unsigned short)((n << 4) & 0x7FF0);
values[i] = a * u.d;
}
}
void vecexp_remez13_05_05(double *values, int num)
{
int i;
for (i = 0;i < num;++i) {
int n;
double a, px, x = values[i];
ieee754 u;
/* n = round(x / log 2) */
a = LOG2E * x + 0.5;
n = (int)a;
n -= (a < 0);
/* x -= n * log2 */
px = (double)n;
x -= px * C1;
x -= px * C2;
/* Compute e^x using a polynomial approximation. */
a = 1.61356848991757396195626449324190427883666354504187e-10;
a *= x;
a += 2.09773502429720720042832865218235470540902290986604e-9;
a *= x;
a += 2.50517997973487340520345973228530569065189960125308e-8;
a *= x;
a += 2.7556973177905466877853160305636358948188615327725e-7;
a *= x;
a += 2.7557319860870868162733628841564031959051602041504e-6;
a *= x;
a += 2.48015878916569323181318984106104816451563207545696e-5;
a *= x;
a += 1.98412698405602139306149764767811806786694849547848e-4;
a *= x;
a += 1.38888888883724638191056667131613472090590184962952e-3;
a *= x;
a += 8.333333333333743243093579329746662890641071879753e-3;
a *= x;
a += 4.16666666666688187524214985734081300874557496783939e-2;
a *= x;
a += 0.16666666666666665609766712753641211515446088211247;
a *= x;
a += 0.49999999999999996637019704330727593814705189927683;
a *= x;
a += 1.00000000000000000008007233620462705038544342452684;
a *= x;
a += 1.0000000000000000000857966908786376708355989802095;
/* Build 2^n in double. */
u.d = 0;
n += 1023;
u.s[3] = (unsigned short)((n << 4) & 0x7FF0);
values[i] = a * u.d;
}
}
void vecexp_taylor5(double *values, int num)
{
int i;
for (i = 0;i < num;++i) {
int n;
double a, px, x = values[i];
ieee754 u;
/* n = round(x / log 2) */
a = LOG2E * x + 0.5;
n = (int)a;
n -= (a < 0);
/* x -= n * log2 */
px = (double)n;
x -= px * C1;
x -= px * C2;
/* Compute e^x using a polynomial approximation. */
a = 1. / 120.;
a *= x;
a += 4.1666666666666666666666666666666666666666666666666e-2;
a *= x;
a += 0.166666666666666666666666666666666666666666666666665;
a *= x;
a += 0.5;
a *= x;
a += 1.0;
a *= x;
a += 1.0;
/* Build 2^n in double. */
u.d = 0;
n += 1023;
u.s[3] = (unsigned short)((n << 4) & 0x7FF0);
values[i] = a * u.d;
}
}
void vecexp_taylor7(double *values, int num)
{
int i;
for (i = 0;i < num;++i) {
int n;
double a, px, x = values[i];
ieee754 u;
/* n = round(x / log 2) */
a = LOG2E * x + 0.5;
n = (int)a;
n -= (a < 0);
/* x -= n * log2 */
px = (double)n;
x -= px * C1;
x -= px * C2;
/* Compute e^x using a polynomial approximation. */
a = 1. / 5040.;
a *= x;
a += 1.38888888888888888888888888888888888888888888888889e-3;
a *= x;
a += 8.3333333333333333333333333333333333333333333333333e-3;
a *= x;
a += 4.1666666666666666666666666666666666666666666666666e-2;
a *= x;
a += 0.166666666666666666666666666666666666666666666666665;
a *= x;
a += 0.5;
a *= x;
a += 1.0;
a *= x;
a += 1.0;
/* Build 2^n in double. */
u.d = 0;
n += 1023;
u.s[3] = (unsigned short)((n << 4) & 0x7FF0);
values[i] = a * u.d;
}
}
void vecexp_taylor9(double *values, int num)
{
int i;
for (i = 0;i < num;++i) {
int n;
double a, px, x = values[i];
ieee754 u;
/* n = round(x / log 2) */
a = LOG2E * x + 0.5;
n = (int)a;
n -= (a < 0);
/* x -= n * log2 */
px = (double)n;
x -= px * C1;
x -= px * C2;
/* Compute e^x using a polynomial approximation. */
a = 1. / 362880.;
a *= x;
a += 2.4801587301587301587301587301587301587301587301587e-5;
a *= x;
a += 1.98412698412698412698412698412698412698412698412698e-4;
a *= x;
a += 1.38888888888888888888888888888888888888888888888889e-3;
a *= x;
a += 8.3333333333333333333333333333333333333333333333333e-3;
a *= x;
a += 4.1666666666666666666666666666666666666666666666666e-2;
a *= x;
a += 0.166666666666666666666666666666666666666666666666665;
a *= x;
a += 0.5;
a *= x;
a += 1.0;
a *= x;
a += 1.0;
/* Build 2^n in double. */
u.d = 0;
n += 1023;
u.s[3] = (unsigned short)((n << 4) & 0x7FF0);
values[i] = a * u.d;
}
}
void vecexp_taylor11(double *values, int num)
{
int i;
for (i = 0;i < num;++i) {
int n;
double a, px, x = values[i];
ieee754 u;
/* n = round(x / log 2) */
a = LOG2E * x + 0.5;
n = (int)a;
n -= (a < 0);
/* x -= n * log2 */
px = (double)n;
x -= px * C1;
x -= px * C2;
/* Compute e^x using a polynomial approximation. */
a = 1. / 39916800.;
a *= x;
a += 2.7557319223985890652557319223985890652557319223986e-7;
a *= x;
a += 2.75573192239858906525573192239858906525573192239859e-6;
a *= x;
a += 2.4801587301587301587301587301587301587301587301587e-5;
a *= x;
a += 1.98412698412698412698412698412698412698412698412698e-4;
a *= x;
a += 1.38888888888888888888888888888888888888888888888889e-3;
a *= x;
a += 8.3333333333333333333333333333333333333333333333333e-3;
a *= x;
a += 4.1666666666666666666666666666666666666666666666666e-2;
a *= x;
a += 0.166666666666666666666666666666666666666666666666665;
a *= x;
a += 0.5;
a *= x;
a += 1.0;
a *= x;
a += 1.0;
/* Build 2^n in double. */
u.d = 0;
n += 1023;
u.s[3] = (unsigned short)((n << 4) & 0x7FF0);
values[i] = a * u.d;
}
}
void vecexp_taylor13(double *values, int num)
{
int i;
for (i = 0;i < num;++i) {
int n;
double a, px, x = values[i];
ieee754 u;
/* n = round(x / log 2) */
a = LOG2E * x + 0.5;
n = (int)a;
n -= (a < 0);
/* x -= n * log2 */
px = (double)n;
x -= px * C1;
x -= px * C2;
/* Compute e^x using a polynomial approximation. */
a = 1. / 6227020800;
a *= x;
a += 2.08767569878680989792100903212014323125434236545349e-9;
a *= x;
a += 2.50521083854417187750521083854417187750521083854419e-8;
a *= x;
a += 2.7557319223985890652557319223985890652557319223986e-7;
a *= x;
a += 2.75573192239858906525573192239858906525573192239859e-6;
a *= x;
a += 2.4801587301587301587301587301587301587301587301587e-5;
a *= x;
a += 1.98412698412698412698412698412698412698412698412698e-4;
a *= x;
a += 1.38888888888888888888888888888888888888888888888889e-3;
a *= x;
a += 8.3333333333333333333333333333333333333333333333333e-3;
a *= x;
a += 4.1666666666666666666666666666666666666666666666666e-2;
a *= x;
a += 0.166666666666666666666666666666666666666666666666665;
a *= x;
a += 0.5;
a *= x;
a += 1.0;
a *= x;
a += 1.0;
/* Build 2^n in double. */
u.d = 0;
n += 1023;
u.s[3] = (unsigned short)((n << 4) & 0x7FF0);
values[i] = a * u.d;
}
}
void vecexp_cephes(double *values, int num)
{
int i;
for (i = 0;i < num;++i) {
int n;
double x = values[i];
double a, xx, px, qx;
ieee754 u;
/* n = round(x / log 2) */
a = LOG2E * x + 0.5;
n = (int)a;
n -= (a < 0);
/* x -= n * log2 */
px = (double)n;
x -= px * C1;
x -= px * C2;
xx = x * x;
/* px = x * P(x**2). */
px = 1.26177193074810590878E-4;
px *= xx;
px += 3.02994407707441961300E-2;
px *= xx;
px += 9.99999999999999999910E-1;
px *= x;
/* Evaluate Q(x**2). */
qx = 3.00198505138664455042E-6;
qx *= xx;
qx += 2.52448340349684104192E-3;
qx *= xx;
qx += 2.27265548208155028766E-1;
qx *= xx;
qx += 2.00000000000000000009E0;
/* e**x = 1 + 2x P(x**2)/( Q(x**2) - P(x**2) ) */
x = px / (qx - px);
x = 1.0 + 2.0 * x;
/* Build 2^n in double. */
u.d = 0;
n += 1023;
u.s[3] = (unsigned short)((n << 4) & 0x7FF0);
values[i] = x * u.d;
}
}
void vecexp_libc(double *values, int n)
{
int i;
for (i = 0;i < n;++i) {
values[i] = exp(values[i]);
}
}
double *read_source(FILE *fp, int *num)
{
int n = 0;
char line[1024];
double *values = NULL;
while (fgets(line, 1023, fp) != NULL) {
values = realloc(values, sizeof(double) * (n+1));
values[n++] = atof(line);
}
*num = n;
return values;
}
void measure(performance_t *perf, double *values, int n)
{
int i;
performance_t *p;
for (p = perf;p->func != NULL;++p) {
p->values = (double*)_aligned_malloc(sizeof(double) * n, 16);
for (i = 0;i < n;++i) {
p->values[i] = values[i];
}
}
for (p = perf;p->func != NULL;++p) {
p->elapsed_time = clock();
p->func(p->values, n);
p->elapsed_time = clock() - p->elapsed_time;
}
for (p = perf;p->func != NULL;++p) {
for (i = 0;i < n;++i) {
double ex = perf[0].values[i];
double exf = p->values[i];
double err = fabs(exf - ex) / ex;
if (p->error_peak < err) {
p->error_peak = err;
}
p->error_rms += (err * err);
}
p->error_rms /= n;
p->error_rms = sqrt(p->error_rms);
}
}
int main(int argc, char *argv[])
{
int n;
double *values = NULL;
performance_t *p = NULL;
performance_t perf[] = {
{"libc", vecexp_libc, 0., 0., 0, NULL},
{"Cephes", vecexp_cephes, 0., 0., 0, NULL},
{"Taylor 5th", vecexp_taylor5, 0., 0., 0, NULL},
{"Taylor 7th", vecexp_taylor7, 0., 0., 0, NULL},
{"Taylor 9th", vecexp_taylor9, 0., 0., 0, NULL},
{"Taylor 11th", vecexp_taylor11, 0., 0., 0, NULL},
{"Taylor 13th", vecexp_taylor13, 0., 0., 0, NULL},
{"Remez 5th [-0.5,+0.5]", vecexp_remez5_05_05, 0., 0., 0, NULL},
{"Remez 7th [-0.5,+0.5]", vecexp_remez7_05_05, 0., 0., 0, NULL},
{"Remez 9th [-0.5,+0.5]", vecexp_remez9_05_05, 0., 0., 0, NULL},
{"Remez 11th [-0.5,+0.5]", vecexp_remez11_05_05, 0., 0., 0, NULL},
{"Remez 13th [-0.5,+0.5]", vecexp_remez13_05_05, 0., 0., 0, NULL},
{"Remez 5th [0,log2]", remez5_0_log2, 0., 0., 0, NULL},
{"Remez 7th [0,log2]", remez7_0_log2, 0., 0., 0, NULL},
{"Remez 9th [0,log2]", remez9_0_log2, 0., 0., 0, NULL},
{"Remez 11th [0,log2]", remez11_0_log2, 0., 0., 0, NULL},
{"Remez 13th [0,log2]", remez13_0_log2, 0., 0., 0, NULL},
{"Remez 5th [0,log2] SSE", remez5_0_log2_sse, 0., 0., 0, NULL},
{"Remez 7th [0,log2] SSE", remez7_0_log2_sse, 0., 0., 0, NULL},
{"Remez 9th [0,log2] SSE", remez9_0_log2_sse, 0., 0., 0, NULL},
{"Remez 11th [0,log2] SSE", remez11_0_log2_sse, 0., 0., 0, NULL},
{NULL, NULL, 0., 0., 0},
};
values = read_source(stdin, &n);
measure(perf, values, n);
for (p = perf;p->func != NULL;++p) {
printf(
"%s\t%f\t%e\t%e\n",
p->name,
p->elapsed_time / (double)CLOCKS_PER_SEC,
p->error_peak,
p->error_rms
);
}
return 0;
}