/*******************************************************************************
* Copyright 2018-2023 Intel Corporation.
*
* This software and the related documents are Intel copyrighted  materials,  and
* your use of  them is  governed by the  express license  under which  they were
* provided to you (License).  Unless the License provides otherwise, you may not
* use, modify, copy, publish, distribute,  disclose or transmit this software or
* the related documents without Intel's prior written permission.
*
* This software and the related documents  are provided as  is,  with no express
* or implied  warranties,  other  than those  that are  expressly stated  in the
* License.
*******************************************************************************/

/*
 *
 *  Content:
 *            MKL VM different accuracies usage example
 *            and accuracy vs performance tradeoff demonstration:
 *
 *            Call scalar LIBM acosf and MKL VM vmsAcos OMP offload
 *            single precision functions on randomly distributed (-1.0f, 1.0f)
 *            vector with length = 1M using three accuracy MKL VM flavors:
 *                HA (High Accuracy)
 *                LA (Low Accuracy)
 *                EP (Enhanced Performance)
 *            Compare maximum observed relative errors, ulps (units in last place)
 *            and performance measured in geval/sec (giga evaluations per second)
 *
 *
 *******************************************************************************/

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <math.h>
#include <omp.h>

#ifdef _WIN32
  #include <windows.h>
#else
  #include <time.h>
#endif

#include "mkl.h"
#include "mkl_omp_offload.h"

/**
 * General input constants:
 */
/**
 * General input constants:
 */
/* Vector lengths in host & device VM calls */
#if defined VM_QUICK_RUN
    static const int64_t host_length = 100000;
    static const int64_t device_length = 1000000;
#else
    static const int64_t host_length = 1000000;
    static const int64_t device_length = 100000000;
#endif
/* Number of VM repeats for host */
static const int host_repeats = 1000;
/* Number of VM repeats for device */
static const int device_repeats = 100;
/* Input range begin: -1.0+EPS */
static const float beg   = -1.0f + 1.1e-07f;
/* Input range end: 1.0-EPS */
static const float end   = +1.0f - 1.1e-07f;
/* Device number for OMP offload */
static const int devnum  = 0;

/**
 * Mapping to MKL VM accuracy mode constants:
 */
static const unsigned int classic_accuracy_mode[] =
{
    VML_HA,
    VML_LA,
    VML_EP
};

/**
 * Available function accuracies:
 */
enum VmAccuracy
{
    kHA = 0,  /* HA */
    kLA,      /* LA */
    kEP,      /* EP */
    kAccNum   /* Number of accuracies */
};

/**
 * @brief Safe malloc
 *
 * own_safe_malloc allocates memory and check resulted pointer.
 * Report error and exit application if unsuccessful.
 *
 * @param[in] size          Size in bytes
 * @return                  Pointer to allocated memory
 *
 */
static void* own_safe_malloc(int size)
{
    void* ptr = malloc (size);
    if (ptr == NULL)
    {
       fprintf (stderr, "\t\tERROR: %d bytes allocated unsuccesfully\n", size);
       exit(-1);
    }

    return ptr;
}

/**
 * @brief Safe free
 *
 * own_safe_free deallocates memory.
 * Report error if NULL pointer passed.
 *
 * @param[in] ptr          Pointer to memory
 *
 */
static void own_safe_free(void *ptr)
{
    if (ptr != NULL) { free (ptr); }
    else
    {
       fprintf (stderr, "\t\tERROR: NULL pointer cannot be deallocated\n");
       exit(-1);
    }

    return;
}

/**
 * @brief Clock timer.
 *
 * own_get_nano Number of nanoseconds that the system has been running since it was booted.
 *
 * @return Number of nanoseconds that the system has been running.
 *
 */
static uint64_t own_get_nano()
{
    uint64_t timer = 0;
#ifdef _WIN32
    LARGE_INTEGER li;
    if (QueryPerformanceCounter (&li)) { timer = li.QuadPart; }
#else
    struct timespec ts;
    if (clock_gettime(CLOCK_MONOTONIC, &ts) == 0)
    {
        timer = (uint64_t)ts.tv_sec * 1000 * 1000 * 1000 + ts.tv_nsec;
    }
#endif
    return timer;
}

/**
 * @brief Computation of maximum relative error and ulp.
 *
 * own_compute_err relative error and simplified ulp computations
 * between resulted and reference vectors.
 *
 * @param[in] len            Vectors length
 * @param[in] res            Resulted vector
 * @param[in] ref            Reference vector
 * @param[out] err           Computed relative error
 * @param[out] err           Computed ulp
 *
 */
static void own_compute_err (int64_t len, float* res, double* ref, double* err, double* ulp)
{
    int    ex     = 0;
    double maxerr = 0.0;
    double maxulp = 0.0;
    double den    = 1.0;

    for (int64_t i = 0; i < len; i++)
    {
        /**
         * Simplified ulp formula: |res-ref|/2^(ex-p+1)
         * where p - precision, equals to 23 for single one
         */
        frexp (ref[i], &ex);                 /* ex: integral power of two of ref */
        den = ldexp (1.0, ex - 24);          /* den: ulp's denominator 2^(ex-p+1) */
        den = (den == 0.0)? 0x1.p-149 : den; /* if den=0 then replace by EPS to avoid divbyzero */
        
        /* max ulp = |res-ref|/2^(ex-24) */
        maxulp = fmax (maxulp, fabs ((((double)(res[i]) - ref[i])) / den)); 
        /* max relative error = |res-ref|/ref */
        maxerr = fmax (maxerr, fabs (((double)res[i] - ref[i]) / ref[i]));
    }

    *err = maxerr;
    *ulp = maxulp;

    return;
}

/**
 * @brief Run scalar function on host
 *
 * Measure performance and acuracy for scalar host function
 *
 * @param[in]  len         Vector length
 * @param[in]  arg         Arguments array
 * @param[out] res         Results array
 * @param[in]  ref         Reference results array
 * @param[out] err         Resulted relative error
 * @param[out] ulp         Resulted ulp
 * @param[out] gev         Resulted performance (GEval/sec)
 *
 */
void own_libm_scalar (int64_t len, float* arg, float* res, double* ref, double* err, double* ulp, double* gev)
{
    /* Warmup run */
    #pragma novector
    for (int64_t i = 0; i < len; i++)
    {
        res[i] = acosf(arg[i]);
    }
    uint64_t ns = own_get_nano ();
    #pragma novector
    for (int64_t i = 0; i < len; i++)
    {
        res[i] = acosf(arg[i]);
    }
    gev[kHA] = (double)(len) / (own_get_nano () - ns);
    /* Compute relative error & ulp */
    own_compute_err (len, res, ref, &(err[kHA]), &(ulp[kHA]));

    return;
}

/**
 * @brief Run VM host (classic) API
 *
 * Measure performance and acuracy for VM host API's
 *
 * @param[in]  len         Vector length
 * @param[in]  rep         Number of repeats
 * @param[in]  arg         Arguments array
 * @param[out] res         Results array
 * @param[in]  ref         Reference results array
 * @param[out] err         Resulted relative errors
 * @param[out] ulp         Resulted ulps
 * @param[out] gev         Resulted performance (GEval/sec)
 *
 */
void own_vm_host (int64_t len, int64_t rep, float* arg, float* res, double* ref, double* err, double* ulp, double* gev)
{
    /* Loop by accuracies */
    for (int a = kHA; a < kAccNum; a++)
    {
        /* Warmup call */
        vmsAcos (len, arg, res, classic_accuracy_mode[a]);
        uint64_t ns = own_get_nano ();
        /* Do several repeats */
        for (int j = 0; j < rep; j++)
        {
            vmsAcos (len, arg, res, classic_accuracy_mode[a]);
        }
        gev[a] = (double)(len * rep) / (own_get_nano () - ns);
        /* Compute relative error & ulp */
        own_compute_err (len, res, ref, &(err[a]), &(ulp[a]));
    }
}

/**
 * @brief Run VM OMP device offload API
 *
 * Measure performance and acuracy for VM offload API's
 *
 * @param[in]  len         Vector length
 * @param[in]  rep         Number of repeats
 * @param[in]  arg         Arguments array
 * @param[out] res         Results array
 * @param[in]  ref         Reference results array
 * @param[out] err         Resulted relative errors
 * @param[out] ulp         Resulted ulps
 * @param[out] gev         Resulted performance (GEval/sec)
 *
 */
void own_vm_offload (int64_t len, int64_t rep, float* arg, float* res, double* ref, double* err, double* ulp, double* gev)
{
    /* Loop by accuracies */
    for (int a = kHA; a < kAccNum; a++)
    {
        /* Create device buffers. Arguments will be copied to device */
        #pragma omp target data map(to:arg[0:len]) map(tofrom:res[0:len]) device(devnum)
        {
            /* Warmup call */
            #pragma omp target variant dispatch device(devnum) use_device_ptr(arg, res) nowait
            vmsAcos (len, arg, res, classic_accuracy_mode[a]);
            #pragma omp taskwait
            
            uint64_t ns = own_get_nano ();
            /* Do several repeats */
            for (int j = 0; j < rep; j++)
            {
                #pragma omp target variant dispatch device(devnum) use_device_ptr(arg, res) nowait
                vmsAcos (len, arg, res, classic_accuracy_mode[a]);
                #pragma omp taskwait
            }
            gev[a] = (double)(len * rep) / (own_get_nano () - ns);
            
        /* End of device buffers scope. Results to be copied to host */
        }
        /* Compute relative error & ulp */
        own_compute_err (len, res, ref, &(err[a]), &(ulp[a]));
    }
}

/**
 * @brief Main function for VM accuracy example
 *
 * Main performs accuracy vs performance tradeoff demonstration
 *
 * @param[in] argc         Number of arguments
 * @param[in] argv         Pointer to argument strings
 * @return                 -1 for FAIL or 0 for PASS
 *
 */
int main(int argc, char **argv)
{
    int64_t max_length = (device_length > host_length)?device_length:host_length;
    /**
     * Relative error, ulp and gevals results for scalar and offload API's.
     */
    double scal_err[kAccNum] = {0}, scal_ulp[kAccNum] = {0}, scal_gev[kAccNum] = {0};
    double host_err[kAccNum] = {0}, host_ulp[kAccNum] = {0}, host_gev[kAccNum] = {0};
    double off_err[kAccNum]  = {0}, off_ulp[kAccNum]  = {0}, off_gev[kAccNum]  = {0};
    /**
     * Allocate memory for argument, scalar (SC) and HA/LA/EP result, and reference vectors.
     */
    float *arg   = (float*) own_safe_malloc (max_length * sizeof(float));
    float *res   = (float*) own_safe_malloc (max_length * sizeof(float));
    double *ref  = (double*)own_safe_malloc (max_length * sizeof(double));

    fprintf(stdout, "omp offload vm_perf_accuracy: started...\n");
    fflush(stdout);

    /**
     * Fill source vector by random numbers uniformly distributed on [beg, end) range.
     */
    srand(777);
    for (int64_t i = 0; i < max_length; i++)
    {
        arg[i] = (float)(beg + (end - beg) * (float)(rand()) / (float)(RAND_MAX));
    }

    /**
     * Fill reference array computed with scalar double precision acos() for generated arguments.
     */
    for (int64_t i = 0; i < max_length; i++)
    {
        ref[i] = acos ((double)arg[i]);
    }

    /**
     * Run different API's:
     */
    /* Scalar LIBM */
    own_libm_scalar (host_length, arg, res, ref, scal_err, scal_ulp, scal_gev);
    /* VM host (classic) API */
    own_vm_host (host_length, host_repeats, arg, res, ref, host_err, host_ulp, host_gev);
    /* VM device OMP offload API */
    own_vm_offload (device_length, device_repeats, arg, res, ref, off_err, off_ulp, off_gev);

    /**
     * Result printouts.
     */
    fprintf(stdout, "\t======================================================\n");
    fprintf(stdout, "\t%15s,%12s,%12s,%12s\n", "<acosf>","Scalar", "VM Host", "VM offload");
    fprintf(stdout, "\t======================================================\n");
    fprintf(stdout, "\t%15s,%12.3le,%12.3le,%12.3le\n", "Relative err HA",  scal_err[kHA], host_err[kHA], off_err[kHA]);
    fprintf(stdout, "\t%15s,%12s,%12.3le,%12.3le\n", "Relative err LA", "", host_err[kLA], off_err[kLA]);
    fprintf(stdout, "\t%15s,%12s,%12.3le,%12.3le\n", "Relative err EP", "", host_err[kEP], off_err[kEP]);
    fprintf(stdout, "\t------------------------------------------------------\n");
    fprintf(stdout, "\t%15s,%12.3lg,%12.3lg,%12.3lg\n", "Ulp err HA",   scal_ulp[kHA], host_ulp[kHA], off_ulp[kHA]);
    fprintf(stdout, "\t%15s,%12s,%12.3lg,%12.3lg\n", "Ulp err LA",  "", host_ulp[kLA], off_ulp[kLA]);
    fprintf(stdout, "\t%15s,%12s,%12.3lg,%12.3lg\n", "Ulp err EP",  "", host_ulp[kEP], off_ulp[kEP]);
    fprintf(stdout, "\t------------------------------------------------------\n");
    fprintf(stdout, "\t%15s,%12.3lf,%12.3lf,%12.3lf\n", "GEval/sec HA",   scal_gev[kHA], host_gev[kHA], off_gev[kHA]);
    fprintf(stdout, "\t%15s,%12s,%12.3lf,%12.3lf\n", "GEval/sec LA",  "", host_gev[kLA], off_gev[kLA]);
    fprintf(stdout, "\t%15s,%12s,%12.3lf,%12.3lf\n", "GEval/sec EP",  "", host_gev[kEP], off_gev[kEP]);
    fprintf(stdout, "\t======================================================\n");
    fprintf(stdout, "\t GEval/sec performance comparisons:\n");
    fprintf(stdout, "\t======================================================\n");
    fprintf(stdout, "\t%15s,%12s,%11.2lfx,%11.2lfx\n", "HA vs Scalar",  "", host_gev[kHA]/scal_gev[kHA], off_gev[kHA]/scal_gev[kHA]);
    fprintf(stdout, "\t%15s,%12s,%11.2lfx,%11.2lfx\n", "LA vs Scalar",  "", host_gev[kLA]/scal_gev[kHA], off_gev[kLA]/scal_gev[kHA]);
    fprintf(stdout, "\t%15s,%12s,%11.2lfx,%11.2lfx\n", "EP vs Scalar",  "", host_gev[kEP]/scal_gev[kHA], off_gev[kEP]/scal_gev[kHA]);
    fprintf(stdout, "\t------------------------------------------------------\n");
    fprintf(stdout, "\t%15s,%12s,%11.2lfx,%11.2lfx\n", "LA vs HA",  "", host_gev[kLA]/host_gev[kHA], off_gev[kLA]/off_gev[kHA]);
    fprintf(stdout, "\t%15s,%12s,%11.2lfx,%11.2lfx\n", "EP vs LA",  "", host_gev[kEP]/host_gev[kLA], off_gev[kEP]/off_gev[kLA]);
    fprintf(stdout, "\t%15s,%12s,%11.2lfx,%11.2lfx\n", "EP vs HA",  "", host_gev[kEP]/host_gev[kHA], off_gev[kEP]/off_gev[kHA]);
    fprintf(stdout, "\t======================================================\n");

    /**
     * Free allocated memory
     */
    own_safe_free (arg);
    own_safe_free (res);
    own_safe_free (ref);

    int ret = (vmlGetErrStatus() < VML_STATUS_OK)?-1:0;
    fprintf(stdout, "omp offload vm_perf_accuracy: %s\n\n", (ret != 0)?"FAIL":"PASS");

    return ret;
}

