/* GPU Reduction Skeleton (sum elements in an array)
 * CS61C Sp14 Lab 10
 * @skeleton_author Sagar Karandikar
 */

// cuda libraries
#include<cuda_runtime.h> // required for all CUDA functionality
#include<cutil.h> // lets us use the CUT_CHECK_ERROR and CUDA_SAFE_CALL macros

// standard C stuff
#include<stdio.h>
#include<stdlib.h>

// timing functionality
#include<time.h>
#include<sys/time.h>

// kernel for reduction on GPU
__global__ void reductionKernel(float* A, int len, int level) {
    // YOUR CODE HERE

}

// Compute reduction of elements in A
// - Result should be placed in A[0]
// - You are free to overwrite elements in A as long as you don't plan to use them
// in the future in your computation! (in fact you will probably need to do so)
// - This code only needs to handle the cases where len is a power of 2 greater than
// or equal to 2^9
void reductionGPU(float* A, int len) {
    // we'll stick with 256 threads_per_block. DO NOT CHANGE this value
    int threads_per_block = 256; // initially, each block takes care of 512 elements
   
    // YOUR CODE HERE
    // we've added comments where we think you should add code

    // set an initial number of blocks_per_grid


    int level = 1;
    while (level != len) { 
        // YOUR CODE HERE
        // we've added comments where we think you should add code

        // create 3-dim vector objects to initialize values

        // launch kernel on GPU
    
        // wait for GPU to finish computation

        // DO NOT remove the following line. This is for your own debugging sanity.
        CUT_CHECK_ERROR("");

        // scale up level


        // Scale down the number of threads, required to prevent index overflow
        // in the kernel (and the resulting bad memory accesses)
        // Don't forget to consider what happens when dividing blocks_per_grid
        // in half results in zero


    }
}

// naive CPU reduction
float reductionCPU(float* A, int len) {
    float result = 0.0;
    for (int i = 0; i < len ; i++) {
        result += A[i];
    }
    return result;
}

// Tester for result correctness. You may ignore this function.
void checkEquality(float* one, float* two, int len) {
    for (int i = 0; i < len; i++) {
        // absolute difference
        float diff = one[i] > two[i] ? one[i] - two[i] : two[i] - one[i];
        if (diff/one[i] > 0.01) {
            // disregard error less than 1%
            // see link at bottom of lab10 for why
            printf("ERR in GPU computation beginning at index %d\n", i);
            printf("%f, %f\n", one[i], two[i]);
            return;
        }
    }
}

int main() {
    cudaDeviceReset(); // prep CUDA device

    int compute_size = 512;
    int increase_factor = 2;
    
    // run 15 tests
    for (int i = 0; i < 15; i++) {
        printf("Benchmark #%d. Compute size: %d elements.\n", i+1, compute_size);

        // compute arraySize in bytes for all the mallocs
        size_t arraySize = compute_size*sizeof(float); 

        // allocate input array
        float* a = (float*)malloc(arraySize);

        if (a == NULL) {
            printf("Unable to allocate space");
            exit(EXIT_FAILURE);
        }

        // initialize input arrays
        for (int j = 0; j < compute_size; j++) {
            a[j] = (float)(rand() % 10);
        }

        // pointers to feed to GPU for compute
        float* gpu_A;

        // allocate inputs/outputs in GPU-addressible memory
        // cudaMalloc is taking in a POINTER to the float* - it will do some magic
        // to allocate space addressible to the GPU and set the pointer whose address
        // you feed it to the address of the space it has allocated
        CUDA_SAFE_CALL(cudaMalloc(&gpu_A, arraySize));

        // copy data from input to GPU memory
        CUDA_SAFE_CALL(cudaMemcpy(gpu_A, a, arraySize, cudaMemcpyHostToDevice));

        // setup timing
        struct timeval start, end;
        float seconds;
        gettimeofday( &start, NULL );
        // run GPU code
        reductionGPU(gpu_A, compute_size);
        // finish timing
        gettimeofday( &end, NULL );
        seconds = (end.tv_sec - start.tv_sec) + 1.0e-6 * (end.tv_usec - start.tv_usec);
        printf("GPU %f Gflops\n", ((float)compute_size)/seconds/1000000000.0);

        float gpu_result;
        CUDA_SAFE_CALL(cudaMemcpy(&gpu_result, gpu_A, sizeof(float), cudaMemcpyDeviceToHost));
        
        // copy results back to CPU memory, free on GPU
        CUDA_SAFE_CALL(cudaFree(gpu_A));

        // setup timing
        struct timeval start2, end2;
        float seconds2;
        gettimeofday( &start2, NULL );
        // run CPU code
        float cpu_result = reductionCPU(a, compute_size);
        // finish timing
        gettimeofday( &end2, NULL );
        seconds2 = (end2.tv_sec - start2.tv_sec) + 1.0e-6 * (end2.tv_usec - start2.tv_usec);
        printf("CPU %f Gflops\n", ((float)compute_size)/(seconds2*1000000000.0));

        // check that GPU/CPU computation matches
        checkEquality(&gpu_result, &cpu_result, 1);

        // free everything
        free(a);

        compute_size *= increase_factor;
        printf("---------------\n");
    }
}