/* GPU Weighted Vector Addition Example
 * CS61C Sp14 Lab 10
 * @author Sagar Karandikar
 */

// cuda libraries
#include<cuda_runtime.h> // required for all CUDA functionality
#include<cutil.h> // lets us use the CUT_CHECK_ERROR and CUDA_SAFE_CALL macros

// standard C stuff
#include<stdio.h>
#include<stdlib.h>

// timing functionality
#include<time.h>
#include<sys/time.h>

// kernel for weighted vector addition on GPU
__global__ void weightedVecAddKernel(float* out, float* A, float* B, int len, float weight_a, float weight_b) {
    int thisThreadIndex = blockIdx.x*blockDim.x + threadIdx.x;
    if (thisThreadIndex < len) {
        out[thisThreadIndex] = A[thisThreadIndex]*weight_a + B[thisThreadIndex]*weight_b;
    }
}

// compute weighted vector addition on GPU: out = weight_a*A + weight_b*B
void weightedVecAdd(float* out, float* A, float* B, int len, float weight_a, float weight_b) {
    // we need to figure out how to fit our computation to the "geometry"
    // of the GPU. We'll stick with the max 512 threads_per_block and then
    // compute the right number of blocks_per_grid to compute our entire addition
    int threads_per_block = 512;
    int blocks_per_grid = (len/threads_per_block)+1;
    printf("Blocks in a grid: %d\n", blocks_per_grid);

    // 3-dim vector objects to initialize values
    dim3 dim_blocks_per_grid(blocks_per_grid, 1);
    dim3 dim_threads_per_block(threads_per_block, 1, 1);

    // launch kernel on GPU
    weightedVecAddKernel<<<dim_blocks_per_grid, dim_threads_per_block>>>(out, A, B, len, weight_a, weight_b);
    
    // wait for GPU to finish computation
    cudaThreadSynchronize();
    CUT_CHECK_ERROR("");
}

// compute weighted vector addition on CPU: out = weight_a*A + weight_b*B
void cpuWeightedVectorAdd(float* out, float* A, float* B, int len, float weight_a, float weight_b) {
    for (int i = 0; i < len ; i++) {
        out[i] = A[i]*weight_a + B[i]*weight_b;
    }
}

// Tester for result correctness. You may ignore this function.
void checkEquality(float* one, float* two, int len) {
    for (int i = 0; i < len; i++) {
        // absolute difference
        float diff = one[i] > two[i] ? one[i] - two[i] : two[i] - one[i];
        if (diff/one[i] > 0.000001) {
            // disregard error less than 0.0001%
            // see link at bottom of lab10 for why
            printf("ERR in GPU computation beginning at index %d\n", i);
            printf("%f, %f\n", one[i], two[i]);
            return;
        }
    }
}

// Launches 9 tests. Last one fails on purpose.
int main() {
    cudaDeviceReset(); // prep CUDA device

    float weight_a = 10.0;
    float weight_b = 11.0;

    //int compute_size = 33553919;

    int compute_size = 15000;
    int increase_factor = 3;
    
    // run 9 tests
    for (int i = 0; i < 9; i++) {
        printf("Benchmark #%d. Compute size: %d elements.\n", i+1, compute_size);

        // compute arraySize in bytes for all the mallocs
        size_t arraySize = compute_size*sizeof(float); 

        if (i == 8) {
            printf("The following computation should fail. See Checkoff Question 1\n");
        }

        // allocate input arrays
        float* a = (float*)malloc(arraySize);
        float* b = (float*)malloc(arraySize);

        // allocate output arrays
        float* Gout = (float*)malloc(arraySize);
        float* Cout = (float*)malloc(arraySize);
        
        if (a == NULL || b == NULL || Gout == NULL || Cout == NULL) {
            printf("Unable to allocate space");
            exit(EXIT_FAILURE);
        }

        // initialize input arrays
        for (int j = 0; j < compute_size; j++) {
            a[j] = (float)(rand() % 1000000);
            b[j] = (float)(rand() % 1000000);
        }

        // pointers to feed to GPU for compute
        float* gpu_out;
        float* gpu_A;
        float* gpu_B;

        // allocate inputs/outputs in GPU-addressible memory
        // cudaMalloc is taking in a POINTER to the float* - it will do some magic
        // to allocate space addressible to the GPU and set the pointer whose address
        // you feed it to the address of the space it has allocated
        CUDA_SAFE_CALL(cudaMalloc(&gpu_out, arraySize));
        CUDA_SAFE_CALL(cudaMalloc(&gpu_A, arraySize));
        CUDA_SAFE_CALL(cudaMalloc(&gpu_B, arraySize));

        // copy data from input to GPU memory
        CUDA_SAFE_CALL(cudaMemcpy(gpu_A, a, arraySize, cudaMemcpyHostToDevice));
        CUDA_SAFE_CALL(cudaMemcpy(gpu_B, b, arraySize, cudaMemcpyHostToDevice));

        // setup timing
        struct timeval start, end;
        float seconds;
        gettimeofday( &start, NULL );
        // run GPU code
        weightedVecAdd(gpu_out, gpu_A, gpu_B, compute_size, weight_a, weight_b);
        // finish timing
        gettimeofday( &end, NULL );
        seconds = (end.tv_sec - start.tv_sec) + 1.0e-6 * (end.tv_usec - start.tv_usec);
        printf("GPU %f Gflops\n", ((float)compute_size*3)/seconds/1000000000.0);

        // copy results back to CPU memory, free on GPU
        CUDA_SAFE_CALL(cudaMemcpy(Gout, gpu_out, arraySize, cudaMemcpyDeviceToHost));
        CUDA_SAFE_CALL(cudaFree(gpu_out));
        CUDA_SAFE_CALL(cudaFree(gpu_A));
        CUDA_SAFE_CALL(cudaFree(gpu_B));

        // setup timing
        struct timeval start2, end2;
        float seconds2;
        gettimeofday( &start2, NULL );
        // run CPU code
        cpuWeightedVectorAdd(Cout, a, b, compute_size, weight_a, weight_b);
        // finish timing
        gettimeofday( &end2, NULL );
        seconds2 = (end2.tv_sec - start2.tv_sec) + 1.0e-6 * (end2.tv_usec - start2.tv_usec);
        printf("CPU %f Gflops\n", ((float)compute_size*3)/(seconds2*1000000000.0));

        // check that GPU/CPU computation matches
        checkEquality(Gout, Cout, compute_size);

        // free everything
        free(a);
        free(b);
        free(Gout);
        free(Cout);

        compute_size *= increase_factor;
        printf("---------------\n");
    }
}