/* GPU Reduction Skeleton (sum elements in an array) * CS61C Sp14 Lab 10 * @skeleton_author Sagar Karandikar */ // cuda libraries #include // required for all CUDA functionality #include // lets us use the CUT_CHECK_ERROR and CUDA_SAFE_CALL macros // standard C stuff #include #include // timing functionality #include #include // kernel for reduction on GPU __global__ void reductionKernel(float* A, int len, int level) { // YOUR CODE HERE } // Compute reduction of elements in A // - Result should be placed in A[0] // - You are free to overwrite elements in A as long as you don't plan to use them // in the future in your computation! (in fact you will probably need to do so) // - This code only needs to handle the cases where len is a power of 2 greater than // or equal to 2^9 void reductionGPU(float* A, int len) { // we'll stick with 256 threads_per_block. DO NOT CHANGE this value int threads_per_block = 256; // initially, each block takes care of 512 elements // YOUR CODE HERE // we've added comments where we think you should add code // set an initial number of blocks_per_grid int level = 1; while (level != len) { // YOUR CODE HERE // we've added comments where we think you should add code // create 3-dim vector objects to initialize values // launch kernel on GPU // wait for GPU to finish computation // DO NOT remove the following line. This is for your own debugging sanity. CUT_CHECK_ERROR(""); // scale up level // Scale down the number of threads, required to prevent index overflow // in the kernel (and the resulting bad memory accesses) // Don't forget to consider what happens when dividing blocks_per_grid // in half results in zero } } // naive CPU reduction float reductionCPU(float* A, int len) { float result = 0.0; for (int i = 0; i < len ; i++) { result += A[i]; } return result; } // Tester for result correctness. You may ignore this function. void checkEquality(float* one, float* two, int len) { for (int i = 0; i < len; i++) { // absolute difference float diff = one[i] > two[i] ? one[i] - two[i] : two[i] - one[i]; if (diff/one[i] > 0.01) { // disregard error less than 1% // see link at bottom of lab10 for why printf("ERR in GPU computation beginning at index %d\n", i); printf("%f, %f\n", one[i], two[i]); return; } } } int main() { cudaDeviceReset(); // prep CUDA device int compute_size = 512; int increase_factor = 2; // run 15 tests for (int i = 0; i < 15; i++) { printf("Benchmark #%d. Compute size: %d elements.\n", i+1, compute_size); // compute arraySize in bytes for all the mallocs size_t arraySize = compute_size*sizeof(float); // allocate input array float* a = (float*)malloc(arraySize); if (a == NULL) { printf("Unable to allocate space"); exit(EXIT_FAILURE); } // initialize input arrays for (int j = 0; j < compute_size; j++) { a[j] = (float)(rand() % 10); } // pointers to feed to GPU for compute float* gpu_A; // allocate inputs/outputs in GPU-addressible memory // cudaMalloc is taking in a POINTER to the float* - it will do some magic // to allocate space addressible to the GPU and set the pointer whose address // you feed it to the address of the space it has allocated CUDA_SAFE_CALL(cudaMalloc(&gpu_A, arraySize)); // copy data from input to GPU memory CUDA_SAFE_CALL(cudaMemcpy(gpu_A, a, arraySize, cudaMemcpyHostToDevice)); // setup timing struct timeval start, end; float seconds; gettimeofday( &start, NULL ); // run GPU code reductionGPU(gpu_A, compute_size); // finish timing gettimeofday( &end, NULL ); seconds = (end.tv_sec - start.tv_sec) + 1.0e-6 * (end.tv_usec - start.tv_usec); printf("GPU %f Gflops\n", ((float)compute_size)/seconds/1000000000.0); float gpu_result; CUDA_SAFE_CALL(cudaMemcpy(&gpu_result, gpu_A, sizeof(float), cudaMemcpyDeviceToHost)); // copy results back to CPU memory, free on GPU CUDA_SAFE_CALL(cudaFree(gpu_A)); // setup timing struct timeval start2, end2; float seconds2; gettimeofday( &start2, NULL ); // run CPU code float cpu_result = reductionCPU(a, compute_size); // finish timing gettimeofday( &end2, NULL ); seconds2 = (end2.tv_sec - start2.tv_sec) + 1.0e-6 * (end2.tv_usec - start2.tv_usec); printf("CPU %f Gflops\n", ((float)compute_size)/(seconds2*1000000000.0)); // check that GPU/CPU computation matches checkEquality(&gpu_result, &cpu_result, 1); // free everything free(a); compute_size *= increase_factor; printf("---------------\n"); } }