/* GPU Weighted Vector Addition Example * CS61C Sp14 Lab 10 * @author Sagar Karandikar */ // cuda libraries #include // required for all CUDA functionality #include // lets us use the CUT_CHECK_ERROR and CUDA_SAFE_CALL macros // standard C stuff #include #include // timing functionality #include #include // kernel for weighted vector addition on GPU __global__ void weightedVecAddKernel(float* out, float* A, float* B, int len, float weight_a, float weight_b) { int thisThreadIndex = blockIdx.x*blockDim.x + threadIdx.x; if (thisThreadIndex < len) { out[thisThreadIndex] = A[thisThreadIndex]*weight_a + B[thisThreadIndex]*weight_b; } } // compute weighted vector addition on GPU: out = weight_a*A + weight_b*B void weightedVecAdd(float* out, float* A, float* B, int len, float weight_a, float weight_b) { // we need to figure out how to fit our computation to the "geometry" // of the GPU. We'll stick with the max 512 threads_per_block and then // compute the right number of blocks_per_grid to compute our entire addition int threads_per_block = 512; int blocks_per_grid = (len/threads_per_block)+1; printf("Blocks in a grid: %d\n", blocks_per_grid); // 3-dim vector objects to initialize values dim3 dim_blocks_per_grid(blocks_per_grid, 1); dim3 dim_threads_per_block(threads_per_block, 1, 1); // launch kernel on GPU weightedVecAddKernel<<>>(out, A, B, len, weight_a, weight_b); // wait for GPU to finish computation cudaThreadSynchronize(); CUT_CHECK_ERROR(""); } // compute weighted vector addition on CPU: out = weight_a*A + weight_b*B void cpuWeightedVectorAdd(float* out, float* A, float* B, int len, float weight_a, float weight_b) { for (int i = 0; i < len ; i++) { out[i] = A[i]*weight_a + B[i]*weight_b; } } // Tester for result correctness. You may ignore this function. void checkEquality(float* one, float* two, int len) { for (int i = 0; i < len; i++) { // absolute difference float diff = one[i] > two[i] ? one[i] - two[i] : two[i] - one[i]; if (diff/one[i] > 0.000001) { // disregard error less than 0.0001% // see link at bottom of lab10 for why printf("ERR in GPU computation beginning at index %d\n", i); printf("%f, %f\n", one[i], two[i]); return; } } } // Launches 9 tests. Last one fails on purpose. int main() { cudaDeviceReset(); // prep CUDA device float weight_a = 10.0; float weight_b = 11.0; //int compute_size = 33553919; int compute_size = 15000; int increase_factor = 3; // run 9 tests for (int i = 0; i < 9; i++) { printf("Benchmark #%d. Compute size: %d elements.\n", i+1, compute_size); // compute arraySize in bytes for all the mallocs size_t arraySize = compute_size*sizeof(float); if (i == 8) { printf("The following computation should fail. See Checkoff Question 1\n"); } // allocate input arrays float* a = (float*)malloc(arraySize); float* b = (float*)malloc(arraySize); // allocate output arrays float* Gout = (float*)malloc(arraySize); float* Cout = (float*)malloc(arraySize); if (a == NULL || b == NULL || Gout == NULL || Cout == NULL) { printf("Unable to allocate space"); exit(EXIT_FAILURE); } // initialize input arrays for (int j = 0; j < compute_size; j++) { a[j] = (float)(rand() % 1000000); b[j] = (float)(rand() % 1000000); } // pointers to feed to GPU for compute float* gpu_out; float* gpu_A; float* gpu_B; // allocate inputs/outputs in GPU-addressible memory // cudaMalloc is taking in a POINTER to the float* - it will do some magic // to allocate space addressible to the GPU and set the pointer whose address // you feed it to the address of the space it has allocated CUDA_SAFE_CALL(cudaMalloc(&gpu_out, arraySize)); CUDA_SAFE_CALL(cudaMalloc(&gpu_A, arraySize)); CUDA_SAFE_CALL(cudaMalloc(&gpu_B, arraySize)); // copy data from input to GPU memory CUDA_SAFE_CALL(cudaMemcpy(gpu_A, a, arraySize, cudaMemcpyHostToDevice)); CUDA_SAFE_CALL(cudaMemcpy(gpu_B, b, arraySize, cudaMemcpyHostToDevice)); // setup timing struct timeval start, end; float seconds; gettimeofday( &start, NULL ); // run GPU code weightedVecAdd(gpu_out, gpu_A, gpu_B, compute_size, weight_a, weight_b); // finish timing gettimeofday( &end, NULL ); seconds = (end.tv_sec - start.tv_sec) + 1.0e-6 * (end.tv_usec - start.tv_usec); printf("GPU %f Gflops\n", ((float)compute_size*3)/seconds/1000000000.0); // copy results back to CPU memory, free on GPU CUDA_SAFE_CALL(cudaMemcpy(Gout, gpu_out, arraySize, cudaMemcpyDeviceToHost)); CUDA_SAFE_CALL(cudaFree(gpu_out)); CUDA_SAFE_CALL(cudaFree(gpu_A)); CUDA_SAFE_CALL(cudaFree(gpu_B)); // setup timing struct timeval start2, end2; float seconds2; gettimeofday( &start2, NULL ); // run CPU code cpuWeightedVectorAdd(Cout, a, b, compute_size, weight_a, weight_b); // finish timing gettimeofday( &end2, NULL ); seconds2 = (end2.tv_sec - start2.tv_sec) + 1.0e-6 * (end2.tv_usec - start2.tv_usec); printf("CPU %f Gflops\n", ((float)compute_size*3)/(seconds2*1000000000.0)); // check that GPU/CPU computation matches checkEquality(Gout, Cout, compute_size); // free everything free(a); free(b); free(Gout); free(Cout); compute_size *= increase_factor; printf("---------------\n"); } }