diff --git a/src/01-double-elements.cu b/src/01-double-elements.cu new file mode 100644 index 0000000..3604ebe --- /dev/null +++ b/src/01-double-elements.cu @@ -0,0 +1,81 @@ +#include + +/* + * Initialize array values on the host. + */ + +void init(int *a, int N) +{ + int i; + for (i = 0; i < N; ++i) + { + a[i] = i; + } +} + +/* + * Double elements in parallel on the GPU. + */ + +__global__ void doubleElements(int *a, int N) +{ + int i; + i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < N) + { + a[i] *= 2; + } +} + +/* + * Check all elements have been doubled on the host. + */ + +bool checkElementsAreDoubled(int *a, int N) +{ + int i; + for (i = 0; i < N; ++i) + { + if (a[i] != i * 2) + return false; + } + return true; +} + +int main() +{ + int N = 100; + int *a; + + size_t size = N * sizeof(int); + + /* + * Refactor this memory allocation to provide a pointer + * `a` that can be used on both the host and the device. + */ + + cudaMallocManaged(&a, size); + + init(a, N); + + size_t threads_per_block = 10; + size_t number_of_blocks = 10; + + /* + * This launch will not work until the pointer `a` is also + * available to the device. + */ + + doubleElements<<>>(a, N); + cudaDeviceSynchronize(); + + bool areDoubled = checkElementsAreDoubled(a, N); + printf("All elements were doubled? %s\n", areDoubled ? "TRUE" : "FALSE"); + + /* + * Refactor to free memory that has been allocated to be + * accessed by both the host and the device. + */ + + cudaFree(a); +} diff --git a/src/01-first-parallel.cu b/src/01-first-parallel.cu new file mode 100644 index 0000000..fcadb7d --- /dev/null +++ b/src/01-first-parallel.cu @@ -0,0 +1,27 @@ +#include + +/* + * Refactor firstParallel so that it can run on the GPU. + */ + +__global__ void firstParallel() +{ + printf("This should be running in parallel.\n"); +} + +int main() +{ + /* + * Refactor this call to firstParallel to execute in parallel + * on the GPU. + */ + + firstParallel<<<10, 10>>>(); + + /* + * Some code is needed below so that the CPU will wait + * for the GPU kernels to complete before proceeding. + */ + + cudaDeviceSynchronize(); +} diff --git a/src/01-hello-gpu.cu b/src/01-hello-gpu.cu new file mode 100644 index 0000000..58f0117 --- /dev/null +++ b/src/01-hello-gpu.cu @@ -0,0 +1,38 @@ +#include + +void helloCPU() +{ + printf("Hello from the CPU.\n"); +} + +/* + * Refactor the `helloGPU` definition to be a kernel + * that can be launched on the GPU. Update its message + * to read "Hello from the GPU!" + */ + +__global__ void helloGPU() +{ + printf("Hello from the GPU.\n"); +} + +int main() +{ + + helloCPU(); + + /* + * Refactor this call to `helloGPU` so that it launches + * as a kernel on the GPU. + */ + + helloGPU<<<1, 1>>>(); + + /* + * Add code below to synchronize on the completion of the + * `helloGPU` kernel completion before continuing the CPU + * thread. + */ + + cudaDeviceSynchronize(); +} diff --git a/src/01-single-block-loop.cu b/src/01-single-block-loop.cu new file mode 100644 index 0000000..aa5bc2f --- /dev/null +++ b/src/01-single-block-loop.cu @@ -0,0 +1,27 @@ +#include + +/* + * Refactor `loop` to be a CUDA Kernel. The new kernel should + * only do the work of 1 iteration of the original loop. + */ + +__global__ void loop() +{ + printf("This is iteration number %d\n", threadIdx.x); +} + +int main() +{ + /* + * When refactoring `loop` to launch as a kernel, be sure + * to use the execution configuration to control how many + * "iterations" to perform. + * + * For this exercise, only use 1 block of threads. + */ + + int N = 10; + loop<<<1, N>>>(); + + cudaDeviceSynchronize(); +} diff --git a/src/01-thread-and-block-idx.cu b/src/01-thread-and-block-idx.cu new file mode 100644 index 0000000..0f33c91 --- /dev/null +++ b/src/01-thread-and-block-idx.cu @@ -0,0 +1,22 @@ +#include + +__global__ void printSuccessForCorrectExecutionConfiguration() +{ + + if (threadIdx.x == 1023 && blockIdx.x == 255) + { + printf("Success!\n"); + } +} + +int main() +{ + /* + * Update the execution configuration so that the kernel + * will print `"Success!"`. + */ + + printSuccessForCorrectExecutionConfiguration<<<256, 1024>>>(); + + cudaDeviceSynchronize(); +} diff --git a/src/02-mismatched-config-loop.cu b/src/02-mismatched-config-loop.cu new file mode 100644 index 0000000..fb9a7a7 --- /dev/null +++ b/src/02-mismatched-config-loop.cu @@ -0,0 +1,69 @@ +#include + +/* + * Currently, `initializeElementsTo`, if executed in a thread whose + * `i` is calculated to be greater than `N`, will try to access a value + * outside the range of `a`. + * + * Refactor the kernel definition to prevent out of range accesses. + */ + +__global__ void initializeElementsTo(int initialValue, int *a, int N) +{ + int i = threadIdx.x + blockIdx.x * blockDim.x; + if (i < N) + { + a[i] = initialValue; + } +} + +int main() +{ + /* + * Do not modify `N`. + */ + + int N = 1000; + + int *a; + size_t size = N * sizeof(int); + + cudaMallocManaged(&a, size); + + /* + * Assume we have reason to want the number of threads + * fixed at `256`: do not modify `threads_per_block`. + */ + + size_t threads_per_block = 256; + + /* + * Assign a value to `number_of_blocks` that will + * allow for a working execution configuration given + * the fixed values for `N` and `threads_per_block`. + */ + + size_t number_of_blocks = 4; + + int initialValue = 6; + + initializeElementsTo<<>>(initialValue, a, N); + cudaDeviceSynchronize(); + + /* + * Check to make sure all values in `a`, were initialized. + */ + + for (int i = 0; i < N; ++i) + { + if (a[i] != initialValue) + { + printf("FAILURE: target value: %d\t a[%d]: %d\n", initialValue, i, a[i]); + cudaFree(a); + exit(1); + } + } + printf("SUCCESS!\n"); + + cudaFree(a); +} diff --git a/src/02-multi-block-loop.cu b/src/02-multi-block-loop.cu new file mode 100644 index 0000000..440a06a --- /dev/null +++ b/src/02-multi-block-loop.cu @@ -0,0 +1,28 @@ +#include + +/* + * Refactor `loop` to be a CUDA Kernel. The new kernel should + * only do the work of 1 iteration of the original loop. + */ + +__global__ void loop() +{ + int i = threadIdx.x + blockIdx.x * blockDim.x; + printf("This is iteration number %d\n", i); +} + +int main() +{ + /* + * When refactoring `loop` to launch as a kernel, be sure + * to use the execution configuration to control how many + * "iterations" to perform. + * + * For this exercise, be sure to use more than 1 block in + * the execution configuration. + */ + + loop<<<2, 5>>>(); + + cudaDeviceSynchronize(); +} diff --git a/src/03-grid-stride-double.cu b/src/03-grid-stride-double.cu new file mode 100644 index 0000000..f7c77a0 --- /dev/null +++ b/src/03-grid-stride-double.cu @@ -0,0 +1,68 @@ +#include + +void init(int *a, int N) +{ + int i; + for (i = 0; i < N; ++i) + { + a[i] = i; + } +} + +/* + * In the current application, `N` is larger than the grid. + * Refactor this kernel to use a grid-stride loop in order that + * each parallel thread work on more than one element of the array. + */ + +__global__ void doubleElements(int *a, int N) +{ + int indexWithinTheGrid = threadIdx.x + blockIdx.x * blockDim.x; + int gridStride = gridDim.x * blockDim.x; + + for (int i = indexWithinTheGrid; i < N; i += gridStride) + { + a[i] *= 2; + } +} + +bool checkElementsAreDoubled(int *a, int N) +{ + int i; + for (i = 0; i < N; ++i) + { + if (a[i] != i * 2) + return false; + } + return true; +} + +int main() +{ + /* + * `N` is greater than the size of the grid (see below). + */ + + int N = 10000; + int *a; + + size_t size = N * sizeof(int); + cudaMallocManaged(&a, size); + + init(a, N); + + /* + * The size of this grid is 256*32 = 8192. + */ + + size_t threads_per_block = 256; + size_t number_of_blocks = 32; + + doubleElements<<>>(a, N); + cudaDeviceSynchronize(); + + bool areDoubled = checkElementsAreDoubled(a, N); + printf("All elements were doubled? %s\n", areDoubled ? "TRUE" : "FALSE"); + + cudaFree(a); +}