Section 1

2026-02-04 02:13:09 +00:00 · 2023-11-03 11:34:44 -03:00
parent 467a657e23
commit 7e3159c2dd
8 changed files with 360 additions and 0 deletions
--- a/src/01-double-elements.cu
+++ b/src/01-double-elements.cu
@@ -0,0 +1,81 @@
 #include <stdio.h>
 /*
 * Initialize array values on the host.
 */
 void init(int *a, int N)
 {
    int i;
    for (i = 0; i < N; ++i)
    {
        a[i] = i;
    }
 }
 /*
 * Double elements in parallel on the GPU.
 */
 __global__ void doubleElements(int *a, int N)
 {
    int i;
    i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N)
    {
        a[i] *= 2;
    }
 }
 /*
 * Check all elements have been doubled on the host.
 */
 bool checkElementsAreDoubled(int *a, int N)
 {
    int i;
    for (i = 0; i < N; ++i)
    {
        if (a[i] != i * 2)
            return false;
    }
    return true;
 }
 int main()
 {
    int N = 100;
    int *a;
    size_t size = N * sizeof(int);
    /*
     * Refactor this memory allocation to provide a pointer
     * `a` that can be used on both the host and the device.
     */
    cudaMallocManaged(&a, size);
    init(a, N);
    size_t threads_per_block = 10;
    size_t number_of_blocks = 10;
    /*
     * This launch will not work until the pointer `a` is also
     * available to the device.
     */
    doubleElements<<<number_of_blocks, threads_per_block>>>(a, N);
    cudaDeviceSynchronize();
    bool areDoubled = checkElementsAreDoubled(a, N);
    printf("All elements were doubled? %s\n", areDoubled ? "TRUE" : "FALSE");
    /*
     * Refactor to free memory that has been allocated to be
     * accessed by both the host and the device.
     */
    cudaFree(a);
 }
--- a/src/01-first-parallel.cu
+++ b/src/01-first-parallel.cu
@@ -0,0 +1,27 @@
 #include <stdio.h>
 /*
 * Refactor firstParallel so that it can run on the GPU.
 */
 __global__ void firstParallel()
 {
    printf("This should be running in parallel.\n");
 }
 int main()
 {
    /*
     * Refactor this call to firstParallel to execute in parallel
     * on the GPU.
     */
    firstParallel<<<10, 10>>>();
    /*
     * Some code is needed below so that the CPU will wait
     * for the GPU kernels to complete before proceeding.
     */
    cudaDeviceSynchronize();
 }
--- a/src/01-hello-gpu.cu
+++ b/src/01-hello-gpu.cu
@@ -0,0 +1,38 @@
 #include <stdio.h>
 void helloCPU()
 {
    printf("Hello from the CPU.\n");
 }
 /*
 * Refactor the `helloGPU` definition to be a kernel
 * that can be launched on the GPU. Update its message
 * to read "Hello from the GPU!"
 */
 __global__ void helloGPU()
 {
    printf("Hello from the GPU.\n");
 }
 int main()
 {
    helloCPU();
    /*
     * Refactor this call to `helloGPU` so that it launches
     * as a kernel on the GPU.
     */
    helloGPU<<<1, 1>>>();
    /*
     * Add code below to synchronize on the completion of the
     * `helloGPU` kernel completion before continuing the CPU
     * thread.
     */
    cudaDeviceSynchronize();
 }
--- a/src/01-single-block-loop.cu
+++ b/src/01-single-block-loop.cu
@@ -0,0 +1,27 @@
 #include <stdio.h>
 /*
 * Refactor `loop` to be a CUDA Kernel. The new kernel should
 * only do the work of 1 iteration of the original loop.
 */
 __global__ void loop()
 {
    printf("This is iteration number %d\n", threadIdx.x);
 }
 int main()
 {
    /*
     * When refactoring `loop` to launch as a kernel, be sure
     * to use the execution configuration to control how many
     * "iterations" to perform.
     *
     * For this exercise, only use 1 block of threads.
     */
    int N = 10;
    loop<<<1, N>>>();
    cudaDeviceSynchronize();
 }
--- a/src/01-thread-and-block-idx.cu
+++ b/src/01-thread-and-block-idx.cu
@@ -0,0 +1,22 @@
 #include <stdio.h>
 __global__ void printSuccessForCorrectExecutionConfiguration()
 {
    if (threadIdx.x == 1023 && blockIdx.x == 255)
    {
        printf("Success!\n");
    }
 }
 int main()
 {
    /*
     * Update the execution configuration so that the kernel
     * will print `"Success!"`.
     */
    printSuccessForCorrectExecutionConfiguration<<<256, 1024>>>();
    cudaDeviceSynchronize();
 }
--- a/src/02-mismatched-config-loop.cu
+++ b/src/02-mismatched-config-loop.cu
@@ -0,0 +1,69 @@
 #include <stdio.h>
 /*
 * Currently, `initializeElementsTo`, if executed in a thread whose
 * `i` is calculated to be greater than `N`, will try to access a value
 * outside the range of `a`.
 *
 * Refactor the kernel definition to prevent out of range accesses.
 */
 __global__ void initializeElementsTo(int initialValue, int *a, int N)
 {
    int i = threadIdx.x + blockIdx.x * blockDim.x;
    if (i < N)
    {
        a[i] = initialValue;
    }
 }
 int main()
 {
    /*
     * Do not modify `N`.
     */
    int N = 1000;
    int *a;
    size_t size = N * sizeof(int);
    cudaMallocManaged(&a, size);
    /*
     * Assume we have reason to want the number of threads
     * fixed at `256`: do not modify `threads_per_block`.
     */
    size_t threads_per_block = 256;
    /*
     * Assign a value to `number_of_blocks` that will
     * allow for a working execution configuration given
     * the fixed values for `N` and `threads_per_block`.
     */
    size_t number_of_blocks = 4;
    int initialValue = 6;
    initializeElementsTo<<<number_of_blocks, threads_per_block>>>(initialValue, a, N);
    cudaDeviceSynchronize();
    /*
     * Check to make sure all values in `a`, were initialized.
     */
    for (int i = 0; i < N; ++i)
    {
        if (a[i] != initialValue)
        {
            printf("FAILURE: target value: %d\t a[%d]: %d\n", initialValue, i, a[i]);
            cudaFree(a);
            exit(1);
        }
    }
    printf("SUCCESS!\n");
    cudaFree(a);
 }
--- a/src/02-multi-block-loop.cu
+++ b/src/02-multi-block-loop.cu
@@ -0,0 +1,28 @@
 #include <stdio.h>
 /*
 * Refactor `loop` to be a CUDA Kernel. The new kernel should
 * only do the work of 1 iteration of the original loop.
 */
 __global__ void loop()
 {
    int i = threadIdx.x + blockIdx.x * blockDim.x;
    printf("This is iteration number %d\n", i);
 }
 int main()
 {
    /*
     * When refactoring `loop` to launch as a kernel, be sure
     * to use the execution configuration to control how many
     * "iterations" to perform.
     *
     * For this exercise, be sure to use more than 1 block in
     * the execution configuration.
     */
    loop<<<2, 5>>>();
    cudaDeviceSynchronize();
 }
--- a/src/03-grid-stride-double.cu
+++ b/src/03-grid-stride-double.cu
@@ -0,0 +1,68 @@
 #include <stdio.h>
 void init(int *a, int N)
 {
    int i;
    for (i = 0; i < N; ++i)
    {
        a[i] = i;
    }
 }
 /*
 * In the current application, `N` is larger than the grid.
 * Refactor this kernel to use a grid-stride loop in order that
 * each parallel thread work on more than one element of the array.
 */
 __global__ void doubleElements(int *a, int N)
 {
    int indexWithinTheGrid = threadIdx.x + blockIdx.x * blockDim.x;
    int gridStride = gridDim.x * blockDim.x;
    for (int i = indexWithinTheGrid; i < N; i += gridStride)
    {
        a[i] *= 2;
    }
 }
 bool checkElementsAreDoubled(int *a, int N)
 {
    int i;
    for (i = 0; i < N; ++i)
    {
        if (a[i] != i * 2)
            return false;
    }
    return true;
 }
 int main()
 {
    /*
     * `N` is greater than the size of the grid (see below).
     */
    int N = 10000;
    int *a;
    size_t size = N * sizeof(int);
    cudaMallocManaged(&a, size);
    init(a, N);
    /*
     * The size of this grid is 256*32 = 8192.
     */
    size_t threads_per_block = 256;
    size_t number_of_blocks = 32;
    doubleElements<<<number_of_blocks, threads_per_block>>>(a, N);
    cudaDeviceSynchronize();
    bool areDoubled = checkElementsAreDoubled(a, N);
    printf("All elements were doubled? %s\n", areDoubled ? "TRUE" : "FALSE");
    cudaFree(a);
 }