Section 1

This commit is contained in:
Guilherme Werner
2023-11-03 11:34:44 -03:00
parent 467a657e23
commit 7e3159c2dd
8 changed files with 360 additions and 0 deletions

81
src/01-double-elements.cu Normal file
View File

@ -0,0 +1,81 @@
#include <stdio.h>
/*
* Initialize array values on the host.
*/
void init(int *a, int N)
{
int i;
for (i = 0; i < N; ++i)
{
a[i] = i;
}
}
/*
* Double elements in parallel on the GPU.
*/
__global__ void doubleElements(int *a, int N)
{
int i;
i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N)
{
a[i] *= 2;
}
}
/*
* Check all elements have been doubled on the host.
*/
bool checkElementsAreDoubled(int *a, int N)
{
int i;
for (i = 0; i < N; ++i)
{
if (a[i] != i * 2)
return false;
}
return true;
}
int main()
{
int N = 100;
int *a;
size_t size = N * sizeof(int);
/*
* Refactor this memory allocation to provide a pointer
* `a` that can be used on both the host and the device.
*/
cudaMallocManaged(&a, size);
init(a, N);
size_t threads_per_block = 10;
size_t number_of_blocks = 10;
/*
* This launch will not work until the pointer `a` is also
* available to the device.
*/
doubleElements<<<number_of_blocks, threads_per_block>>>(a, N);
cudaDeviceSynchronize();
bool areDoubled = checkElementsAreDoubled(a, N);
printf("All elements were doubled? %s\n", areDoubled ? "TRUE" : "FALSE");
/*
* Refactor to free memory that has been allocated to be
* accessed by both the host and the device.
*/
cudaFree(a);
}

27
src/01-first-parallel.cu Normal file
View File

@ -0,0 +1,27 @@
#include <stdio.h>
/*
* Refactor firstParallel so that it can run on the GPU.
*/
__global__ void firstParallel()
{
printf("This should be running in parallel.\n");
}
int main()
{
/*
* Refactor this call to firstParallel to execute in parallel
* on the GPU.
*/
firstParallel<<<10, 10>>>();
/*
* Some code is needed below so that the CPU will wait
* for the GPU kernels to complete before proceeding.
*/
cudaDeviceSynchronize();
}

38
src/01-hello-gpu.cu Normal file
View File

@ -0,0 +1,38 @@
#include <stdio.h>
void helloCPU()
{
printf("Hello from the CPU.\n");
}
/*
* Refactor the `helloGPU` definition to be a kernel
* that can be launched on the GPU. Update its message
* to read "Hello from the GPU!"
*/
__global__ void helloGPU()
{
printf("Hello from the GPU.\n");
}
int main()
{
helloCPU();
/*
* Refactor this call to `helloGPU` so that it launches
* as a kernel on the GPU.
*/
helloGPU<<<1, 1>>>();
/*
* Add code below to synchronize on the completion of the
* `helloGPU` kernel completion before continuing the CPU
* thread.
*/
cudaDeviceSynchronize();
}

View File

@ -0,0 +1,27 @@
#include <stdio.h>
/*
* Refactor `loop` to be a CUDA Kernel. The new kernel should
* only do the work of 1 iteration of the original loop.
*/
__global__ void loop()
{
printf("This is iteration number %d\n", threadIdx.x);
}
int main()
{
/*
* When refactoring `loop` to launch as a kernel, be sure
* to use the execution configuration to control how many
* "iterations" to perform.
*
* For this exercise, only use 1 block of threads.
*/
int N = 10;
loop<<<1, N>>>();
cudaDeviceSynchronize();
}

View File

@ -0,0 +1,22 @@
#include <stdio.h>
__global__ void printSuccessForCorrectExecutionConfiguration()
{
if (threadIdx.x == 1023 && blockIdx.x == 255)
{
printf("Success!\n");
}
}
int main()
{
/*
* Update the execution configuration so that the kernel
* will print `"Success!"`.
*/
printSuccessForCorrectExecutionConfiguration<<<256, 1024>>>();
cudaDeviceSynchronize();
}

View File

@ -0,0 +1,69 @@
#include <stdio.h>
/*
* Currently, `initializeElementsTo`, if executed in a thread whose
* `i` is calculated to be greater than `N`, will try to access a value
* outside the range of `a`.
*
* Refactor the kernel definition to prevent out of range accesses.
*/
__global__ void initializeElementsTo(int initialValue, int *a, int N)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
if (i < N)
{
a[i] = initialValue;
}
}
int main()
{
/*
* Do not modify `N`.
*/
int N = 1000;
int *a;
size_t size = N * sizeof(int);
cudaMallocManaged(&a, size);
/*
* Assume we have reason to want the number of threads
* fixed at `256`: do not modify `threads_per_block`.
*/
size_t threads_per_block = 256;
/*
* Assign a value to `number_of_blocks` that will
* allow for a working execution configuration given
* the fixed values for `N` and `threads_per_block`.
*/
size_t number_of_blocks = 4;
int initialValue = 6;
initializeElementsTo<<<number_of_blocks, threads_per_block>>>(initialValue, a, N);
cudaDeviceSynchronize();
/*
* Check to make sure all values in `a`, were initialized.
*/
for (int i = 0; i < N; ++i)
{
if (a[i] != initialValue)
{
printf("FAILURE: target value: %d\t a[%d]: %d\n", initialValue, i, a[i]);
cudaFree(a);
exit(1);
}
}
printf("SUCCESS!\n");
cudaFree(a);
}

View File

@ -0,0 +1,28 @@
#include <stdio.h>
/*
* Refactor `loop` to be a CUDA Kernel. The new kernel should
* only do the work of 1 iteration of the original loop.
*/
__global__ void loop()
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
printf("This is iteration number %d\n", i);
}
int main()
{
/*
* When refactoring `loop` to launch as a kernel, be sure
* to use the execution configuration to control how many
* "iterations" to perform.
*
* For this exercise, be sure to use more than 1 block in
* the execution configuration.
*/
loop<<<2, 5>>>();
cudaDeviceSynchronize();
}

View File

@ -0,0 +1,68 @@
#include <stdio.h>
void init(int *a, int N)
{
int i;
for (i = 0; i < N; ++i)
{
a[i] = i;
}
}
/*
* In the current application, `N` is larger than the grid.
* Refactor this kernel to use a grid-stride loop in order that
* each parallel thread work on more than one element of the array.
*/
__global__ void doubleElements(int *a, int N)
{
int indexWithinTheGrid = threadIdx.x + blockIdx.x * blockDim.x;
int gridStride = gridDim.x * blockDim.x;
for (int i = indexWithinTheGrid; i < N; i += gridStride)
{
a[i] *= 2;
}
}
bool checkElementsAreDoubled(int *a, int N)
{
int i;
for (i = 0; i < N; ++i)
{
if (a[i] != i * 2)
return false;
}
return true;
}
int main()
{
/*
* `N` is greater than the size of the grid (see below).
*/
int N = 10000;
int *a;
size_t size = N * sizeof(int);
cudaMallocManaged(&a, size);
init(a, N);
/*
* The size of this grid is 256*32 = 8192.
*/
size_t threads_per_block = 256;
size_t number_of_blocks = 32;
doubleElements<<<number_of_blocks, threads_per_block>>>(a, N);
cudaDeviceSynchronize();
bool areDoubled = checkElementsAreDoubled(a, N);
printf("All elements were doubled? %s\n", areDoubled ? "TRUE" : "FALSE");
cudaFree(a);
}