mirror of
https://github.com/guilhermewerner/nvidia-cuda
synced 2025-06-15 04:34:19 +00:00
Section 1
This commit is contained in:
81
src/01-double-elements.cu
Normal file
81
src/01-double-elements.cu
Normal file
@ -0,0 +1,81 @@
|
||||
#include <stdio.h>
|
||||
|
||||
/*
|
||||
* Initialize array values on the host.
|
||||
*/
|
||||
|
||||
void init(int *a, int N)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < N; ++i)
|
||||
{
|
||||
a[i] = i;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Double elements in parallel on the GPU.
|
||||
*/
|
||||
|
||||
__global__ void doubleElements(int *a, int N)
|
||||
{
|
||||
int i;
|
||||
i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (i < N)
|
||||
{
|
||||
a[i] *= 2;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Check all elements have been doubled on the host.
|
||||
*/
|
||||
|
||||
bool checkElementsAreDoubled(int *a, int N)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < N; ++i)
|
||||
{
|
||||
if (a[i] != i * 2)
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
int N = 100;
|
||||
int *a;
|
||||
|
||||
size_t size = N * sizeof(int);
|
||||
|
||||
/*
|
||||
* Refactor this memory allocation to provide a pointer
|
||||
* `a` that can be used on both the host and the device.
|
||||
*/
|
||||
|
||||
cudaMallocManaged(&a, size);
|
||||
|
||||
init(a, N);
|
||||
|
||||
size_t threads_per_block = 10;
|
||||
size_t number_of_blocks = 10;
|
||||
|
||||
/*
|
||||
* This launch will not work until the pointer `a` is also
|
||||
* available to the device.
|
||||
*/
|
||||
|
||||
doubleElements<<<number_of_blocks, threads_per_block>>>(a, N);
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
bool areDoubled = checkElementsAreDoubled(a, N);
|
||||
printf("All elements were doubled? %s\n", areDoubled ? "TRUE" : "FALSE");
|
||||
|
||||
/*
|
||||
* Refactor to free memory that has been allocated to be
|
||||
* accessed by both the host and the device.
|
||||
*/
|
||||
|
||||
cudaFree(a);
|
||||
}
|
27
src/01-first-parallel.cu
Normal file
27
src/01-first-parallel.cu
Normal file
@ -0,0 +1,27 @@
|
||||
#include <stdio.h>
|
||||
|
||||
/*
|
||||
* Refactor firstParallel so that it can run on the GPU.
|
||||
*/
|
||||
|
||||
__global__ void firstParallel()
|
||||
{
|
||||
printf("This should be running in parallel.\n");
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
/*
|
||||
* Refactor this call to firstParallel to execute in parallel
|
||||
* on the GPU.
|
||||
*/
|
||||
|
||||
firstParallel<<<10, 10>>>();
|
||||
|
||||
/*
|
||||
* Some code is needed below so that the CPU will wait
|
||||
* for the GPU kernels to complete before proceeding.
|
||||
*/
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
}
|
38
src/01-hello-gpu.cu
Normal file
38
src/01-hello-gpu.cu
Normal file
@ -0,0 +1,38 @@
|
||||
#include <stdio.h>
|
||||
|
||||
void helloCPU()
|
||||
{
|
||||
printf("Hello from the CPU.\n");
|
||||
}
|
||||
|
||||
/*
|
||||
* Refactor the `helloGPU` definition to be a kernel
|
||||
* that can be launched on the GPU. Update its message
|
||||
* to read "Hello from the GPU!"
|
||||
*/
|
||||
|
||||
__global__ void helloGPU()
|
||||
{
|
||||
printf("Hello from the GPU.\n");
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
|
||||
helloCPU();
|
||||
|
||||
/*
|
||||
* Refactor this call to `helloGPU` so that it launches
|
||||
* as a kernel on the GPU.
|
||||
*/
|
||||
|
||||
helloGPU<<<1, 1>>>();
|
||||
|
||||
/*
|
||||
* Add code below to synchronize on the completion of the
|
||||
* `helloGPU` kernel completion before continuing the CPU
|
||||
* thread.
|
||||
*/
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
}
|
27
src/01-single-block-loop.cu
Normal file
27
src/01-single-block-loop.cu
Normal file
@ -0,0 +1,27 @@
|
||||
#include <stdio.h>
|
||||
|
||||
/*
|
||||
* Refactor `loop` to be a CUDA Kernel. The new kernel should
|
||||
* only do the work of 1 iteration of the original loop.
|
||||
*/
|
||||
|
||||
__global__ void loop()
|
||||
{
|
||||
printf("This is iteration number %d\n", threadIdx.x);
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
/*
|
||||
* When refactoring `loop` to launch as a kernel, be sure
|
||||
* to use the execution configuration to control how many
|
||||
* "iterations" to perform.
|
||||
*
|
||||
* For this exercise, only use 1 block of threads.
|
||||
*/
|
||||
|
||||
int N = 10;
|
||||
loop<<<1, N>>>();
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
}
|
22
src/01-thread-and-block-idx.cu
Normal file
22
src/01-thread-and-block-idx.cu
Normal file
@ -0,0 +1,22 @@
|
||||
#include <stdio.h>
|
||||
|
||||
__global__ void printSuccessForCorrectExecutionConfiguration()
|
||||
{
|
||||
|
||||
if (threadIdx.x == 1023 && blockIdx.x == 255)
|
||||
{
|
||||
printf("Success!\n");
|
||||
}
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
/*
|
||||
* Update the execution configuration so that the kernel
|
||||
* will print `"Success!"`.
|
||||
*/
|
||||
|
||||
printSuccessForCorrectExecutionConfiguration<<<256, 1024>>>();
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
}
|
69
src/02-mismatched-config-loop.cu
Normal file
69
src/02-mismatched-config-loop.cu
Normal file
@ -0,0 +1,69 @@
|
||||
#include <stdio.h>
|
||||
|
||||
/*
|
||||
* Currently, `initializeElementsTo`, if executed in a thread whose
|
||||
* `i` is calculated to be greater than `N`, will try to access a value
|
||||
* outside the range of `a`.
|
||||
*
|
||||
* Refactor the kernel definition to prevent out of range accesses.
|
||||
*/
|
||||
|
||||
__global__ void initializeElementsTo(int initialValue, int *a, int N)
|
||||
{
|
||||
int i = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (i < N)
|
||||
{
|
||||
a[i] = initialValue;
|
||||
}
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
/*
|
||||
* Do not modify `N`.
|
||||
*/
|
||||
|
||||
int N = 1000;
|
||||
|
||||
int *a;
|
||||
size_t size = N * sizeof(int);
|
||||
|
||||
cudaMallocManaged(&a, size);
|
||||
|
||||
/*
|
||||
* Assume we have reason to want the number of threads
|
||||
* fixed at `256`: do not modify `threads_per_block`.
|
||||
*/
|
||||
|
||||
size_t threads_per_block = 256;
|
||||
|
||||
/*
|
||||
* Assign a value to `number_of_blocks` that will
|
||||
* allow for a working execution configuration given
|
||||
* the fixed values for `N` and `threads_per_block`.
|
||||
*/
|
||||
|
||||
size_t number_of_blocks = 4;
|
||||
|
||||
int initialValue = 6;
|
||||
|
||||
initializeElementsTo<<<number_of_blocks, threads_per_block>>>(initialValue, a, N);
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
/*
|
||||
* Check to make sure all values in `a`, were initialized.
|
||||
*/
|
||||
|
||||
for (int i = 0; i < N; ++i)
|
||||
{
|
||||
if (a[i] != initialValue)
|
||||
{
|
||||
printf("FAILURE: target value: %d\t a[%d]: %d\n", initialValue, i, a[i]);
|
||||
cudaFree(a);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
printf("SUCCESS!\n");
|
||||
|
||||
cudaFree(a);
|
||||
}
|
28
src/02-multi-block-loop.cu
Normal file
28
src/02-multi-block-loop.cu
Normal file
@ -0,0 +1,28 @@
|
||||
#include <stdio.h>
|
||||
|
||||
/*
|
||||
* Refactor `loop` to be a CUDA Kernel. The new kernel should
|
||||
* only do the work of 1 iteration of the original loop.
|
||||
*/
|
||||
|
||||
__global__ void loop()
|
||||
{
|
||||
int i = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
printf("This is iteration number %d\n", i);
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
/*
|
||||
* When refactoring `loop` to launch as a kernel, be sure
|
||||
* to use the execution configuration to control how many
|
||||
* "iterations" to perform.
|
||||
*
|
||||
* For this exercise, be sure to use more than 1 block in
|
||||
* the execution configuration.
|
||||
*/
|
||||
|
||||
loop<<<2, 5>>>();
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
}
|
68
src/03-grid-stride-double.cu
Normal file
68
src/03-grid-stride-double.cu
Normal file
@ -0,0 +1,68 @@
|
||||
#include <stdio.h>
|
||||
|
||||
void init(int *a, int N)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < N; ++i)
|
||||
{
|
||||
a[i] = i;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* In the current application, `N` is larger than the grid.
|
||||
* Refactor this kernel to use a grid-stride loop in order that
|
||||
* each parallel thread work on more than one element of the array.
|
||||
*/
|
||||
|
||||
__global__ void doubleElements(int *a, int N)
|
||||
{
|
||||
int indexWithinTheGrid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
int gridStride = gridDim.x * blockDim.x;
|
||||
|
||||
for (int i = indexWithinTheGrid; i < N; i += gridStride)
|
||||
{
|
||||
a[i] *= 2;
|
||||
}
|
||||
}
|
||||
|
||||
bool checkElementsAreDoubled(int *a, int N)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < N; ++i)
|
||||
{
|
||||
if (a[i] != i * 2)
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
/*
|
||||
* `N` is greater than the size of the grid (see below).
|
||||
*/
|
||||
|
||||
int N = 10000;
|
||||
int *a;
|
||||
|
||||
size_t size = N * sizeof(int);
|
||||
cudaMallocManaged(&a, size);
|
||||
|
||||
init(a, N);
|
||||
|
||||
/*
|
||||
* The size of this grid is 256*32 = 8192.
|
||||
*/
|
||||
|
||||
size_t threads_per_block = 256;
|
||||
size_t number_of_blocks = 32;
|
||||
|
||||
doubleElements<<<number_of_blocks, threads_per_block>>>(a, N);
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
bool areDoubled = checkElementsAreDoubled(a, N);
|
||||
printf("All elements were doubled? %s\n", areDoubled ? "TRUE" : "FALSE");
|
||||
|
||||
cudaFree(a);
|
||||
}
|
Reference in New Issue
Block a user