mirror of
https://github.com/guilhermewerner/nvidia-cuda
synced 2025-06-15 04:34:19 +00:00
Section 2
This commit is contained in:
30
src/01-get-device-properties.cu
Normal file
30
src/01-get-device-properties.cu
Normal file
@ -0,0 +1,30 @@
|
||||
#include <stdio.h>
|
||||
|
||||
int main()
|
||||
{
|
||||
/*
|
||||
* Assign values to these variables so that the output string below prints the
|
||||
* requested properties of the currently active GPU.
|
||||
*/
|
||||
|
||||
int deviceId;
|
||||
int computeCapabilityMajor;
|
||||
int computeCapabilityMinor;
|
||||
int multiProcessorCount;
|
||||
int warpSize;
|
||||
|
||||
/*
|
||||
* There should be no need to modify the output string below.
|
||||
*/
|
||||
|
||||
cudaGetDevice(&deviceId);
|
||||
cudaDeviceProp props;
|
||||
cudaGetDeviceProperties(&props, deviceId);
|
||||
|
||||
computeCapabilityMajor = props.major;
|
||||
computeCapabilityMinor = props.minor;
|
||||
multiProcessorCount = props.multiProcessorCount;
|
||||
warpSize = props.warpSize;
|
||||
|
||||
printf("Device ID: %d\nNumber of SMs: %d\nCompute Capability Major: %d\nCompute Capability Minor: %d\nWarp Size: %d\n", deviceId, multiProcessorCount, computeCapabilityMajor, computeCapabilityMinor, warpSize);
|
||||
}
|
28
src/01-page-faults-solution-cpu-only.cu
Normal file
28
src/01-page-faults-solution-cpu-only.cu
Normal file
@ -0,0 +1,28 @@
|
||||
__global__ void deviceKernel(int *a, int N)
|
||||
{
|
||||
int idx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
int stride = blockDim.x * gridDim.x;
|
||||
|
||||
for (int i = idx; i < N; i += stride)
|
||||
{
|
||||
a[i] = 1;
|
||||
}
|
||||
}
|
||||
|
||||
void hostFunction(int *a, int N)
|
||||
{
|
||||
for (int i = 0; i < N; ++i)
|
||||
{
|
||||
a[i] = 1;
|
||||
}
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
int N = 2 << 24;
|
||||
size_t size = N * sizeof(int);
|
||||
int *a;
|
||||
cudaMallocManaged(&a, size);
|
||||
hostFunction(a, N);
|
||||
cudaFree(a);
|
||||
}
|
68
src/01-saxpy.cu
Normal file
68
src/01-saxpy.cu
Normal file
@ -0,0 +1,68 @@
|
||||
#include <stdio.h>
|
||||
|
||||
#define N 2048 * 2048 // Number of elements in each vector
|
||||
|
||||
/*
|
||||
* Optimize this already-accelerated codebase. Work iteratively,
|
||||
* and use nsys to support your work.
|
||||
*
|
||||
* Aim to profile `saxpy` (without modifying `N`) running under
|
||||
* 20us.
|
||||
*
|
||||
* Some bugs have been placed in this codebase for your edification.
|
||||
*/
|
||||
|
||||
__global__ void saxpy(int *a, int *b, int *c)
|
||||
{
|
||||
int tid = blockIdx.x * blockDim.x * threadIdx.x;
|
||||
int stride = blockDim.x * gridDim.x;
|
||||
|
||||
for (int i = tid; i < N; i += stride)
|
||||
{
|
||||
c[i] = 2 * a[i] + b[i];
|
||||
}
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
int *a, *b, *c;
|
||||
|
||||
int size = N * sizeof(int); // The total number of bytes per vector
|
||||
|
||||
int deviceId;
|
||||
int numberOfSms;
|
||||
|
||||
cudaGetDevice(&deviceId);
|
||||
cudaDeviceGetAttribute(&numberOfSms, cudaDevAttrMultiProcessorCount, deviceId);
|
||||
|
||||
cudaMallocManaged(&a, size);
|
||||
cudaMallocManaged(&b, size);
|
||||
cudaMallocManaged(&c, size);
|
||||
|
||||
// Initialize memory
|
||||
for (int i = 0; i < N; ++i)
|
||||
{
|
||||
a[i] = 2;
|
||||
b[i] = 1;
|
||||
c[i] = 0;
|
||||
}
|
||||
|
||||
int threads_per_block = 128;
|
||||
int number_of_blocks = (N / threads_per_block) + 1;
|
||||
|
||||
saxpy<<<number_of_blocks, threads_per_block>>>(a, b, c);
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
// Print out the first and last 5 values of c for a quality check
|
||||
for (int i = 0; i < 5; ++i)
|
||||
printf("c[%d] = %d, ", i, c[i]);
|
||||
printf("\n");
|
||||
for (int i = N - 5; i < N; ++i)
|
||||
printf("c[%d] = %d, ", i, c[i]);
|
||||
printf("\n");
|
||||
|
||||
cudaFree(a);
|
||||
cudaFree(b);
|
||||
cudaFree(c);
|
||||
}
|
101
src/01-vector-add-init-in-kernel-solution.cu
Normal file
101
src/01-vector-add-init-in-kernel-solution.cu
Normal file
@ -0,0 +1,101 @@
|
||||
#include <stdio.h>
|
||||
|
||||
/*
|
||||
* Refactor host function to run as CUDA kernel
|
||||
*/
|
||||
|
||||
__global__ void initWith(float num, float *a, int N)
|
||||
{
|
||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
int stride = blockDim.x * gridDim.x;
|
||||
|
||||
for (int i = index; i < N; i += stride)
|
||||
{
|
||||
a[i] = num;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void addArraysInto(float *result, float *a, float *b, int N)
|
||||
{
|
||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
int stride = blockDim.x * gridDim.x;
|
||||
|
||||
for (int i = index; i < N; i += stride)
|
||||
{
|
||||
result[i] = a[i] + b[i];
|
||||
}
|
||||
}
|
||||
|
||||
void checkElementsAre(float target, float *array, int N)
|
||||
{
|
||||
for (int i = 0; i < N; i++)
|
||||
{
|
||||
if (array[i] != target)
|
||||
{
|
||||
printf("FAIL: array[%d] - %0.0f does not equal %0.0f\n", i, array[i], target);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
printf("Success! All values calculated correctly.\n");
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
int deviceId;
|
||||
int numberOfSMs;
|
||||
|
||||
cudaGetDevice(&deviceId);
|
||||
cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);
|
||||
printf("Device ID: %d\tNumber of SMs: %d\n", deviceId, numberOfSMs);
|
||||
|
||||
const int N = 2 << 24;
|
||||
size_t size = N * sizeof(float);
|
||||
|
||||
float *a;
|
||||
float *b;
|
||||
float *c;
|
||||
|
||||
cudaMallocManaged(&a, size);
|
||||
cudaMallocManaged(&b, size);
|
||||
cudaMallocManaged(&c, size);
|
||||
|
||||
size_t threadsPerBlock;
|
||||
size_t numberOfBlocks;
|
||||
|
||||
threadsPerBlock = 256;
|
||||
numberOfBlocks = 32 * numberOfSMs;
|
||||
|
||||
cudaError_t addArraysErr;
|
||||
cudaError_t asyncErr;
|
||||
|
||||
/*
|
||||
* Launch kernels.
|
||||
*/
|
||||
|
||||
initWith<<<numberOfBlocks, threadsPerBlock>>>(3, a, N);
|
||||
initWith<<<numberOfBlocks, threadsPerBlock>>>(4, b, N);
|
||||
initWith<<<numberOfBlocks, threadsPerBlock>>>(0, c, N);
|
||||
|
||||
/*
|
||||
* Now that initialization is happening on a GPU, host code
|
||||
* must be synchronized to wait for its completion.
|
||||
*/
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
addArraysInto<<<numberOfBlocks, threadsPerBlock>>>(c, a, b, N);
|
||||
|
||||
addArraysErr = cudaGetLastError();
|
||||
if (addArraysErr != cudaSuccess)
|
||||
printf("Error: %s\n", cudaGetErrorString(addArraysErr));
|
||||
|
||||
asyncErr = cudaDeviceSynchronize();
|
||||
if (asyncErr != cudaSuccess)
|
||||
printf("Error: %s\n", cudaGetErrorString(asyncErr));
|
||||
|
||||
checkElementsAre(7, c, N);
|
||||
|
||||
cudaFree(a);
|
||||
cudaFree(b);
|
||||
cudaFree(c);
|
||||
}
|
90
src/01-vector-add-prefetch-solution.cu
Normal file
90
src/01-vector-add-prefetch-solution.cu
Normal file
@ -0,0 +1,90 @@
|
||||
#include <stdio.h>
|
||||
|
||||
__global__ void initWith(float num, float *a, int N)
|
||||
{
|
||||
|
||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
int stride = blockDim.x * gridDim.x;
|
||||
|
||||
for (int i = index; i < N; i += stride)
|
||||
{
|
||||
a[i] = num;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void addVectorsInto(float *result, float *a, float *b, int N)
|
||||
{
|
||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
int stride = blockDim.x * gridDim.x;
|
||||
|
||||
for (int i = index; i < N; i += stride)
|
||||
{
|
||||
result[i] = a[i] + b[i];
|
||||
}
|
||||
}
|
||||
|
||||
void checkElementsAre(float target, float *vector, int N)
|
||||
{
|
||||
for (int i = 0; i < N; i++)
|
||||
{
|
||||
if (vector[i] != target)
|
||||
{
|
||||
printf("FAIL: vector[%d] - %0.0f does not equal %0.0f\n", i, vector[i], target);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
printf("Success! All values calculated correctly.\n");
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
int deviceId;
|
||||
int numberOfSMs;
|
||||
|
||||
cudaGetDevice(&deviceId);
|
||||
cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);
|
||||
|
||||
const int N = 2 << 24;
|
||||
size_t size = N * sizeof(float);
|
||||
|
||||
float *a;
|
||||
float *b;
|
||||
float *c;
|
||||
|
||||
cudaMallocManaged(&a, size);
|
||||
cudaMallocManaged(&b, size);
|
||||
cudaMallocManaged(&c, size);
|
||||
|
||||
cudaMemPrefetchAsync(a, size, deviceId);
|
||||
cudaMemPrefetchAsync(b, size, deviceId);
|
||||
cudaMemPrefetchAsync(c, size, deviceId);
|
||||
|
||||
size_t threadsPerBlock;
|
||||
size_t numberOfBlocks;
|
||||
|
||||
threadsPerBlock = 256;
|
||||
numberOfBlocks = 32 * numberOfSMs;
|
||||
|
||||
cudaError_t addVectorsErr;
|
||||
cudaError_t asyncErr;
|
||||
|
||||
initWith<<<numberOfBlocks, threadsPerBlock>>>(3, a, N);
|
||||
initWith<<<numberOfBlocks, threadsPerBlock>>>(4, b, N);
|
||||
initWith<<<numberOfBlocks, threadsPerBlock>>>(0, c, N);
|
||||
|
||||
addVectorsInto<<<numberOfBlocks, threadsPerBlock>>>(c, a, b, N);
|
||||
|
||||
addVectorsErr = cudaGetLastError();
|
||||
if (addVectorsErr != cudaSuccess)
|
||||
printf("Error: %s\n", cudaGetErrorString(addVectorsErr));
|
||||
|
||||
asyncErr = cudaDeviceSynchronize();
|
||||
if (asyncErr != cudaSuccess)
|
||||
printf("Error: %s\n", cudaGetErrorString(asyncErr));
|
||||
|
||||
checkElementsAre(7, c, N);
|
||||
|
||||
cudaFree(a);
|
||||
cudaFree(b);
|
||||
cudaFree(c);
|
||||
}
|
103
src/01-vector-add.cu
Normal file
103
src/01-vector-add.cu
Normal file
@ -0,0 +1,103 @@
|
||||
#include <stdio.h>
|
||||
|
||||
/*
|
||||
* Host function to initialize vector elements. This function
|
||||
* simply initializes each element to equal its index in the
|
||||
* vector.
|
||||
*/
|
||||
|
||||
void initWith(float num, float *a, int N)
|
||||
{
|
||||
for (int i = 0; i < N; ++i)
|
||||
{
|
||||
a[i] = num;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Device kernel stores into `result` the sum of each
|
||||
* same-indexed value of `a` and `b`.
|
||||
*/
|
||||
|
||||
__global__ void addVectorsInto(float *result, float *a, float *b, int N)
|
||||
{
|
||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
int stride = blockDim.x * gridDim.x;
|
||||
|
||||
for (int i = index; i < N; i += stride)
|
||||
{
|
||||
result[i] = a[i] + b[i];
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Host function to confirm values in `vector`. This function
|
||||
* assumes all values are the same `target` value.
|
||||
*/
|
||||
|
||||
void checkElementsAre(float target, float *vector, int N)
|
||||
{
|
||||
for (int i = 0; i < N; i++)
|
||||
{
|
||||
if (vector[i] != target)
|
||||
{
|
||||
printf("FAIL: vector[%d] - %0.0f does not equal %0.0f\n", i, vector[i], target);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
printf("Success! All values calculated correctly.\n");
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
const int N = 2 << 24;
|
||||
size_t size = N * sizeof(float);
|
||||
|
||||
float *a;
|
||||
float *b;
|
||||
float *c;
|
||||
|
||||
cudaMallocManaged(&a, size);
|
||||
cudaMallocManaged(&b, size);
|
||||
cudaMallocManaged(&c, size);
|
||||
|
||||
initWith(3, a, N);
|
||||
initWith(4, b, N);
|
||||
initWith(0, c, N);
|
||||
|
||||
size_t threadsPerBlock;
|
||||
size_t numberOfBlocks;
|
||||
|
||||
/*
|
||||
* nsys should register performance changes when execution configuration
|
||||
* is updated.
|
||||
*/
|
||||
|
||||
int deviceId;
|
||||
cudaGetDevice(&deviceId);
|
||||
cudaDeviceProp props;
|
||||
cudaGetDeviceProperties(&props, deviceId);
|
||||
int multiProcessorCount = props.multiProcessorCount;
|
||||
|
||||
threadsPerBlock = 32;
|
||||
numberOfBlocks = 2 * multiProcessorCount;
|
||||
|
||||
cudaError_t addVectorsErr;
|
||||
cudaError_t asyncErr;
|
||||
|
||||
addVectorsInto<<<numberOfBlocks, threadsPerBlock>>>(c, a, b, N);
|
||||
|
||||
addVectorsErr = cudaGetLastError();
|
||||
if (addVectorsErr != cudaSuccess)
|
||||
printf("Error: %s\n", cudaGetErrorString(addVectorsErr));
|
||||
|
||||
asyncErr = cudaDeviceSynchronize();
|
||||
if (asyncErr != cudaSuccess)
|
||||
printf("Error: %s\n", cudaGetErrorString(asyncErr));
|
||||
|
||||
checkElementsAre(7, c, N);
|
||||
|
||||
cudaFree(a);
|
||||
cudaFree(b);
|
||||
cudaFree(c);
|
||||
}
|
31
src/02-page-faults-solution-gpu-only.cu
Normal file
31
src/02-page-faults-solution-gpu-only.cu
Normal file
@ -0,0 +1,31 @@
|
||||
__global__ void deviceKernel(int *a, int N)
|
||||
{
|
||||
int idx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
int stride = blockDim.x * gridDim.x;
|
||||
|
||||
for (int i = idx; i < N; i += stride)
|
||||
{
|
||||
a[i] = 1;
|
||||
}
|
||||
}
|
||||
|
||||
void hostFunction(int *a, int N)
|
||||
{
|
||||
for (int i = 0; i < N; ++i)
|
||||
{
|
||||
a[i] = 1;
|
||||
}
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
int N = 2 << 24;
|
||||
size_t size = N * sizeof(int);
|
||||
int *a;
|
||||
cudaMallocManaged(&a, size);
|
||||
|
||||
deviceKernel<<<256, 256>>>(a, N);
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
cudaFree(a);
|
||||
}
|
92
src/02-vector-add-prefetch-solution-cpu-also.cu
Normal file
92
src/02-vector-add-prefetch-solution-cpu-also.cu
Normal file
@ -0,0 +1,92 @@
|
||||
#include <stdio.h>
|
||||
|
||||
__global__ void initWith(float num, float *a, int N)
|
||||
{
|
||||
|
||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
int stride = blockDim.x * gridDim.x;
|
||||
|
||||
for (int i = index; i < N; i += stride)
|
||||
{
|
||||
a[i] = num;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void addVectorsInto(float *result, float *a, float *b, int N)
|
||||
{
|
||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
int stride = blockDim.x * gridDim.x;
|
||||
|
||||
for (int i = index; i < N; i += stride)
|
||||
{
|
||||
result[i] = a[i] + b[i];
|
||||
}
|
||||
}
|
||||
|
||||
void checkElementsAre(float target, float *vector, int N)
|
||||
{
|
||||
for (int i = 0; i < N; i++)
|
||||
{
|
||||
if (vector[i] != target)
|
||||
{
|
||||
printf("FAIL: vector[%d] - %0.0f does not equal %0.0f\n", i, vector[i], target);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
printf("Success! All values calculated correctly.\n");
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
int deviceId;
|
||||
int numberOfSMs;
|
||||
|
||||
cudaGetDevice(&deviceId);
|
||||
cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);
|
||||
|
||||
const int N = 2 << 24;
|
||||
size_t size = N * sizeof(float);
|
||||
|
||||
float *a;
|
||||
float *b;
|
||||
float *c;
|
||||
|
||||
cudaMallocManaged(&a, size);
|
||||
cudaMallocManaged(&b, size);
|
||||
cudaMallocManaged(&c, size);
|
||||
|
||||
cudaMemPrefetchAsync(a, size, deviceId);
|
||||
cudaMemPrefetchAsync(b, size, deviceId);
|
||||
cudaMemPrefetchAsync(c, size, deviceId);
|
||||
|
||||
size_t threadsPerBlock;
|
||||
size_t numberOfBlocks;
|
||||
|
||||
threadsPerBlock = 256;
|
||||
numberOfBlocks = 32 * numberOfSMs;
|
||||
|
||||
cudaError_t addVectorsErr;
|
||||
cudaError_t asyncErr;
|
||||
|
||||
initWith<<<numberOfBlocks, threadsPerBlock>>>(3, a, N);
|
||||
initWith<<<numberOfBlocks, threadsPerBlock>>>(4, b, N);
|
||||
initWith<<<numberOfBlocks, threadsPerBlock>>>(0, c, N);
|
||||
|
||||
addVectorsInto<<<numberOfBlocks, threadsPerBlock>>>(c, a, b, N);
|
||||
|
||||
addVectorsErr = cudaGetLastError();
|
||||
if (addVectorsErr != cudaSuccess)
|
||||
printf("Error: %s\n", cudaGetErrorString(addVectorsErr));
|
||||
|
||||
asyncErr = cudaDeviceSynchronize();
|
||||
if (asyncErr != cudaSuccess)
|
||||
printf("Error: %s\n", cudaGetErrorString(asyncErr));
|
||||
|
||||
cudaMemPrefetchAsync(c, size, cudaCpuDeviceId);
|
||||
|
||||
checkElementsAre(7, c, N);
|
||||
|
||||
cudaFree(a);
|
||||
cudaFree(b);
|
||||
cudaFree(c);
|
||||
}
|
32
src/03-page-faults-solution-cpu-then-gpu.cu
Normal file
32
src/03-page-faults-solution-cpu-then-gpu.cu
Normal file
@ -0,0 +1,32 @@
|
||||
__global__ void deviceKernel(int *a, int N)
|
||||
{
|
||||
int idx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
int stride = blockDim.x * gridDim.x;
|
||||
|
||||
for (int i = idx; i < N; i += stride)
|
||||
{
|
||||
a[i] = 1;
|
||||
}
|
||||
}
|
||||
|
||||
void hostFunction(int *a, int N)
|
||||
{
|
||||
for (int i = 0; i < N; ++i)
|
||||
{
|
||||
a[i] = 1;
|
||||
}
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
int N = 2 << 24;
|
||||
size_t size = N * sizeof(int);
|
||||
int *a;
|
||||
cudaMallocManaged(&a, size);
|
||||
|
||||
hostFunction(a, N);
|
||||
deviceKernel<<<256, 256>>>(a, N);
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
cudaFree(a);
|
||||
}
|
30
src/04-page-faults-solution-gpu-then-cpu.cu
Normal file
30
src/04-page-faults-solution-gpu-then-cpu.cu
Normal file
@ -0,0 +1,30 @@
|
||||
__global__ void deviceKernel(int *a, int N)
|
||||
{
|
||||
int idx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
int stride = blockDim.x * gridDim.x;
|
||||
|
||||
for (int i = idx; i < N; i += stride)
|
||||
{
|
||||
a[i] = 1;
|
||||
}
|
||||
}
|
||||
|
||||
void hostFunction(int *a, int N)
|
||||
{
|
||||
for (int i = 0; i < N; ++i)
|
||||
{
|
||||
a[i] = 1;
|
||||
}
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
int N = 2 << 24;
|
||||
size_t size = N * sizeof(int);
|
||||
int *a;
|
||||
cudaMallocManaged(&a, size);
|
||||
deviceKernel<<<256, 256>>>(a, N);
|
||||
cudaDeviceSynchronize();
|
||||
hostFunction(a, N);
|
||||
cudaFree(a);
|
||||
}
|
Reference in New Issue
Block a user