mirror of
https://github.com/guilhermewerner/nvidia-cuda
synced 2025-06-16 13:14:18 +00:00
Section 2
This commit is contained in:
90
src/01-vector-add-prefetch-solution.cu
Normal file
90
src/01-vector-add-prefetch-solution.cu
Normal file
@ -0,0 +1,90 @@
|
||||
#include <stdio.h>
|
||||
|
||||
__global__ void initWith(float num, float *a, int N)
|
||||
{
|
||||
|
||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
int stride = blockDim.x * gridDim.x;
|
||||
|
||||
for (int i = index; i < N; i += stride)
|
||||
{
|
||||
a[i] = num;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void addVectorsInto(float *result, float *a, float *b, int N)
|
||||
{
|
||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
int stride = blockDim.x * gridDim.x;
|
||||
|
||||
for (int i = index; i < N; i += stride)
|
||||
{
|
||||
result[i] = a[i] + b[i];
|
||||
}
|
||||
}
|
||||
|
||||
void checkElementsAre(float target, float *vector, int N)
|
||||
{
|
||||
for (int i = 0; i < N; i++)
|
||||
{
|
||||
if (vector[i] != target)
|
||||
{
|
||||
printf("FAIL: vector[%d] - %0.0f does not equal %0.0f\n", i, vector[i], target);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
printf("Success! All values calculated correctly.\n");
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
int deviceId;
|
||||
int numberOfSMs;
|
||||
|
||||
cudaGetDevice(&deviceId);
|
||||
cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);
|
||||
|
||||
const int N = 2 << 24;
|
||||
size_t size = N * sizeof(float);
|
||||
|
||||
float *a;
|
||||
float *b;
|
||||
float *c;
|
||||
|
||||
cudaMallocManaged(&a, size);
|
||||
cudaMallocManaged(&b, size);
|
||||
cudaMallocManaged(&c, size);
|
||||
|
||||
cudaMemPrefetchAsync(a, size, deviceId);
|
||||
cudaMemPrefetchAsync(b, size, deviceId);
|
||||
cudaMemPrefetchAsync(c, size, deviceId);
|
||||
|
||||
size_t threadsPerBlock;
|
||||
size_t numberOfBlocks;
|
||||
|
||||
threadsPerBlock = 256;
|
||||
numberOfBlocks = 32 * numberOfSMs;
|
||||
|
||||
cudaError_t addVectorsErr;
|
||||
cudaError_t asyncErr;
|
||||
|
||||
initWith<<<numberOfBlocks, threadsPerBlock>>>(3, a, N);
|
||||
initWith<<<numberOfBlocks, threadsPerBlock>>>(4, b, N);
|
||||
initWith<<<numberOfBlocks, threadsPerBlock>>>(0, c, N);
|
||||
|
||||
addVectorsInto<<<numberOfBlocks, threadsPerBlock>>>(c, a, b, N);
|
||||
|
||||
addVectorsErr = cudaGetLastError();
|
||||
if (addVectorsErr != cudaSuccess)
|
||||
printf("Error: %s\n", cudaGetErrorString(addVectorsErr));
|
||||
|
||||
asyncErr = cudaDeviceSynchronize();
|
||||
if (asyncErr != cudaSuccess)
|
||||
printf("Error: %s\n", cudaGetErrorString(asyncErr));
|
||||
|
||||
checkElementsAre(7, c, N);
|
||||
|
||||
cudaFree(a);
|
||||
cudaFree(b);
|
||||
cudaFree(c);
|
||||
}
|
Reference in New Issue
Block a user