#include __global__ void initWith(float num, float *a, int N) { int index = threadIdx.x + blockIdx.x * blockDim.x; int stride = blockDim.x * gridDim.x; for (int i = index; i < N; i += stride) { a[i] = num; } } __global__ void addVectorsInto(float *result, float *a, float *b, int N) { int index = threadIdx.x + blockIdx.x * blockDim.x; int stride = blockDim.x * gridDim.x; for (int i = index; i < N; i += stride) { result[i] = a[i] + b[i]; } } void checkElementsAre(float target, float *vector, int N) { for (int i = 0; i < N; i++) { if (vector[i] != target) { printf("FAIL: vector[%d] - %0.0f does not equal %0.0f\n", i, vector[i], target); exit(1); } } printf("Success! All values calculated correctly.\n"); } int main() { int deviceId; int numberOfSMs; cudaGetDevice(&deviceId); cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId); const int N = 2 << 24; size_t size = N * sizeof(float); float *a; float *b; float *c; cudaMallocManaged(&a, size); cudaMallocManaged(&b, size); cudaMallocManaged(&c, size); cudaMemPrefetchAsync(a, size, deviceId); cudaMemPrefetchAsync(b, size, deviceId); cudaMemPrefetchAsync(c, size, deviceId); size_t threadsPerBlock; size_t numberOfBlocks; threadsPerBlock = 256; numberOfBlocks = 32 * numberOfSMs; cudaError_t addVectorsErr; cudaError_t asyncErr; initWith<<>>(3, a, N); initWith<<>>(4, b, N); initWith<<>>(0, c, N); addVectorsInto<<>>(c, a, b, N); addVectorsErr = cudaGetLastError(); if (addVectorsErr != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(addVectorsErr)); asyncErr = cudaDeviceSynchronize(); if (asyncErr != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(asyncErr)); checkElementsAre(7, c, N); cudaFree(a); cudaFree(b); cudaFree(c); }