Section 2

2025-06-16 13:14:18 +00:00 · 2023-11-03 15:13:55 -03:00
parent 7e3159c2dd
commit 163599ad50
10 changed files with 605 additions and 0 deletions
--- a/src/01-vector-add-prefetch-solution.cu
+++ b/src/01-vector-add-prefetch-solution.cu
@ -0,0 +1,90 @@
+#include <stdio.h>
+
+__global__ void initWith(float num, float *a, int N)
+{
+
+    int index = threadIdx.x + blockIdx.x * blockDim.x;
+    int stride = blockDim.x * gridDim.x;
+
+    for (int i = index; i < N; i += stride)
+    {
+        a[i] = num;
+    }
+}
+
+__global__ void addVectorsInto(float *result, float *a, float *b, int N)
+{
+    int index = threadIdx.x + blockIdx.x * blockDim.x;
+    int stride = blockDim.x * gridDim.x;
+
+    for (int i = index; i < N; i += stride)
+    {
+        result[i] = a[i] + b[i];
+    }
+}
+
+void checkElementsAre(float target, float *vector, int N)
+{
+    for (int i = 0; i < N; i++)
+    {
+        if (vector[i] != target)
+        {
+            printf("FAIL: vector[%d] - %0.0f does not equal %0.0f\n", i, vector[i], target);
+            exit(1);
+        }
+    }
+    printf("Success! All values calculated correctly.\n");
+}
+
+int main()
+{
+    int deviceId;
+    int numberOfSMs;
+
+    cudaGetDevice(&deviceId);
+    cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);
+
+    const int N = 2 << 24;
+    size_t size = N * sizeof(float);
+
+    float *a;
+    float *b;
+    float *c;
+
+    cudaMallocManaged(&a, size);
+    cudaMallocManaged(&b, size);
+    cudaMallocManaged(&c, size);
+
+    cudaMemPrefetchAsync(a, size, deviceId);
+    cudaMemPrefetchAsync(b, size, deviceId);
+    cudaMemPrefetchAsync(c, size, deviceId);
+
+    size_t threadsPerBlock;
+    size_t numberOfBlocks;
+
+    threadsPerBlock = 256;
+    numberOfBlocks = 32 * numberOfSMs;
+
+    cudaError_t addVectorsErr;
+    cudaError_t asyncErr;
+
+    initWith<<<numberOfBlocks, threadsPerBlock>>>(3, a, N);
+    initWith<<<numberOfBlocks, threadsPerBlock>>>(4, b, N);
+    initWith<<<numberOfBlocks, threadsPerBlock>>>(0, c, N);
+
+    addVectorsInto<<<numberOfBlocks, threadsPerBlock>>>(c, a, b, N);
+
+    addVectorsErr = cudaGetLastError();
+    if (addVectorsErr != cudaSuccess)
+        printf("Error: %s\n", cudaGetErrorString(addVectorsErr));
+
+    asyncErr = cudaDeviceSynchronize();
+    if (asyncErr != cudaSuccess)
+        printf("Error: %s\n", cudaGetErrorString(asyncErr));
+
+    checkElementsAre(7, c, N);
+
+    cudaFree(a);
+    cudaFree(b);
+    cudaFree(c);
+}