diff --git a/src/01-get-device-properties.cu b/src/01-get-device-properties.cu new file mode 100644 index 0000000..90d3019 --- /dev/null +++ b/src/01-get-device-properties.cu @@ -0,0 +1,30 @@ +#include + +int main() +{ + /* + * Assign values to these variables so that the output string below prints the + * requested properties of the currently active GPU. + */ + + int deviceId; + int computeCapabilityMajor; + int computeCapabilityMinor; + int multiProcessorCount; + int warpSize; + + /* + * There should be no need to modify the output string below. + */ + + cudaGetDevice(&deviceId); + cudaDeviceProp props; + cudaGetDeviceProperties(&props, deviceId); + + computeCapabilityMajor = props.major; + computeCapabilityMinor = props.minor; + multiProcessorCount = props.multiProcessorCount; + warpSize = props.warpSize; + + printf("Device ID: %d\nNumber of SMs: %d\nCompute Capability Major: %d\nCompute Capability Minor: %d\nWarp Size: %d\n", deviceId, multiProcessorCount, computeCapabilityMajor, computeCapabilityMinor, warpSize); +} diff --git a/src/01-page-faults-solution-cpu-only.cu b/src/01-page-faults-solution-cpu-only.cu new file mode 100644 index 0000000..bbe2e1e --- /dev/null +++ b/src/01-page-faults-solution-cpu-only.cu @@ -0,0 +1,28 @@ +__global__ void deviceKernel(int *a, int N) +{ + int idx = threadIdx.x + blockIdx.x * blockDim.x; + int stride = blockDim.x * gridDim.x; + + for (int i = idx; i < N; i += stride) + { + a[i] = 1; + } +} + +void hostFunction(int *a, int N) +{ + for (int i = 0; i < N; ++i) + { + a[i] = 1; + } +} + +int main() +{ + int N = 2 << 24; + size_t size = N * sizeof(int); + int *a; + cudaMallocManaged(&a, size); + hostFunction(a, N); + cudaFree(a); +} diff --git a/src/01-saxpy.cu b/src/01-saxpy.cu new file mode 100644 index 0000000..3bc9235 --- /dev/null +++ b/src/01-saxpy.cu @@ -0,0 +1,68 @@ +#include + +#define N 2048 * 2048 // Number of elements in each vector + +/* + * Optimize this already-accelerated codebase. Work iteratively, + * and use nsys to support your work. + * + * Aim to profile `saxpy` (without modifying `N`) running under + * 20us. + * + * Some bugs have been placed in this codebase for your edification. + */ + +__global__ void saxpy(int *a, int *b, int *c) +{ + int tid = blockIdx.x * blockDim.x * threadIdx.x; + int stride = blockDim.x * gridDim.x; + + for (int i = tid; i < N; i += stride) + { + c[i] = 2 * a[i] + b[i]; + } +} + +int main() +{ + int *a, *b, *c; + + int size = N * sizeof(int); // The total number of bytes per vector + + int deviceId; + int numberOfSms; + + cudaGetDevice(&deviceId); + cudaDeviceGetAttribute(&numberOfSms, cudaDevAttrMultiProcessorCount, deviceId); + + cudaMallocManaged(&a, size); + cudaMallocManaged(&b, size); + cudaMallocManaged(&c, size); + + // Initialize memory + for (int i = 0; i < N; ++i) + { + a[i] = 2; + b[i] = 1; + c[i] = 0; + } + + int threads_per_block = 128; + int number_of_blocks = (N / threads_per_block) + 1; + + saxpy<<>>(a, b, c); + + cudaDeviceSynchronize(); + + // Print out the first and last 5 values of c for a quality check + for (int i = 0; i < 5; ++i) + printf("c[%d] = %d, ", i, c[i]); + printf("\n"); + for (int i = N - 5; i < N; ++i) + printf("c[%d] = %d, ", i, c[i]); + printf("\n"); + + cudaFree(a); + cudaFree(b); + cudaFree(c); +} diff --git a/src/01-vector-add-init-in-kernel-solution.cu b/src/01-vector-add-init-in-kernel-solution.cu new file mode 100644 index 0000000..c216252 --- /dev/null +++ b/src/01-vector-add-init-in-kernel-solution.cu @@ -0,0 +1,101 @@ +#include + +/* + * Refactor host function to run as CUDA kernel + */ + +__global__ void initWith(float num, float *a, int N) +{ + int index = threadIdx.x + blockIdx.x * blockDim.x; + int stride = blockDim.x * gridDim.x; + + for (int i = index; i < N; i += stride) + { + a[i] = num; + } +} + +__global__ void addArraysInto(float *result, float *a, float *b, int N) +{ + int index = threadIdx.x + blockIdx.x * blockDim.x; + int stride = blockDim.x * gridDim.x; + + for (int i = index; i < N; i += stride) + { + result[i] = a[i] + b[i]; + } +} + +void checkElementsAre(float target, float *array, int N) +{ + for (int i = 0; i < N; i++) + { + if (array[i] != target) + { + printf("FAIL: array[%d] - %0.0f does not equal %0.0f\n", i, array[i], target); + exit(1); + } + } + printf("Success! All values calculated correctly.\n"); +} + +int main() +{ + int deviceId; + int numberOfSMs; + + cudaGetDevice(&deviceId); + cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId); + printf("Device ID: %d\tNumber of SMs: %d\n", deviceId, numberOfSMs); + + const int N = 2 << 24; + size_t size = N * sizeof(float); + + float *a; + float *b; + float *c; + + cudaMallocManaged(&a, size); + cudaMallocManaged(&b, size); + cudaMallocManaged(&c, size); + + size_t threadsPerBlock; + size_t numberOfBlocks; + + threadsPerBlock = 256; + numberOfBlocks = 32 * numberOfSMs; + + cudaError_t addArraysErr; + cudaError_t asyncErr; + + /* + * Launch kernels. + */ + + initWith<<>>(3, a, N); + initWith<<>>(4, b, N); + initWith<<>>(0, c, N); + + /* + * Now that initialization is happening on a GPU, host code + * must be synchronized to wait for its completion. + */ + + cudaDeviceSynchronize(); + + addArraysInto<<>>(c, a, b, N); + + addArraysErr = cudaGetLastError(); + if (addArraysErr != cudaSuccess) + printf("Error: %s\n", cudaGetErrorString(addArraysErr)); + + asyncErr = cudaDeviceSynchronize(); + if (asyncErr != cudaSuccess) + printf("Error: %s\n", cudaGetErrorString(asyncErr)); + + checkElementsAre(7, c, N); + + cudaFree(a); + cudaFree(b); + cudaFree(c); +} diff --git a/src/01-vector-add-prefetch-solution.cu b/src/01-vector-add-prefetch-solution.cu new file mode 100644 index 0000000..c890458 --- /dev/null +++ b/src/01-vector-add-prefetch-solution.cu @@ -0,0 +1,90 @@ +#include + +__global__ void initWith(float num, float *a, int N) +{ + + int index = threadIdx.x + blockIdx.x * blockDim.x; + int stride = blockDim.x * gridDim.x; + + for (int i = index; i < N; i += stride) + { + a[i] = num; + } +} + +__global__ void addVectorsInto(float *result, float *a, float *b, int N) +{ + int index = threadIdx.x + blockIdx.x * blockDim.x; + int stride = blockDim.x * gridDim.x; + + for (int i = index; i < N; i += stride) + { + result[i] = a[i] + b[i]; + } +} + +void checkElementsAre(float target, float *vector, int N) +{ + for (int i = 0; i < N; i++) + { + if (vector[i] != target) + { + printf("FAIL: vector[%d] - %0.0f does not equal %0.0f\n", i, vector[i], target); + exit(1); + } + } + printf("Success! All values calculated correctly.\n"); +} + +int main() +{ + int deviceId; + int numberOfSMs; + + cudaGetDevice(&deviceId); + cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId); + + const int N = 2 << 24; + size_t size = N * sizeof(float); + + float *a; + float *b; + float *c; + + cudaMallocManaged(&a, size); + cudaMallocManaged(&b, size); + cudaMallocManaged(&c, size); + + cudaMemPrefetchAsync(a, size, deviceId); + cudaMemPrefetchAsync(b, size, deviceId); + cudaMemPrefetchAsync(c, size, deviceId); + + size_t threadsPerBlock; + size_t numberOfBlocks; + + threadsPerBlock = 256; + numberOfBlocks = 32 * numberOfSMs; + + cudaError_t addVectorsErr; + cudaError_t asyncErr; + + initWith<<>>(3, a, N); + initWith<<>>(4, b, N); + initWith<<>>(0, c, N); + + addVectorsInto<<>>(c, a, b, N); + + addVectorsErr = cudaGetLastError(); + if (addVectorsErr != cudaSuccess) + printf("Error: %s\n", cudaGetErrorString(addVectorsErr)); + + asyncErr = cudaDeviceSynchronize(); + if (asyncErr != cudaSuccess) + printf("Error: %s\n", cudaGetErrorString(asyncErr)); + + checkElementsAre(7, c, N); + + cudaFree(a); + cudaFree(b); + cudaFree(c); +} diff --git a/src/01-vector-add.cu b/src/01-vector-add.cu new file mode 100644 index 0000000..555217f --- /dev/null +++ b/src/01-vector-add.cu @@ -0,0 +1,103 @@ +#include + +/* + * Host function to initialize vector elements. This function + * simply initializes each element to equal its index in the + * vector. + */ + +void initWith(float num, float *a, int N) +{ + for (int i = 0; i < N; ++i) + { + a[i] = num; + } +} + +/* + * Device kernel stores into `result` the sum of each + * same-indexed value of `a` and `b`. + */ + +__global__ void addVectorsInto(float *result, float *a, float *b, int N) +{ + int index = threadIdx.x + blockIdx.x * blockDim.x; + int stride = blockDim.x * gridDim.x; + + for (int i = index; i < N; i += stride) + { + result[i] = a[i] + b[i]; + } +} + +/* + * Host function to confirm values in `vector`. This function + * assumes all values are the same `target` value. + */ + +void checkElementsAre(float target, float *vector, int N) +{ + for (int i = 0; i < N; i++) + { + if (vector[i] != target) + { + printf("FAIL: vector[%d] - %0.0f does not equal %0.0f\n", i, vector[i], target); + exit(1); + } + } + printf("Success! All values calculated correctly.\n"); +} + +int main() +{ + const int N = 2 << 24; + size_t size = N * sizeof(float); + + float *a; + float *b; + float *c; + + cudaMallocManaged(&a, size); + cudaMallocManaged(&b, size); + cudaMallocManaged(&c, size); + + initWith(3, a, N); + initWith(4, b, N); + initWith(0, c, N); + + size_t threadsPerBlock; + size_t numberOfBlocks; + + /* + * nsys should register performance changes when execution configuration + * is updated. + */ + + int deviceId; + cudaGetDevice(&deviceId); + cudaDeviceProp props; + cudaGetDeviceProperties(&props, deviceId); + int multiProcessorCount = props.multiProcessorCount; + + threadsPerBlock = 32; + numberOfBlocks = 2 * multiProcessorCount; + + cudaError_t addVectorsErr; + cudaError_t asyncErr; + + addVectorsInto<<>>(c, a, b, N); + + addVectorsErr = cudaGetLastError(); + if (addVectorsErr != cudaSuccess) + printf("Error: %s\n", cudaGetErrorString(addVectorsErr)); + + asyncErr = cudaDeviceSynchronize(); + if (asyncErr != cudaSuccess) + printf("Error: %s\n", cudaGetErrorString(asyncErr)); + + checkElementsAre(7, c, N); + + cudaFree(a); + cudaFree(b); + cudaFree(c); +} diff --git a/src/02-page-faults-solution-gpu-only.cu b/src/02-page-faults-solution-gpu-only.cu new file mode 100644 index 0000000..c5406c2 --- /dev/null +++ b/src/02-page-faults-solution-gpu-only.cu @@ -0,0 +1,31 @@ +__global__ void deviceKernel(int *a, int N) +{ + int idx = threadIdx.x + blockIdx.x * blockDim.x; + int stride = blockDim.x * gridDim.x; + + for (int i = idx; i < N; i += stride) + { + a[i] = 1; + } +} + +void hostFunction(int *a, int N) +{ + for (int i = 0; i < N; ++i) + { + a[i] = 1; + } +} + +int main() +{ + int N = 2 << 24; + size_t size = N * sizeof(int); + int *a; + cudaMallocManaged(&a, size); + + deviceKernel<<<256, 256>>>(a, N); + cudaDeviceSynchronize(); + + cudaFree(a); +} diff --git a/src/02-vector-add-prefetch-solution-cpu-also.cu b/src/02-vector-add-prefetch-solution-cpu-also.cu new file mode 100644 index 0000000..54a95dd --- /dev/null +++ b/src/02-vector-add-prefetch-solution-cpu-also.cu @@ -0,0 +1,92 @@ +#include + +__global__ void initWith(float num, float *a, int N) +{ + + int index = threadIdx.x + blockIdx.x * blockDim.x; + int stride = blockDim.x * gridDim.x; + + for (int i = index; i < N; i += stride) + { + a[i] = num; + } +} + +__global__ void addVectorsInto(float *result, float *a, float *b, int N) +{ + int index = threadIdx.x + blockIdx.x * blockDim.x; + int stride = blockDim.x * gridDim.x; + + for (int i = index; i < N; i += stride) + { + result[i] = a[i] + b[i]; + } +} + +void checkElementsAre(float target, float *vector, int N) +{ + for (int i = 0; i < N; i++) + { + if (vector[i] != target) + { + printf("FAIL: vector[%d] - %0.0f does not equal %0.0f\n", i, vector[i], target); + exit(1); + } + } + printf("Success! All values calculated correctly.\n"); +} + +int main() +{ + int deviceId; + int numberOfSMs; + + cudaGetDevice(&deviceId); + cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId); + + const int N = 2 << 24; + size_t size = N * sizeof(float); + + float *a; + float *b; + float *c; + + cudaMallocManaged(&a, size); + cudaMallocManaged(&b, size); + cudaMallocManaged(&c, size); + + cudaMemPrefetchAsync(a, size, deviceId); + cudaMemPrefetchAsync(b, size, deviceId); + cudaMemPrefetchAsync(c, size, deviceId); + + size_t threadsPerBlock; + size_t numberOfBlocks; + + threadsPerBlock = 256; + numberOfBlocks = 32 * numberOfSMs; + + cudaError_t addVectorsErr; + cudaError_t asyncErr; + + initWith<<>>(3, a, N); + initWith<<>>(4, b, N); + initWith<<>>(0, c, N); + + addVectorsInto<<>>(c, a, b, N); + + addVectorsErr = cudaGetLastError(); + if (addVectorsErr != cudaSuccess) + printf("Error: %s\n", cudaGetErrorString(addVectorsErr)); + + asyncErr = cudaDeviceSynchronize(); + if (asyncErr != cudaSuccess) + printf("Error: %s\n", cudaGetErrorString(asyncErr)); + + cudaMemPrefetchAsync(c, size, cudaCpuDeviceId); + + checkElementsAre(7, c, N); + + cudaFree(a); + cudaFree(b); + cudaFree(c); +} diff --git a/src/03-page-faults-solution-cpu-then-gpu.cu b/src/03-page-faults-solution-cpu-then-gpu.cu new file mode 100644 index 0000000..c478532 --- /dev/null +++ b/src/03-page-faults-solution-cpu-then-gpu.cu @@ -0,0 +1,32 @@ +__global__ void deviceKernel(int *a, int N) +{ + int idx = threadIdx.x + blockIdx.x * blockDim.x; + int stride = blockDim.x * gridDim.x; + + for (int i = idx; i < N; i += stride) + { + a[i] = 1; + } +} + +void hostFunction(int *a, int N) +{ + for (int i = 0; i < N; ++i) + { + a[i] = 1; + } +} + +int main() +{ + int N = 2 << 24; + size_t size = N * sizeof(int); + int *a; + cudaMallocManaged(&a, size); + + hostFunction(a, N); + deviceKernel<<<256, 256>>>(a, N); + cudaDeviceSynchronize(); + + cudaFree(a); +} diff --git a/src/04-page-faults-solution-gpu-then-cpu.cu b/src/04-page-faults-solution-gpu-then-cpu.cu new file mode 100644 index 0000000..f6b46bb --- /dev/null +++ b/src/04-page-faults-solution-gpu-then-cpu.cu @@ -0,0 +1,30 @@ +__global__ void deviceKernel(int *a, int N) +{ + int idx = threadIdx.x + blockIdx.x * blockDim.x; + int stride = blockDim.x * gridDim.x; + + for (int i = idx; i < N; i += stride) + { + a[i] = 1; + } +} + +void hostFunction(int *a, int N) +{ + for (int i = 0; i < N; ++i) + { + a[i] = 1; + } +} + +int main() +{ + int N = 2 << 24; + size_t size = N * sizeof(int); + int *a; + cudaMallocManaged(&a, size); + deviceKernel<<<256, 256>>>(a, N); + cudaDeviceSynchronize(); + hostFunction(a, N); + cudaFree(a); +}