From 5165824ea5d420b0b001fc0ff791cb604c40435c Mon Sep 17 00:00:00 2001 From: Guilherme Werner Date: Fri, 3 Nov 2023 17:52:39 -0300 Subject: [PATCH] Section 3 --- src/01-init-kernel-solution.cu | 90 ++++++++++++++++++++ src/05-stream-intro.cu | 19 +++++ src/09-nbody-solution.cu | 145 +++++++++++++++++++++++++++++++++ src/09-nbody.cu | 138 +++++++++++++++++++++++++++++++ 4 files changed, 392 insertions(+) create mode 100644 src/01-init-kernel-solution.cu create mode 100644 src/05-stream-intro.cu create mode 100644 src/09-nbody-solution.cu create mode 100644 src/09-nbody.cu diff --git a/src/01-init-kernel-solution.cu b/src/01-init-kernel-solution.cu new file mode 100644 index 0000000..c890458 --- /dev/null +++ b/src/01-init-kernel-solution.cu @@ -0,0 +1,90 @@ +#include + +__global__ void initWith(float num, float *a, int N) +{ + + int index = threadIdx.x + blockIdx.x * blockDim.x; + int stride = blockDim.x * gridDim.x; + + for (int i = index; i < N; i += stride) + { + a[i] = num; + } +} + +__global__ void addVectorsInto(float *result, float *a, float *b, int N) +{ + int index = threadIdx.x + blockIdx.x * blockDim.x; + int stride = blockDim.x * gridDim.x; + + for (int i = index; i < N; i += stride) + { + result[i] = a[i] + b[i]; + } +} + +void checkElementsAre(float target, float *vector, int N) +{ + for (int i = 0; i < N; i++) + { + if (vector[i] != target) + { + printf("FAIL: vector[%d] - %0.0f does not equal %0.0f\n", i, vector[i], target); + exit(1); + } + } + printf("Success! All values calculated correctly.\n"); +} + +int main() +{ + int deviceId; + int numberOfSMs; + + cudaGetDevice(&deviceId); + cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId); + + const int N = 2 << 24; + size_t size = N * sizeof(float); + + float *a; + float *b; + float *c; + + cudaMallocManaged(&a, size); + cudaMallocManaged(&b, size); + cudaMallocManaged(&c, size); + + cudaMemPrefetchAsync(a, size, deviceId); + cudaMemPrefetchAsync(b, size, deviceId); + cudaMemPrefetchAsync(c, size, deviceId); + + size_t threadsPerBlock; + size_t numberOfBlocks; + + threadsPerBlock = 256; + numberOfBlocks = 32 * numberOfSMs; + + cudaError_t addVectorsErr; + cudaError_t asyncErr; + + initWith<<>>(3, a, N); + initWith<<>>(4, b, N); + initWith<<>>(0, c, N); + + addVectorsInto<<>>(c, a, b, N); + + addVectorsErr = cudaGetLastError(); + if (addVectorsErr != cudaSuccess) + printf("Error: %s\n", cudaGetErrorString(addVectorsErr)); + + asyncErr = cudaDeviceSynchronize(); + if (asyncErr != cudaSuccess) + printf("Error: %s\n", cudaGetErrorString(asyncErr)); + + checkElementsAre(7, c, N); + + cudaFree(a); + cudaFree(b); + cudaFree(c); +} diff --git a/src/05-stream-intro.cu b/src/05-stream-intro.cu new file mode 100644 index 0000000..4cea36e --- /dev/null +++ b/src/05-stream-intro.cu @@ -0,0 +1,19 @@ +#include +#include + +__global__ void printNumber(int number) +{ + printf("%d\n", number); +} + +int main() +{ + for (int i = 0; i < 5; ++i) + { + cudaStream_t stream; // CUDA streams are of type `cudaStream_t`. + cudaStreamCreate(&stream); // Note that a pointer must be passed to `cudaCreateStream`. + printNumber<<<1, 2, 0, stream>>>(i); // `stream` is passed as 4th EC argument. + cudaStreamDestroy(stream); + } + cudaDeviceSynchronize(); +} diff --git a/src/09-nbody-solution.cu b/src/09-nbody-solution.cu new file mode 100644 index 0000000..7fb5321 --- /dev/null +++ b/src/09-nbody-solution.cu @@ -0,0 +1,145 @@ +#include +#include +#include +#include "timer.h" +#include "files.h" + +#define SOFTENING 1e-9f + +/* + * Each body contains x, y, and z coordinate positions, + * as well as velocities in the x, y, and z directions. + */ + +typedef struct +{ + float x, y, z, vx, vy, vz; +} Body; + +/* + * Calculate the gravitational impact of all bodies in the system + * on all others. + */ + +__global__ void bodyForce(Body *p, float dt, int n) +{ + int tid = threadIdx.x + blockDim.x * blockIdx.x; + int stride = blockDim.x * gridDim.x; + + for (int i = tid; i < n; i += stride) + { + float Fx = 0.0f; + float Fy = 0.0f; + float Fz = 0.0f; + + for (int j = 0; j < n; j++) + { + float dx = p[j].x - p[i].x; + float dy = p[j].y - p[i].y; + float dz = p[j].z - p[i].z; + float distSqr = dx * dx + dy * dy + dz * dz + SOFTENING; + float invDist = rsqrtf(distSqr); + float invDist3 = invDist * invDist * invDist; + + Fx += dx * invDist3; + Fy += dy * invDist3; + Fz += dz * invDist3; + } + + p[i].vx += dt * Fx; + p[i].vy += dt * Fy; + p[i].vz += dt * Fz; + } +} + +int main(const int argc, const char **argv) +{ + + // The assessment will test against both 2<11 and 2<15. + // Feel free to pass the command line argument 15 when you generate ./nbody report files + int nBodies = 2 << 11; + if (argc > 1) + nBodies = 2 << atoi(argv[1]); + + // The assessment will pass hidden initialized values to check for correctness. + // You should not make changes to these files, or else the assessment will not work. + const char *initialized_values; + const char *solution_values; + + if (nBodies == 2 << 11) + { + initialized_values = "09-nbody/files/initialized_4096"; + solution_values = "09-nbody/files/solution_4096"; + } + else + { // nBodies == 2<<15 + initialized_values = "09-nbody/files/initialized_65536"; + solution_values = "09-nbody/files/solution_65536"; + } + + if (argc > 2) + initialized_values = argv[2]; + if (argc > 3) + solution_values = argv[3]; + + const float dt = 0.01f; // Time step + const int nIters = 10; // Simulation iterations + + int bytes = nBodies * sizeof(Body); + float *buf; + + // buf = (float *)malloc(bytes); + cudaMallocManaged(&buf, bytes); + + Body *p = (Body *)buf; + + read_values_from_file(initialized_values, buf, bytes); + + double totalTime = 0.0; + + /* + * This simulation will run for 10 cycles of time, calculating gravitational + * interaction amongst bodies, and adjusting their positions to reflect. + */ + + for (int iter = 0; iter < nIters; iter++) + { + StartTimer(); + + /* + * You will likely wish to refactor the work being done in `bodyForce`, + * and potentially the work to integrate the positions. + */ + + bodyForce<<<120, 256>>>(p, dt, nBodies); // compute interbody forces + + cudaDeviceSynchronize(); + + /* + * This position integration cannot occur until this round of `bodyForce` has completed. + * Also, the next round of `bodyForce` cannot begin until the integration is complete. + */ + + for (int i = 0; i < nBodies; i++) + { // integrate position + p[i].x += p[i].vx * dt; + p[i].y += p[i].vy * dt; + p[i].z += p[i].vz * dt; + } + + const double tElapsed = GetTimer() / 1000.0; + totalTime += tElapsed; + } + + double avgTime = totalTime / (double)(nIters); + float billionsOfOpsPerSecond = 1e-9 * nBodies * nBodies / avgTime; + write_values_to_file(solution_values, buf, bytes); + + // You will likely enjoy watching this value grow as you accelerate the application, + // but beware that a failure to correctly synchronize the device might result in + // unrealistically high values. + printf("%0.3f Billion Interactions / second\n", billionsOfOpsPerSecond); + + // free(buf); + cudaFree(buf); +} diff --git a/src/09-nbody.cu b/src/09-nbody.cu new file mode 100644 index 0000000..fab3068 --- /dev/null +++ b/src/09-nbody.cu @@ -0,0 +1,138 @@ +#include +#include +#include +#include "timer.h" +#include "files.h" + +#define SOFTENING 1e-9f + +/* + * Each body contains x, y, and z coordinate positions, + * as well as velocities in the x, y, and z directions. + */ + +typedef struct +{ + float x, y, z, vx, vy, vz; +} Body; + +/* + * Calculate the gravitational impact of all bodies in the system + * on all others. + */ + +void bodyForce(Body *p, float dt, int n) +{ + for (int i = 0; i < n; ++i) + { + float Fx = 0.0f; + float Fy = 0.0f; + float Fz = 0.0f; + + for (int j = 0; j < n; j++) + { + float dx = p[j].x - p[i].x; + float dy = p[j].y - p[i].y; + float dz = p[j].z - p[i].z; + float distSqr = dx * dx + dy * dy + dz * dz + SOFTENING; + float invDist = rsqrtf(distSqr); + float invDist3 = invDist * invDist * invDist; + + Fx += dx * invDist3; + Fy += dy * invDist3; + Fz += dz * invDist3; + } + + p[i].vx += dt * Fx; + p[i].vy += dt * Fy; + p[i].vz += dt * Fz; + } +} + +int main(const int argc, const char **argv) +{ + + // The assessment will test against both 2<11 and 2<15. + // Feel free to pass the command line argument 15 when you generate ./nbody report files + int nBodies = 2 << 11; + if (argc > 1) + nBodies = 2 << atoi(argv[1]); + + // The assessment will pass hidden initialized values to check for correctness. + // You should not make changes to these files, or else the assessment will not work. + const char *initialized_values; + const char *solution_values; + + if (nBodies == 2 << 11) + { + initialized_values = "09-nbody/files/initialized_4096"; + solution_values = "09-nbody/files/solution_4096"; + } + else + { // nBodies == 2<<15 + initialized_values = "09-nbody/files/initialized_65536"; + solution_values = "09-nbody/files/solution_65536"; + } + + if (argc > 2) + initialized_values = argv[2]; + if (argc > 3) + solution_values = argv[3]; + + const float dt = 0.01f; // Time step + const int nIters = 10; // Simulation iterations + + int bytes = nBodies * sizeof(Body); + float *buf; + + buf = (float *)malloc(bytes); + + Body *p = (Body *)buf; + + read_values_from_file(initialized_values, buf, bytes); + + double totalTime = 0.0; + + /* + * This simulation will run for 10 cycles of time, calculating gravitational + * interaction amongst bodies, and adjusting their positions to reflect. + */ + + for (int iter = 0; iter < nIters; iter++) + { + StartTimer(); + + /* + * You will likely wish to refactor the work being done in `bodyForce`, + * and potentially the work to integrate the positions. + */ + + bodyForce(p, dt, nBodies); // compute interbody forces + + /* + * This position integration cannot occur until this round of `bodyForce` has completed. + * Also, the next round of `bodyForce` cannot begin until the integration is complete. + */ + + for (int i = 0; i < nBodies; i++) + { // integrate position + p[i].x += p[i].vx * dt; + p[i].y += p[i].vy * dt; + p[i].z += p[i].vz * dt; + } + + const double tElapsed = GetTimer() / 1000.0; + totalTime += tElapsed; + } + + double avgTime = totalTime / (double)(nIters); + float billionsOfOpsPerSecond = 1e-9 * nBodies * nBodies / avgTime; + write_values_to_file(solution_values, buf, bytes); + + // You will likely enjoy watching this value grow as you accelerate the application, + // but beware that a failure to correctly synchronize the device might result in + // unrealistically high values. + printf("%0.3f Billion Interactions / second\n", billionsOfOpsPerSecond); + + free(buf); +}