Section 3

This commit is contained in:
Guilherme Werner
2023-11-03 17:52:39 -03:00
parent 163599ad50
commit 5165824ea5
4 changed files with 392 additions and 0 deletions

View File

@ -0,0 +1,90 @@
#include <stdio.h>
__global__ void initWith(float num, float *a, int N)
{
int index = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < N; i += stride)
{
a[i] = num;
}
}
__global__ void addVectorsInto(float *result, float *a, float *b, int N)
{
int index = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < N; i += stride)
{
result[i] = a[i] + b[i];
}
}
void checkElementsAre(float target, float *vector, int N)
{
for (int i = 0; i < N; i++)
{
if (vector[i] != target)
{
printf("FAIL: vector[%d] - %0.0f does not equal %0.0f\n", i, vector[i], target);
exit(1);
}
}
printf("Success! All values calculated correctly.\n");
}
int main()
{
int deviceId;
int numberOfSMs;
cudaGetDevice(&deviceId);
cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);
const int N = 2 << 24;
size_t size = N * sizeof(float);
float *a;
float *b;
float *c;
cudaMallocManaged(&a, size);
cudaMallocManaged(&b, size);
cudaMallocManaged(&c, size);
cudaMemPrefetchAsync(a, size, deviceId);
cudaMemPrefetchAsync(b, size, deviceId);
cudaMemPrefetchAsync(c, size, deviceId);
size_t threadsPerBlock;
size_t numberOfBlocks;
threadsPerBlock = 256;
numberOfBlocks = 32 * numberOfSMs;
cudaError_t addVectorsErr;
cudaError_t asyncErr;
initWith<<<numberOfBlocks, threadsPerBlock>>>(3, a, N);
initWith<<<numberOfBlocks, threadsPerBlock>>>(4, b, N);
initWith<<<numberOfBlocks, threadsPerBlock>>>(0, c, N);
addVectorsInto<<<numberOfBlocks, threadsPerBlock>>>(c, a, b, N);
addVectorsErr = cudaGetLastError();
if (addVectorsErr != cudaSuccess)
printf("Error: %s\n", cudaGetErrorString(addVectorsErr));
asyncErr = cudaDeviceSynchronize();
if (asyncErr != cudaSuccess)
printf("Error: %s\n", cudaGetErrorString(asyncErr));
checkElementsAre(7, c, N);
cudaFree(a);
cudaFree(b);
cudaFree(c);
}

19
src/05-stream-intro.cu Normal file
View File

@ -0,0 +1,19 @@
#include <stdio.h>
#include <unistd.h>
__global__ void printNumber(int number)
{
printf("%d\n", number);
}
int main()
{
for (int i = 0; i < 5; ++i)
{
cudaStream_t stream; // CUDA streams are of type `cudaStream_t`.
cudaStreamCreate(&stream); // Note that a pointer must be passed to `cudaCreateStream`.
printNumber<<<1, 2, 0, stream>>>(i); // `stream` is passed as 4th EC argument.
cudaStreamDestroy(stream);
}
cudaDeviceSynchronize();
}

145
src/09-nbody-solution.cu Normal file
View File

@ -0,0 +1,145 @@
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include "timer.h"
#include "files.h"
#define SOFTENING 1e-9f
/*
* Each body contains x, y, and z coordinate positions,
* as well as velocities in the x, y, and z directions.
*/
typedef struct
{
float x, y, z, vx, vy, vz;
} Body;
/*
* Calculate the gravitational impact of all bodies in the system
* on all others.
*/
__global__ void bodyForce(Body *p, float dt, int n)
{
int tid = threadIdx.x + blockDim.x * blockIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = tid; i < n; i += stride)
{
float Fx = 0.0f;
float Fy = 0.0f;
float Fz = 0.0f;
for (int j = 0; j < n; j++)
{
float dx = p[j].x - p[i].x;
float dy = p[j].y - p[i].y;
float dz = p[j].z - p[i].z;
float distSqr = dx * dx + dy * dy + dz * dz + SOFTENING;
float invDist = rsqrtf(distSqr);
float invDist3 = invDist * invDist * invDist;
Fx += dx * invDist3;
Fy += dy * invDist3;
Fz += dz * invDist3;
}
p[i].vx += dt * Fx;
p[i].vy += dt * Fy;
p[i].vz += dt * Fz;
}
}
int main(const int argc, const char **argv)
{
// The assessment will test against both 2<11 and 2<15.
// Feel free to pass the command line argument 15 when you generate ./nbody report files
int nBodies = 2 << 11;
if (argc > 1)
nBodies = 2 << atoi(argv[1]);
// The assessment will pass hidden initialized values to check for correctness.
// You should not make changes to these files, or else the assessment will not work.
const char *initialized_values;
const char *solution_values;
if (nBodies == 2 << 11)
{
initialized_values = "09-nbody/files/initialized_4096";
solution_values = "09-nbody/files/solution_4096";
}
else
{ // nBodies == 2<<15
initialized_values = "09-nbody/files/initialized_65536";
solution_values = "09-nbody/files/solution_65536";
}
if (argc > 2)
initialized_values = argv[2];
if (argc > 3)
solution_values = argv[3];
const float dt = 0.01f; // Time step
const int nIters = 10; // Simulation iterations
int bytes = nBodies * sizeof(Body);
float *buf;
// buf = (float *)malloc(bytes);
cudaMallocManaged(&buf, bytes);
Body *p = (Body *)buf;
read_values_from_file(initialized_values, buf, bytes);
double totalTime = 0.0;
/*
* This simulation will run for 10 cycles of time, calculating gravitational
* interaction amongst bodies, and adjusting their positions to reflect.
*/
for (int iter = 0; iter < nIters; iter++)
{
StartTimer();
/*
* You will likely wish to refactor the work being done in `bodyForce`,
* and potentially the work to integrate the positions.
*/
bodyForce<<<120, 256>>>(p, dt, nBodies); // compute interbody forces
cudaDeviceSynchronize();
/*
* This position integration cannot occur until this round of `bodyForce` has completed.
* Also, the next round of `bodyForce` cannot begin until the integration is complete.
*/
for (int i = 0; i < nBodies; i++)
{ // integrate position
p[i].x += p[i].vx * dt;
p[i].y += p[i].vy * dt;
p[i].z += p[i].vz * dt;
}
const double tElapsed = GetTimer() / 1000.0;
totalTime += tElapsed;
}
double avgTime = totalTime / (double)(nIters);
float billionsOfOpsPerSecond = 1e-9 * nBodies * nBodies / avgTime;
write_values_to_file(solution_values, buf, bytes);
// You will likely enjoy watching this value grow as you accelerate the application,
// but beware that a failure to correctly synchronize the device might result in
// unrealistically high values.
printf("%0.3f Billion Interactions / second\n", billionsOfOpsPerSecond);
// free(buf);
cudaFree(buf);
}

138
src/09-nbody.cu Normal file
View File

@ -0,0 +1,138 @@
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include "timer.h"
#include "files.h"
#define SOFTENING 1e-9f
/*
* Each body contains x, y, and z coordinate positions,
* as well as velocities in the x, y, and z directions.
*/
typedef struct
{
float x, y, z, vx, vy, vz;
} Body;
/*
* Calculate the gravitational impact of all bodies in the system
* on all others.
*/
void bodyForce(Body *p, float dt, int n)
{
for (int i = 0; i < n; ++i)
{
float Fx = 0.0f;
float Fy = 0.0f;
float Fz = 0.0f;
for (int j = 0; j < n; j++)
{
float dx = p[j].x - p[i].x;
float dy = p[j].y - p[i].y;
float dz = p[j].z - p[i].z;
float distSqr = dx * dx + dy * dy + dz * dz + SOFTENING;
float invDist = rsqrtf(distSqr);
float invDist3 = invDist * invDist * invDist;
Fx += dx * invDist3;
Fy += dy * invDist3;
Fz += dz * invDist3;
}
p[i].vx += dt * Fx;
p[i].vy += dt * Fy;
p[i].vz += dt * Fz;
}
}
int main(const int argc, const char **argv)
{
// The assessment will test against both 2<11 and 2<15.
// Feel free to pass the command line argument 15 when you generate ./nbody report files
int nBodies = 2 << 11;
if (argc > 1)
nBodies = 2 << atoi(argv[1]);
// The assessment will pass hidden initialized values to check for correctness.
// You should not make changes to these files, or else the assessment will not work.
const char *initialized_values;
const char *solution_values;
if (nBodies == 2 << 11)
{
initialized_values = "09-nbody/files/initialized_4096";
solution_values = "09-nbody/files/solution_4096";
}
else
{ // nBodies == 2<<15
initialized_values = "09-nbody/files/initialized_65536";
solution_values = "09-nbody/files/solution_65536";
}
if (argc > 2)
initialized_values = argv[2];
if (argc > 3)
solution_values = argv[3];
const float dt = 0.01f; // Time step
const int nIters = 10; // Simulation iterations
int bytes = nBodies * sizeof(Body);
float *buf;
buf = (float *)malloc(bytes);
Body *p = (Body *)buf;
read_values_from_file(initialized_values, buf, bytes);
double totalTime = 0.0;
/*
* This simulation will run for 10 cycles of time, calculating gravitational
* interaction amongst bodies, and adjusting their positions to reflect.
*/
for (int iter = 0; iter < nIters; iter++)
{
StartTimer();
/*
* You will likely wish to refactor the work being done in `bodyForce`,
* and potentially the work to integrate the positions.
*/
bodyForce(p, dt, nBodies); // compute interbody forces
/*
* This position integration cannot occur until this round of `bodyForce` has completed.
* Also, the next round of `bodyForce` cannot begin until the integration is complete.
*/
for (int i = 0; i < nBodies; i++)
{ // integrate position
p[i].x += p[i].vx * dt;
p[i].y += p[i].vy * dt;
p[i].z += p[i].vz * dt;
}
const double tElapsed = GetTimer() / 1000.0;
totalTime += tElapsed;
}
double avgTime = totalTime / (double)(nIters);
float billionsOfOpsPerSecond = 1e-9 * nBodies * nBodies / avgTime;
write_values_to_file(solution_values, buf, bytes);
// You will likely enjoy watching this value grow as you accelerate the application,
// but beware that a failure to correctly synchronize the device might result in
// unrealistically high values.
printf("%0.3f Billion Interactions / second\n", billionsOfOpsPerSecond);
free(buf);
}