mirror of
https://github.com/guilhermewerner/nvidia-cuda
synced 2025-06-14 20:24:21 +00:00
Section 3
This commit is contained in:
90
src/01-init-kernel-solution.cu
Normal file
90
src/01-init-kernel-solution.cu
Normal file
@ -0,0 +1,90 @@
|
||||
#include <stdio.h>
|
||||
|
||||
__global__ void initWith(float num, float *a, int N)
|
||||
{
|
||||
|
||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
int stride = blockDim.x * gridDim.x;
|
||||
|
||||
for (int i = index; i < N; i += stride)
|
||||
{
|
||||
a[i] = num;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void addVectorsInto(float *result, float *a, float *b, int N)
|
||||
{
|
||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
int stride = blockDim.x * gridDim.x;
|
||||
|
||||
for (int i = index; i < N; i += stride)
|
||||
{
|
||||
result[i] = a[i] + b[i];
|
||||
}
|
||||
}
|
||||
|
||||
void checkElementsAre(float target, float *vector, int N)
|
||||
{
|
||||
for (int i = 0; i < N; i++)
|
||||
{
|
||||
if (vector[i] != target)
|
||||
{
|
||||
printf("FAIL: vector[%d] - %0.0f does not equal %0.0f\n", i, vector[i], target);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
printf("Success! All values calculated correctly.\n");
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
int deviceId;
|
||||
int numberOfSMs;
|
||||
|
||||
cudaGetDevice(&deviceId);
|
||||
cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);
|
||||
|
||||
const int N = 2 << 24;
|
||||
size_t size = N * sizeof(float);
|
||||
|
||||
float *a;
|
||||
float *b;
|
||||
float *c;
|
||||
|
||||
cudaMallocManaged(&a, size);
|
||||
cudaMallocManaged(&b, size);
|
||||
cudaMallocManaged(&c, size);
|
||||
|
||||
cudaMemPrefetchAsync(a, size, deviceId);
|
||||
cudaMemPrefetchAsync(b, size, deviceId);
|
||||
cudaMemPrefetchAsync(c, size, deviceId);
|
||||
|
||||
size_t threadsPerBlock;
|
||||
size_t numberOfBlocks;
|
||||
|
||||
threadsPerBlock = 256;
|
||||
numberOfBlocks = 32 * numberOfSMs;
|
||||
|
||||
cudaError_t addVectorsErr;
|
||||
cudaError_t asyncErr;
|
||||
|
||||
initWith<<<numberOfBlocks, threadsPerBlock>>>(3, a, N);
|
||||
initWith<<<numberOfBlocks, threadsPerBlock>>>(4, b, N);
|
||||
initWith<<<numberOfBlocks, threadsPerBlock>>>(0, c, N);
|
||||
|
||||
addVectorsInto<<<numberOfBlocks, threadsPerBlock>>>(c, a, b, N);
|
||||
|
||||
addVectorsErr = cudaGetLastError();
|
||||
if (addVectorsErr != cudaSuccess)
|
||||
printf("Error: %s\n", cudaGetErrorString(addVectorsErr));
|
||||
|
||||
asyncErr = cudaDeviceSynchronize();
|
||||
if (asyncErr != cudaSuccess)
|
||||
printf("Error: %s\n", cudaGetErrorString(asyncErr));
|
||||
|
||||
checkElementsAre(7, c, N);
|
||||
|
||||
cudaFree(a);
|
||||
cudaFree(b);
|
||||
cudaFree(c);
|
||||
}
|
19
src/05-stream-intro.cu
Normal file
19
src/05-stream-intro.cu
Normal file
@ -0,0 +1,19 @@
|
||||
#include <stdio.h>
|
||||
#include <unistd.h>
|
||||
|
||||
__global__ void printNumber(int number)
|
||||
{
|
||||
printf("%d\n", number);
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
for (int i = 0; i < 5; ++i)
|
||||
{
|
||||
cudaStream_t stream; // CUDA streams are of type `cudaStream_t`.
|
||||
cudaStreamCreate(&stream); // Note that a pointer must be passed to `cudaCreateStream`.
|
||||
printNumber<<<1, 2, 0, stream>>>(i); // `stream` is passed as 4th EC argument.
|
||||
cudaStreamDestroy(stream);
|
||||
}
|
||||
cudaDeviceSynchronize();
|
||||
}
|
145
src/09-nbody-solution.cu
Normal file
145
src/09-nbody-solution.cu
Normal file
@ -0,0 +1,145 @@
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "timer.h"
|
||||
#include "files.h"
|
||||
|
||||
#define SOFTENING 1e-9f
|
||||
|
||||
/*
|
||||
* Each body contains x, y, and z coordinate positions,
|
||||
* as well as velocities in the x, y, and z directions.
|
||||
*/
|
||||
|
||||
typedef struct
|
||||
{
|
||||
float x, y, z, vx, vy, vz;
|
||||
} Body;
|
||||
|
||||
/*
|
||||
* Calculate the gravitational impact of all bodies in the system
|
||||
* on all others.
|
||||
*/
|
||||
|
||||
__global__ void bodyForce(Body *p, float dt, int n)
|
||||
{
|
||||
int tid = threadIdx.x + blockDim.x * blockIdx.x;
|
||||
int stride = blockDim.x * gridDim.x;
|
||||
|
||||
for (int i = tid; i < n; i += stride)
|
||||
{
|
||||
float Fx = 0.0f;
|
||||
float Fy = 0.0f;
|
||||
float Fz = 0.0f;
|
||||
|
||||
for (int j = 0; j < n; j++)
|
||||
{
|
||||
float dx = p[j].x - p[i].x;
|
||||
float dy = p[j].y - p[i].y;
|
||||
float dz = p[j].z - p[i].z;
|
||||
float distSqr = dx * dx + dy * dy + dz * dz + SOFTENING;
|
||||
float invDist = rsqrtf(distSqr);
|
||||
float invDist3 = invDist * invDist * invDist;
|
||||
|
||||
Fx += dx * invDist3;
|
||||
Fy += dy * invDist3;
|
||||
Fz += dz * invDist3;
|
||||
}
|
||||
|
||||
p[i].vx += dt * Fx;
|
||||
p[i].vy += dt * Fy;
|
||||
p[i].vz += dt * Fz;
|
||||
}
|
||||
}
|
||||
|
||||
int main(const int argc, const char **argv)
|
||||
{
|
||||
|
||||
// The assessment will test against both 2<11 and 2<15.
|
||||
// Feel free to pass the command line argument 15 when you generate ./nbody report files
|
||||
int nBodies = 2 << 11;
|
||||
if (argc > 1)
|
||||
nBodies = 2 << atoi(argv[1]);
|
||||
|
||||
// The assessment will pass hidden initialized values to check for correctness.
|
||||
// You should not make changes to these files, or else the assessment will not work.
|
||||
const char *initialized_values;
|
||||
const char *solution_values;
|
||||
|
||||
if (nBodies == 2 << 11)
|
||||
{
|
||||
initialized_values = "09-nbody/files/initialized_4096";
|
||||
solution_values = "09-nbody/files/solution_4096";
|
||||
}
|
||||
else
|
||||
{ // nBodies == 2<<15
|
||||
initialized_values = "09-nbody/files/initialized_65536";
|
||||
solution_values = "09-nbody/files/solution_65536";
|
||||
}
|
||||
|
||||
if (argc > 2)
|
||||
initialized_values = argv[2];
|
||||
if (argc > 3)
|
||||
solution_values = argv[3];
|
||||
|
||||
const float dt = 0.01f; // Time step
|
||||
const int nIters = 10; // Simulation iterations
|
||||
|
||||
int bytes = nBodies * sizeof(Body);
|
||||
float *buf;
|
||||
|
||||
// buf = (float *)malloc(bytes);
|
||||
cudaMallocManaged(&buf, bytes);
|
||||
|
||||
Body *p = (Body *)buf;
|
||||
|
||||
read_values_from_file(initialized_values, buf, bytes);
|
||||
|
||||
double totalTime = 0.0;
|
||||
|
||||
/*
|
||||
* This simulation will run for 10 cycles of time, calculating gravitational
|
||||
* interaction amongst bodies, and adjusting their positions to reflect.
|
||||
*/
|
||||
|
||||
for (int iter = 0; iter < nIters; iter++)
|
||||
{
|
||||
StartTimer();
|
||||
|
||||
/*
|
||||
* You will likely wish to refactor the work being done in `bodyForce`,
|
||||
* and potentially the work to integrate the positions.
|
||||
*/
|
||||
|
||||
bodyForce<<<120, 256>>>(p, dt, nBodies); // compute interbody forces
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
/*
|
||||
* This position integration cannot occur until this round of `bodyForce` has completed.
|
||||
* Also, the next round of `bodyForce` cannot begin until the integration is complete.
|
||||
*/
|
||||
|
||||
for (int i = 0; i < nBodies; i++)
|
||||
{ // integrate position
|
||||
p[i].x += p[i].vx * dt;
|
||||
p[i].y += p[i].vy * dt;
|
||||
p[i].z += p[i].vz * dt;
|
||||
}
|
||||
|
||||
const double tElapsed = GetTimer() / 1000.0;
|
||||
totalTime += tElapsed;
|
||||
}
|
||||
|
||||
double avgTime = totalTime / (double)(nIters);
|
||||
float billionsOfOpsPerSecond = 1e-9 * nBodies * nBodies / avgTime;
|
||||
write_values_to_file(solution_values, buf, bytes);
|
||||
|
||||
// You will likely enjoy watching this value grow as you accelerate the application,
|
||||
// but beware that a failure to correctly synchronize the device might result in
|
||||
// unrealistically high values.
|
||||
printf("%0.3f Billion Interactions / second\n", billionsOfOpsPerSecond);
|
||||
|
||||
// free(buf);
|
||||
cudaFree(buf);
|
||||
}
|
138
src/09-nbody.cu
Normal file
138
src/09-nbody.cu
Normal file
@ -0,0 +1,138 @@
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "timer.h"
|
||||
#include "files.h"
|
||||
|
||||
#define SOFTENING 1e-9f
|
||||
|
||||
/*
|
||||
* Each body contains x, y, and z coordinate positions,
|
||||
* as well as velocities in the x, y, and z directions.
|
||||
*/
|
||||
|
||||
typedef struct
|
||||
{
|
||||
float x, y, z, vx, vy, vz;
|
||||
} Body;
|
||||
|
||||
/*
|
||||
* Calculate the gravitational impact of all bodies in the system
|
||||
* on all others.
|
||||
*/
|
||||
|
||||
void bodyForce(Body *p, float dt, int n)
|
||||
{
|
||||
for (int i = 0; i < n; ++i)
|
||||
{
|
||||
float Fx = 0.0f;
|
||||
float Fy = 0.0f;
|
||||
float Fz = 0.0f;
|
||||
|
||||
for (int j = 0; j < n; j++)
|
||||
{
|
||||
float dx = p[j].x - p[i].x;
|
||||
float dy = p[j].y - p[i].y;
|
||||
float dz = p[j].z - p[i].z;
|
||||
float distSqr = dx * dx + dy * dy + dz * dz + SOFTENING;
|
||||
float invDist = rsqrtf(distSqr);
|
||||
float invDist3 = invDist * invDist * invDist;
|
||||
|
||||
Fx += dx * invDist3;
|
||||
Fy += dy * invDist3;
|
||||
Fz += dz * invDist3;
|
||||
}
|
||||
|
||||
p[i].vx += dt * Fx;
|
||||
p[i].vy += dt * Fy;
|
||||
p[i].vz += dt * Fz;
|
||||
}
|
||||
}
|
||||
|
||||
int main(const int argc, const char **argv)
|
||||
{
|
||||
|
||||
// The assessment will test against both 2<11 and 2<15.
|
||||
// Feel free to pass the command line argument 15 when you generate ./nbody report files
|
||||
int nBodies = 2 << 11;
|
||||
if (argc > 1)
|
||||
nBodies = 2 << atoi(argv[1]);
|
||||
|
||||
// The assessment will pass hidden initialized values to check for correctness.
|
||||
// You should not make changes to these files, or else the assessment will not work.
|
||||
const char *initialized_values;
|
||||
const char *solution_values;
|
||||
|
||||
if (nBodies == 2 << 11)
|
||||
{
|
||||
initialized_values = "09-nbody/files/initialized_4096";
|
||||
solution_values = "09-nbody/files/solution_4096";
|
||||
}
|
||||
else
|
||||
{ // nBodies == 2<<15
|
||||
initialized_values = "09-nbody/files/initialized_65536";
|
||||
solution_values = "09-nbody/files/solution_65536";
|
||||
}
|
||||
|
||||
if (argc > 2)
|
||||
initialized_values = argv[2];
|
||||
if (argc > 3)
|
||||
solution_values = argv[3];
|
||||
|
||||
const float dt = 0.01f; // Time step
|
||||
const int nIters = 10; // Simulation iterations
|
||||
|
||||
int bytes = nBodies * sizeof(Body);
|
||||
float *buf;
|
||||
|
||||
buf = (float *)malloc(bytes);
|
||||
|
||||
Body *p = (Body *)buf;
|
||||
|
||||
read_values_from_file(initialized_values, buf, bytes);
|
||||
|
||||
double totalTime = 0.0;
|
||||
|
||||
/*
|
||||
* This simulation will run for 10 cycles of time, calculating gravitational
|
||||
* interaction amongst bodies, and adjusting their positions to reflect.
|
||||
*/
|
||||
|
||||
for (int iter = 0; iter < nIters; iter++)
|
||||
{
|
||||
StartTimer();
|
||||
|
||||
/*
|
||||
* You will likely wish to refactor the work being done in `bodyForce`,
|
||||
* and potentially the work to integrate the positions.
|
||||
*/
|
||||
|
||||
bodyForce(p, dt, nBodies); // compute interbody forces
|
||||
|
||||
/*
|
||||
* This position integration cannot occur until this round of `bodyForce` has completed.
|
||||
* Also, the next round of `bodyForce` cannot begin until the integration is complete.
|
||||
*/
|
||||
|
||||
for (int i = 0; i < nBodies; i++)
|
||||
{ // integrate position
|
||||
p[i].x += p[i].vx * dt;
|
||||
p[i].y += p[i].vy * dt;
|
||||
p[i].z += p[i].vz * dt;
|
||||
}
|
||||
|
||||
const double tElapsed = GetTimer() / 1000.0;
|
||||
totalTime += tElapsed;
|
||||
}
|
||||
|
||||
double avgTime = totalTime / (double)(nIters);
|
||||
float billionsOfOpsPerSecond = 1e-9 * nBodies * nBodies / avgTime;
|
||||
write_values_to_file(solution_values, buf, bytes);
|
||||
|
||||
// You will likely enjoy watching this value grow as you accelerate the application,
|
||||
// but beware that a failure to correctly synchronize the device might result in
|
||||
// unrealistically high values.
|
||||
printf("%0.3f Billion Interactions / second\n", billionsOfOpsPerSecond);
|
||||
|
||||
free(buf);
|
||||
}
|
Reference in New Issue
Block a user