diff --git a/src/01-double-elements.cu b/src/01-double-elements.cu
new file mode 100644
index 0000000..3604ebe
--- /dev/null
+++ b/src/01-double-elements.cu
@@ -0,0 +1,81 @@
+#include <stdio.h>
+
+/*
+ * Initialize array values on the host.
+ */
+
+void init(int *a, int N)
+{
+    int i;
+    for (i = 0; i < N; ++i)
+    {
+        a[i] = i;
+    }
+}
+
+/*
+ * Double elements in parallel on the GPU.
+ */
+
+__global__ void doubleElements(int *a, int N)
+{
+    int i;
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < N)
+    {
+        a[i] *= 2;
+    }
+}
+
+/*
+ * Check all elements have been doubled on the host.
+ */
+
+bool checkElementsAreDoubled(int *a, int N)
+{
+    int i;
+    for (i = 0; i < N; ++i)
+    {
+        if (a[i] != i * 2)
+            return false;
+    }
+    return true;
+}
+
+int main()
+{
+    int N = 100;
+    int *a;
+
+    size_t size = N * sizeof(int);
+
+    /*
+     * Refactor this memory allocation to provide a pointer
+     * `a` that can be used on both the host and the device.
+     */
+
+    cudaMallocManaged(&a, size);
+
+    init(a, N);
+
+    size_t threads_per_block = 10;
+    size_t number_of_blocks = 10;
+
+    /*
+     * This launch will not work until the pointer `a` is also
+     * available to the device.
+     */
+
+    doubleElements<<<number_of_blocks, threads_per_block>>>(a, N);
+    cudaDeviceSynchronize();
+
+    bool areDoubled = checkElementsAreDoubled(a, N);
+    printf("All elements were doubled? %s\n", areDoubled ? "TRUE" : "FALSE");
+
+    /*
+     * Refactor to free memory that has been allocated to be
+     * accessed by both the host and the device.
+     */
+
+    cudaFree(a);
+}
diff --git a/src/01-first-parallel.cu b/src/01-first-parallel.cu
new file mode 100644
index 0000000..fcadb7d
--- /dev/null
+++ b/src/01-first-parallel.cu
@@ -0,0 +1,27 @@
+#include <stdio.h>
+
+/*
+ * Refactor firstParallel so that it can run on the GPU.
+ */
+
+__global__ void firstParallel()
+{
+    printf("This should be running in parallel.\n");
+}
+
+int main()
+{
+    /*
+     * Refactor this call to firstParallel to execute in parallel
+     * on the GPU.
+     */
+
+    firstParallel<<<10, 10>>>();
+
+    /*
+     * Some code is needed below so that the CPU will wait
+     * for the GPU kernels to complete before proceeding.
+     */
+
+    cudaDeviceSynchronize();
+}
diff --git a/src/01-hello-gpu.cu b/src/01-hello-gpu.cu
new file mode 100644
index 0000000..58f0117
--- /dev/null
+++ b/src/01-hello-gpu.cu
@@ -0,0 +1,38 @@
+#include <stdio.h>
+
+void helloCPU()
+{
+    printf("Hello from the CPU.\n");
+}
+
+/*
+ * Refactor the `helloGPU` definition to be a kernel
+ * that can be launched on the GPU. Update its message
+ * to read "Hello from the GPU!"
+ */
+
+__global__ void helloGPU()
+{
+    printf("Hello from the GPU.\n");
+}
+
+int main()
+{
+
+    helloCPU();
+
+    /*
+     * Refactor this call to `helloGPU` so that it launches
+     * as a kernel on the GPU.
+     */
+
+    helloGPU<<<1, 1>>>();
+
+    /*
+     * Add code below to synchronize on the completion of the
+     * `helloGPU` kernel completion before continuing the CPU
+     * thread.
+     */
+
+    cudaDeviceSynchronize();
+}
diff --git a/src/01-single-block-loop.cu b/src/01-single-block-loop.cu
new file mode 100644
index 0000000..aa5bc2f
--- /dev/null
+++ b/src/01-single-block-loop.cu
@@ -0,0 +1,27 @@
+#include <stdio.h>
+
+/*
+ * Refactor `loop` to be a CUDA Kernel. The new kernel should
+ * only do the work of 1 iteration of the original loop.
+ */
+
+__global__ void loop()
+{
+    printf("This is iteration number %d\n", threadIdx.x);
+}
+
+int main()
+{
+    /*
+     * When refactoring `loop` to launch as a kernel, be sure
+     * to use the execution configuration to control how many
+     * "iterations" to perform.
+     *
+     * For this exercise, only use 1 block of threads.
+     */
+
+    int N = 10;
+    loop<<<1, N>>>();
+
+    cudaDeviceSynchronize();
+}
diff --git a/src/01-thread-and-block-idx.cu b/src/01-thread-and-block-idx.cu
new file mode 100644
index 0000000..0f33c91
--- /dev/null
+++ b/src/01-thread-and-block-idx.cu
@@ -0,0 +1,22 @@
+#include <stdio.h>
+
+__global__ void printSuccessForCorrectExecutionConfiguration()
+{
+
+    if (threadIdx.x == 1023 && blockIdx.x == 255)
+    {
+        printf("Success!\n");
+    }
+}
+
+int main()
+{
+    /*
+     * Update the execution configuration so that the kernel
+     * will print `"Success!"`.
+     */
+
+    printSuccessForCorrectExecutionConfiguration<<<256, 1024>>>();
+
+    cudaDeviceSynchronize();
+}
diff --git a/src/02-mismatched-config-loop.cu b/src/02-mismatched-config-loop.cu
new file mode 100644
index 0000000..fb9a7a7
--- /dev/null
+++ b/src/02-mismatched-config-loop.cu
@@ -0,0 +1,69 @@
+#include <stdio.h>
+
+/*
+ * Currently, `initializeElementsTo`, if executed in a thread whose
+ * `i` is calculated to be greater than `N`, will try to access a value
+ * outside the range of `a`.
+ *
+ * Refactor the kernel definition to prevent out of range accesses.
+ */
+
+__global__ void initializeElementsTo(int initialValue, int *a, int N)
+{
+    int i = threadIdx.x + blockIdx.x * blockDim.x;
+    if (i < N)
+    {
+        a[i] = initialValue;
+    }
+}
+
+int main()
+{
+    /*
+     * Do not modify `N`.
+     */
+
+    int N = 1000;
+
+    int *a;
+    size_t size = N * sizeof(int);
+
+    cudaMallocManaged(&a, size);
+
+    /*
+     * Assume we have reason to want the number of threads
+     * fixed at `256`: do not modify `threads_per_block`.
+     */
+
+    size_t threads_per_block = 256;
+
+    /*
+     * Assign a value to `number_of_blocks` that will
+     * allow for a working execution configuration given
+     * the fixed values for `N` and `threads_per_block`.
+     */
+
+    size_t number_of_blocks = 4;
+
+    int initialValue = 6;
+
+    initializeElementsTo<<<number_of_blocks, threads_per_block>>>(initialValue, a, N);
+    cudaDeviceSynchronize();
+
+    /*
+     * Check to make sure all values in `a`, were initialized.
+     */
+
+    for (int i = 0; i < N; ++i)
+    {
+        if (a[i] != initialValue)
+        {
+            printf("FAILURE: target value: %d\t a[%d]: %d\n", initialValue, i, a[i]);
+            cudaFree(a);
+            exit(1);
+        }
+    }
+    printf("SUCCESS!\n");
+
+    cudaFree(a);
+}
diff --git a/src/02-multi-block-loop.cu b/src/02-multi-block-loop.cu
new file mode 100644
index 0000000..440a06a
--- /dev/null
+++ b/src/02-multi-block-loop.cu
@@ -0,0 +1,28 @@
+#include <stdio.h>
+
+/*
+ * Refactor `loop` to be a CUDA Kernel. The new kernel should
+ * only do the work of 1 iteration of the original loop.
+ */
+
+__global__ void loop()
+{
+    int i = threadIdx.x + blockIdx.x * blockDim.x;
+    printf("This is iteration number %d\n", i);
+}
+
+int main()
+{
+    /*
+     * When refactoring `loop` to launch as a kernel, be sure
+     * to use the execution configuration to control how many
+     * "iterations" to perform.
+     *
+     * For this exercise, be sure to use more than 1 block in
+     * the execution configuration.
+     */
+
+    loop<<<2, 5>>>();
+
+    cudaDeviceSynchronize();
+}
diff --git a/src/03-grid-stride-double.cu b/src/03-grid-stride-double.cu
new file mode 100644
index 0000000..f7c77a0
--- /dev/null
+++ b/src/03-grid-stride-double.cu
@@ -0,0 +1,68 @@
+#include <stdio.h>
+
+void init(int *a, int N)
+{
+    int i;
+    for (i = 0; i < N; ++i)
+    {
+        a[i] = i;
+    }
+}
+
+/*
+ * In the current application, `N` is larger than the grid.
+ * Refactor this kernel to use a grid-stride loop in order that
+ * each parallel thread work on more than one element of the array.
+ */
+
+__global__ void doubleElements(int *a, int N)
+{
+    int indexWithinTheGrid = threadIdx.x + blockIdx.x * blockDim.x;
+    int gridStride = gridDim.x * blockDim.x;
+
+    for (int i = indexWithinTheGrid; i < N; i += gridStride)
+    {
+        a[i] *= 2;
+    }
+}
+
+bool checkElementsAreDoubled(int *a, int N)
+{
+    int i;
+    for (i = 0; i < N; ++i)
+    {
+        if (a[i] != i * 2)
+            return false;
+    }
+    return true;
+}
+
+int main()
+{
+    /*
+     * `N` is greater than the size of the grid (see below).
+     */
+
+    int N = 10000;
+    int *a;
+
+    size_t size = N * sizeof(int);
+    cudaMallocManaged(&a, size);
+
+    init(a, N);
+
+    /*
+     * The size of this grid is 256*32 = 8192.
+     */
+
+    size_t threads_per_block = 256;
+    size_t number_of_blocks = 32;
+
+    doubleElements<<<number_of_blocks, threads_per_block>>>(a, N);
+    cudaDeviceSynchronize();
+
+    bool areDoubled = checkElementsAreDoubled(a, N);
+    printf("All elements were doubled? %s\n", areDoubled ? "TRUE" : "FALSE");
+
+    cudaFree(a);
+}