diff --git a/saxpy-mpi-cuda/Makefile b/saxpy-mpi-cuda/Makefile new file mode 100644 index 0000000..32c3dc7 --- /dev/null +++ b/saxpy-mpi-cuda/Makefile @@ -0,0 +1,21 @@ +MPI_DIR = $(OPENMPI_ROOT) +NVCC = nvcc +NVCC_FLAGS = -I$(MPI_DIR)/include +CUDA_INCLUDE = $(CUDA_HOME)/include +LIBS = -L$(MPI_DIR)/lib -lmpi -L$(CUDA_HOME)/lib64 -lcudart -lnvToolsExt +PROFILE_FLAGS = -lineinfo +SRCS = saxpy-mpi-cuda.cu +OBJS = saxpy-mpi-cuda.o +EXECUTABLE = saxpy-mpi-cuda +ARCH = #-arch=sm_80 + +all: $(EXECUTABLE) +$(EXECUTABLE): $(OBJS) + $(NVCC) $(OBJS) -o $(EXECUTABLE) $(LIBS) + +$(OBJS): $(SRCS) + $(NVCC) $(NVCC_FLAGS) $(ARCH) $(PROFILE_FLAGS) -I$(CUDA_INCLUDE) -c $(SRCS) -o $(OBJS) + +clean: + rm -rf $(OBJS) $(EXECUTABLE) + diff --git a/saxpy-mpi-cuda/README.md b/saxpy-mpi-cuda/README.md new file mode 100644 index 0000000..6fc3bfd --- /dev/null +++ b/saxpy-mpi-cuda/README.md @@ -0,0 +1,15 @@ +## MPI+CUDA implementation of the saxpy program + +#### Benchmark information +- Uses two processes and one GPU per process. +- Rank 0 initializes the input arrays and sends them to Rank 1. +- Both processes does the same saxpy calculation. +- Rank 1 sends the results to Rank 0. + +The following parameters can be used to run the program: +- -i -> number of iterations. +- -N -> the problem size. + +#### Example +`./saxpy-mpi-cuda -i 5 -N 32768000` + diff --git a/saxpy-mpi-cuda/saxpy-mpi-cuda.cu b/saxpy-mpi-cuda/saxpy-mpi-cuda.cu new file mode 100644 index 0000000..82e7467 --- /dev/null +++ b/saxpy-mpi-cuda/saxpy-mpi-cuda.cu @@ -0,0 +1,146 @@ +#include +#include +#include +#include +#include +#include +#include +#include "nvToolsExt.h" + +__global__ void saxpy(double *z, double *x, double *y, double alpha, int N) { + + int idx = blockDim.x*blockIdx.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + + for (int i = idx; i < N; i += stride) { + //if (idx < N) + z[i] = alpha*x[i] + y[i]; + } +} + +int main(int argc, char *argv[]) { + + double *h_z, *h_x, *h_y; + double *d_z, *d_x, *d_y; + double alpha = 1.5; + int N = 4096; + int iterations = 2; + + MPI_Init(&argc, &argv); + int commSize, commRank; + MPI_Comm_size(MPI_COMM_WORLD, &commSize); + MPI_Comm_rank(MPI_COMM_WORLD, &commRank); + + cudaSetDevice(commRank); + + int c; + char* endp; + // parse arguments + while ((c = getopt (argc, argv, "N:i:h")) != -1) { + switch (c) { + case 'N': + N = strtol(optarg, &endp, 10); + break; + case 'i': + iterations = strtol(optarg, &endp, 10); + break; + case 'h': + printf("-N => default: -N 4096\n"); + printf("-i => default: -i 2\n"); + exit(0); + break; + case '?': + printf("Unknown argument. Use -h to see the options.\n"); + exit(1); + break; + } + } + + if (commRank == 0) { + printf("Number of iterations: %d\n", iterations); + printf("Problem size (N): %d\n", N); + } + + int deviceCount = 0; + cudaGetDeviceCount(&deviceCount); + printf("Rank %d - Number of GPUs: %d\n", commRank, deviceCount); + + h_z = new double[N]; + h_x = new double[N]; + h_y = new double[N]; + + for (int it = 0; it < iterations; it++){ + // initialize + if (commRank == 0){ + for (int i = 0; i < N; i += 1) { + h_x[i] = 5.0; + h_y[i] = -2.0; + h_z[i] = 0.0; + } + } + + // send the input arrays to the other process. + if (commRank == 0) { + MPI_Send(h_x, N, MPI_DOUBLE, 1, it+0, MPI_COMM_WORLD); + MPI_Send(h_y, N, MPI_DOUBLE, 1, it+1, MPI_COMM_WORLD); + } + else if (commRank == 1) { + MPI_Recv(h_x, N, MPI_DOUBLE, 0, it+0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + MPI_Recv(h_y, N, MPI_DOUBLE, 0, it+1, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + } + + cudaMalloc(&d_z, N*sizeof(double)); + cudaMalloc(&d_y, N*sizeof(double)); + cudaMalloc(&d_x, N*sizeof(double)); + + // copy arrays from host to device + cudaMemcpy(d_x, h_x, N*sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(d_y, h_y, N*sizeof(double), cudaMemcpyHostToDevice); + + int threadsPerBlock = 512; + int numBlocks = 2; //N/threadsPerBlock + (N % threadsPerBlock != 0); + + // kernel call + nvtxRangePushA("saxpy"); + saxpy<<>>(d_z, d_x, d_y, alpha, N); + nvtxRangePop(); + + // copy arrays back to the host + cudaMemcpy(h_z, d_z, N*sizeof(double), cudaMemcpyDeviceToHost); + + // check if the results are correct + bool success = true; + for (size_t i = 0; i < N; i += 1) { + if (std::abs(h_z[i] - (1.5*5.0-2.0)) > 1E-8) { + success = false; + } + } + if (!success) { + printf("Rank %d => Error: incorrect results! it: %d\n", commRank, it); + } + else { + printf("Rank %d => Correct results! it: %d\n", commRank, it); + } + + // send the result to rank 0. + if (commRank == 1) { + MPI_Send(h_z, N, MPI_DOUBLE, 0, it+2, MPI_COMM_WORLD); + } + else if (commRank == 0) { + MPI_Recv(h_z, N, MPI_DOUBLE, 1, it+2, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + } + } + + // cleaning + delete[] h_x; + delete[] h_y; + delete[] h_z; + + cudaFree(d_z); + cudaFree(d_x); + cudaFree(d_y); + + MPI_Finalize(); + + return 0; +}