科研实习Week 3 | 星渊小站

以下是科研实习第三周的进度

psc-config/psc 代码学习

论文学习

psc-config/psc 的论文以The Plasma Simulation Code: A modern particle-in-cell code with patch-based load-balancing为题发表在Journal of Computational Physics上，为了更好的理解psc-config/psc的代码，翻阅一下论文。

论文第四章中讲述了psc-config/psc中的并行策略，但是由于并行策略是关于粒子系统网格化的详细讨论，对于当前项目没有特别大的意义，但是其中的负载均衡策略是值得学习的。

负载均衡：用于在多个服务器或资源之间分配工作负载，以确保系统的高可用性和高性能。其主要目标是优化资源使用、最大化吞吐量、最小化响应时间，并避免某个单一资源过载。

论文指出：给定一定数量的处理单元N_proc,将域划分成比处理单元数量更多的补丁N_patch,每个处理单元处理多个补丁(通常10-100个)。通过动态调整补丁到处理单元的分配,我们可以确保每个处理单元被分配到几乎相等的负载。

demo:

我们将域划分为16×16个子域,如图所示。由于现在Patch(256个)远多于进程(16个),每个进程需要处理多个Patch,因此需要定义一个将Patch分配给进程的策略。图(b)中沿着1维曲线,每个Patch被访问一次。然后将这条256个Patch的曲线划分成与进程数量相同的段,在这种情况下我们得到16个每个包含16个Patch的段。然后将这些Patch段依次分配给每个MPI进程。最终(见图(c)),得到的空间分解与使用4×4子域的标准分区方式基本相同,但获得了额外的灵活性,可以通过将Patch从负载较高的进程移动到负载较低的相邻进程来应对负载变化。

简单说就是进一步拆分一个进程中的处理，使一个进程中的处理离散化，从而可以使一个进程中的运行项可以成比例的产生负载，每个进程的Patch可以伪动态进行分配。

代码学习-变量/方法名的统一性

#ifdef USE_CUDA
using PscConfig = PscConfig1vbecCuda<Dim>;
#else
using PscConfig = PscConfig1vbecDouble<Dim>;
#endif

在主程序中，我们会引用不同的变量，对于同一个功能不同设备的变量，尽可能保证两个设备上的变量的成员变量具有一定的统一性，这样我们完全可以使用同一个变量来指代不同设备上的配置。

同样，我们可以在不同的头文件中定义同一个方法，具有相同的参数、返回值，然后引入的头文件不同，实现不同位置的计算：

代码学习-使用类对底层进行封装

使用类进行封装，使其在主要代码中不可知底层实现(同样以上周的为例)，我们在后面的引用中，只使用Psc这个类(C++的结构体和类已经泛化，下统称类)，使用C++模板语法，传入的PscConfig可以根据需求使用不同设备的PscConfig（不同设备的PscConfigXXXX均继承PscConfig类）

template <typename PscConfig, typename Diagnostics, typename InjectParticles>
struct Psc
{
  using Mparticles = typename PscConfig::Mparticles;
  using MfieldsState = typename PscConfig::MfieldsState;
  using Balance = typename PscConfig::Balance;
  using Sort = typename PscConfig::Sort;
  using Collision = typename PscConfig::Collision;
  using Checks = typename PscConfig::Checks;
  using Marder = typename PscConfig::Marder;
  using PushParticles = typename PscConfig::PushParticles;
  using PushFields = typename PscConfig::PushFields;
  using Bnd = typename PscConfig::Bnd;
  using BndFields = typename PscConfig::BndFields;
  using BndParticles = typename PscConfig::BndParticles;
  using Dim = typename PscConfig::Dim;
//...省略
}

代码学习-CMAKE模板的使用

#ifndef PSCCONFIG_H_
#define PSCCONFIG_H_

/* PSC Version Information */
#define PSC_VERSION_MAJOR @PSC_VERSION_MAJOR@
#define PSC_VERSION_MINOR @PSC_VERSION_MINOR@
#define PSC_VERSION_PATCH @PSC_VERSION_PATCH@
#define PSC_VERSION       @PSC_VERSION@

/*
 * PSC Build Information:
 *
 * Compiler:
 *   C: @CMAKE_C_COMPILER@
 *     Id: @CMAKE_C_COMPILER_ID@ @CMAKE_C_COMPILER_VERSION@
 *   CXX: @CMAKE_CXX_COMPILER@
 *     Id: @CMAKE_CXX_COMPILER_ID@ @CMAKE_CXX_COMPILER_VERSION@
 *   Fortran: @CMAKE_CXX_COMPILER@
 *     Id: @CMAKE_Fortran_COMPILER_ID@ @CMAKE_CXX_COMPILER_VERSION@
 */

#cmakedefine USE_CUDA
#cmakedefine USE_VPIC

/* Everything past this line is programatically generated */

@PSC_CONFIG_DEFINES@

#endif /* PSCCONFIG_H_ */

使用Cmake模板，自动生成配置在CMakeLists.txt中的头文件，这里将cmake定义的环境变量引入了代码。

代码学习-rmm的使用

RAPIDS Memory Manager(rmm)可以轻松的将host上的内存转换到device上的显存，同时如无显卡则不变，是一个快速的内存管理系统。

psc-config/psc中大量使用了rmm来简化内存分配问题

  static bool first_time = true;
  if (!first_time)
    return;

  first_time = false;

#ifdef PSC_HAVE_RMM
  rmm::logger().set_level(spdlog::level::trace);

  device_mr_type* mr =
    rmm::mr::get_current_device_resource(); // Points to `cuda_memory_resource`
  static log_mr_type _log_mr{mr, std::cout, true};
  static pool_mr_type pool_mr{&_log_mr, 15000000000};
  static track_mr_type track_mr{&pool_mr};
#if 0
  static log_mr_type log_mr{&track_mr, std::cout, true};
  rmm::mr::set_current_device_resource(&log_mr);

代码实践

使用psc-config/psc类似的方法，求解二维方形边界上关于第一类边界条件的拉普拉斯方程。

文件结构：

gp-demo/
├── CMakeLists.txt
├── include
│   ├── laplace.cuh
│   └── laplace.h
└── src
    ├── CMakeLists.txt
    ├── laplace.cpp
    ├── laplace.cu
    └── main.cpp

根据psc-config的思路，我们在CMakeLists.txt中指定环境变量

# ./CMakeLists.txt


cmake_minimum_required(VERSION 3.0)
project(gp_demo)
option(USE_CUDA "Build CUDA components" ON)

if (USE_CUDA)
    enable_language(CUDA CXX)
else ()
    enable_language(CXX)
endif()
set(CMAKE_CUDA_ARCHITECTURES all-major)
set(CMAKE_CUDA_STANDARD 17)

include_directories("./include")
add_subdirectory(./src)

# ./src/CMakeLists.txt

include_directories("/usr/local/cuda-11.8/targets/x86_64-linux/include")

if (USE_CUDA)
    find_package(CUDA REQUIRED)
    add_executable(gp-main main.cpp
            laplace.cu)
    set_target_properties(gp-main PROPERTIES
            CUDA_SEPARABLE_COMPILATION ON)
else()
    add_executable(gp-main main.cpp
        laplace.cpp)
endif()

在C++代码中，我们使用psc-config/psc的方法

// ./src/main.cpp

#ifdef USE_CUDA
#include <laplace.cuh>
#else
#include <laplace.h>
#endif

然后我们采用不同文件下的同名方法

// ./include/laplace.h

#ifndef LAPLACE_H
#define LAPLACE_H

#define IDX(i, j, cols) ((i) * (cols) + (j))
void solveLaplace(double* grid, int rows, int cols, int max_iter = 10000, double tol = 1e-6);
#endif


// ./include/laplace.cuh

#ifndef LAPLACE_CUH
#define LAPLACE_CUH
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#define IDX(i, j, cols) ((i) * (cols) + (j))
__global__ void laplaceKernel(double* grid, double* new_grid, int rows, int cols);
void solveLaplace(double* grid, int rows, int cols, int max_iter = 10000, double tol = 1e-6);
#endif 

对应不同的实现为：

// openmp
void solveLaplace(double* grid, int rows, int cols, int max_iter, double tol) {
    double* new_grid = new double[rows * cols];

    for (int iter = 0; iter < max_iter; ++iter) {
        double max_error = 0.0;
#pragma omp parallel for reduction(max:max_error)
        for (int i = 1; i < rows - 1; ++i) {
            for (int j = 1; j < cols - 1; ++j) {
                int idx = IDX(i, j, cols);
                new_grid[idx] = 0.25 * (grid[IDX(i-1, j, cols)] + grid[IDX(i+1, j, cols)] + grid[IDX(i, j-1, cols)] + grid[IDX(i, j+1, cols)]);
                double error = std::fabs(new_grid[idx] - grid[idx]);
                max_error = std::max(max_error, error);
            }
        }
#pragma omp parallel for
        for (int i = 1; i < rows - 1; ++i) {
            for (int j = 1; j < cols - 1; ++j) {
                grid[IDX(i, j, cols)] = new_grid[IDX(i, j, cols)];
            }
        }
        if (max_error < tol) {
            std::cout << "Converged after " << iter + 1 << " iterations with error " << max_error << std::endl;
            break;
        }
    }
    delete[] new_grid;
}


// cuda
__global__ void laplaceKernel(double* grid, double* new_grid, int rows, int cols) {
    int i = blockIdx.y * blockDim.y + threadIdx.y;
    int j = blockIdx.x * blockDim.x + threadIdx.x;

    if (i > 0 && i < rows - 1 && j > 0 && j < cols - 1) {
        new_grid[IDX(i, j, cols)] = 0.25 * (grid[IDX(i-1, j, cols)] + grid[IDX(i+1, j, cols)]
                                            + grid[IDX(i, j-1, cols)] + grid[IDX(i, j+1, cols)]);
    }
}

void solveLaplace(double* grid, int rows, int cols, int max_iter, double tol) {
    double *d_grid, *d_new_grid;
    cudaMalloc(&d_grid, rows * cols * sizeof(double));
    cudaMalloc(&d_new_grid, rows * cols * sizeof(double));

    cudaMemcpy(d_grid, grid, rows * cols * sizeof(double), cudaMemcpyHostToDevice);

    //这里我指定了大小
    dim3 blockSize(10, 10);
    dim3 gridSize(10,10);

    for (int iter = 0; iter < max_iter; ++iter) {
        laplaceKernel<<<gridSize, blockSize>>>(d_grid, d_new_grid, rows, cols);
        cudaDeviceSynchronize();
        cudaMemcpy(grid, d_new_grid, rows * cols * sizeof(double), cudaMemcpyDeviceToHost);

        double max_error = 0.0;
        for (int i = 1; i < rows - 1; ++i) {
            for (int j = 1; j < cols - 1; ++j) {
                double error = std::fabs(grid[IDX(i, j, cols)] - d_grid[IDX(i, j, cols)]);
                if (error > max_error) {
                    max_error = error;
                }
            }
        }

        if (max_error < tol) {
            std::cout << "Converged after " << iter + 1 << " iterations with error " << max_error << std::endl;
            break;
        }

        std::swap(d_grid, d_new_grid);
    }

    cudaMemcpy(grid, d_grid, rows * cols * sizeof(double), cudaMemcpyDeviceToHost);

    cudaFree(d_grid);
    cudaFree(d_new_grid);
}

主函数信息：

int main() {
    int rows = 100;
    int cols = 100;
    double* grid = new double[rows * cols];

    for (int i = 0; i < rows; ++i) {
        for (int j = 0; j < cols; ++j) {
            grid[IDX(i, j, cols)] = 0.0;
        }
    }

    for (int i = 0; i < rows; ++i) {
        grid[IDX(i, 0, cols)] = 0.0; // 左边界条件
        grid[IDX(i, cols-1, cols)] = 1.0; // 右边界条件
    }
    for (int j = 0; j < cols; ++j) {
        grid[IDX(0, j, cols)] = 0.0; // 上边界条件
        grid[IDX(rows-1, j, cols)] = 1.0; // 下边界条件
    }

    solveLaplace(grid, rows, cols);

    for (int i = 0; i < rows; ++i) {
        for (int j = 0; j < cols; ++j) {
            std::cout << grid[IDX(i, j, cols)] << " ";
        }
        std::cout << std::endl;
    }
    delete[] grid;
    return 0;
}