20180413并行程序设计原理HW1

并行程序设计原理 HW1

冯浩然 1600013009

1Intro

  • 本次作业完成过程中,我从从初学到探索,由简单到优化地尝试了三种实现矩阵转置的方法。本报告将依次介绍三种实现,并展示和分析他们的效率

2实现

1. 外围函数
  • 参考CUDA手册给出的例子和作业要求,我们假设矩阵为10241024float型矩阵。首先给出生成矩阵的generator()函数和测试转置结果正确性的check()函数

    其中generator()函数中生成随机数的部分参考了https://zhidao.baidu.com/question/1514652593460316220.html中的回答,check()函数采用不考虑性能的简单实现

/*
* generate random cuda matrix with "curand.h", and copy back to the host as a normal matrix
* cm represents "cuda matrix", m represents "matrix"
*/
void generator(float *cm, float *m)
{
    curandGenerator_t gen;
    curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);
    curandSetPseudoRandomGeneratorSeed(gen, 1234ULL);
    curandGenerateUniform(gen, cm, N * N);
    curandDestroyGenerator(gen);

    cudaMemcpy(m, cm, N * N * sizeof(float), cudaMemcpyDeviceToHost);
}

/*
* check if the result is correct
* dst represents the tranposed, src represents the previous
*/
bool check(float *dst, float *src)
{
    for (int i = 0; i < N; i++)
        for (int j = 0; j < N; j++){
            if (src[i * N + j] != dst[j * N + i])
                return false;
        }

    return true;
}
  • 我们在main()中添加计时的部分,以衡量方法的时间效率。main()函数主体实现如下

    其中计时部分参考了https://blog.csdn.net/litdaguang/article/details/50520549中的GPU计时函数部分

    省略了执行矩阵转置的`matrix_trans()函数和一些变量准备

#include <stdio.h>
#include <stdlib.h>
#include <ctime>
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <curand.h>
#include <curand_kernel.h>
using namespace std;
#pragma comment(lib, "curand.lib")
#define N 1024
#define TILE 32

/*
* generate random cuda matrix with "curand.h", and copy back to the host as a normal matrix
* cm represents "cuda matrix", m represents "matrix"
*/
void generator(float *cm, float *m)
{
    curandGenerator_t gen;
    curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);
    curandSetPseudoRandomGeneratorSeed(gen, 1234ULL);
    curandGenerateUniform(gen, cm, N * N);
    curandDestroyGenerator(gen);

    cudaMemcpy(m, cm, N * N * sizeof(float), cudaMemcpyDeviceToHost);
}

/*
* check if the result is correct
* dst represents the tranposed, src represents the previous
*/
bool check(float *dst, float *src)
{
    for (int i = 0; i < N; i++)
        for (int j = 0; j < N; j++){
            if (src[i * N + j] != dst[j * N + i])
                return false;
        }

    return true;
}

int main()
{

    float *cdst, *csrc;
    float *dst, *src;

    cudaMalloc(&cdst, N * N * sizeof(float));
    cudaMalloc(&csrc, N * N * sizeof(float));
    dst = (float *)malloc(N * N * sizeof(float));
    src = (float *)malloc(N * N * sizeof(float));

    generator(csrc, src);

    /*initialize varibles like dimBlock and dimGrid there*/

    /*time record*/
    cudaEvent_t start, stop;
    float elapsedTime = 0.0;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    /*executive the matrix_tran() function there*/

    /*end time record*/
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start, stop);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    cudaMemcpy(dst, cdst, N * N * sizeof(float), cudaMemcpyDeviceToHost);

    if (check(dst, src))
        printf("correct answer!\n");
    else
        printf("wrong answer!\n");
    printf("the executive takes %f s on average.\n", elapsedTime);

    free(dst);
    free(src);
    cudaFree(cdst);
    cudaFree(csrc);

    return 0;
}
2.主体转置函数

2.1 naive方法

  • 矩阵共10241024个元素,于是我们将一个`Grid分配为1024个Block,每个Block负责一行1024个元素,每个Block中1024个Thread,每个Thread负责一个元素

  • /*
    * transpose matrix src, and store the result in matrix dst
    * dst represents the tranposed, src represents the previous
    * naive implementation: one block takes one row, one threads takes one elements
    */
    __global__ void matrix_trans_1(float *dst, float *src)
    {
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    int j = blockDim.y * blockIdx.y + threadIdx.y;
    
    dst[i * N + j] = src[j * N + i];
    }
    
    int main()
    {
    dim3 dimBlock(N, 1);
    dim3 dimGrid(1, N);
    
    /*...*/
    
    naive_trans_1<<<dimGrid, dimBlock>>>(cdst, csrc);
    
    /*...*/
    }

2.2 优化step1

  • 考虑shared memory的性质。我们不妨考虑将矩阵切为一个个3232的单位块进行移动,提高访问的聚合性,从而利用shared memory带来的局部性。

    参考了https://blog.csdn.net/bruce_0712/article/details/65018223中有关内容

    /*
    * transpose matrix src, and store the result in matrix dst
    * dst represents the tranposed, src represents the previous
    * optimized step1: a unit is a 32 * 32 matrix, move unit by unit
    */
    __global__ void matrix_trans_2(float *dst, float *src)
    {
    __shared__ float tile[TILE * TILE];
    
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    int j = blockDim.y * blockIdx.y + threadIdx.y;
    int ind = threadIdx.x * TILE + threadIdx.y;
    
    tile[ind] = src[j * N + i];
    
    __syncthreads();
    
    dst[i * N + j] = tile[ind];
    }
    
    int main()
    {
    dim3 dimBlock(TILE, TILE);
    dim3 dimGrid(N / TILE, N / TILE);
    
    /*...*/
    
    naive_trans_2<<<dimGrid, dimBlock>>>(cdst, csrc);
    
    /*...*/
    }
  • 然而进行测试后,发现matrix_trans_2()效率还不如matrix_trans_1,猜测原因是我们对3232的unit对聚合性和局部性的利用并不好。于是我们将在下一步优化中调整分块,以求得更好的聚合性和局部性

  • naive版本这么快也是有原因的:每个warp是连续访问的,程序有很好的聚合性

2.3 优化step2

  • 仍保持3232的分块,每个block管理一个分块,但改进线程分配
  • 每个block中有832thread,每个thread每次读41的4个元素到tile中,然后直接转置后放入dst矩阵中,以实现更好的凝聚访问
/*
* transpose matrix src, and store the result in matrix dst
* dst represents the tranposed, src represents the previous
* optimized step2: a unit is a 32 * 32 matrix, move by 4 * 1 elements
*/
__global__ void matrix_trans_3(float *dst, float *src)
{
    __shared__ float tile[TILE][TILE];

    int i = blockIdx.x * TILE + threadIdx.x;
    int j = blockIdx.y * TILE + threadIdx.y * 4;
    int ind = j * N + i;
    int tile_i = threadIdx.y * 4;
    int tile_j = threadIdx.x;

    for (i = 0; i < 4; i++){
        tile[tile_j][tile_i] = src[ind];
        tile_i++;
        ind += N;
    }

    __syncthreads();

    i = blockIdx.y * TILE + threadIdx.x;
    j = blockIdx.x * TILE + threadIdx.y * 4;
    ind = j * N + i;
    tile_i = threadIdx.x;
    tile_j = threadIdx.y * 4;

    for (int i = 0; i < 4; i++){
        dst[ind] = tile[tile_j][tile_i];
        tile_j++;
        ind += N;
    }
}

int main()
{
    dim3 dimBlock(TILE, TILE / 4);
    dim3 dimGrid(N / TILE, N / TILE);

    /*...*/

    naive_trans_3<<<dimGrid, dimBlock>>>(cdst, csrc);

    /*...*/
}

3执行及性能

  • 三种实现分别对应提交的matrix_trans_1.cu, matrix_trans_2.cu, matrix_trans_3.cu文件

  • 运行情况如图

    注意到naive版本性能就很不错,step1优化后性能反而下降,step2优化后性能最佳,性能约是naive版本的3倍

    分析原因是

    step1时,我们对3232的unit对聚合性和局部性的利用并不好。于是我们将在下一步优化中调整分块,以求得更好的聚合性和局部性

    step2时达到了比较好的聚合性和局部性

4特别注释

代码位置为

[email protected]:/home/manycore/users/Feng.Haoran 中的matrix_trans_1, 2, 3, hw1_1, 2, 3是编译完成的可执行文件

(1是naive实现,2是step1优化后,3是step2优化后)

版权声明:本文为ace313原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://blog.csdn.net/ace313/article/details/79923464