100x100 짜리 매트릭스 두 개를 곱하는 연산을, OpenCL (GPU) 을 이용하여 병렬처리 해보자.


아래는 main.c 라는 코드이다.


이것은 호스트 머신(CPU) 에서 돌아갈 것이다.


#include <stdio.h>
#include <stdlib.h>
#include <CL/cl.h>
#include <math.h>
#include <sys/time.h>
#include <unistd.h>

//macro to print error
#define CHECK_ERROR(err)\
        if(err!=CL_SUCCESS){\
                printf("[%s:%d] OpenCL error %d/\n", __FILE__, __LINE__, err);\
        exit(EXIT_FAILURE);\
        }

double get_time() {
  struct timeval tv;
  gettimeofday(&tv, NULL);
  return (double)tv.tv_sec + (double)1e-6 * tv.tv_usec;
}

//function to read a source code for devices(GPU, FPGS, .. etc)
char *get_source_code(const char *file_name, size_t *len) {
	char *source_code;
	size_t length;
	FILE *file = fopen(file_name, "r");
	if (file == NULL) {
		printf("[%s:%d] OpenCL error %d/\n", __FILE__, __LINE__, file_name);
		exit(EXIT_FAILURE);
	}

	fseek(file, 0, SEEK_END);
	length = (size_t) ftell(file);
	rewind(file);

	source_code = (char *) malloc(length + 1);
	fread(source_code, length, 1, file);
	source_code[length] = '\0';

	fclose(file);
	*len = length;
	return source_code;
}

//function to matrix multiply
//pointers are the space.
//integers are row and col of A, B.
void mat_mul_opencl(float *A, float *B, float *C, int ROW_A, int COL_A,
		int COL_B) {
	cl_platform_id platform; //for a platform
	cl_device_id device; //for a device
	cl_context context; //for a context
	cl_command_queue queue; //for a queue
	cl_program program; //for a program to run on devices
	char *kernel_source; //for a kernel source
	size_t kernel_source_size;
	cl_kernel kernel; //for a kernel
	cl_int err; //for an error for all

	err = clGetPlatformIDs(1, &platform, NULL); //get a platform and save into &platform
	CHECK_ERROR(err);

	err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); //get a device id with platform and save into &device
	CHECK_ERROR(err);

	context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); //make a context and save into context
	CHECK_ERROR(err);

	queue = clCreateCommandQueue(context, device, 0, &err); //make a queue and save into queue.
	CHECK_ERROR(err);

	//get a source code named 'kernel.cl' and save its length into &kernel_source_size.
	kernel_source = get_source_code("kernel.cl", &kernel_source_size);
	//make a program with kernel_source and kernel_source_size.
	//this program runs on devices.
	program = clCreateProgramWithSource(context, 1,
			(const char**) &kernel_source, &kernel_source_size, &err);
	CHECK_ERROR(err);

	//start building the program with the source code on the device.
	err = clBuildProgram(program, 1, &device, "", NULL, NULL);
	//if build fails
	if (err == CL_BUILD_PROGRAM_FAILURE) {
		size_t log_size;
		char *log;

		//get log size from build info log first.
		err = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0,
		NULL, &log_size);
		CHECK_ERROR(err);

		log = (char*) malloc(log_size + 1);
		//get log from build info log.
		err = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
				log_size, log, NULL);
		CHECK_ERROR(err);

		log[log_size] = '\0';
		//show the log
		printf("comfiler error : \n%s\n", log);
		free(log);
		exit(0);
	}
	CHECK_ERROR(err);

	kernel = clCreateKernel(program, "mat_mul", &err); //make a kernel with program named mat_mul.
	CHECK_ERROR(err);

	//memory objects for buffer A,B, and C.
	//size is ROW_A * COL_A for bufA, COL_A * COL_B for bufB, and ROW_A * COL_B for bufC
	cl_mem bufA, bufB, bufC;
	bufA = clCreateBuffer(context, CL_MEM_READ_ONLY,
			sizeof(int) * ROW_A * COL_A, NULL, &err);
	CHECK_ERROR(err);
	bufB = clCreateBuffer(context, CL_MEM_READ_ONLY,
			sizeof(int) * COL_A * COL_B, NULL, &err);
	CHECK_ERROR(err);
	bufC = clCreateBuffer(context, CL_MEM_READ_WRITE,
			sizeof(int) * ROW_A * COL_B, NULL, &err);
	CHECK_ERROR(err);

	//make time to elapse
	double start_time = get_time();

	//write these buffers into the queue which you already made before.
	//the A's data (from 0 to int*rowA*colA) is enqueued.
	//this means, the device's global memory will get the A's data so that the device (kernal func) can use the A's data.
	err = clEnqueueWriteBuffer(queue, bufA, CL_FALSE, 0,
			sizeof(int) * ROW_A * COL_A, A, 0, NULL, NULL);
	CHECK_ERROR(err);
	//the same situation.
	err = clEnqueueWriteBuffer(queue, bufB, CL_FALSE, 0,
			sizeof(int) * COL_A * COL_B, B, 0, NULL, NULL);
	CHECK_ERROR(err);

	//this is how the kernel function can use buffer's data from the host.
	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &bufA);
	CHECK_ERROR(err);
	err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &bufB);
	CHECK_ERROR(err);
	err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &bufC);
	CHECK_ERROR(err);
	err = clSetKernelArg(kernel, 3, sizeof(cl_int), &ROW_A);
	CHECK_ERROR(err);
	err = clSetKernelArg(kernel, 4, sizeof(cl_int), &COL_A);
	CHECK_ERROR(err);
	err = clSetKernelArg(kernel, 5, sizeof(cl_int), &COL_B);
	CHECK_ERROR(err);

	size_t global_size[2] = { COL_B, ROW_A };
	size_t local_size[2] = { 16, 16 };
	//ceiling
	global_size[0] = (global_size[0] + local_size[0] - 1) / local_size[0]
			* local_size[0];
	global_size[1] = (global_size[1] + local_size[1] - 1) / local_size[1]
			* local_size[1];

	//run kernal function named "mat_mul".
	//the result from "mat_mul" is saved into bufC because that is what "mat_mul" does.
	err = clEnqueueNDRangeKernel(queue, kernel, 2, //two dimension
			NULL, global_size, //global size
			local_size, //local size
			0, NULL, NULL); //default is non blocking.
	CHECK_ERROR(err);

	//read buffer
	err = clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0,
			sizeof(int) * ROW_A * COL_B, C, 0, NULL, NULL);
	CHECK_ERROR(err);

	double end_time = get_time();
	printf("Elapsed time (excl. initialization) : %f sec\n",
			end_time - start_time);

	clReleaseMemObject(bufA);
	clReleaseMemObject(bufB);
	clReleaseMemObject(bufC);
	clReleaseKernel(kernel);
	clReleaseProgram(program);
	clReleaseCommandQueue(queue);
	clReleaseContext(context);

	printf("finish!\n");
}

void verify(float *A, float *B, float *C, int ROW_A, int COL_A, int COL_B) {
	int i, j, k;
	float sum;

	for (i = 0; i < ROW_A; i++) {
		for (j = 0; j < COL_B; j++) {
			sum = 0.0f;
			for (k = 0; k < COL_A; k++) {
				sum += A[i * COL_A + k] * B[k * COL_B + j];
			}
			if (fabsf(C[i * COL_B + j] - sum) > 0.1) {
				printf("Verification failed! C[%d][%d]: %f vs. %f\n", i, j,
						C[i * COL_B + j], sum);
				return;
			}
		}
	}
	printf("Verification success!\n");
}

static int ROW_A = 100;
static int COL_A = 100;
static int COL_B = 100;

int main(int argc, char *argv[]) {
	float *A = (float*) malloc(sizeof(float) * ROW_A * COL_A);
	float *B = (float*) malloc(sizeof(float) * COL_A * COL_B);
	float *C = (float*) malloc(sizeof(float) * ROW_A * COL_B);
	int i, j;

	for (i = 0; i < ROW_A; i++) {
		for (j = 0; j < COL_A; j++) {
			A[i * COL_A + j] = (float) (rand() % 1000) / 100.0f;
		}
	}
	for (i = 0; i < COL_A; i++) {
		for (j = 0; j < COL_B; j++) {
			B[i * COL_B + j] = (float) (rand() % 1000) / 100.0f;
		}
	}

	printf("Matrix Multiplication\n");
	printf("C[%d X %d] = A[%d X %d] X B[%d X %d]\n", ROW_A, COL_B, ROW_A, COL_A,
			COL_A, COL_B);

	double start_time = get_time();
	mat_mul_opencl(A, B, C, ROW_A, COL_A, COL_B);
	double end_time = get_time();
	printf("Elapsed time (incl. initialization): %f sec\n",
			end_time - start_time);

	verify(A, B, C, ROW_A, COL_A, COL_B);

	free(A);
	free(B);
	free(C);
	return 0;
}




아래는 커널(GPU, FPGA ..etc) 에서 돌아갈 커널 함수이다.


__kernel void mat_mul(__global float *A,__global float *B,__global float *C, int ROW_A, int COL_A, int COL_B){
        int i = get_global_id(1);
        int j = get_global_id(0);
        int k;
        float sum = 0.0f;

        if(i<ROW_A && j <COL_B){
                for(k=0; k<COL_A; k++){
                        sum+=A[i * COL_A + k] * B[k * COL_B + j];
                }
                C[i * COL_B + j] = sum;
        }
}




아래 명령어로 컴파일하며 main 이라는 이름의 바이너리 파일을 얻는다.


gcc -o main main.c -lOpenCL 


얻은 바이너리 파일을 실행시킨다.


결과가 아래와 같이 나온다.


Matrix Multiplication

C[100 X 100] = A[100 X 100] X B[100 X 100]

Elapsed time (excl. initialization) : 0.013704 sec

finish!

Elapsed time (incl. initialization): 1.077035 sec

Verification success! 





+ Recent posts