I am trying to apply Gaussian Filter on image using CUDA.
(我正在尝试使用CUDA在图像上应用高斯滤波器。)
int main() {
double GKernel[5][5];
FilterCreation(GKernel);
double * kernel = new double[25];
int i,j,k = 0;
for( int i = 0; i < 5; i++){
for( int j = 0; j < 5; j++){
kernel[k++] = GKernel[i][j];
}
}
double * deviceKernel;
cudaMalloc((void **)&deviceKernel, 25 * sizeof(double));
cudaMemcpy(deviceKernel, kernel, 25 * sizeof(double), cudaMemcpyHostToDevice);
Info dat = readBMP("konik.bmp");
unsigned char * devPtr;
unsigned char * devPtrFilter;
size_t pitch;
unsigned char * test= new unsigned char [dat.size - dat.offset ];
cudaMalloc (( void **)& devPtr , dat.size * sizeof ( unsigned char ));
cudaMalloc (( void **)& devPtrFilter , dat.size * sizeof ( unsigned char ));
cudaMemcpy ( devPtr , dat.data , sizeof ( unsigned char ) * dat.size , cudaMemcpyHostToDevice );
greyScale<<<dat.height,dat.width>>>(devPtr,dat.height,dat.width);
CreateGaussFilter<<<dat.height,dat.width>>>(devPtr,devPtrFilter,5,5,deviceKernel,dat.height,dat.width);
cudaMemcpy ( test, devPtrFilter , sizeof ( unsigned char ) * dat.size ,cudaMemcpyDeviceToHost );
cudaDeviceSynchronize ();
ofstream fout;
fout.open("output.bmp", ios::binary | ios::out);
fout.write( reinterpret_cast<char *>(dat.info), dat.offset);
fout.write( reinterpret_cast<char *>(test), dat.size - dat.offset );
fout.close();
return 0;
}
First i call greyScale kernel that works as it should.
(首先,我称其为greyScale内核,它可以正常工作。)
__global__ void greyScale( unsigned char * src , int rows, int cols){
int i = blockDim.x * blockIdx.x * threadIdx.x;
int j = blockDim.y * blockIdx.y * threadIdx.y;
if( i >= rows || j >= cols ) {
return;
}
unsigned char r = src[3 * (i * cols + j)];
unsigned char g = src[3 * (i * cols + j) + 1];
unsigned char b = src[3 * (i * cols + j) + 2];
unsigned char linearIntensity = (unsigned char)(0.2126f * r + 0.7512f * g + 0);
src[3 * (i * cols + j)] = linearIntensity;
src[3 * (i * cols + j) + 1] = linearIntensity;
src[3 * (i * cols + j) + 2] = linearIntensity;
}
But what bothers me is my CreateGaussFilter kernel function:
(但是令我困扰的是我的CreateGaussFilter内核函数:)
__global__ void CreateGaussFilter(unsigned char * src, unsigned char * dst, int kernalHeight, int kernalWidth, double *kernalArray, int rows, int cols){
int verticleImageBound=(kernalHeight-1)/2;
int horizontalImageBound=(kernalWidth-1)/2;
int row = ( blockDim.x * blockIdx.x * threadIdx.x ) + verticleImageBound;
int col = ( blockDim.y * blockIdx.y * threadIdx.y ) + horizontalImageBound;
if ( row >= rows - verticleImageBound || col >= cols - horizontalImageBound ) return;
float value=0;
for(int kRow=0;kRow<kernalHeight;kRow++){
for(int kCol=0;kCol<kernalWidth;kCol++){
float pixel = src[ 3 * ((kRow+row-verticleImageBound ) * cols + (kCol+col-horizontalImageBound)) ] * kernalArray[kRow + kCol * kernalWidth];
value+=pixel;
}
}
printf("value = %i
",value);
dst[3 * ( row * cols + col )] = round(value);
dst[3 * ( row * cols + col ) + 1] = round(value);
dst[3 * ( row * cols + col ) + 2] = round(value);
}
it actually outputs black image ( this works when im running it on CPU without CUDA ), after debugging with printf
it seems like the code wont execute after for loop, im not sure why, the boundaries works in non CUDA version and block size are same for greyScale kernel that works.
(它实际上输出黑色图像(当我在没有CUDA的CPU上运行它时可以正常工作),使用printf
调试后,似乎代码不会在for循环后执行,我不确定为什么,边界在非CUDA版本中有效并且块大小相同适用于有效的greyScale内核。)
(为什么在for循环之后不执行代码?)
Is error occuring somewhere and shutting down the thread?(是否在某处发生错误并关闭了线程?)
Im quite confused.(我很困惑。)
Code for loading images:
(加载图像的代码:)
struct Info{
int width;
int height;
int offset;
unsigned char * info;
unsigned char * data;
int size;
};
Info readBMP(char* filename)
{
int i;
std::ifstream is(filename, std::ifstream::binary);
is.seekg(0, is.end);
i = is.tellg();
is.seekg(0);
unsigned char *info = new unsigned char[i];
is.read((char *)info,i);
int width = *(int*)&info[18];
int height = *(int*)&info[22];
int offset = *(int*)&info[10];
unsigned char a[offset];
unsigned char *b = new unsigned char[i - offset];
std::copy(info,
info + offset,
a);
std::copy(info + offset,
info + i,
b + 0);
Info dat;
dat.width = width;
dat.height = height;
dat.offset = offset;
dat.size = i;
dat.info = new unsigned char[offset - 1];
dat.data = new unsigned char[i - offset + 1];
for( int j = 0; j < offset ; j++ ){
dat.info[j] = a[j];
}
for( int j = 0; j < i - offset; j++ ){
dat.data[j] = b[j];
}
return dat;
}
Thanks for help!
(感谢帮助!)
ask by Darlyn translate from so