I am trying to apply Gaussian Filter on image using CUDA.
int main() {
double GKernel[5][5];
double * kernel = new double[25];
int i,j,k = 0;
for( int i = 0; i < 5; i++){
for( int j = 0; j < 5; j++){
kernel[k++] = GKernel[i][j];
double * deviceKernel;
cudaMalloc((void **)&deviceKernel, 25 * sizeof(double));
cudaMemcpy(deviceKernel, kernel, 25 * sizeof(double), cudaMemcpyHostToDevice);
Info dat = readBMP("konik.bmp");
unsigned char * devPtr;
unsigned char * devPtrFilter;
size_t pitch;
unsigned char * test= new unsigned char [dat.size - dat.offset ];
cudaMalloc (( void **)& devPtr , dat.size * sizeof ( unsigned char ));
cudaMalloc (( void **)& devPtrFilter , dat.size * sizeof ( unsigned char ));
cudaMemcpy ( devPtr , dat.data , sizeof ( unsigned char ) * dat.size , cudaMemcpyHostToDevice );
cudaMemcpy ( test, devPtrFilter , sizeof ( unsigned char ) * dat.size ,cudaMemcpyDeviceToHost );
cudaDeviceSynchronize ();
ofstream fout;
fout.open("output.bmp", ios::binary | ios::out);
fout.write( reinterpret_cast<char *>(dat.info), dat.offset);
fout.write( reinterpret_cast<char *>(test), dat.size - dat.offset );
return 0;
First i call greyScale kernel that works as it should.
__global__ void greyScale( unsigned char * src , int rows, int cols){
int i = blockDim.x * blockIdx.x * threadIdx.x;
int j = blockDim.y * blockIdx.y * threadIdx.y;
if( i >= rows || j >= cols ) {
unsigned char r = src[3 * (i * cols + j)];
unsigned char g = src[3 * (i * cols + j) + 1];
unsigned char b = src[3 * (i * cols + j) + 2];
unsigned char linearIntensity = (unsigned char)(0.2126f * r + 0.7512f * g + 0);
src[3 * (i * cols + j)] = linearIntensity;
src[3 * (i * cols + j) + 1] = linearIntensity;
src[3 * (i * cols + j) + 2] = linearIntensity;
But what bothers me is my CreateGaussFilter kernel function:
__global__ void CreateGaussFilter(unsigned char * src, unsigned char * dst, int kernalHeight, int kernalWidth, double *kernalArray, int rows, int cols){
int verticleImageBound=(kernalHeight-1)/2;
int horizontalImageBound=(kernalWidth-1)/2;
int row = ( blockDim.x * blockIdx.x * threadIdx.x ) + verticleImageBound;
int col = ( blockDim.y * blockIdx.y * threadIdx.y ) + horizontalImageBound;
if ( row >= rows - verticleImageBound || col >= cols - horizontalImageBound ) return;
float value=0;
for(int kRow=0;kRow<kernalHeight;kRow++){
for(int kCol=0;kCol<kernalWidth;kCol++){
float pixel = src[ 3 * ((kRow+row-verticleImageBound ) * cols + (kCol+col-horizontalImageBound)) ] * kernalArray[kRow + kCol * kernalWidth];
printf("value = %i
dst[3 * ( row * cols + col )] = round(value);
dst[3 * ( row * cols + col ) + 1] = round(value);
dst[3 * ( row * cols + col ) + 2] = round(value);
it actually outputs black image ( this works when im running it on CPU without CUDA ), after debugging with printf
it seems like the code wont execute after for loop, im not sure why, the boundaries works in non CUDA version and block size are same for greyScale kernel that works.
Is error occuring somewhere and shutting down the thread?(是否在某处发生错误并关闭了线程?)
Im quite confused.(我很困惑。)
Code for loading images:
struct Info{
int width;
int height;
int offset;
unsigned char * info;
unsigned char * data;
int size;
Info readBMP(char* filename)
int i;
std::ifstream is(filename, std::ifstream::binary);
is.seekg(0, is.end);
i = is.tellg();
unsigned char *info = new unsigned char[i];
is.read((char *)info,i);
int width = *(int*)&info[18];
int height = *(int*)&info[22];
int offset = *(int*)&info[10];
unsigned char a[offset];
unsigned char *b = new unsigned char[i - offset];
info + offset,
std::copy(info + offset,
info + i,
b + 0);
Info dat;
dat.width = width;
dat.height = height;
dat.offset = offset;
dat.size = i;
dat.info = new unsigned char[offset - 1];
dat.data = new unsigned char[i - offset + 1];
for( int j = 0; j < offset ; j++ ){
dat.info[j] = a[j];
for( int j = 0; j < i - offset; j++ ){
dat.data[j] = b[j];
return dat;
Thanks for help!
ask by Darlyn translate from so