Welcome to ShenZhenJia Knowledge Sharing Community for programmer and developer-Open, Learning and Share
menu search
person
Welcome To Ask or Share your Answers For Others

Categories

It's a follow-up question to this one
Now I have the code:

#include <iostream>
#include <cmath>
#include <omp.h>


#define max(a, b) (a)>(b)?(a):(b)

const int m = 2001;
const int n = 2000;
const int p = 4;

double v[m + 2][m + 2];
double x[m + 2];
double y[m + 2];
double _new[m + 2][m + 2];
double maxdiffA[p + 1];
int icol, jrow;

int main() {
    omp_set_num_threads(p);

    double h = 1.0 / (n + 1);

    double start = omp_get_wtime();

    #pragma omp parallel for private(icol) shared(x, y, v, _new)
    for (icol = 0; icol <= n + 1; ++icol) {
        x[icol] = y[icol] = icol * h;

        _new[icol][0] = v[icol][0] = 6 - 2 * x[icol];

        _new[n + 1][icol] = v[n + 1][icol] = 4 - 2 * y[icol];

        _new[icol][n + 1] = v[icol][n + 1] = 3 - x[icol];

        _new[0][icol] = v[0][icol] = 6 - 3 * y[icol];
    }


    const double eps = 0.01;


    #pragma omp parallel private(icol, jrow) shared(_new, v, maxdiffA)
    {
        while (true) { //for [iters=1 to maxiters by 2]
            #pragma omp single
            for (int i = 0; i < p; i++) maxdiffA[i] = 0;
            #pragma omp for
            for (icol = 1; icol <= n; icol++)
                for (jrow = 1; jrow <= n; jrow++)
                    _new[icol][jrow] =
                            (v[icol - 1][jrow] + v[icol + 1][jrow] + v[icol][jrow - 1] + v[icol][jrow + 1]) / 4;
            #pragma omp for
            for (icol = 1; icol <= n; icol++)
                for (jrow = 1; jrow <= n; jrow++)
                    v[icol][jrow] = (_new[icol - 1][jrow] + _new[icol + 1][jrow] + _new[icol][jrow - 1] +
                                     _new[icol][jrow + 1]) / 4;

            #pragma omp for
            for (icol = 1; icol <= n; icol++)
                for (jrow = 1; jrow <= n; jrow++)
                    maxdiffA[omp_get_thread_num()] = max(maxdiffA[omp_get_thread_num()],
                                                         fabs(_new[icol][jrow] - v[icol][jrow]));

            #pragma omp barrier

            double maxdiff = 0.0;
            for (int k = 0; k < p; ++k) {
                maxdiff = max(maxdiff, maxdiffA[k]);
            }


            if (maxdiff < eps)
                break;
            #pragma omp barrier
            //#pragma omp single
            //std::cout << maxdiff << std::endl;
        }
    }
    double end = omp_get_wtime();
    printf("start = %.16lf
end = %.16lf
diff = %.16lf
", start, end, end - start);

    return 0;
}

But why it works 2-3 times slower (32sec vs 18sec) than serial analog:

#include <iostream>
#include <cmath>
#include <omp.h>

#define max(a,b) (a)>(b)?(a):(b)

const int m = 2001;
const int n = 2000;
double v[m + 2][m + 2];
double x[m + 2];
double y[m + 2];
double _new[m + 2][m + 2];

int main() {
    double h = 1.0 / (n + 1);

    double start = omp_get_wtime();

    for (int i = 0; i <= n + 1; ++i) {
        x[i] = y[i] = i * h;

        _new[i][0]=v[i][0] = 6 - 2 * x[i];

        _new[n + 1][i]=v[n + 1][i] = 4 - 2 * y[i];

        _new[i][n + 1]=v[i][n + 1] = 3 - x[i];

        _new[0][i]=v[0][i] = 6 - 3 * y[i];
    }

    const double eps=0.01;
    while(true){ //for [iters=1 to maxiters by 2]
        double maxdiff=0.0;
        for (int i=1;i<=n;i++)
            for (int j=1;j<=n;j++)
                _new[i][j]=(v[i-1][j]+v[i+1][j]+v[i][j-1]+v[i][j+1])/4;
        for (int i=1;i<=n;i++)
            for (int j=1;j<=n;j++)
                v[i][j]=(_new[i-1][j]+_new[i+1][j]+_new[i][j-1]+_new[i][j+1])/4;

        for (int i=1;i<=n;i++)
            for (int j=1;j<=n;j++)
                maxdiff=max(maxdiff, fabs(_new[i][j]-v[i][j]));

        if(maxdiff<eps) break;
        std::cout << maxdiff<<std::endl;
    }

    double end = omp_get_wtime();
    printf("start = %.16lf
end = %.16lf
diff = %.16lf
", start, end, end - start);

    return 0;
}

Also interesting that it works SAME TIME as version (I can post it here if you say so) which looks like so

while(true){ //106 iteratins here!!!
#pragma omp paralell for
for(...)
#pragma omp paralell for
for(...)
#pragma omp paralell for
for(...)
}

But I thought that what making omp code slow is spawning threads inside while loop 106 times... But no! Then probably threads simultaneously write to the same array cells.. But where does it happen? I don't see it could you show me please?
Maybe it's because too much barriers? But Lecturer told me to implement the code like so and "analyse it" Maybe the answer is "Jacobi algorithm isn't meant to run well in parallel"? Or it's just my lame coding?

See Question&Answers more detail:os

与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…
thumb_up_alt 0 like thumb_down_alt 0 dislike
196 views
Welcome To Ask or Share your Answers For Others

1 Answer

So the root of evel was

max(maxdiffA[w],fabs(_new[icol][jrow] - v[icol][jrow]))

because it's

#define max(a, b) (a)>(b)?(a):(b)

It's probably creating TOO much branching ('if's ) Without this thing parallel version works 8 times faster and loading CPU 68% instead of 99%.. The starange thing: same "max" doesn't affect serioal version


与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…
thumb_up_alt 0 like thumb_down_alt 0 dislike
Welcome to ShenZhenJia Knowledge Sharing Community for programmer and developer-Open, Learning and Share
...