Implementation:

Implemented as fork & join. Thread is split in two at every invocation of recursive mergesort, until a depth log2(num of total threads) is reached. In OpenMP, parallel sections and nesting is used. In TBB, the parallel invoke is used.

Merge Function

void merge(int *arr, int * temp, int lo, int mid, int hi){

        int i = lo;

        int j = mid;

    for (int k = lo; k < hi; k++) {       

        if (i < mid && (j >= hi || arr[i] <= arr[j])) {

            temp[k] = arr[i++];           

        } else {

            temp[k] = arr[j++];           

        }

    }     

}

OpenMP implementation

void mpsort(int * arr,int * temp, int lo, int hi, int depth = 0){

        if(hi - lo < 2) return;

        int mid = (lo + hi) / 2;

        if(depth < maxDepth){                

                #pragma omp parallel sections

                {                                                                        

                        #pragma omp section

                        {

                                omp_set_nested(1);

                                omp_set_num_threads(2);        

                                mpsort(temp,arr, lo, mid, depth+1);

                        }

                        #pragma omp section

                        {

                                omp_set_nested(1);

                                omp_set_num_threads(2);        

                                mpsort(temp,arr, mid, hi, depth+1);

                        }

                }

        }

        else {

                mpsort(temp,arr, lo, mid, depth+1);

                mpsort(temp,arr, mid, hi, depth+1);

        }

        merge(arr,temp, lo,mid,hi);

}

TBB implementation

void tbbsort(int * arr,int * temp, int lo, int hi, int depth = 0){

        if(hi - lo < 2) return;

        int mid = (lo + hi) / 2;

        if(depth < maxDepth){

                tbb::parallel_invoke(

                        [=](){ tbbsort(temp,arr, lo, mid, depth+1); },

                        [=](){ tbbsort(temp,arr, mid, hi, depth+1); }

                );

        }

        else {

                tbbsort(temp,arr, lo, mid, depth+1);

                tbbsort(temp,arr, mid, hi, depth+1);

        }

        merge(arr,temp, lo,mid,hi);

}

Results:

Compiled using icpc -std=c++11

Array of 4-byte integers of size 1024000.

Average of 64 runs for number of cores = 1,2,4,8,16,32,64,128 and 256.

Time of single core ~ 0.4s