OpenMP Examples

1Example 1: Arrays¶

OpenMP C program: for.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
#include <stdio.h>
#include <omp.h>
int main() {
  omp_set_num_threads(4);
  int a[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
  const int N = sizeof(a)/sizeof(int);

  #pragma omp parallel for
  for (int i = 0; i < N; i++) {
    printf("thread %d, i = %2d\n", omp_get_thread_num(), i);
    a[i] = a[i] + 10 * omp_get_thread_num();
  }

  for (int i = 0; i < N; i++) {
    printf("%02d ", a[i]);
  }
  printf("\n");
  return 0;
}

Output:

$ gcc -fopenmp -o for for.c
$ ./for
thread 0, i =  0
thread 1, i =  3
thread 2, i =  6
thread 3, i =  8
thread 0, i =  1
thread 1, i =  4
thread 2, i =  7
thread 3, i =  9
thread 0, i =  2
thread 1, i =  5
00 01 02 13 14 15 26 27 38 39

2Example 2: Computing $\pi$ ¶

🎥 Lecture Video

We can compute $\pi$ with numerical integration^[1]:

\pi = \int_0^1 \frac{4.0}{1+x^2} dx

We can use Riemann’s sum to approximate the integral as a sum of $N$ rectangles:

\pi \approx \sum_{i=0}^N F(x_i) \Delta x,

where the $i$ -th rectangle has width $\Delta x$ and height $F(x_i) = \frac{4}{1+x^2}$ at the middle of interval $i$ .

Figure 1:Slides version of the code below, with some additional diagrams.

Sequential Pi

Parallelize 1

Parallelize 2

Parallelize 3: Scale Up

#include <stdio.h>

int main(void) {
  const long num_steps = 10;
  double sum = 0.0;
  for (int i = 0; i < num_steps; i++) {
    double x = (i + 0.5) * step;
    sum += 4.0 * step/(1.0 + x*x);
  }
  printf("pi = %6.12f\n", sum);

  return 0;
}

Output:

pi = 3.142425985001

Resembles $\pi$ , but not very accurate. Let’s increase num_steps and parallelize.

#include <stdio.h>
#include <omp.h>

int main(void) {
  const long num_steps = 10;
  double sum = 0.0;
  #pragma parallel for
  for (int i = 0; i < num_steps; i++) {
    double x = (i + 0.5) * step;
    sum += 4.0 * step/(1.0 + x*x);
  }
  printf("pi = %6.12f\n", sum);

  return 0;
}

Problem: Each thread needs access to the shared variable sum. Code runs sequentially!

Compute a sum array, chunked into components of the Riemann’s Sum. Then, add up the elements of the sum array.

#include <stdio.h>
#include <omp.h>

int main(void) {
  const int NUM_THREADS = 4;
  omp_set_num_threads(NUM_THREADS);

  double sum[NUM_THREADS];
  for (int tid = 0; tid < NUM_THREADS; tid++) {
    sum[tid] = 0;
  }

  const long num_steps = 10;
  double step = 1.0/((double) num_steps);
  #pragma omp parallel
  {
    int tid = omp_get_thread_num();
    for (int i = tid; i < num_steps; i+= NUM_THREADS) {
      double x = (i + 0.5) * step;
      sum[tid] += 4.0 * step/(1.0 + x*x);
      printf("i = %3d, tid = %3d\n", i, tid);
    }
  }

  double pi = 0;
  for (int tid = 0; tid < NUM_THREADS; tid++) {
    pi += sum[tid];
  }
  printf("pi = %6.12f\n", pi);

  return 0;
}

Output:

$ ./pi
i =  1,  id =  1
i =  0,  id =  0
i =  2,  id =  2
i =  3,  id =  3
i =  5,  id =  1
i =  4,  id =  0
i =  6,  id =  2
i =  7,  id =  3
i =  9,  id =  1
i =  8,  id =  0
pi = 3.142425985001

Scale up: num_steps = = 10⁶

#include <stdio.h>
#include <omp.h>

int main(void) {
  const int NUM_THREADS = 4;
  omp_set_num_threads(NUM_THREADS);

  double sum[NUM_THREADS];
  for (int tid = 0; tid < NUM_THREADS; tid++) {
    sum[tid] = 0;
  }

  const long num_steps = 1000000;
  double step = 1.0/((double) num_steps);
  #pragma omp parallel
  {
    int tid = omp_get_thread_num();
    for (int i = tid; i < num_steps; i+= NUM_THREADS) {
      double x = (i + 0.5) * step;
      sum[tid] += 4.0 * step/(1.0 + x*x);
      // printf("i = %3d, tid = %3d\n", i, tid);
    }
  }

  double pi = 0;
  for (int tid = 0; tid < NUM_THREADS; tid++) {
    pi += sum[tid];
  }
  printf("pi = %6.12f\n", pi);

  return 0;
}

Output:

$ ./pi
pi = 3.141592653590

Verify how many digits are correct!

This example is adapted from the OpenMP “Hands On Tutorial” from SC08.^[2] View the original slides PDF.

3Example 3: More on the `for` Directive¶

This section expands on the OpenMP directive for described in an earlier section. Consider the below program, which uses a loop to assign elements of a giant heap-allocated array:

#define LENGTH (1 << 27)
int main(void) {
  char *arr = malloc(sizeof(char) * LENGTH);
  for (int i = 0; i < LENGTH; i++) {
      arr[i] = ...;
  }
}

Toggle between the cards below to compare different parallelizations of this program. Asssume that OMP_NUM_THREADS on this

Code 1

Code 2

Code 3

Code 4

Code 5

#define LENGTH (1 << 27)
int main(void) {
  char *arr = malloc(sizeof(char) * LENGTH);
  #pragma omp parallel
  {
    for(int i = 0; i < LENGTH; i++) {
      arr[i] = ...;
    }
  }
}

#define LENGTH (1 << 27)
int main(void) {
  char *arr = malloc(sizeof(char) * LENGTH);
  #pragma omp parallel
  {
    int tid = omp_get_thread_num();
    int num_threads = omp_get_num_threads();
    for(int i = tid; i < LENGTH; i+= num_threads) {
      arr[i] = j;
    }
  }
}

#define LENGTH (1 << 27)
int main(void) {
  char *arr = malloc(sizeof(char) * LENGTH);
  #pragma omp parallel
  {
    int tid = omp_get_thread_num();
    int num_threads = omp_get_num_threads();
    int thread_start = tid * LENGTH / num_threads;
    int thread_end = (tid+1)*LENGTH / num_threads;
    for(int i = thread_start; i < thread_end; i++) {
      arr[i] = j;
    }
  }
}

#define LENGTH (1 << 27)
int main(void) {
  char *arr = malloc(sizeof(char) * LENGTH);
  #pragma omp parallel
  {
    #pragma omp for
    for(int i = 0; i < LENGTH; i++) {
      arr[i] = j;
    }
  }
}

Explanation 1

Explanation 2

Explanation 3

Explanation 4

Explanation 5

Duplicates work. The for-loop is repeated 12 times, so each array element is assigned 12 times.

Footnotes¶

Review calculus (e.g., wikipedia), or trust me. Briefly: a unit circle satisfies the Cartesian coordinate equation $x^2 + y^2 = 1$ . The arc length of the top-half of the circle is $\pi = \int_{-1}^1 \frac{dx}{\sqrt{1 - x^2}}$ . Use calculus rules to get $\pi = \int_0^1 \frac{4}{1+x^2} dx$ .
↩
Tim Mattson and Larry Meadows, SC08 OpenMP “Hands On Tutorial.” 2008. Access on OpenMP website
↩

1Example 1: Arrays¶

2Example 2: Computing π\piπ¶

3Example 3: More on the for Directive¶

2Example 2: Computing $\pi$ ¶

3Example 3: More on the `for` Directive¶