_OPTIMIZING IN A PARALLEL ENVIRONMENT_
by Barr E. Bauer

[LISTINÇ ONE]

      program test                                                      1
*                                                                       2
* purpose is to test SGI parallelization scheme for loop selection,     3
* numerically-intensive calculations, and total reduction. See text     4
* for details.                                                          5
*                                                                       6
      parameter (MAXFIRST=250, MAXSECOND=250, MAXTHIRD=10)              7
      real*8 a(MAXTHIRD,MAXSECOND,MAXFIRST)                             8
      real*8 b(MAXTHIRD,MAXSECOND,MAXFIRST)                             9
      real*8 sub_total(MAXFIRST), partial_total(4)                      10
      real*8 d(MAXTHIRD), c, tmp    ! local variables                   11
      real*8 dist(MAXSECOND,MAXFIRST), grand_total                      12
      real*8 grand_total      ! test for proper operation               13
      logical parallel        ! selects 2-version loops                 14
      integer*4 iflag         ! used to show LASTLOCAL value            15
                                                                        16
      data parallel /.false./                                           17
      data sub_total, iflag /MAXFIRST*0.0, 0/                           18
*                                                                       19
* outer loop: contains both interior loops                              20
*                                                                       21
                                                                        22
* C$doacross local(k,j,i,tmp,d,c), share(a,b,sub_total,dist),           23
* C$&        lastlocal(iflag)                                           24
                                                                        25
      do i = 1, MAXFIRST                                                26
*                                                                       27
* first inner loop: fills arrays a and b                                28
*                                                                       29
                                                                        30
* C$doacross local(j,k,c), share(i,a,b)                                 31
                                                                        32
        do j = 1, MAXSECOND                                             33
          do k = 1, MAXTHIRD                                            34
            a(k,j,i) = dsqrt(dfloat(i*j*k))                             35
            c = 1.0 - a(k,j,i)                                          36
            if (c .le. 0.0 .and. i .lt. j*k) then                       37
              c = -c                                                    38
            else                                                        39
              c = c**2                                                  40
            endif                                                       41
            b(k,j,i) = 32*(dcos(c)**5)*dsin(c)-                         42
     1                 32*(dcos(c)**3)*dsin(c)+                         43
     2                  6*dcos(c)*dsin(c)                               44
          enddo                                                         45
        enddo                                                           46
*                                                                       47
* seond inner loop: determines distance and starts summation            48
*                                                                       49
                                                                        50
* c$doacross local(j,k,d,tmp), share(i,a,b,dist,sub_total),             51
* c$&        lastlocal(iflag)                                           52
                                                                        53Š        do j=1, MAXSECOND                                               54
          tmp = 0.0                                                     55
          do k = 1, MAXTHIRD                                            56
            d(k) = a(k,j,i) - b(k,j,i)                                  57
          enddo                                                         58
          do k = 1, MAXTHIRD                                            59
            tmp = tmp + d(k)**2                                         60
          enddo                                                         61
          dist(j,i) = dsqrt(tmp)                                        62
          if (dist(j,i) .le. 0.1) iflag = iflag + 1                     63
          sub_total(j) = sub_total(j) + dist(j,i)                       64
        enddo                                                           65
      enddo                                                             66
*                                                                       67
* the next section is an example of sum reduction optimized to the      68
* parallel environment and the use of a more efficient 2 loop summation 69
*                                                                       70
* if -mp option is active, parallel is set to .true. which then         71
* selects the parallel version                                          72
*                                                                       73
                                                                        74
C$    parallel = .true.                                                 75
      grand_total = 0.0                                                 76
      if (parallel) then                     ! parallel version         77
C$      num_threads = mp_numthreads()                                   78
        ichunk = (MAXFIRST + (num_threads - 1))/num_threads             79
                                                                        80
C$doacross local(k,j),                                                  81
C$&        share(num_threads,partial_total,sub_total,ichunk)            82
                                                                        83
        do k = 1, num_threads ! this loop is parallelized               84
          partial_total(k) = 0.0                                        85
          do j = k*ichunk - ichunk + 1, min(k*ichunk,MAXFIRST)          86
            partial_total(k) = partial_total(k) + sub_total(j)          87
          enddo                                                         88
        enddo                                                           89
        do j = 1, num_threads   ! smaller loop handled as scalar        90
          grand_total = grand_total + partial_total(j)                  91
        enddo                                                           92
      else                                   ! the scalar version       93
        do j = 1, MAXFIRST                                              94
          grand_total = grand_total + sub_total(j)                      95
        enddo                                                           96
      endif                                                             97
                                                                        98
      if (parallel) then                                                99
C$      write (*,10) grand_total, num_threads                           100
C$      write (*,20) iflag                                              101
      else                                                              102
        write (*,30) grand_total                                        103
        write (*,40) iflag                                              104
      endif                                                             105
      stop                                                              106
C$10  format(1x,'grand total = ',g10.3,'threads = ',i4)                 107
C$20  format(1x,'parallel iflag = ',i10)                                108Š30    format(1x,'grand total = ',g10.3)                                 109
40    format(1x,'scalar iflag = ',i10)                                  110
      end                                                               111



[LISTINÇ TWO]


(source code)

      subroutine example(a, b, c, n)
      integer*4 n
      real*4 a(n), b(n), c(n)

      (additional code)

c$doacross local(i, x)
      do i=1, n
        x = a(n) * b(n)
        c(n) = x**2
      enddo

      (additional code)

      return
      end

(the loop is transformed to)

      subroutine _example_1(
     1  _local_start,             ! index starting valueŠ     2  _local_ntrip,             ! number of loop executions
     3  _incr,                    ! index increment
     4  _my_threadno)             ! unique process ID number

      integer*4 _local_start, _local_ntrip, _incr, _my_threadno

      integer*4  i                ! declared local
      real*4     x                ! declared local

      integer*4  _tmp             ! created local

      i = _local_start
      do _tmp = 1, _local_ntrip
        x = a(i) * b(i)
        c(i) = x**2
        i = i + _incr
      enddo
      return
      end


Examplĺ 1ş A typical DĎ loop

 
do i = 1, n
   a(i) = x * b(i)
enddo


Examplĺ 2ş Á DĎ loođ iî whicč thĺ arraů variablĺ referenceó á Ť
valuĺ thaô ió noô currenô witč ôhĺ index
 
do i = 2, n
   arr(i) = b(i) - arr(i-1)
enddo

Examplĺ 3ş Aî examplĺ oć loaä imbalance
 
      do i = 1, n
          do j = 1, i
               a(j, i) = a(j, i) * xmult
          enddo
      enddo


Examplĺ 4ş Loaä balancing

      num_threads = mp_numthreads()
c$doacross local(i, j, k)
      do k = 1, num_threads
          do i = k, n, num_threads
               do j = 1, i
                    a(j, i) = a(j, i) * xmult
               enddo
          enddo
      enddo
  