hpss

Since 2/12/13 01:55 pm

lens

Since 2/13/13 10:20 am

smoky

Since 2/13/13 08:05 am
OLCF User Assistance Center

Can't find the information you need below? Need advice from a real person? We're here to help.

OLCF support consultants are available to respond to your emails and phone calls from 9:00 a.m. to 5:00 p.m. EST, Monday through Friday, exclusive of holidays. Emails received outside of regular support hours will be addressed the next business day.

HMPP Game of Life

Bookmark and Share

 

Introduction

This sample shows the Game of Life CPU code converted to an HMPP accelerator directive version. Both codelet and region C and Fortran methods are presented. A second version is presented that fixes a performance issue incurred in the initial version. Please direct any questions or comments to help@nccs.gov

GOL-codelet.c

#include <stdio.h>
#include <stdlib.h>

#define SRAND_VALUE 1985

#pragma hmpp GOL codelet, target=CUDA, args[grid].io=inout, args[*].transfer=atcall, args[newGrid].io=none
void GOL(int dim, int grid[dim+2][dim+2], int newGrid[dim+2][dim+2], int maxIter)
{
    int i, j, iter;

    // Main game loop
    for (iter = 0; iter<maxIter; iter++) {

        // Left-Right columns
        for (i = 1; i<=dim; i++) {
            grid[i][0] = grid[i][dim]; //Copy first real column to right ghost column
            grid[i][dim+1] = grid[i][1]; //Copy last real column to left ghost column
        }

        // Top-Bottom rows
        for (j = 0; j<=dim+1; j++) {
            grid[0][j] = grid[dim][j]; //Copy first real row to bottom ghost row
            grid[dim+1][j] = grid[1][j]; //Copy last real row to top ghost row
        }

        int numNeighbors;
        // Now we loop over all cells and determine their fate
        for (i = 1; i<=dim; i++) {
            for (j = 1; j<=dim; j++) {
               // Get the number of neighbors for a given grid point
                numNeighbors = grid[i+1][j] + grid[i-1][j]      //upper lower
                             + grid[i][j+1] + grid[i][j-1]      //right left
                             + grid[i+1][j+1] + grid[i-1][j-1]  //diagonals
                             + grid[i-1][j+1] + grid[i+1][j-1];


                // Here we have explicitly all of the game rules
                if (grid[i][j] == 1 && numNeighbors < 2)
                    newGrid[i][j] = 0;
                else if (grid[i][j] == 1 && (numNeighbors == 2 || numNeighbors == 3))
                    newGrid[i][j] = 1;
                else if (grid[i][j] == 1 && numNeighbors > 3)
                    newGrid[i][j] = 0;
                else if (grid[i][j] == 0 && numNeighbors == 3)
                    newGrid[i][j] = 1;
                else
                    newGrid[i][j] = grid[i][j];
            }
        }

        // Can't switch pointers so we mannually have to copy array over
        for(i = 1; i<dim+2; i++) {
            for(j = 1; j<dim+2; j++) {
                grid[i][j] = newGrid[i][j];
            }
        }
    }

}

int main(int argc, char* argv[])
{
    int i,j,iter,numNeighbors;
    int dim = 1024;
    // Number of game iterations
    int maxIter = 1<<15;

    // Allocate square grid of (dim+2)^2 elements, 2 added for ghost cells
    int *grid = (int*) malloc( sizeof(int) * (dim+2) *(dim+2));

    // Allocate newGrid
    int *newGrid = (int*) malloc( sizeof(int) * (dim+2) *(dim+2));

    // Assign initial population randomly
    srand(SRAND_VALUE);
    for(i = 1; i<=dim; i++) {
        for(j = 1; j<=dim; j++) {
            grid[i*(dim+2)+j] = rand() % 2;
        }
    }

    // Call the GPU to execute codelet
    #pragma hmpp GOL callsite
    GOL(dim, grid, newGrid, maxIter);

    // Sum up alive cells and print results
    int total = 0;
    for (i = 1; i<=dim; i++) {
        for (j = 1; j<=dim; j++) {
            total += grid[i*(dim+2)+j];
        }
    }

    printf("Total Alive: %d\n", total);

    free(grid);
    free(newGrid);

    return 0;
}

Changes:

Lines 6-58

#pragma hmpp GOL codelet, target=CUDA, args[grid].io=inout, args[*].transfer=atcall, args[newGrid].io=none
void GOL(int dim, int grid[dim+2][dim+2], int newGrid[dim+2][dim+2], int maxIter)
{
    int i, j, iter;
 
    // Main game loop
    for (iter = 0; iter<maxIter; iter++) {
 
        // Left-Right columns
        for (i = 1; i<=dim; i++) {
            grid[i][0] = grid[i][dim]; //Copy first real column to right ghost column
            grid[i][dim+1] = grid[i][1]; //Copy last real column to left ghost column
        }
 
        // Top-Bottom rows
        for (j = 0; j<=dim+1; j++) {
            grid[0][j] = grid[dim][j]; //Copy first real row to bottom ghost row
            grid[dim+1][j] = grid[1][j]; //Copy last real row to top ghost row
        }
 
        int numNeighbors;
        // Now we loop over all cells and determine their fate
        for (i = 1; i<=dim; i++) {
            for (j = 1; j<=dim; j++) {
               // Get the number of neighbors for a given grid point
                numNeighbors = grid[i+1][j] + grid[i-1][j]      //upper lower
                             + grid[i][j+1] + grid[i][j-1]      //right left
                             + grid[i+1][j+1] + grid[i-1][j-1]  //diagonals
                             + grid[i-1][j+1] + grid[i+1][j-1];
 
                // Here we have explicitly all of the game rules
                if (grid[i][j] == 1 && numNeighbors < 2)
                    newGrid[i][j] = 0;
                else if (grid[i][j] == 1 && (numNeighbors == 2 || numNeighbors == 3))
                    newGrid[i][j] = 1;
                else if (grid[i][j] == 1 && numNeighbors > 3)
                    newGrid[i][j] = 0;
                else if (grid[i][j] == 0 && numNeighbors == 3)
                    newGrid[i][j] = 1;
                else
                    newGrid[i][j] = grid[i][j];
            }
        }
 
        // Can't switch pointers so we mannually have to copy array over
        for(i = 1; i<dim+2; i++) {
            for(j = 1; j<dim+2; j++) {
                grid[i][j] = newGrid[i][j];
            }
        }
    }
 
}

To use the codelet method all of the game logic was moved into its own function. With the directive option args[grid].io=inout we specify that grid is to be copied onto the GPU at the start of the codelet and copied back to the host at the end. newGrid is specified to have no transfer between the host and device. The main game loop is included inside of the codelet even though it is neccesarily serial to ensure that the game grids are not coppied to and from the host after each game iteration. We will look at a more explicit way to manage data transfer below. grid and newGrid are passed into our codelet with their full dimension stated, this is neccesary for HMPP to correctly copy the data over.

// Get the number of neighbors for a given grid point
numNeighbors = grid[i+1][j] + grid[i-1][j]          //upper lower
                 + grid[i][j+1] + grid[i][j-1]      //right left
                 + grid[i+1][j+1] + grid[i-1][j-1]  //diagonals
                 + grid[i-1][j+1] + grid[i+1][j-1];

In the CUDA programming model all functions must be effectively inlined. Here we manually inline our function to calculate the number of alive neighbors a cell has.

// Can't switch pointers so we mannually have to copy array over
for(i = 1; i<dim+2; i++) {
    for(j = 1; j<dim+2; j++) {
        grid[i][j] = newGrid[i][j];
    }
}

This modification is very important, device pointers are not accessible inside of the acc region. If you leave the pointer swap in place from the cpu code the compiler will not complain, the code will run, and you will get incorrect results. This is not a limitation of GPU programming as you will see in the CUDA GOL example but of the HMPP directive framework. The easiest way to fix this is to replace the pointer swap with a copy operation as we have done. A possible workaround for this operation is investigated below.

// Allocate square grid of (dim+2)^2 elements, 2 added for ghost cell
int *grid = (int*) malloc( sizeof(int) * (dim+2) *(dim+2));
 
// Allocate newGrid
int *newGrid = (int*) malloc( sizeof(int) * (dim+2) *(dim+2));
grid[i*(dim+2)+j] = rand() % 2;
total += grid[i*(dim+2)+j];

HMPP seems to currently be limited to copying single dimensional arrays from host the device. We are able to maintain our multidimensional array indexing in the codelet while on the host we must allocate and address linear arrays.


GOL-region.c

#include <stdio.h>
#include <stdlib.h>

#define SRAND_VALUE 1985
#define idx(i,j) ((i)*(dim+2)+(j))

int main(int argc, char* argv[])
{
    int i,j,numNeighbors;
    int dim = 1024;
    // Number of game iterations
    int maxIter = 1<<15;

    // Allocate square grid of (dim+2)^2 elements, 2 added for ghost cells
    int *grid = (int*) malloc( sizeof(int) * (dim+2) *(dim+2));

    // Allocate newGrid
    int *newGrid = (int*) malloc( sizeof(int) * (dim+2) *(dim+2));

    // Assign initial population randomly
    srand(SRAND_VALUE);
    for(i = 1; i<=dim; i++) {
        for(j = 1; j<=dim; j++) {
            grid[i*(dim+2)+j] = rand() % 2;
        }
    }

#pragma hmpp GOL region, target=CUDA, args[*].transfer=atcall, args[grid].io=inout &
#pragma hmpp & args[newGrid].io=none
{
    int i,j,iter;
    // Main game loop
    for (iter = 0; iter<maxIter; iter++) {

        #pragma hmppcg gridify
        // Left-Right columns
        for (i = 1; i<=dim; i++) {
            grid[idx(i,0)] = grid[idx(i,dim)]; //Copy first real column to right ghost column
            grid[idx(i,dim+1)] = grid[idx(i,1)]; //Copy last real column to left ghost column
        }

        #pragma hmppcg gridify
        // Top-Bottom rows
        for (j = 0; j<=dim+1; j++) {
            grid[idx(0,j)] = grid[idx(dim,j)]; //Copy first real row to bottom ghost row
            grid[idx(dim+1,j)] = grid[idx(1,j)]; //Copy last real row to top ghost row
        }

        int numNeighbors;
        // Now we loop over all cells and determine their fate
        #pragma hmppcg gridify (i,j)
        for (i = 1; i<=dim; i++) {
            for (j = 1; j<=dim; j++) {
               // Get the number of neighbors for a given grid point
                numNeighbors = grid[idx(i+1,j)] + grid[idx(i-1,j)]      //upper lower
                             + grid[idx(i,j+1)] + grid[idx(i,j-1)]      //right left
                             + grid[idx(i+1,j+1)] + grid[idx(i-1,j-1)]  //diagonals
                             + grid[idx(i-1,j+1)] + grid[idx(i+1,j-1)];

                // Here we have explicitly all of the game rules
                if (grid[idx(i,j)] == 1 && numNeighbors < 2)
                    newGrid[idx(i,j)] = 0;
                else if (grid[idx(i,j)] == 1 && (numNeighbors == 2 || numNeighbors == 3))
                    newGrid[idx(i,j)] = 1;
                else if (grid[idx(i,j)] == 1 && numNeighbors > 3)
                    newGrid[idx(i,j)] = 0;
                else if (grid[idx(i,j)] == 0 && numNeighbors == 3)
                    newGrid[idx(i,j)] = 1;
                else
                    newGrid[idx(i,j)] = grid[idx(i,j)];
            }
        }

        #pragma hmppcg gridify
        // Can't switch pointers so we mannually have to copy array over
        for(i = 1; i<dim+2; i++) {
            for(j = 1; j<dim+2; j++) {
                grid[idx(i,j)] = newGrid[idx(i,j)];
            }
        }
    }

}
    // Sum up alive cells and print results
    int total = 0;
    for (i = 1; i<=dim; i++) {
        for (j = 1; j<=dim; j++) {
            total += grid[idx(i,j)];
        }
    }

    printf("Total Alive: %d\n", total);

    free(grid);
    free(newGrid);

    return 0;
}

Changes:

#define idx(i,j) ((i)*(dim+2)+(j))

In the codelet version we were able to stretch the C language a bit and use multidimensional arrays in our codelet and a single dimensional array on our host, this is not the case with the region method. A convenience macro is defined and used whenever grid or newGrid is accessed.

// Allocate square grid of (dim+2)^2 elements, 2 added for ghost cells
int *grid = (int*) malloc( sizeof(int) * (dim+2) *(dim+2));
 
// Allocate newGrid
int *newGrid = (int*) malloc( sizeof(int) * (dim+2) *(dim+2));

As in the codelet version HMPP does not currently handle multidimensional arrays properly and so we revert back to a single dimensional array.

#pragma hmpp GOL region, target=CUDA, args[*].transfer=atcall, args[grid].io=inout &
#pragma hmpp & args[newGrid.io]=none

We add the directive option args[grid].io=inout we specify that grid is to be copied onto the GPU at the start of the region and copied back to the host at the end. newGrid is specified to have no transfer between the host and device. The main game loop is included inside of the region even though it is neccesarily serial to ensure that the game grids are not coppied to and from the host after each game iteration. We will look at a more explicit way to manage data transfer below.

 int i,j,iter;

At the time of this writing HMPP does not correctly identify the induction variables used in hmpp region that are declared outside of the region. The solution is to simply define our induction variables inside of the region.

#pragma hmppcg gridify
// Left-Right columns
for (i = 1; i<=dim; i++) {
#pragma hmppcg gridify
// Top-Bottom rows
for (j = 0; j<=dim+1; j++) {
// Now we loop over all cells and determine their fate
#pragma hmppcg gridify (i,j)
for (i = 1; i<=dim; i++) {
    for (j = 1; j<=dim; j++) {
#pragma hmppcg gridify
// Can't switch pointers so we mannually have to copy array over
for(i = 1; i<dim+2; i++) {
    for(j = 1; j<dim+2; j++) {

The hmppcg directives are used to guide the compiler at the loop level. In this particular case the compiler did not want to parallelize the loops due to the outermost game loop being sequential. Adding hmppcg gridify tells the compiler the loop immediately following the directive should be executed in parallel on the GPU.

// Get the number of neighbors for a given grid point
numNeighbors = grid[idx(i+1,j)] + grid[idx(i-1,j)]          //upper lower
                 + grid[idx(i,j+1)] + grid[idx(i,j-1)]      //right left
                 + grid[idx(i+1,j+1)] + grid[idx(i-1,j-1)]  //diagonals
                 + grid[idx(i-1,j+1)] + grid[idx(i+1,j-1)];

In the CUDA programming model all functions must be effectively inlined. Here we manually inline our function to calculate the number of alive neighbors a cell has.

#pragma hmppcg gridify
// Can't switch pointers so we mannually have to copy array over
for(i = 1; i<dim+2; i++) {
    for(j = 1; j<dim+2; j++) {
        grid[idx(i,j)] = newGrid[idx(i,j)];
    }
}

This modification is very important, device pointers are not accessible inside of the acc region. If you leave the pointer swap in place from the cpu code the compiler will not complain, the code will run, and you will get incorrect results. This is not a limitation of GPU programming as you will see in the CUDA GOL example but of the HMPP directive framework. The easiest way to fix this is to replace the pointer swap with a copy operation as we have done. A possible workaround for this operation is investigated below.

Compiling:

We add the –io-report option to hmpp to see compile time host/device memory transfer information.

$ hmpp --io-report cc GOL-codelet.c -o gol.out

or

$ hmpp --io-report cc GOL-region.c -o gol.out

Output:
The output will be similar for both the codelet and region methods.

hmpp: [Info] IO report :
In GROUP 'gol'
 REGION 'gol' at GOL-region.c:29, function '__hmpp_region__gol'
    Parameter  'dim' has intent IN
    Parameter  'maxIter' has intent IN
    Parameter  'grid' has intent INOUT
    Parameter  'newGrid' has intent INOUT

Here we see the intent, the transfer direction with respect to the GPU, of each of our variables. We will see more explicit methods of dealing with data transfer below.

hmpp: [Info] Generated codelet filename is "gol_cuda.cu".
hmppcg: [Message DPL3000] GOLHMPP-TRY0-REGION.c:37: Loop 'i' was gridified (1D)
hmppcg: [Message DPL3000] GOLHMPP-TRY0-REGION.c:44: Loop 'j' was gridified (1D)
hmppcg: [Message DPL3001] GOLHMPP-TRY0-REGION.c:52: Loops 'j' and 'i' were gridified (2D)
hmppcg: [Message DPL3000] GOLHMPP-TRY0-REGION.c:77: Loop 'i' was gridified (1D)
hmppcg: [Warning DPL3002] GOLHMPP-TRY0-REGION.c:33: Loop not gridified: will be executed on CPU

All of the loops have been gridified indicating a kernel has been generated for the given loop that will be executed in parallel on the GPU. A warming message tells us that the main game loop has not been gridified and will be executed serially on the CPU, this is the expected and correct behavior.


GOL-codelet.f90

!$hmpp GOL codelet, target=CUDA, args[grid].io=inout, args[*].transfer=atcall, args[newGrid].io=none
subroutine GOL(dim, grid, newGrid, maxIter)
    implicit none
    integer, intent(in) :: dim, maxIter
    integer, intent(inout) :: grid(dim+2,dim+2), newGrid(dim+2,dim+2)
    integer :: i,j,iter,numNeighbors

    ! Main game loop
    do iter=1,maxIter
        ! Top-Bottom ghost rows
        do j=2,dim+1
            grid(1,j) = grid(dim+1,j) !Copy first game grid row to bottom ghost row
            grid(dim+2,j) = grid(2,j) !Copy first game grid row to top ghost row
        enddo

        ! Left-Right ghost columns
        do i=1,dim+2
            grid(i,1) = grid(i, dim+1) !Copy first game grid column to right ghost column
            grid(i,dim+2) = grid(i,2)  !Copy last game grid column to left ghost column
        enddo

        ! Now we loop over all cells and determine their fate
        do j=2,dim+1
            do i=2,dim+1
                ! Get the number of neighbors for a given grid point
                numNeighbors = grid(i,j+1) + grid(i,j-1)& !left & right
                             + grid(i+1,j) + grid(i-1,j)& !upper & lower
                             + grid(i+1,j+1) + grid(i-1,j-1)& !diagonals
                             + grid(i-1,j+1) + grid(i+1,j-1)


                ! Here we have explicitly all of the game rules
                if(grid(i,j) == 1 .AND. numNeighbors < 2) then
                    newGrid(i,j) = 0
                elseif(grid(i,j) == 1 .AND. (numNeighbors == 2 .OR. numNeighbors == 3)) then
                    newGrid(i,j) = 1
                elseif(grid(i,j) == 1 .AND. numNeighbors > 3) then
                    newGrid(i,j) = 0
                elseif(grid(i,j) == 0 .AND. numNeighbors == 3) then
                    newGrid(i,j) = 1
                else
                    newGrid(i,j) = grid(i,j)
                endif
            enddo
        enddo

        ! Can't switch pointers so we mannually have to copy array over
        do j=2,dim+1
            do i=2,dim+1
                grid(i,j) = newGrid(i,j)
            enddo
        enddo

    enddo!end main game loop  

end subroutine GOL

program main
    implicit none

    integer :: i,j,seed(8),numNeighbors,total
    real :: randm
    ! Linear game grid dimension
    integer :: dim = 1024
    ! Number of game iterations
    integer :: maxIter = 2**10

    ! Game grid pointers
    integer,dimension(:,:),allocatable :: grid, newGrid, tmpGrid

    ! Allocate square grid of (dim+2)^2 elements, 2 added for ghost cells
    allocate(grid(dim+2,dim+2))
    allocate(newGrid(dim+2,dim+2))

    ! Assign initial population randomly
    seed = (/1985, 2011, 2012, 500, 24, 15, 99, 8/)
    call random_seed(PUT=seed)
    do j=1,dim
        do i=1,dim
            call random_number(randm)
            grid(i,j) = nint(randm)
        enddo
    enddo

    !$hmpp GOL callsite
    call GOL(dim, grid, newGrid, maxIter)

    ! Sum up alive cells and print results
    total = 0
    do j=2,dim+1
        do i=2,dim+1
            total = total + grid(i,j)
        enddo
    enddo
    print *, "Total Alive", total

    ! Release memory
    deallocate(grid)
    deallocate(newGrid)

end program

Changes

Lines 1-55:

!$hmpp GOL codelet, target=CUDA, args[grid].io=inout, args[*].transfer=atcall, args[newGrid].io=none
subroutine GOL(dim, grid, newGrid, maxIter)
    implicit none
    integer, intent(in) :: dim, maxIter
    integer, intent(inout) :: grid(dim+2,dim+2), newGrid(dim+2,dim+2)
    integer :: i,j,iter,numNeighbors
 
    ! Main game loop
    do iter=1,maxITer
        ! Top-Bottom ghost rows
        do j=2,dim+1
            grid(1,j) = grid(dim+1,j) !Copy first game grid row to bottom ghost row
            grid(dim+2,j) = grid(2,j) !Copy first game grid row to top ghost row
        enddo
 
        ! Left-Right ghost columns
        do i=1,dim+2
            grid(i,1) = grid(i, dim+1) !Copy first game grid column to right ghost column
            grid(i,dim+2) = grid(i,2)  !Copy last game grid column to left ghost column
        enddo
 
        ! Now we loop over all cells and determine their fate
        do j=2,dim+1
            do i=2,dim+1
                ! Get the number of neighbors for a given grid point
                numNeighbors = grid(i,j+1) + grid(i,j-1)& !left & right
                             + grid(i+1,j) + grid(i-1,j)& !upper & lower
                             + grid(i+1,j+1) + grid(i-1,j-1)& !diagonals
                             + grid(i-1,j+1) + grid(i+1,j-1)
 
                ! Here we have explicitly all of the game rules
                if(grid(i,j) == 1 .AND. numNeighbors < 2) then
                    newGrid(i,j) = 0
                elseif(grid(i,j) == 1 .AND. (numNeighbors == 2 .OR. numNeighbors == 3)) then
                    newGrid(i,j) = 1
                elseif(grid(i,j) == 1 .AND. numNeighbors > 3) then
                    newGrid(i,j) = 0
                elseif(grid(i,j) == 0 .AND. numNeighbors == 3) then
                    newGrid(i,j) = 1
                else
                    newGrid(i,j) = grid(i,j)
                endif
            enddo
        enddo
 
        ! Can't switch pointers so we mannually have to copy array over
        do j=2,dim+1
            do i=2,dim+1
                grid(i,j) = newGrid(i,j)
            enddo
        enddo
 
    enddo!end main game loop  
 
end subroutine GOL

To use the codelet method all of the game logic was moved into its own subroutine. With the directive option args[grid].io=inout we specify that grid is to be copied onto the GPU at the start of the codelet and copied back to the host at the end. newGrid is specified to have no transfer between the host and device. The main game loop is included inside of the codelet even though it is neccesarily serial to ensure that the game grids are not copied to and from the host after each game iteration. We will look at a more explicit way to manage data transfer below.

! Get the number of neighbors for a given grid point
numNeighbors = grid(i,j+1) + grid(i,j-1)& !left & right
             + grid(i+1,j) + grid(i-1,j)& !upper & lower
             + grid(i+1,j+1) + grid(i-1,j-1)& !diagonals
             + grid(i-1,j+1) + grid(i+1,j-1)

In the CUDA programming model all functions must be effectively inlined. Here we manually inline our subroutine to calculate the number of alive neighbors a cell has.

! Can't switch pointers so we mannually have to copy array over
do j=2,dim+1
    do i=2,dim+1
        grid(i,j) = newGrid(i,j)
    enddo
enddo

This modification is very important, device pointers are not accessible inside of the acc region. If you leave the pointer swap in place from the cpu code the compiler will not complain, the code will run, and you will get incorrect results. This is not a limitation of GPU programming as you will see in the CUDA GOL example but of the HMPP directive framework. The easiest way to fix this is to replace the pointer swap with a copy operation as we have done. A possible workaround for this operation is investigated below.

GOL-region.f90

program main
    implicit none

    integer :: i,j,iter,seed(8),numNeighbors,total
    real :: randm
    ! Linear game grid dimension
    integer :: dim = 1024
    ! Number of game iterations
    integer :: maxIter = 2**10

    ! Game grid pointers
    integer,dimension(:,:),allocatable :: grid, newGrid, tmpGrid

    ! Allocate square grid of (dim+2)^2 elements, 2 added for ghost cells
    allocate(grid(dim+2,dim+2))
    allocate(newGrid(dim+2,dim+2))

    ! Assign initial population randomly
    seed = (/1985, 2011, 2012, 500, 24, 15, 99, 8/)
    call random_seed(PUT=seed)
    do j=1,dim
        do i=1,dim
            call random_number(randm)
            grid(i,j) = nint(randm)
        enddo
    enddo

    !$hmpp GOL region, target=CUDA, args[grid].io=inout, args[*].transfer=atcall & 
    !$hmpp & args[newGrid].io=none

    ! Main game loop
    do iter=1,maxITer
        ! Top-Bottom ghost rows
        do j=2,dim+1
            grid(1,j) = grid(dim+1,j) !Copy first game grid row to bottom ghost row
            grid(dim+2,j) = grid(2,j) !Copy first game grid row to top ghost row
        enddo

        ! Left-Right ghost columns
        do i=1,dim+2
            grid(i,1) = grid(i, dim+1) !Copy first game grid column to right ghost column
            grid(i,dim+2) = grid(i,2)  !Copy last game grid column to left ghost column
        enddo

        ! Now we loop over all cells and determine their fate
        !$hmppcg gridify(j, i)
        do j=2,dim+1
            do i=2,dim+1
                ! Get the number of neighbors for a given grid point
                numNeighbors = grid(i,j+1) + grid(i,j-1)&    !left & right
                             + grid(i+1,j) + grid(i-1,j)&    !upper & lower
                             + grid(i+1,j+1) + grid(i-1,j-1)& !diagonals
                             + grid(i-1,j+1) + grid(i+1,j-1)


                !Here we have explicitly all of the game rules
                if(grid(i,j) == 1 .AND. numNeighbors < 2) then
                    newGrid(i,j) = 0
                elseif(grid(i,j) == 1 .AND. (numNeighbors == 2 .OR. numNeighbors == 3)) then
                    newGrid(i,j) = 1
                elseif(grid(i,j) == 1 .AND. numNeighbors > 3) then
                    newGrid(i,j) = 0
                elseif(grid(i,j) == 0 .AND. numNeighbors == 3) then
                    newGrid(i,j) = 1
                else
                    newGrid(i,j) = grid(i,j)
                endif
            enddo
        enddo

        ! Can't switch pointers so we mannually have to copy array over
        do j=2,dim+1
            do i=2,dim+1
                grid(i,j) = newGrid(i,j)
            enddo
        enddo

    enddo!end main game loop  

    !$hmpp GOL endregion

    ! Sum up alive cells and print results
    total = 0
    do j=2,dim+1
        do i=2,dim+1
            total = total + grid(i,j)
        enddo
    enddo
    print *, "Total Alive", total

    ! Release memory
    deallocate(grid)
    deallocate(newGrid)

end program

Changes

! Now we loop over all cells and determine their fate
!$hmppcg gridify(j, i)
    do j=2,dim+1
        do i=2,dim+1

By default the hmpp compiler did not want to gridify the loop over each cell. We can attempt to force the compiler to do so adding the hmppcg directive with gridify(j, i) option which specifies we want to gridify with respect to the induction variables j and i.

! Get the number of neighbors for a given grid point
numNeighbors = grid(i,j+1) + grid(i,j-1)&    !left & right
             + grid(i+1,j) + grid(i-1,j)&    !upper & lower
             + grid(i+1,j+1) + grid(i-1,j-1)& !diagonals
             + grid(i-1,j+1) + grid(i+1,j-1)

In the CUDA programming model all functions must be effectively inlined. Here we manually inline our subroutine to calculate the number of alive neighbors a cell has.

! Can't switch pointers so we mannually have to copy array over
do j=2,dim+1
    do i=2,dim+1
        grid(i,j) = newGrid(i,j)
        enddo 
    enddo

This modification is very important, device pointers are not accessible inside of the acc region. If you leave the pointer swap in place from the cpu code the compiler will not complain, the code will run, and you will get incorrect results. This is not a limitation of GPU programming as you will see in the CUDA GOL example but of the HMPP directive framework. The easiest way to fix this is to replace the pointer swap with a copy operation as we have done. A possible workaround for this operation is investigated below.

Compiling

We add the –io-report option to hmpp to see compile time host/device memory transfer information.

$ hmpp --io-report ftn GOL-codelet.f90 -o gol.out

or

$ hmpp --io-report ftn GOL-region.f90 -o gol.out

Output:
The output will be similar for both the codelet and region methods. In the most recent release as of this writing you may get some warning messages from the compiler about unused variables when using the region method.

hmpp: [Info] IO report :
In GROUP 'gol'
 CODELET 'gol' at GOLHMPP-TRY0-CODELET.f90:1, function 'gol'
    Parameter 'dim' has intent IN
    Parameter 'grid' has intent INOUT
    Parameter 'newgrid' has intent INOUT
    Parameter 'maxiter' has intent IN

Here we see the intent, the transfer direction with respect to the GPU, of each of our variables. We will see more explicit methods of dealing with data transfer below.

hmpp: [Info] Generated codelet filename is "gol_cuda.cu".
hmppcg: [Message DPL3000] GOLHMPP-TRY0-CODELET.f90:11: Loop 'j' was gridified (1D)
hmppcg: [Message DPL3000] GOLHMPP-TRY0-CODELET.f90:17: Loop 'i' was gridified (1D)
hmppcg: [Message DPL3001] GOLHMPP-TRY0-CODELET.f90:23: Loops 'i' and 'j' were gridified (2D)
hmppcg: [Message DPL3001] GOLHMPP-TRY0-CODELET.f90:47: Loops 'i' and 'j' were gridified (2D)
hmppcg: [Warning DPL3002] GOLHMPP-TRY0-CODELET.f90:9: Loop not gridified: will be executed on CPU

All of the loops have been gridified indicating a kernel has been generated for the given loop that will be executed in parallel on the GPU. A warming message tells us that the main game loop has not been gridified and will be executed serially on the CPU, this is the expected and correct behavior.

GOL-codelet-Try2.c

In this version we will remove the memory copy kernel that we needed to use since HMPP accelerate does not currently suppoort device pointers. As an added bonus we will also take a look at more advanced memory management techniques. There are numerous ways this could be approached but the methods shown highlight many good to know HMPP features.

#include <stdio.h>
#include <stdlib.h>

#define SRAND_VALUE 1985

#pragma hmpp <GOL> group target=CUDA
#pragma hmpp <GOL> mapbyname, dim
#pragma hmpp <GOL> map, args[even::grid;odd::newGrid]
#pragma hmpp <GOL> map, args[even::newGrid;odd::grid]
#pragma hmpp <GOL> even codelet, args[dim,grid,newGrid].transfer=manual
#pragma hmpp <GOL> odd  codelet, args[dim,grid,newGrid].transfer=manual
void GOL(int dim, int grid[dim+2][dim+2], int newGrid[dim+2][dim+2])
{
    int i, j, numNeighbors;

    // Left-Right columns
    for (i = 1; i<=dim; i++) {
        grid[i][0] = grid[i][dim]; //Copy first real column to right ghost column
        grid[i][dim+1] = grid[i][1]; //Copy last real column to left ghost column
    }
    // Top-Bottom rows
    for (j = 0; j<=dim+1; j++) {
        grid[0][j] = grid[dim][j]; //Copy first real row to bottom ghost row
        grid[dim+1][j] = grid[1][j]; //Copy last real row to top ghost row
    }

    // Now we loop over all cells and determine their fate
    for (i = 1; i<=dim; i++) {
        for (j = 1; j<=dim; j++) {
           // Get the number of neighbors for a given grid point
            numNeighbors = grid[i+1][j] + grid[i-1][j]      //upper lower
                         + grid[i][j+1] + grid[i][j-1]      //right left
                         + grid[i+1][j+1] + grid[i-1][j-1]  //Diagonals
                         + grid[i-1][j+1] + grid[i+1][j-1];

            // Here we have explicitly all of the game rules
            if (grid[i][j] == 1 && numNeighbors < 2)
                newGrid[i][j] = 0;
            else if (grid[i][j] == 1 && (numNeighbors == 2 || numNeighbors == 3))
                newGrid[i][j] = 1;
            else if (grid[i][j] == 1 && numNeighbors > 3)
                newGrid[i][j] = 0;
            else if (grid[i][j] == 0 && numNeighbors == 3)
                newGrid[i][j] = 1;
            else
                newGrid[i][j] = grid[i][j];
        }
    }
}

int main(int argc, char* argv[])
{
    int i,j,iter;
    // Linear grid dimension
    int dim = 1024;
    // Number of game iterations
    int maxIter = 1<<10;

    #pragma hmpp <GOL> allocate    
    // Allocate square grid of (dim+2)^2 elements, 2 added for ghost cells
    int *grid = (int*) malloc( sizeof(int) * (dim+2) *(dim+2));

    // Allocate newGrid
    int *newGrid = (int*) malloc( sizeof(int) * (dim+2) *(dim+2));

    // Assign initial population randomly
    srand(SRAND_VALUE);
    for(i = 1; i<=dim; i++) {
        for(j = 1; j<=dim; j++) {
            grid[i*(dim+2)+j] = rand() % 2;
        }
    }

    // Copy memory tp GPU
    #pragma hmpp <GOL> advancedload, args[even::dim;even::grid;even::newGrid]

    // Main game loop
    for (iter = 0; iter<maxIter; iter++) {
        if(iter%2 == 0) {
            #pragma hmpp <GOL> even callsite
            GOL(dim, grid, newGrid);
        }
        else {
            #pragma hmpp <GOL> odd callsite
            GOL(dim, grid, newGrid);
         }

    }//iter loop

    // Read back final configuration to confirm it worked
    #pragma hmpp <GOL> delegatedstore, args[odd::newGrid]

    // Sum up alive cells and print results
    int total = 0;
    for (i = 1; i<=dim; i++) {
        for (j = 1; j<=dim; j++) {
            total += newGrid[i*(dim+2)+j];
        }
    }

    printf("Total Alive: %d\n", total);

    #pragma hmpp <GOL> release 
    free(grid);
    free(newGrid);

    return 0;
}

Changes

#pragma hmpp <GOL> group target=CUDA
#pragma hmpp <GOL> mapbyname, dim
#pragma hmpp <GOL> map, args[even::grid;odd::newGrid]
#pragma hmpp <GOL> map, args[even::newGrid;odd::grid]
#pragma hmpp <GOL> even codelet, args[dim,grid,newGrid].transfer=manual
#pragma hmpp <GOL> odd  codelet, args[dim,grid,newGrid].transfer=manual

A major change to our code has been to get rid of the newGrid to grid memory copy. Instead we use some of the built in HMPP directive functionality.

#pragma hmpp <GOL> group target=CUDA

Here we create what is known as a codelet group, name GOL. This allows us to keep track of similar kernels and set common attributes.

#pragma hmpp <GOL> mapbyname, dim

This line will map the variable dim across all kernels. This means once dim is allocated and copied to the device for one kernel it will be available for use by all kernels in the GOL group.

#pragma hmpp <GOL> map, args[even::grid;odd::newGrid]
#pragma hmpp <GOL> map, args[even::newGrid;odd::grid]

The map directive is used to map arguments between different kernels. This will effectively switch our pointers for us. The notation args[CodeletName1::Argument1,CodeletName2::Argument2] is used such that Argument1 in CodeletName1 will refer to the same device memory as Argument2 in CodeletName2. We see in our case we will soon be defining two kernels, one named even and one named odd.

#pragma hmpp <GOL> even codelet
#pragma hmpp <GOL> odd  codelet, args[newGrid].io=none

The GOL group will contain two codelets, even and odd. Our game will oscillate between calling these codelets in the main game loop. The only difference between these codelets is caused by the map directive used above which effectively switches the device pointers to grid and newGrid.

#pragma hmpp <GOL> allocate

The allocate directive will allocate space on the GPU for all memory needed by the named group. Any memory objects shared by more than one codelet through the map or mapbyname directives will only be allocated once.

// Copy memory tp GPU
#pragma hmpp <GOL> advancedload, args[even::dim;even::grid;even::newGrid]

Once we have allocated space on the GPU we can copy the results from the host to the GPU with the advancedload directive. Notice when referencing what arguments will be copied we specify one codelet names, either codelet name could be specified.

// Main game loop
for (iter = 0; iter<maxIter; iter++) {
    if(iter%2 == 0) {
        #pragma hmpp <GOL> even callsite
        GOL(dim, grid, newGrid);
    }
    else {
        #pragma hmpp <GOL> odd callsite
        GOL(dim, grid, newGrid);
    }
}//iter loop

The main game loop induction variable is used to switch between calling our even codelet and our odd codelet. Although it looks as though the same GOL function is being called in both cases because of the map HMPP directive defined with the codelet we know the grid and newGrid GPU pointers will be switched.

// Read back final configuration to confirm it worked
#pragma hmpp <GOL> delegatedstore, args[odd::newGrid]

After the kernel is finished we copy the grid from the device to the host.

#pragma hmpp <GOL> release

Finally since we allocated the device memory we must free it with the directive hmpp GOL release .

GOL-region-Try2.c

#include <stdio.h>
#include <stdlib.h>

#define SRAND_VALUE 1985
#define idx(i,j) ((i)*(dim+2)+(j))

int main(int argc, char* argv[])
{
    int i,j,iter;
    int dim = 1024;
    //Number of game iterations
    int maxIter = 1<<10;

    #pragma hmpp GOL allocate, args[grid,newGrid].size={(dim+2)*(dim+2)} 
    // Allocate square grid of (dim+2)^2 elements, 2 added for perioid boundary condition ghost cells
    int *grid = (int*) malloc( sizeof(int) * (dim+2) *(dim+2));

    // Allocate newGrid
    int *newGrid = (int*) malloc( sizeof(int) * (dim+2) *(dim+2));

    // Assign initial population randomly
    srand(SRAND_VALUE);
    for(i = 1; i<=dim; i++) {
        for(j = 1; j<=dim; j++) {
            grid[i*(dim+2)+j] = rand() % 2;
        }
    }

    // Copy memory tp GPU
    #pragma hmpp GOL advancedload, args[dim;grid;newGrid]

    // Main game loop
    for (iter = 0; iter<maxIter; iter++) {
        #pragma hmpp GOL region, target=CUDA, args[dim,grid,newGrid].transfer=manual, args[iter].transfer=atcall
        {
            int i,j;
            if(iter%2 == 0)
            {
                #pragma hmppcg gridify
                // Left-Right columns
                for (i = 1; i<=dim; i++) {
                    grid[idx(i,0)] = grid[idx(i,dim)]; //Copy first real column to right ghost column
                    grid[idx(i,dim+1)] = grid[idx(i,1)]; //Copy last real column to left ghost column
                }

                #pragma hmppcg gridify
                // Top-Bottom rows
                for (j = 0; j<=dim+1; j++) {
                    grid[idx(0,j)] = grid[idx(dim,j)]; //Copy first real row to bottom ghost row
                    grid[idx(dim+1,j)] = grid[idx(1,j)]; //Copy last real row to top ghost row
                }

                int numNeighbors;
                // Now we loop over all cells and determine their fate
                #pragma hmppcg gridify (i,j)
                for (i = 1; i<=dim; i++) {
                    for (j = 1; j<=dim; j++) {
                        // Get the number of neighbors for a given grid point
                        numNeighbors = grid[idx(i+1,j)] + grid[idx(i-1,j)]      //upper lower
                                     + grid[idx(i,j+1)] + grid[idx(i,j-1)]      //right left
                                     + grid[idx(i+1,j+1)] + grid[idx(i-1,j-1)]  //diagonals
                                     + grid[idx(i-1,j+1)] + grid[idx(i+1,j-1)];


                        // Here we have explicitly all of the game rules
                        if (grid[idx(i,j)] == 1 && numNeighbors < 2)
                            newGrid[idx(i,j)] = 0;
                        else if (grid[idx(i,j)] == 1 && (numNeighbors == 2 || numNeighbors == 3))
                            newGrid[idx(i,j)] = 1;
                        else if (grid[idx(i,j)] == 1 && numNeighbors > 3)
                            newGrid[idx(i,j)] = 0;
                        else if (grid[idx(i,j)] == 0 && numNeighbors == 3)
                            newGrid[idx(i,j)] = 1;
                        else
                            newGrid[idx(i,j)] = grid[idx(i,j)];
                    }
                }

            } else {
            #pragma hmppcg gridify
            // Left-Right columns
            for (i = 1; i<=dim; i++) {
                newGrid[idx(i,0)] = newGrid[idx(i,dim)]; //Copy first real column to right ghost column
                newGrid[idx(i,dim+1)] = newGrid[idx(i,1)]; //Copy last real column to left ghost column
            }

            #pragma hmppcg gridify
            // Top-Bottom rows
            for (j = 0; j<=dim+1; j++) {
                newGrid[idx(0,j)] = newGrid[idx(dim,j)]; //Copy first real row to bottom ghost row
                newGrid[idx(dim+1,j)] = newGrid[idx(1,j)]; //Copy last real row to top ghost row
            }

            int numNeighbors;
            // Now we loop over all cells and determine their fate
            #pragma hmppcg gridify (i,j)
            for (i = 1; i<=dim; i++) {
                for (j = 1; j<=dim; j++) {
                    // Get the number of neighbors for a given grid point
                    numNeighbors = newGrid[idx(i+1,j)] + newGrid[idx(i-1,j)]      //upper lower
                                 + newGrid[idx(i,j+1)] + newGrid[idx(i,j-1)]      //right left
                                 + newGrid[idx(i+1,j+1)] + newGrid[idx(i-1,j-1)]  //diagonals
                                 + newGrid[idx(i-1,j+1)] + newGrid[idx(i+1,j-1)];


                    // Here we have explicitly all of the game rules
                    if (newGrid[idx(i,j)] == 1 && numNeighbors < 2)
                        grid[idx(i,j)] = 0;
                    else if (newGrid[idx(i,j)] == 1 && (numNeighbors == 2 || numNeighbors == 3))
                        grid[idx(i,j)] = 1;
                    else if (newGrid[idx(i,j)] == 1 && numNeighbors > 3)
                        grid[idx(i,j)] = 0;
                    else if (newGrid[idx(i,j)] == 0 && numNeighbors == 3)
                        grid[idx(i,j)] = 1;
                    else
                        grid[idx(i,j)] = newGrid[idx(i,j)];
                }
            }

          }
        }
    }//iter loop

    // Read back final configuration to confirm it worked
    #pragma hmpp GOL delegatedstore, args[grid]

    // Sum up alive cells and print results
    int total = 0;
    for (i = 1; i<=dim; i++) {
        for (j = 1; j<=dim; j++) {
            total += grid[i*(dim+2)+j];
        }
    }

    printf("Total Alive: %d\n", total);

    #pragma hmpp GOL release 
    free(grid);
    free(newGrid);

    return 0;
}

Changes

#pragma hmpp GOL allocate, args[grid,newGrid].size={(dim+2)*(dim+2)}

The hmpp GOL allocate directive will allocate space on the GPU for the named region. The compiler requires us to specify the size of the passed in arrays at allocation time.

hmpp GOL advancedload, args[dim;iter;grid;newGrid]

hmpp GOL advancedload, args[dim;iter;grid;newGrid] will copy the named arguments of the region from the host to the GPU.

#pragma hmpp GOL region, target=CUDA, args[dim,grid,newGrid].transfer=manual, args[iter].transfer=atcall

In this version of the code we have our region inside of our main game loop but no longer have to worry about unnecessary memory copies.

           if(iter%2 == 0)
            {
                #pragma hmppcg gridify
                // Left-Right columns
                for (i = 1; i<=dim; i++) {
                    grid[idx(i,0)] = grid[idx(i,dim)]; //Copy first real column to right ghost column
                    grid[idx(i,dim+1)] = grid[idx(i,1)]; //Copy last real column to left ghost column
                }
 
                #pragma hmppcg gridify
                // Top-Bottom rows
                for (j = 0; j<=dim+1; j++) {
                    grid[idx(0,j)] = grid[idx(dim,j)]; //Copy first real row to bottom ghost row
                    grid[idx(dim+1,j)] = grid[idx(1,j)]; //Copy last real row to top ghost row
                }
 
                int numNeighbors;
                // Now we loop over all cells and determine their fate
                #pragma hmppcg gridify (i,j)
                for (i = 1; i<=dim; i++) {
                    for (j = 1; j<=dim; j++) {
                        // Get the number of neighbors for a given grid point
                        numNeighbors = grid[idx(i+1,j)] + grid[idx(i-1,j)]      //upper lower
                                     + grid[idx(i,j+1)] + grid[idx(i,j-1)]      //right left
                                     + grid[idx(i+1,j+1)] + grid[idx(i-1,j-1)]  //diagonals
                                     + grid[idx(i-1,j+1)] + grid[idx(i+1,j-1)];
 
                        // Here we have explicitly all of the game rules
                        if (grid[idx(i,j)] == 1 && numNeighbors < 2)
                            newGrid[idx(i,j)] = 0;
                        else if (grid[idx(i,j)] == 1 && (numNeighbors == 2 || numNeighbors == 3))
                            newGrid[idx(i,j)] = 1;
                        else if (grid[idx(i,j)] == 1 && numNeighbors > 3)
                            newGrid[idx(i,j)] = 0;
                        else if (grid[idx(i,j)] == 0 && numNeighbors == 3)
                            newGrid[idx(i,j)] = 1;
                        else
                            newGrid[idx(i,j)] = grid[idx(i,j)];
                    }
                }
 
            } else {
            #pragma hmppcg gridify
            // Left-Right columns
            for (i = 1; i<=dim; i++) {
                newGrid[idx(i,0)] = newGrid[idx(i,dim)]; //Copy first real column to right ghost column
                newGrid[idx(i,dim+1)] = newGrid[idx(i,1)]; //Copy last real column to left ghost column
            }
 
            #pragma hmppcg gridify
            // Top-Bottom rows
            for (j = 0; j<=dim+1; j++) {
                newGrid[idx(0,j)] = newGrid[idx(dim,j)]; //Copy first real row to bottom ghost row
                newGrid[idx(dim+1,j)] = newGrid[idx(1,j)]; //Copy last real row to top ghost row
            }
 
            int numNeighbors;
            // Now we loop over all cells and determine their fate
            #pragma hmppcg gridify (i,j)
            for (i = 1; i<=dim; i++) {
                for (j = 1; j<=dim; j++) {
                    // Get the number of neighbors for a given grid point
                    numNeighbors = newGrid[idx(i+1,j)] + newGrid[idx(i-1,j)]      //upper lower
                                 + newGrid[idx(i,j+1)] + newGrid[idx(i,j-1)]      //right left
                                 + newGrid[idx(i+1,j+1)] + newGrid[idx(i-1,j-1)]  //diagonals
                                 + newGrid[idx(i-1,j+1)] + newGrid[idx(i+1,j-1)];
 
                    // Here we have explicitly all of the game rules
                    if (newGrid[idx(i,j)] == 1 && numNeighbors < 2)
                        grid[idx(i,j)] = 0;
                    else if (newGrid[idx(i,j)] == 1 && (numNeighbors == 2 || numNeighbors == 3))
                        grid[idx(i,j)] = 1;
                    else if (newGrid[idx(i,j)] == 1 && numNeighbors > 3)
                        grid[idx(i,j)] = 0;
                    else if (newGrid[idx(i,j)] == 0 && numNeighbors == 3)
                        grid[idx(i,j)] = 1;
                    else
                        grid[idx(i,j)] = newGrid[idx(i,j)];
                }
            }
 
          }

A major change to our code has been to get rid of the grid/newGrid switching kernel. We have explicitly coded the two cases and use the modulus of the main iteration variable to switch between the two pieces of code. This increases the code complexity but does get around a costly memory transfer.

// Read back final configuration to confirm it worked
#pragma hmpp GOL delegatedstore, args[grid]

After the kernel is finished we copy the grid from the device to the host with hmpp GOL delegatedstore, args[grid]

#pragma hmpp GOL release

Finally since we allocated the device memory we must free it with the directive hmpp GOL release .

GOL-codelet-Try2.f90

!$hmpp <GOL> group, target=CUDA
!$hmpp <GOL> mapbyname, dim
!$hmpp <GOL> map, args[even::grid;odd::newGrid]
!$hmpp <GOL> map, args[even::newGrid;odd::grid]
!$hmpp <GOL> even codelet, args[dim,grid,newGrid].transfer=manual
!$hmpp <GOL> odd  codelet, args[dim,grid,newGrid].transfer=manual
subroutine GOL(dim, grid, newGrid)
    implicit none
    integer, intent(in) :: dim
    integer, intent(inout) :: grid(dim+2,dim+2), newGrid(dim+2,dim+2)
    integer :: i,j,numNeighbors

    ! Top-Bottom ghost rows
    do j=2,dim+1
        grid(1,j) = grid(dim+1,j) !Copy first game grid row to bottom ghost row 
        grid(dim+2,j) = grid(2,j) !Copy first game grid row to top ghost row
    enddo

    ! Left-Right ghost columns
    do i=1,dim+2
        grid(i,1) = grid(i, dim+1) !Copy first game grid column to right ghost column
        grid(i,dim+2) = grid(i,2)  !Copy last game grid column to left ghost column
    enddo


    ! Now we loop over all cells and determine their fate
    do j=2,dim+1
       do i=2,dim+1
           ! Get the number of neighbors for a given grid point
            numNeighbors = grid(i,j+1) + grid(i,j-1)&     !right & left
                         + grid(i+1,j) + grid(i-1,j)&     !upper & lower
                         + grid(i+1,j+1) + grid(i-1,j-1)& !diagonals
                         + grid(i-1,j+1) + grid(i+1,j-1)

           ! Here we have explicitly all of the game rules
           if(grid(i,j) == 1 .AND. numNeighbors < 2) then
                newGrid(i,j) = 0
            elseif(grid(i,j) == 1 .AND. (numNeighbors == 2 .OR. numNeighbors == 3)) then
                newGrid(i,j) = 1
            elseif(grid(i,j) == 1 .AND. numNeighbors > 3) then
                newGrid(i,j) = 0
            elseif(grid(i,j) == 0 .AND. numNeighbors == 3) then
                newGrid(i,j) = 1
            else
                newGrid(i,j) = grid(i,j)
            endif
        enddo
    enddo
end subroutine GOL

program main
    implicit none

    integer :: i,j,seed(8),iter,total
    real :: randm
    ! Linear game grid dimension
    integer :: dim = 1024
    ! Number of game iterations
    integer :: maxIter = 2**10

    ! Game grid pointers
    integer,dimension(:,:),allocatable :: grid, newGrid, tmpGrid

    !$hmpp <GOL> allocate 
    ! Allocate square grid of (dim+2)^2 elements, 2 added for ghost cells
    allocate(grid(dim+2,dim+2))
    allocate(newGrid(dim+2,dim+2))

    ! Assign initial population randomly
    seed = (/1985, 2011, 2012, 500, 24, 15, 99, 8/)
    call random_seed(PUT=seed)
    do j=1,dim
        do i=1,dim
            call random_number(randm)
            grid(i,j) = nint(randm)
        enddo
    enddo

    ! Copy memory tp GPU
    !$hmpp <GOL> advancedload, args[even::dim;even::grid;even::newGrid]

    ! Main game loop
    do iter=1,maxITer
        if(MOD(iter,2) == 1) then
            !$hmpp <GOL> even callsite
             call GOL(dim, grid, newGrid)
         else
             !$hmpp <GOL> odd callsite
             call GOL(dim, grid, newGrid)
         endif
    enddo!end main game loop 

    ! Read back final configuration to confirm it worked
    !$hmpp <GOL> delegatedstore, args[odd::newGrid]

    ! Sum up alive cells and print results
    total = 0
    do j=2,dim+1
        do i=2,dim+1
            total = total + newGrid(i,j)
        enddo
    enddo
    print *, "Total Alive", total

    ! Release memory
    !$hmpp <GOL> release
    deallocate(grid)
    deallocate(newGrid)

end program

Changes

!$hmpp <GOL> group, target=CUDA
!$hmpp <GOL> mapbyname, dim
!$hmpp <GOL> map, args[even::grid;odd::newGrid]
!$hmpp <GOL> map, args[even::newGrid;odd::grid]
!$hmpp <GOL> even codelet
!$hmpp <GOL> odd  codelet

A major change to our code has been to get rid of the newGrid to grid memory copy. Instead we use some of the built in HMPP directive functionality.

!$hmpp <GOL> group, target=CUDA

Here we create what is known as a codelet group, name GOL. This allows us to keep track of similar kernels and set common attributes.

!$hmpp <GOL> mapbyname, dim

This line will map the variable dim across all kernels. This means once dim is allocated and copied to the device for one kernel it will be available for use by all kernels in the GOL group.

!$hmpp <GOL> map, args[even::grid;odd::newGrid]
!$hmpp <GOL> map, args[even::newGrid;odd::grid]

The map directive is used to map arguments between different kernels. This will effectively switch our pointers for us. The notation args[CodeletName1::Argument1,CodeletName2::Argument2] is used such that Argument1 in CodeletName1 will refer to the same device memory as Argument2 in CodeletName2. We see in our case we will soon be defining two kernels, one named even and one named odd.

!$hmpp <GOL> even codelet
!$hmpp <GOL> odd  codelet

The GOL group will contain two codelets, even and odd. Our game will oscillate between calling these codelets in the main game loop. The only difference between these codelets is caused by the map directive used above which effectively switches the device pointers to grid and newGrid.

!$hmpp <GOL> allocate

The allocate directive will allocate space on the GPU for all memory needed by the named group. Any memory objects shared by more than one codelet through the map or mapbyname directives will only be allocated once.

! Copy memory tp GPU
!$hmpp <GOL> advancedload, args[even::dim;even::grid;even::newGrid]

Once we have allocated space on the GPU we can copy the results from the host to the GPU with the advancedload directive. Notice when referencing what arguments will be copied we specify one codelet names, either codelet name could be specified.

if(MOD(iter,2) == 1) then
    !$hmpp <GOL> even callsite, args[grid].noupdate=true
    call GOL(dim, grid, newGrid)
else
    !$hmpp <GOL> odd callsite, args[grid].noupdate=true
    call GOL(dim, grid, newGrid)
endif

We see that we use our main game loop induction variable to switch between calling our even codelet and our odd codelet. Although it looks as though the same GOL function is being called in both cases because of the map HMPP directive defined with the codelet we know the grid and newGrid GPU pointers will be switched. args[grid].noupdate=true indicates that we do not want to have any implicit data movement between the host and device.

! Read back final configuration to confirm it worked
!$hmpp <GOL> delegatedstore, args[odd::newGrid]

After the kernel is finished we copy the grid from the device to the host with hmpp GOL delegatedstore, args[grid]

!$hmpp <GOL> release

Finally since we allocated the device memory we must free it with the directive hmpp GOL release .

GOL-region-Try2.f90

program main
    implicit none

    integer :: i,j,seed(8),iter,numNeighbors,total
    real :: randm
    ! Linear game grid dimension
    integer :: dim = 1024
    ! Number of game iterations
    integer :: maxIter = 2**10

    ! Game grid pointers
    integer,dimension(:,:),allocatable :: grid, newGrid, tmpGrid

    !$hmpp GOL allocate, args[grid,newGrid].size={(dim+2),(dim+2)}
    ! Allocate square grid of (dim+2)^2 elements, 2 added for ghost cells
    allocate(grid(dim+2,dim+2))
    allocate(newGrid(dim+2,dim+2))

    ! Assign initial population randomly
    seed = (/1985, 2011, 2012, 500, 24, 15, 99, 8/)
    call random_seed(PUT=seed)
    do j=1,dim
        do i=1,dim
            call random_number(randm)
            grid(i,j) = nint(randm)
        enddo
    enddo

    ! Copy memory tp GPU
    !$hmpp GOL advancedload, args[dim;grid;newGrid]

    ! Main game loop
    do iter=1,maxITer
    !$hmpp GOL region, target=CUDA, args[dim,grid,newGrid].transfer=manual, args[iter].transfer=atcall
        if(MOD(iter,2) == 1) then
            ! Top-Bottom ghost rows
            do j=2,dim+1
                grid(1,j) = grid(dim+1,j) !Copy first game grid row to bottom ghost row
                grid(dim+2,j) = grid(2,j) !Copy first game grid row to top ghost row
            enddo

            ! Left-Right ghost columns
            do i=1,dim+2
                grid(i,1) = grid(i, dim+1) !Copy first game grid column to right ghost column
                grid(i,dim+2) = grid(i,2)  !Copy last game grid column to left ghost column
            enddo

            ! Now we loop over all cells and determine their fate
            !$hmppcg gridify(j,i)
            do j=2,dim+1
                do i=2,dim+1
                    ! Get the number of neighbors for a given grid point
                    numNeighbors = grid(i,j+1) + grid(i,j-1)&     !right & left
                                 + grid(i+1,j) + grid(i-1,j)&     !upper & lower
                                 + grid(i+1,j+1) + grid(i-1,j-1)& !diagonals
                                 + grid(i-1,j+1) + grid(i+1,j-1)

                    ! Here we have explicitly all of the game rules
                    if(grid(i,j) == 1 .AND. numNeighbors < 2) then
                        newGrid(i,j) = 0
                    elseif(grid(i,j) == 1 .AND. (numNeighbors == 2 .OR. numNeighbors == 3)) then
                        newGrid(i,j) = 1
                    elseif(grid(i,j) == 1 .AND. numNeighbors > 3) then
                        newGrid(i,j) = 0
                    elseif(grid(i,j) == 0 .AND. numNeighbors == 3) then
                        newGrid(i,j) = 1
                    else
                        newGrid(i,j) = grid(i,j)
                    endif
                enddo
            enddo
        else
            ! Top-Bottom ghost rows
            do j=2,dim+1
                newGrid(1,j) = newGrid(dim+1,j) !Copy first game grid row to bottom ghost row
                newGrid(dim+2,j) = newGrid(2,j) !Copy first game grid row to top ghost row
            enddo

            ! Left-Right ghost columns
            do i=1,dim+2
                newGrid(i,1) = newGrid(i, dim+1) !Copy first game grid column to right ghost column
                newGrid(i,dim+2) = newGrid(i,2)  !Copy last game grid column to left ghost column
            enddo

            ! Now we loop over all cells and determine their fate
            !$hmppcg gridify(j,i)
            do j=2,dim+1
                do i=2,dim+1
                    ! Get the number of neighbors for a given grid point
                    numNeighbors = newGrid(i,j+1) + newGrid(i,j-1)&     !right & left
                                 + newGrid(i+1,j) + newGrid(i-1,j)&     !upper & lower
                                 + newGrid(i+1,j+1) + newGrid(i-1,j-1)& !diagonals
                                 + newGrid(i-1,j+1) + newGrid(i+1,j-1)

                    ! Here we have explicitly all of the game rules
                    if(newGrid(i,j) == 1 .AND. numNeighbors < 2) then
                        grid(i,j) = 0
                    elseif(newGrid(i,j) == 1 .AND. (numNeighbors == 2 .OR. numNeighbors == 3)) then
                        grid(i,j) = 1
                    elseif(newGrid(i,j) == 1 .AND. numNeighbors > 3) then
                        grid(i,j) = 0
                    elseif(newGrid(i,j) == 0 .AND. numNeighbors == 3) then
                        grid(i,j) = 1
                    else
                        grid(i,j) = newGrid(i,j)
                    endif
                enddo
            enddo
        endif !End grid newGrid switch
       !$hmpp GOL endregion

    enddo!end main game loop 

    ! Read back final configuration to confirm it worked
    !$hmpp GOL delegatedstore, args[grid]

    ! Sum up alive cells and print results
    total = 0
    do j=2,dim+1
        do i=2,dim+1
            total = total + grid(i,j)
        enddo
    enddo
    print *, "Total Alive", total

    ! Release memory
    !$hmpp GOL release
    deallocate(grid)
    deallocate(newGrid)

end program

Changes

!$hmpp GOL allocate, args[grid,newGrid].size={(dim+2),(dim+2)}

The hmpp GOL allocate directive will allocate space on the GPU for the named region. The compiler requires us to specify the size of the passed in arrays at allocation time with args[grid,newGrid].size={(dim+2),(dim+2)}

!$hmpp GOL advancedload, args[dim;grid;newGrid]

hmpp GOL advancedload, args[dim;iter;grid;newGrid] will copy the named arguments of the region from the host to the GPU.

hmpp GOL region, target=CUDA, args[dim,grid,newGrid].transfer=manual, args[iter].transfer=atcall

In this version of the code we have our region inside of our main game loop but no longer have to worry about unnecessary memory copies.

Lines 35-109:

        if(MOD(iter,2) == 1) then
            ! Top-Bottom ghost rows
            do j=2,dim+1
                grid(1,j) = grid(dim+1,j) !Copy first game grid row to bottom ghost row
                grid(dim+2,j) = grid(2,j) !Copy first game grid row to top ghost row
            enddo
 
            ! Left-Right ghost columns
            do i=1,dim+2
                grid(i,1) = grid(i, dim+1) !Copy first game grid column to right ghost column
                grid(i,dim+2) = grid(i,2)  !Copy last game grid column to left ghost column
            enddo
 
            ! Now we loop over all cells and determine their fate
            !$hmppcg gridify(j,i)
            do j=2,dim+1
                do i=2,dim+1
                    ! Get the number of neighbors for a given grid point
                    numNeighbors = grid(i,j+1) + grid(i,j-1)&     !right & left
                                 + grid(i+1,j) + grid(i-1,j)&     !upper & lower
                                 + grid(i+1,j+1) + grid(i-1,j-1)& !diagonals
                                 + grid(i-1,j+1) + grid(i+1,j-1)
 
                    ! Here we have explicitly all of the game rules
                    if(grid(i,j) == 1 .AND. numNeighbors < 2) then
                        newGrid(i,j) = 0
                    elseif(grid(i,j) == 1 .AND. (numNeighbors == 2 .OR. numNeighbors == 3)) then
                        newGrid(i,j) = 1
                    elseif(grid(i,j) == 1 .AND. numNeighbors > 3) then
                        newGrid(i,j) = 0
                    elseif(grid(i,j) == 0 .AND. numNeighbors == 3) then
                        newGrid(i,j) = 1
                    else
                        newGrid(i,j) = grid(i,j)
                    endif
                enddo
            enddo
        else
            ! Top-Bottom ghost rows
            do j=2,dim+1
                newGrid(1,j) = newGrid(dim+1,j) !Copy first game grid row to bottom ghost row
                newGrid(dim+2,j) = newGrid(2,j) !Copy first game grid row to top ghost row
            enddo
 
            ! Left-Right ghost columns
            do i=1,dim+2
                newGrid(i,1) = newGrid(i, dim+1) !Copy first game grid column to right ghost column
                newGrid(i,dim+2) = newGrid(i,2)  !Copy last game grid column to left ghost column
            enddo
 
            ! Now we loop over all cells and determine their fate
            !$hmppcg gridify(j,i)
            do j=2,dim+1
                do i=2,dim+1
                    ! Get the number of neighbors for a given grid point
                    numNeighbors = newGrid(i,j+1) + newGrid(i,j-1)&     !right & left
                                 + newGrid(i+1,j) + newGrid(i-1,j)&     !upper & lower
                                 + newGrid(i+1,j+1) + newGrid(i-1,j-1)& !diagonals
                                 + newGrid(i-1,j+1) + newGrid(i+1,j-1)
 
                    ! Here we have explicitly all of the game rules
                    if(newGrid(i,j) == 1 .AND. numNeighbors < 2) then
                        grid(i,j) = 0
                    elseif(newGrid(i,j) == 1 .AND. (numNeighbors == 2 .OR. numNeighbors == 3)) then
                        grid(i,j) = 1
                    elseif(newGrid(i,j) == 1 .AND. numNeighbors > 3) then
                        grid(i,j) = 0
                    elseif(newGrid(i,j) == 0 .AND. numNeighbors == 3) then
                        grid(i,j) = 1
                    else
                        grid(i,j) = newGrid(i,j)
                    endif
                enddo
            enddo
        endif !End grid newGrid switch

A major change to our code has been to get rid of the grid/newGrid switching kernel. We have explicitly coded the two cases and use the modulus of the main iteration variable to switch between the two pieces of code. This increases the code complexity but does get around a costly memory transfer.

! Now we loop over all cells and determine their fate
!$hmppcg gridify(j,i)
! Now we loop over all cells and determine their fate
!$hmppcg gridify(j,i)

By default the hmpp compiler did not want to gridify the loop over each cell. We can attempt to force the compiler to do so adding thehmppcg gridify(j, i) directive which specifies we want to gridify with respect to the induction variables j and i.

! Read back final configuration to confirm it worked
!$hmpp GOL delegatedstore, args[grid]

After the kernel is finished we copy the grid from the device to the host with hmpp GOL delegatedstore, args[grid]

! Release memory
!$hmpp GOL release

Finally since we allocated the device memory we must free it with the directive hmpp GOL release .