job_manage.fpp Source File


Source Code

# include "define.inc"

module job_manage

   implicit none
   private
   public :: timer_local
   public :: time_message
   public :: job_fork
   public :: checkstop
   public :: checktime
   public :: njobs

   integer :: njobs = 1

contains

!!! returns CPU time in second
   function timer_local()
# ifndef MPIINC
      use mpi, only: mpi_wtime
# else
      include "mpif.h" ! CMR following Michele Weiland's advice
# endif
      real :: timer_local

      timer_local = 0.

      timer_local = mpi_wtime()

   end function timer_local

   subroutine time_message(lprint, targ, chmessage)
      !
      ! this routine counts elapse time between two calls
      !
      character(len=*), intent(in) :: chmessage
      logical, intent(in) :: lprint
      real, intent(in out) :: targ(2) ! tsum and told
      real :: tnew

      tnew = timer_local()

      if (abs(targ(2)) < epsilon(0.)) then
         targ(2) = tnew
      else
         targ(1) = targ(1) + tnew - targ(2)
         if (lprint) write (*, '(A24,F7.2,A8)') chmessage, tnew - targ(2), ' seconds'
         targ(2) = 0.
      end if

   end subroutine time_message

   subroutine job_fork(n_ensembles)

      use file_utils, only: get_unused_unit, list_name, run_name, init_job_name
! MAB> -- moved init_error_unit and init_input_unit calls here from file_utils
! because they were being called there on all procs when they should be called
! only on proc0
      use file_utils, only: init_error_unit, init_input_unit, list_name
! <MAB
      use mp, only: job, scope, min_proc
      use mp, only: proc0, nproc
      use mp, only: init_job_topology, broadcast, finish_mp
      implicit none
      integer, intent(in), optional :: n_ensembles
      integer, dimension(:), allocatable :: group0
      integer :: i, l

      character(10) :: ext
      character(len=500), dimension(:), allocatable :: job_list

      integer :: list_unit, ierr
      logical :: err = .true., inp = .true.

      ! open file containing list of input files to run and read total
      ! number of input files from first line
      if (.not. present(n_ensembles)) then
         if (proc0) then
            call get_unused_unit(list_unit)
            open (unit=list_unit, file=trim(list_name))
            read (list_unit, *) njobs
         end if
      else
         njobs = n_ensembles
      end if
      call broadcast(njobs)

      if (nproc < njobs) then
         if (proc0) then
            write (*, *)
            write (*, *) 'Number of jobs = ', njobs, ' and number of processors = ', nproc
            write (*, *) 'Number of processors must not be less than the number of jobs'
            write (*, *) 'Stopping'
            write (*, *)
         end if
         call finish_mp
         stop
      end if

      if (mod(nproc, njobs) /= 0) then
         if (proc0) then
            write (*, *)
            write (*, *) 'Number of jobs = ', njobs, ' and number of processors = ', nproc
            write (*, *) 'Number of jobs must evenly divide the number of processors.'
            write (*, *) 'Stopping'
            write (*, *)
         end if
         call finish_mp
         stop
      end if

      allocate (job_list(0:njobs - 1))

      if (proc0) then
         if (.not. present(n_ensembles)) then
            do i = 0, njobs - 1
               read (list_unit, fmt="(a)") job_list(i)
            end do
            close (list_unit)
         else
            l = len_trim(list_name)
            do i = 0, njobs - 1
               write (ext, '(i9)') i + 1
               ext = adjustl(ext)
               job_list(i) = trim(list_name(1:l - 3))//"_"//trim(ext)
            end do
         end if
      end if

      do i = 0, njobs - 1
         call broadcast(job_list(i))
      end do

      allocate (group0(0:njobs - 1))

      call init_job_topology(njobs, group0, ierr)
      ! TT> brought up one line [call scope(subprocs)] from file_utils.fpp
      !     to init_jobs
      !    call init_job_name (njobs, group0, job_list)
      call init_job_name(job_list(job))
      ! <TT

      ! MAB> moved from file_utils because had to be within proc0,
      ! which is undefined there
      if (proc0) then
         call init_error_unit(err)
         call init_input_unit(inp)
      end if
      ! <MAB

      if (nproc > 1 .and. proc0) &
           & write (*, *) 'Job ', job, ' is called ', trim(run_name),&
           & ' and is running on ', nproc, ' processors with a minimum of', &
           & min_proc, ' processors on a node'
      if (nproc == 1) write (*, *) 'Job ', job, ' is called ', trim(run_name),&
           & ' and is running on ', nproc, ' processors with a minimum of', &
           & min_proc, ' processors on a node'

      deallocate (group0, job_list) ! MAB

   end subroutine job_fork

   subroutine checkstop(exit, list)

      use mp, only: proc0, broadcast
      use file_utils, only: run_name, list_name
      logical, intent(in), optional :: list
      logical, intent(in out) :: exit
      character(len=300) :: filename

      logical :: exit_local

      ! If .stop file has appeared, set exit flag
      filename = trim(run_name)//".stop"
      if (present(list)) then
         if (list) filename = list_name(:len_trim(list_name) - 5)//".stop"
      end if

      if (proc0) then
         inquire (file=filename, exist=exit_local)
         exit = exit .or. exit_local
      end if

      call broadcast(exit)

   end subroutine checkstop

   subroutine checktime(avail_time, exit)
      use mp, only: proc0, broadcast
      use file_utils, only: error_unit

      ! available time in second
      real, intent(in) :: avail_time
      ! true if elapse time exceed available time
      logical, intent(in out) :: exit
      logical, save :: initialized = .false.
      real :: elapse_time = 0.
      real :: initial_time = 0.
      real :: margin = 300. ! 5 minutes

      if (.not. initialized) then
         initial_time = timer_local()  ! timer_local() returns #seconds from fixed time in past
         initialized = .true.
         return
      end if

      elapse_time = timer_local() - initial_time

      if (proc0) then
         if (elapse_time >= avail_time - margin) then
            write (error_unit(), '(a,f12.4,a,f12.4)') &
                 & 'Elapse time ', elapse_time, &
                 & ' exceeds available time', avail_time - margin
            write (error_unit(), '(a,f12.4,a,f12.4,a)') &
                 & '  (Given CPU time: ', avail_time, &
                 & '  Margin: ', margin, ')'
            exit = .true.
         end if
      end if

      call broadcast(exit)

   end subroutine checktime

end module job_manage