! Molecular Orbital PACkage (MOPAC)
! Copyright 2021 Virginia Polytechnic Institute and State University
!
! Licensed under the Apache License, Version 2.0 (the "License");
! you may not use this file except in compliance with the License.
! You may obtain a copy of the License at
!
!    http://www.apache.org/licenses/LICENSE-2.0
!
! Unless required by applicable law or agreed to in writing, software
! distributed under the License is distributed on an "AS IS" BASIS,
! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
! See the License for the specific language governing permissions and
! limitations under the License.

subroutine diag_for_GPU (fao, vector, nocc, eig, norbs, mpack)
#ifdef MKL
    use molkst_C, only: num_threads
#endif
#ifdef GPU
    Use mod_vars_cuda, only: lgpu, prec, ngpus
    Use iso_c_binding
    use call_rot_cuda
    Use call_gemm_cublas
#endif
    implicit none
    integer, intent (in) :: nocc, mpack, norbs
    double precision, dimension (mpack), intent (in) :: fao
    double precision, dimension (norbs, norbs), intent (inout) :: vector
    double precision, dimension (norbs), intent (in) :: eig
    double precision, allocatable, dimension (:) :: fmo
    integer :: lumo, mdim, n, nvirt, i, ij, in, j,  norbs2
    double precision :: bigeps = 1.5d-007
    double precision :: a, alpha, b, beta, d, e, tiny, x
    double precision, allocatable, dimension(:,:)  :: fck
#ifdef GPU
    double precision, allocatable, dimension(:,:)  :: ci0,ca0 ! alp,bet
#endif
    integer :: kk
!
    integer,external :: idamax

!-----------------------------------------------
!***********************************************************************
!
!   "FAST" DIAGONALISATION PROCEDURE.
!
!    ON INPUT FAO CONTAINS THE LOWER HALF TRIANGLE OF THE MATRIX TO BE
!                         DIAGONALISED, PACKED.
!             VECTOR  CONTAINS THE OLD EIGENVECTORS ON INPUT, THE NEW
!             VECTORS ON EXITING.
!             NOCC = NUMBER OF OCCUPIED MOLECULAR ORBITALS.
!             EIG  = EIGENVALUES FROM AN EXACT DIAGONALISATION
!             MDIM = DECLARED SIZE OF MATRIX "C".
!             N = NUMBER OF ATOMIC ORBITALS IN BASIS SET
!
!  DIAG IS A PSEUDO-DIAGONALISATION PROCEDURE, IN THAT THE VECTORS THAT
!       ARE GENERATED BY IT ARE MORE NEARLY ABLE TO BLOCK-DIAGONALISE
!       THE FOCK MATRIX OVER MOLECULAR ORBITALS THAN THE STARTING
!       VECTORS. IT MUST BE CONSIDERED PSEUDO FOR SEVERAL REASONS:
!       (A) IT DOES NOT GENERATE EIGENVECTORS - THE SECULAR DETERMINANT
!           IS NOT DIAGONALISED, ONLY THE OCCUPIED-VIRTUAL INTERSECTION.
!       (B) MANY SMALL ELEMENTS IN THE SEC.DET. ARE IGNORED AS BEING TOO
!           SMALL COMPARED WITH THE LARGEST ELEMENT.
!       (C) WHEN ELEMENTS ARE ELIMINATED BY ROTATION, THE REST OF THE
!           SEC. DET. IS ASSUMED NOT TO CHANGE, I.E. ELEMENTS CREATED
!           ARE IGNORED.
!       (D) THE ROTATION REQUIRED TO ELIMINATE THOSE ELEMENTS CONSIDERED
!           SIGNIFICANT IS APPROXIMATED TO USING THE EIGENVALUES OF THE
!           EXACT DIAGONALISATION THROUGHOUT THE REST OF THE ITERATIVE
!           PROCEDURE.
!
!  (NOTE:- IN AN ITERATIVE PROCEDURE ALL THE APPROXIMATIONS PRESENT IN
!          DIAG BECOME VALID AT SELF-CONSISTENCY, SELF-CONSISTENCY IS
!          NOT SLOWED DOWN BY USE OF THESE APPROXIMATIONS)
!
!    REFERENCE:
!             "FAST SEMIEMPIRICAL CALCULATIONS",
!             STEWART. J.J.P., CSASZAR, P., PULAY, P., J. COMP. CHEM.,
!             3, 227, (1982)
!
!***********************************************************************
!
!  FIRST, CONSTRUCT THAT PART OF A SECULAR DETERMINANT OVER MOLECULAR
!  ORBITALS WHICH CONNECTS THE OCCUPIED AND VIRTUAL SETS.
!
!***********************************************************************
!
!  Quick check:  does an occupied-virtual intersection exist
!
   if (nocc == norbs .or. nocc == 0) return
    n = norbs
    mdim = norbs
    lumo = nocc + 1
    nvirt = n - nocc
    norbs2 = norbs*norbs

    allocate(fmo(norbs2), fck(norbs,norbs))
    !fck = 0.d0
    forall (i=1:norbs,j=1:norbs) fck(i,j) = 0.d0
    do i = 1,n
      do j = 1,n

        if (j > i) then  ! here is for columns > rows
          in = ((j*(j-1))/2) + i  ! location in upper triangle
        else
          in = ((i*(i-1))/2) + j  ! location in lower triangle
        end if

        ij = j + n*(i-1)
        fmo(ij) = fao(in)
      end do
    end do
! here, performs matrix multiplications to form FMO
#ifdef GPU
    if (lgpu) then
  !  PERFORMS BOTH MULTIPLICATIONS USING CUBLAS	
      if (nocc < nvirt) then
        call gemm_cublas('N', 'N', n, nocc, n, 1.0d0, fmo, n,vector , mdim, 0.0d0, fck, &
        & norbs)
        call gemm_cublas('T', 'N', nvirt, nocc, n, 1.0d0, vector(1:n, lumo:n), mdim, fck, &
        & norbs, 0.0d0, fmo, nvirt)
      else
        call gemm_cublas ('N', 'N', n, nvirt, n, 1.0d0, fmo, n, vector(1:n, lumo:n), mdim, &
        & 0.0d0, fck, norbs)
        call gemm_cublas ('T', 'N', nvirt, nocc, n, 1.0d0, fck, norbs, vector, mdim, &
        & 0.0d0, fmo, nvirt)
      end if
    else
#endif
!
!  PERFORMS BOTH MULTIPLICATIONS USING LAPACK
!
      if (nocc < nvirt) then
        call dgemm ("N", "N", n, nocc, n, 1.d0, fmo, n, vector, mdim, 0.d0, fck, &
        & norbs)
        call dgemm ("T", "N", nvirt, nocc, n, 1.d0, vector(1:n, lumo:n), mdim, fck, &
        & norbs, 0.d0, fmo, nvirt)
      else
        call dgemm ("N", "N", n, nvirt, n, 1.d0, fmo, n, vector(1:n, lumo:n), mdim, &
        & 0.d0, fck, norbs)
        call dgemm ("T", "N", nvirt, nocc, n, 1.d0, fck, norbs, vector, mdim, &
        & 0.d0, fmo, nvirt)
      end if
#ifdef GPU
    end if
#endif
    i = idamax (nocc*nvirt, fmo, 1)
    tiny = dabs (fmo(i)) * 0.05d0
!***********************************************************************
!
!   NOW DO A CRUDE 2 BY 2 ROTATION TO "ELIMINATE" SIGNIFICANT ELEMENTS
!
!***********************************************************************
    kk = 1
#ifdef GPU
    if (lgpu) then
      kk = 2
    end if
#endif

    Select case (kk)

       case (1)

          ! The correct one
          ij = 1
#ifdef MKL
          call mkl_set_num_threads(ij)
#endif
          ij = 0 ; in = 0

          do i = 1, nocc
            do j = lumo, n
               ij = ij + 1

               !ij = i + n*(j-1)
               !ij = nvirt*(i-1) - (lumo - j) + 1

               x = fmo(ij)
               if (dabs(x) < tiny) cycle
!
!      BEGIN 2 X 2 ROTATIONS
!
               a = eig(i)
               b = eig(j)
               d = a - b !eig(i) - eig(j)
!
!    USE BIGEPS TO DETERMINE WHETHER TO DO A 2 BY 2 ROTATION
!
               if (dabs (x/d) < bigeps) cycle
!
!  AT THIS POINT WE KNOW THAT
!
               !in = in + 1
               e = sign (dsqrt(4.0d0*x*x + d*d), d)
               alpha = dsqrt (0.5d0*(1.0d0 + d/e))
               beta = -sign(dsqrt(1.0d0 - alpha*alpha), x)
               call drot(n,vector(1:n,i),1,vector(1:n,j),1,alpha,beta)

!              continue
            end do
          end do

          !write(6,*) 'IN e IJ = ',in,ij

          continue

       case (2)

#ifdef GPU
         allocate (ci0(n,nocc),ca0(n,nvirt),stat=i)

         ci0 = vector(1:n,1:nocc)
         ca0 = vector(1:n,lumo:n)

         if (ngpus > 1) then
            call rot_cuda_2gpu(fmo,eig,vector,ci0,ca0,nocc,lumo,n,bigeps,tiny)
         else
            call rot_cuda(fmo,eig,vector,ci0,ca0,nocc,lumo,n,bigeps,tiny)
         end if
         deallocate (ci0,ca0,stat=i)
#endif
    End select
    deallocate (fmo,fck,stat=i)
    continue
#ifdef MKL
    call mkl_set_num_threads(num_threads)
#endif
    return
end subroutine diag_for_GPU
