From 6043063d2b7156e05d9abc253035fcbcb5027a28 Mon Sep 17 00:00:00 2001
From: Johnathan Rhyne <rhynejohnathan@gmail.com>
Date: Wed, 5 Feb 2025 10:21:10 -0700
Subject: [PATCH 01/10] double tests pass. TODO: Run single tests for slarfb0c2

---
 SRC/CMakeLists.txt |   2 +-
 SRC/Makefile       |   2 +-
 SRC/dlarfb0c2.f    | 263 +++++++++++++++++++++++++++++++++++++++++++++
 SRC/dorglq.f       |  25 ++---
 SRC/dorgql.f       |  14 +--
 SRC/dorgqr.f       | 172 +++++++++++++++++------------
 SRC/dorgrq.f       |  17 +--
 SRC/slarfb0c2.f    | 263 +++++++++++++++++++++++++++++++++++++++++++++
 8 files changed, 648 insertions(+), 110 deletions(-)
 create mode 100644 SRC/dlarfb0c2.f
 create mode 100644 SRC/slarfb0c2.f

diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt
index be426cecd4..88e503efea 100644
--- a/SRC/CMakeLists.txt
+++ b/SRC/CMakeLists.txt
@@ -307,7 +307,7 @@ set(DLASRC
    dlaqgb.f dlaqge.f dlaqp2.f dlaqps.f dlaqp2rk.f dlaqp3rk.f dlaqsb.f dlaqsp.f dlaqsy.f
    dlaqr0.f dlaqr1.f dlaqr2.f dlaqr3.f dlaqr4.f dlaqr5.f
    dlaqtr.f dlar1v.f dlar2v.f iladlr.f iladlc.f
-   dlarf.f  dlarfb.f dlarfb_gett.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f dlarf1f.f dlarf1l.f
+   dlarf.f  dlarfb.f dlarfb_gett.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f dlarf1f.f dlarf1l.f dlarfb0c2.f
    dlargv.f dlarmm.f dlarrv.f dlartv.f
    dlarz.f  dlarzb.f dlarzt.f dlaswp.f dlasy2.f
    dlasyf.f dlasyf_rook.f dlasyf_rk.f dlasyf_aa.f
diff --git a/SRC/Makefile b/SRC/Makefile
index 0191626f0e..dc8f472092 100644
--- a/SRC/Makefile
+++ b/SRC/Makefile
@@ -339,7 +339,7 @@ DLASRC = \
    dlaqgb.o dlaqge.o dlaqp2.o dlaqps.o dlaqp2rk.o dlaqp3rk.o dlaqsb.o dlaqsp.o dlaqsy.o \
    dlaqr0.o dlaqr1.o dlaqr2.o dlaqr3.o dlaqr4.o dlaqr5.o \
    dlaqtr.o dlar1v.o dlar2v.o iladlr.o iladlc.o \
-   dlarf.o  dlarfb.o dlarfb_gett.o dlarfg.o dlarfgp.o dlarft.o dlarfx.o dlarfy.o dlarf1f.o dlarf1l.o\
+   dlarf.o  dlarfb.o dlarfb_gett.o dlarfg.o dlarfgp.o dlarft.o dlarfx.o dlarfy.o dlarf1f.o dlarf1l.o dlarfb0c2.o\
    dlargv.o dlarmm.o dlarrv.o dlartv.o \
    dlarz.o  dlarzb.o dlarzt.o dlaswp.o dlasy2.o \
    dlasyf.o dlasyf_rook.o dlasyf_rk.o \
diff --git a/SRC/dlarfb0c2.f b/SRC/dlarfb0c2.f
new file mode 100644
index 0000000000..cb7f9d3873
--- /dev/null
+++ b/SRC/dlarfb0c2.f
@@ -0,0 +1,263 @@
+      SUBROUTINE DLARFB0C2(SIDE, TRANS, DIRECT, STOREV, M, N, K, V, 
+     $                     LDV, T, LDT, C, LDC)
+         ! Scalar arguments
+         INTEGER           M, N, K, LDV, LDC, LDT
+         ! Note: We should probably get rid of SIDE and TRANS as these flags
+         ! are not used as we are only using this routine inside
+         ! x{OR,UN}G{QR,LQ,QR,LQ}, which have values that never change
+         CHARACTER         SIDE, TRANS, DIRECT, STOREV 
+         ! Array arguments
+         DOUBLE PRECISION  V(LDV,*), C(LDC,*), T(LDT,*)
+         ! Local scalars
+         LOGICAL           QR, LQ, QL, DIRF, COLV
+         ! External subroutines
+         EXTERNAL          DGEMM, DTRMM
+         ! External functions
+         LOGICAL           LSAME
+         EXTERNAL          LSAME
+         ! Parameters
+         DOUBLE PRECISION ONE, ZERO, NEG_ONE
+         PARAMETER(ONE=1.0D+0, ZERO = 0.0D+0, NEG_ONE = -1.0D+0)
+
+         ! Beginning of executable statements
+         ! Convert our character flags to logical values
+         DIRF = LSAME(DIRECT,'F')
+         COLV = LSAME(STOREV,'C')
+
+         ! Determine which of the 4 modes are using.
+         ! QR is when we store the reflectors column by column and have the
+         ! 'first' reflector stored in the first column
+         QR = DIRF.AND.COLV
+
+         ! LQ is when we store the reflectors row by row and have the
+         ! 'first' reflector stored in the first row
+         LQ = DIRF.AND.(.NOT.COLV)
+
+         ! QL is when we store the reflectors column by column and have the
+         ! 'first' reflector stored in the last column
+         QL = (.NOT.DIRF).AND.COLV
+
+         ! RQ is when we store the reflectors row by row and have the
+         ! 'first' reflector stored in the last row
+         ! RQ = (.NOT.DIRF).AND.(.NOT.COLV)
+         ! Since we have exactly one of these 4 modes, we don't need to actually
+         ! store the value of RQ, instead we assume this is the case if we fail
+         ! the above 3 checks.
+
+         IF (QR) THEN
+            ! We are computing C = HC = (I - VTV')C
+            ! Where: V = [ V1 ] and C = [ C1 ]
+            !            [ V2 ]         [ C2 ]
+            ! with the following dimensions:
+            !     V1\in\R^{K\times K}
+            !     V2\in\R^{M-K\times K}
+            !     C1=0\in\R^{K\times N}
+            !     C2\in\R^{M-K\times N}
+            ! Since we are assuming that C1 is a zero matrix and it will be
+            ! overwritten on exit, we can use this spot as a temporary workspace
+            ! without having to allocate anything extra.
+            ! This lets us simplify our above equation to get
+            !
+            ! C = HC = (I - [ V1 ]T [V1', V2'])[ 0  ]
+            !               [ V2 ]             [ C2 ]
+            !   = [ 0  ] - [ V1 ]T*V2'*C2
+            !     [ C2 ]   [ V2 ]
+            !
+            !   = [ 0  ] - [ V1*T*V2'*C2 ]
+            !     [ C2 ]   [ V2*T*V2'*C2 ]
+            !
+            !   = [      V1*T*V2'*C2 ]
+            !     [ C2 - V2*T*V2'*C2 ]
+            !
+            ! So, we can order our computations as follows:
+            !
+            ! C1 = V2'*C2
+            ! C1 = T*C1
+            ! C2 = C2 - V2*C1
+            ! C1 = -V1*C1
+            !
+            ! To achieve the same end result
+            !
+            ! C1 = V2'*C2
+            !
+            CALL DGEMM('Transpose', 'No Transpose', K, N, M - K,
+     $                  ONE, V(K+1,1), LDV, C(K+1,1), LDC, ZERO, 
+     $                  C, LDC)
+            !
+            ! C1 = T*C1
+            !
+            CALL DTRMM('Left', 'Upper', 'No Transpose', 'Non-unit', 
+     $                  K, N, ONE, T, LDT, C, LDC)
+            !
+            ! C2 = C2 - V2*C1 = -V2*C1 + C2
+            !
+            CALL DGEMM('No Transpose', 'No Transpose', M-K, N, K,
+     $                  NEG_ONE, V(K+1,1), LDV, C, LDC, ONE, 
+     $                  C(K+1,1), LDC)
+            !
+            ! C1 = -V1*C1
+            !
+            CALL DTRMM('Left', 'Lower', 'No Transpose', 'Unit', 
+     $                  K, N, NEG_ONE, V, LDV, C, LDC)
+         ELSE IF (LQ) THEN
+            ! We are computing C = CH' = C(I-V'T'V)
+            ! Where: V = [ V1 V2 ] and C = [ C1 C2 ]
+            ! with the following dimensions:
+            !     V1\in\R^{K\times K}
+            !     V2\in\R^{K\times N-K}
+            !     C1=0\in\R^{M\times K}
+            !     C2\in\R^{M\times N-K}
+            ! Since we are assuming that C1 is a zero matrix and it will be
+            ! overwritten on exit, we can use this spot as a temporary workspace
+            ! without having to allocate anything extra.
+            ! This lets us simplify our above equation to get
+            !
+            ! C = CH' = [ 0, C2 ](I - [ V1' ]T'[ V1, V2 ])
+            !                         [ V2' ]
+            !
+            !   = [ 0, C2 ] - [ 0, C2 ][ V1' ]T'[ V1, V2 ]
+            !                          [ V2' ]
+            !
+            !   = [ 0, C2 ] - C2*V2'*T'[ V1, V2 ]
+            !
+            !   = [ -C2*V2'*T'*V1, C2 - C2*V2'*T'*V2 ]
+            !
+            ! So, we can order our computations as follows:
+            !
+            ! C1 = C2*V2'
+            ! C1 = C1*T'
+            ! C2 = C2 - C1*V2
+            ! C1 = -C1*V1
+            !
+            ! To achieve the same end result
+            !
+            ! C1 = C2*V2'
+            !
+            CALL DGEMM('No Transpose', 'Transpose', M, K, N-K, 
+     $            ONE, C(1,K+1), LDC, V(1, K+1), LDV, ZERO, C, LDC)
+            !
+            ! C1 = C1*T'
+            !
+            CALL DTRMM('Right', 'Upper', 'Transpose', 'Non-unit',
+     $            M, K, ONE, T, LDT, C, LDC)
+            !
+            ! C2 = C2 - C1*V2 = -C1*V2 + C2
+            !
+            CALL DGEMM('No Transpose', 'No Transpose', M, N-K, K,
+     $            NEG_ONE, C, LDC, V(1,K+1), LDV, ONE, 
+     $            C(1,K+1), LDC)
+            !
+            ! C1 = -C1*V1
+            !
+            CALL DTRMM('Right', 'Upper', 'No Transpose', 'Unit',
+     $            M, K, NEG_ONE, V, LDV, C, LDC)
+         ELSE IF (QL) THEN
+            ! We are computing C = HC = (I - VTV')C
+            ! Where: V = [ V2 ] and C = [ C2 ]
+            !            [ V1 ]         [ C1 ]
+            ! with the following dimensions:
+            !     V1\in\R^{K\times K}
+            !     V2\in\R^{M-K\times K}
+            !     C1=0\in\R^{K\times N}
+            !     C2\in\R^{M-K\times N}
+            ! Since we are assuming that C1 is a zero matrix and it will be
+            ! overwritten on exit, we can use this spot as a temporary workspace
+            ! without having to allocate anything extra.
+            ! This lets us simplify our above equation to get
+            !
+            ! C = HC = (I-[ V2 ]T[ V2' V1' ])[ C2 ]
+            !             [ V1 ]             [ 0  ]
+            !
+            !   = [ C2 ] - [ V2 ]T*V2'*C2
+            !     [ 0  ]   [ V1 ]
+            !
+            !   = [ C2 ] - [ V2*T*V2'*C2 ]
+            !     [ 0  ]   [ V1*T*V2'*C2 ]
+            !
+            !   = [ C2 - V2*T*V2'*C2 ]
+            !     [    - V1*T*V2'*C2 ]
+            !
+            ! So, we can order our computations as follows:
+            !
+            ! C1 = V2'*C2
+            ! C1 = T*C1
+            ! C2 = C2 - V2*C1
+            ! C1 = -V1*C1
+            !
+            ! To achieve the same end result
+            !
+            ! C1 = V2'*C2
+            !
+            CALL DGEMM('Transpose', 'No Transpose', K, N, M-K, 
+     $         ONE, V, LDV, C, LDC, ZERO, C(M-K+1, 1), LDC)
+            !
+            ! C1 = T*C1
+            !
+            CALL DTRMM('Left', 'Lower', 'No Transpose', 'Non-unit',
+     $         K, N, ONE, T, LDT, C(M-K+1,1), LDC)
+            !
+            ! C2 = C2 - V2*C1 = -V2*C1 + C2
+            !
+            CALL DGEMM('No Transpose', 'No Transpose', M-K, N, K, 
+     $         NEG_ONE, V, LDV, C(M-K+1,1), LDC, ONE, C, LDC)
+            !
+            ! C1 = -V1*C1
+            !
+            CALL DTRMM('Left', 'Upper', 'No Transpose', 'Unit', 
+     $         K, N, NEG_ONE, V(M-K+1,1), LDV, C(M-K+1,1), LDC)
+         ELSE ! IF (RQ) THEN
+            ! We are computing C = CH' = C(I-V'T'V)
+            ! Where: V = [ V2 V1] and C = [ C2 C1 ]
+            ! with the following dimensions:
+            !     V1\in\R^{K\times K}
+            !     V2\in\R^{K\times N-K}
+            !     C1=0\in\R^{M\times K}
+            !     C2\in\R^{M\times N-K}
+            ! Since we are assuming that C1 is a zero matrix and it will be
+            ! overwritten on exit, we can use this spot as a temporary workspace
+            ! without having to allocate anything extra.
+            ! This lets us simplify our above equation to get
+            !
+            ! C = CH' = [ C2, 0 ] (I - [ V2' ]T'[ V2, V1 ]
+            !                          [ V1' ]
+            !
+            !   = [ C2, 0 ] - [ C2, 0 ] [ V2' ]T'[ V2, V1 ]
+            !                           [ V1' ]
+            !
+            !   = [ C2, 0 ] - C2*V2'*T'[ V2, V1 ]
+            !
+            !   = [ C2, 0 ] - [ C2*V2'*T'*V2, C2*V2'*T'*V1 ] 
+            !
+            !   = [ C2 - C2*V2'*T'*V2, -C2*V2'*T'*V1 ]
+            !
+            ! So, we can order our computations as follows:
+            !
+            ! C1 = C2*V2'
+            ! C1 = C1*T'
+            ! C2 = C2 - C1*V2
+            ! C1 = -C1*V1
+            ! 
+            !
+            ! To achieve the same end result
+            !
+            ! C1 = C2*V2'
+            !
+            CALL DGEMM('No Transpose', 'Transpose', M, K, N-K,
+     $         ONE, C, LDC, V, LDV, ZERO, C(1, N-K+1), LDC)
+            !
+            ! C1 = C1*T'
+            !
+            CALL DTRMM('Right', 'Lower', 'Transpose', 'Non-unit', 
+     $         M, K, ONE, T, LDT, C(1, N-K+1), LDC)
+            !
+            ! C2 = C2 - C1*V2 = -C1*V2 + C2
+            !
+            CALL DGEMM('No Transpose', 'No Transpose', M, N-K, K, 
+     $         NEG_ONE, C(1, N-K+1), LDC, V, LDV, ONE, C, LDC)
+            !
+            ! C1 = -C1*V1
+            !
+            CALL DTRMM('Right', 'Lower', 'No Transpose', 'Unit', 
+     $         M, K, NEG_ONE, V(1, N-K+1), LDV, C(1,N-K+1), LDC)
+         END IF
+      END SUBROUTINE
diff --git a/SRC/dorglq.f b/SRC/dorglq.f
index c41367ced4..eeb73e4b27 100644
--- a/SRC/dorglq.f
+++ b/SRC/dorglq.f
@@ -148,7 +148,7 @@ SUBROUTINE DORGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
      $                   LWKOPT, NB, NBMIN, NX
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DLARFB, DLARFT, DORGL2, XERBLA
+      EXTERNAL           DLARFB0C2, DLARFT, DORGL2, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
@@ -163,7 +163,8 @@ SUBROUTINE DORGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
       INFO = 0
       NB = ILAENV( 1, 'DORGLQ', ' ', M, N, K, -1 )
-      LWKOPT = MAX( 1, M )*NB
+      ! DLARFB0C2 means we only need a workspace for calls to dorgl2
+      LWKOPT = MAX( 1, M )
       WORK( 1 ) = LWKOPT
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
@@ -227,11 +228,6 @@ SUBROUTINE DORGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
 *        Set A(kk+1:m,1:kk) to zero.
 *
-         DO 20 J = 1, KK
-            DO 10 I = KK + 1, M
-               A( I, J ) = ZERO
-   10       CONTINUE
-   20    CONTINUE
       ELSE
          KK = 0
       END IF
@@ -259,26 +255,19 @@ SUBROUTINE DORGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
 *              Apply H**T to A(i+ib:m,i:n) from the right
 *
-               CALL DLARFB( 'Right', 'Transpose', 'Forward',
-     $                      'Rowwise',
-     $                      M-I-IB+1, N-I+1, IB, A( I, I ), LDA, WORK,
-     $                      LDWORK, A( I+IB, I ), LDA, WORK( IB+1 ),
-     $                      LDWORK )
+               CALL DLARFB0C2('A', 'A', 'Forward', 'Rowwise', 
+     $               M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, LDWORK,
+     $               A(I+IB,I), LDA)
             END IF
 *
 *           Apply H**T to columns i:n of current block
-*
+
             CALL DORGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ),
      $                   WORK,
      $                   IINFO )
 *
 *           Set columns 1:i-1 of current block to zero
 *
-            DO 40 J = 1, I - 1
-               DO 30 L = I, I + IB - 1
-                  A( L, J ) = ZERO
-   30          CONTINUE
-   40       CONTINUE
    50    CONTINUE
       END IF
 *
diff --git a/SRC/dorgql.f b/SRC/dorgql.f
index f931f5a9c8..089dff77bb 100644
--- a/SRC/dorgql.f
+++ b/SRC/dorgql.f
@@ -149,7 +149,7 @@ SUBROUTINE DORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
      $                   NB, NBMIN, NX
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DLARFB, DLARFT, DORG2L, XERBLA
+      EXTERNAL           DLARFB0C2, DLARFT, DORG2L, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
@@ -265,10 +265,9 @@ SUBROUTINE DORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
 *              Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left
 *
-               CALL DLARFB( 'Left', 'No transpose', 'Backward',
-     $                      'Columnwise', M-K+I+IB-1, N-K+I-1, IB,
-     $                      A( 1, N-K+I ), LDA, WORK, LDWORK, A, LDA,
-     $                      WORK( IB+1 ), LDWORK )
+               CALL DLARFB0C2('A', 'A', 'Backward', 'Columnwise',
+     $               M-K+I+IB-1, N-K+I-1, IB, A(1, N-K+I), LDA, 
+     $               WORK, LDWORK, A, LDA)
             END IF
 *
 *           Apply H to rows 1:m-k+i+ib-1 of current block
@@ -278,11 +277,6 @@ SUBROUTINE DORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
 *           Set rows m-k+i+ib:m of current block to zero
 *
-            DO 40 J = N - K + I, N - K + I + IB - 1
-               DO 30 L = M - K + I + IB, M
-                  A( L, J ) = ZERO
-   30          CONTINUE
-   40       CONTINUE
    50    CONTINUE
       END IF
 *
diff --git a/SRC/dorgqr.f b/SRC/dorgqr.f
index fd88519871..5e5851ff44 100644
--- a/SRC/dorgqr.f
+++ b/SRC/dorgqr.f
@@ -1,3 +1,5 @@
+c
+c
 *> \brief \b DORGQR
 *
 *  =========== DOCUMENTATION ===========
@@ -121,10 +123,11 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup ungqr
+*> \ingroup doubleOTHERcomputational
 *
 *  =====================================================================
       SUBROUTINE DORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
+      IMPLICIT NONE
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
@@ -140,16 +143,16 @@ SUBROUTINE DORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *  =====================================================================
 *
 *     .. Parameters ..
-      DOUBLE PRECISION   ZERO
-      PARAMETER          ( ZERO = 0.0D+0 )
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IB, IINFO, IWS, J, KI, KK, L, LDWORK,
+      INTEGER            I,II, IB, IINFO, IWS, J,JJ, KI, KK, L, LDWORK,
      $                   LWKOPT, NB, NBMIN, NX
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DLARFB, DLARFT, DORG2R, XERBLA
+      EXTERNAL             DLARFB0C2, DLARFT, DORG2R, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
@@ -164,7 +167,9 @@ SUBROUTINE DORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
       INFO = 0
       NB = ILAENV( 1, 'DORGQR', ' ', M, N, K, -1 )
-      LWKOPT = MAX( 1, N )*NB
+      ! Only need a workspace for dorg2r in case of bailout and 
+      ! for the panel factorization
+      LWKOPT = MAX( 1, N )
       WORK( 1 ) = LWKOPT
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
@@ -192,96 +197,129 @@ SUBROUTINE DORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
          RETURN
       END IF
 *
+      ! Probably not needed anymore
       NBMIN = 2
+      ! Parameter that controls when we cross from blocked to
+      ! unblocked
       NX = 0
-      IWS = N
-      IF( NB.GT.1 .AND. NB.LT.K ) THEN
 *
-*        Determine when to cross over from blocked to unblocked code.
-*
-         NX = MAX( 0, ILAENV( 3, 'DORGQR', ' ', M, N, K, -1 ) )
-         IF( NX.LT.K ) THEN
+      IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
 *
-*           Determine if workspace is large enough for blocked code.
+*        Use blocked code after the last block.
+*        The first kk columns are handled by the block method.
 *
-            LDWORK = N
-            IWS = LDWORK*NB
-            IF( LWORK.LT.IWS ) THEN
+         KI = K - 2 * NB
+         KK = K - NB
+      ELSE
+         KK = 0
+      END IF
 *
-*              Not enough workspace to use optimal NB:  reduce NB and
-*              determine the minimum value of NB.
+*     Use unblocked code for the only block.
 *
-               NB = LWORK / LDWORK
-               NBMIN = MAX( 2, ILAENV( 2, 'DORGQR', ' ', M, N, K,
-     $                      -1 ) )
-            END IF
-         END IF
+      IF( KK.EQ.0 ) THEN
+            CALL DORG2R( M, N, K, A, LDA, TAU, WORK, IINFO ) 
       END IF
 *
-      IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
+      IF( KK.GT.0 ) THEN
+         I = KK + 1
+         IB = NB
+         ! This is a specialized form of our loop below. We could make this its
+         ! own function, however this is a specialized step, so currently we
+         ! don't do that.
 *
-*        Use blocked code after the last block.
-*        The first kk columns are handled by the block method.
+*           Form the triangular factor of the block reflector
+*           H = H(i) H(i+1) . . . H(i+ib-1)
 *
-         KI = ( ( K-NX-1 ) / NB )*NB
-         KK = MIN( K, KI+NB )
+         CALL DLARFT('Forward', 'Column', M-I+1, IB, A(I,I), 
+     $                     LDA, TAU(I), A(I,I), LDA)
 *
-*        Set A(1:kk,kk+1:n) to zero.
+*           Apply H to A(i:m,i+ib:n) from the left
 *
-         DO 20 J = KK + 1, N
-            DO 10 I = 1, KK
-               A( I, J ) = ZERO
-   10       CONTINUE
-   20    CONTINUE
-      ELSE
-         KK = 0
-      END IF
 *
-*     Use unblocked code for the last or only block.
+**        W := V2
+*        C1 := V2**T
 *
-      IF( KK.LT.N )
-     $   CALL DORG2R( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA,
-     $                TAU( KK+1 ), WORK, IINFO )
+*         Since C1 starts as 0, we are using this instead of WORK(IB+1).
+*         This helps us reduce the memory footprint by lowering WORK to
+*         be of only size IB
+*         CALL DLACPY('All', N-K, IB, A(I+IB,I), LDA,WORK(IB+1),LDWORK)
+         DO JJ = K - NB + 1, K
+           DO II = K + 1, N
+              A( JJ, II ) = A( II, JJ)
+           END DO
+         END DO
 *
-      IF( KK.GT.0 ) THEN
+*              C1 := T * C1
+*
+         CALL DTRMM( 'Left', 'Upper', 'No transpose', 'Non-unit', IB,
+     $               N-K,ONE, A(I,I), LDA, A(I,I+IB),LDA )
+*
+*                 C2 := C2 - V2 * C1
+*
+         CALL DGEMM( 'No transpose', 'No transpose', M-IB-KK, N-K, IB,
+     $               -ONE, A( I+IB, I ), LDA, A(I,I+IB),LDA, ZERO,
+     $               A( I+IB, I+IB ), LDA )
+         DO JJ = 1, N-K
+            A(I+IB+JJ-1,I+IB+JJ-1) = 1 + A(I+IB+JJ-1,I+IB+JJ-1)
+         END DO
+*
+*              C1 := -V1 * C1 
+*
+         CALL DTRMM( 'Left', 'Lower', 'No transpose', 'Unit', IB, N-K,
+     $               -ONE, A(I,I), LDA, A(I,I+IB),LDA )
 *
-*        Use blocked code
+*        Apply H to rows i:m of current block
 *
-         DO 50 I = KI + 1, 1, -NB
-            IB = MIN( NB, K-I+1 )
-            IF( I+IB.LE.N ) THEN
+         CALL DORG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK, IINFO)
+         DO I = KI + 1, 1, -NB
+            IB = NB
 *
-*              Form the triangular factor of the block reflector
-*              H = H(i) H(i+1) . . . H(i+ib-1)
+*           Form the triangular factor of the block reflector
+*           H = H(i) H(i+1) . . . H(i+ib-1)
 *
-               CALL DLARFT( 'Forward', 'Columnwise', M-I+1, IB,
-     $                      A( I, I ), LDA, TAU( I ), WORK, LDWORK )
+            CALL DLARFT('Forward', 'Column', M-I+1, IB, A(I,I), 
+     $         LDA, TAU(I), A(I,I), LDA)
 *
-*              Apply H to A(i:m,i+ib:n) from the left
+*           Apply H to A(i:m,i+ib:n) from the left
 *
-               CALL DLARFB( 'Left', 'No transpose', 'Forward',
-     $                      'Columnwise', M-I+1, N-I-IB+1, IB,
-     $                      A( I, I ), LDA, WORK, LDWORK, A( I, I+IB ),
-     $                      LDA, WORK( IB+1 ), LDWORK )
-            END IF
+            CALL DLARFB0C2('A', 'A', 'Forward', 'Column', M-I+1, 
+     $         N-(I+IB)+1, IB, A(I,I), LDA, A(I,I), LDA, 
+     $         A(I,I+IB), LDA)
+
 *
 *           Apply H to rows i:m of current block
 *
-            CALL DORG2R( M-I+1, IB, IB, A( I, I ), LDA, TAU( I ),
-     $                   WORK,
-     $                   IINFO )
+            CALL DORG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK, 
+     $         IINFO)
+         END DO
+*        This checks for if K was a perfect multiple of NB
+*        so that we only have a special case for the last block when
+*        necessary
+         IF(I.LT.1) THEN
+            IB = I + NB - 1
+            I = 1
 *
-*           Set rows 1:i-1 of current block to zero
+*           Form the triangular factor of the block reflector
+*           H = H(i) H(i+1) . . . H(i+ib-1)
 *
-            DO 40 J = I, I + IB - 1
-               DO 30 L = 1, I - 1
-                  A( L, J ) = ZERO
-   30          CONTINUE
-   40       CONTINUE
-   50    CONTINUE
+            CALL DLARFT('Forward', 'Column', M-I+1, IB, A(I,I), 
+     $         LDA, TAU(I), A(I,I), LDA)
+*
+*           Apply H to A(i:m,i+ib:n) from the left
+*
+            CALL DLARFB0C2('A', 'A', 'Forward', 'Column', M-I+1, 
+     $         N-(I+IB)+1, IB, A(I,I), LDA, A(I,I), LDA, 
+     $         A(I,I+IB),LDA)
+
+*
+*           Apply H to rows i:m of current block
+*
+            CALL DORG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK, 
+     $         IINFO)
+         END IF
       END IF
 *
-      WORK( 1 ) = IWS
+*      WORK( 1 ) = IWS
       RETURN
 *
 *     End of DORGQR
diff --git a/SRC/dorgrq.f b/SRC/dorgrq.f
index c805484578..114891a7c1 100644
--- a/SRC/dorgrq.f
+++ b/SRC/dorgrq.f
@@ -149,7 +149,7 @@ SUBROUTINE DORGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
      $                   LWKOPT, NB, NBMIN, NX
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DLARFB, DLARFT, DORGR2, XERBLA
+      EXTERNAL           DLARFB0C2, DLARFT, DORGR2, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
@@ -266,10 +266,9 @@ SUBROUTINE DORGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
 *              Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right
 *
-               CALL DLARFB( 'Right', 'Transpose', 'Backward',
-     $                      'Rowwise',
-     $                      II-1, N-K+I+IB-1, IB, A( II, 1 ), LDA, WORK,
-     $                      LDWORK, A, LDA, WORK( IB+1 ), LDWORK )
+               CALL DLARFB0C2( 'A', 'A', 'Backward', 'Rowwise', 
+     $               II-1, N-K+I+IB-1, IB, A(II,1), LDA, WORK, LDWORK,
+     $               A, LDA)
             END IF
 *
 *           Apply H**T to columns 1:n-k+i+ib-1 of current block
@@ -277,14 +276,6 @@ SUBROUTINE DORGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
             CALL DORGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA,
      $                   TAU( I ),
      $                   WORK, IINFO )
-*
-*           Set columns n-k+i+ib:n of current block to zero
-*
-            DO 40 L = N - K + I + IB, N
-               DO 30 J = II, II + IB - 1
-                  A( J, L ) = ZERO
-   30          CONTINUE
-   40       CONTINUE
    50    CONTINUE
       END IF
 *
diff --git a/SRC/slarfb0c2.f b/SRC/slarfb0c2.f
new file mode 100644
index 0000000000..5d235bc011
--- /dev/null
+++ b/SRC/slarfb0c2.f
@@ -0,0 +1,263 @@
+      SUBROUTINE SLARFB0C2(SIDE, TRANS, DIRECT, STOREV, M, N, K, V, 
+     $                     LDV, T, LDT, C, LDC)
+         ! Scalar arguments
+         INTEGER           M, N, K, LDV, LDC, LDT
+         ! Note: We should probably get rid of SIDE and TRANS as these flags
+         ! are not used as we are only using this routine inside
+         ! x{OR,UN}G{QR,LQ,QR,LQ}, which have values that never change
+         CHARACTER         SIDE, TRANS, DIRECT, STOREV 
+         ! Array arguments
+         REAL              V(LDV,*), C(LDC,*), T(LDT,*)
+         ! Local scalars
+         LOGICAL           QR, LQ, QL, DIRF, COLV
+         ! External subroutines
+         EXTERNAL          SGEMM, STRMM
+         ! External functions
+         LOGICAL           LSAME
+         EXTERNAL          LSAME
+         ! Parameters
+         REAL              ONE, ZERO, NEG_ONE
+         PARAMETER(ONE=1.0E+0, ZERO = 0.0E+0, NEG_ONE = -1.0E+0)
+
+         ! Beginning of executable statements
+         ! Convert our character flags to logical values
+         DIRF = LSAME(DIRECT,'F')
+         COLV = LSAME(STOREV,'C')
+
+         ! Determine which of the 4 modes are using.
+         ! QR is when we store the reflectors column by column and have the
+         ! 'first' reflector stored in the first column
+         QR = DIRF.AND.COLV
+
+         ! LQ is when we store the reflectors row by row and have the
+         ! 'first' reflector stored in the first row
+         LQ = DIRF.AND.(.NOT.COLV)
+
+         ! QL is when we store the reflectors column by column and have the
+         ! 'first' reflector stored in the last column
+         QL = (.NOT.DIRF).AND.COLV
+
+         ! RQ is when we store the reflectors row by row and have the
+         ! 'first' reflector stored in the last row
+         ! RQ = (.NOT.DIRF).AND.(.NOT.COLV)
+         ! Since we have exactly one of these 4 modes, we don't need to actually
+         ! store the value of RQ, instead we assume this is the case if we fail
+         ! the above 3 checks.
+
+         IF (QR) THEN
+            ! We are computing C = HC = (I - VTV')C
+            ! Where: V = [ V1 ] and C = [ C1 ]
+            !            [ V2 ]         [ C2 ]
+            ! with the following dimensions:
+            !     V1\in\R^{K\times K}
+            !     V2\in\R^{M-K\times K}
+            !     C1=0\in\R^{K\times N}
+            !     C2\in\R^{M-K\times N}
+            ! Since we are assuming that C1 is a zero matrix and it will be
+            ! overwritten on exit, we can use this spot as a temporary workspace
+            ! without having to allocate anything extra.
+            ! This lets us simplify our above equation to get
+            !
+            ! C = HC = (I - [ V1 ]T [V1', V2'])[ 0  ]
+            !               [ V2 ]             [ C2 ]
+            !   = [ 0  ] - [ V1 ]T*V2'*C2
+            !     [ C2 ]   [ V2 ]
+            !
+            !   = [ 0  ] - [ V1*T*V2'*C2 ]
+            !     [ C2 ]   [ V2*T*V2'*C2 ]
+            !
+            !   = [      V1*T*V2'*C2 ]
+            !     [ C2 - V2*T*V2'*C2 ]
+            !
+            ! So, we can order our computations as follows:
+            !
+            ! C1 = V2'*C2
+            ! C1 = T*C1
+            ! C2 = C2 - V2*C1
+            ! C1 = -V1*C1
+            !
+            ! To achieve the same end result
+            !
+            ! C1 = V2'*C2
+            !
+            CALL SGEMM('Transpose', 'No Transpose', K, N, M - K,
+     $                  ONE, V(K+1,1), LDV, C(K+1,1), LDC, ZERO, 
+     $                  C, LDC)
+            !
+            ! C1 = T*C1
+            !
+            CALL STRMM('Left', 'Upper', 'No Transpose', 'Non-unit', 
+     $                  K, N, ONE, T, LDT, C, LDC)
+            !
+            ! C2 = C2 - V2*C1 = -V2*C1 + C2
+            !
+            CALL SGEMM('No Transpose', 'No Transpose', M-K, N, K,
+     $                  NEG_ONE, V(K+1,1), LDV, C, LDC, ONE, 
+     $                  C(K+1,1), LDC)
+            !
+            ! C1 = -V1*C1
+            !
+            CALL STRMM('Left', 'Lower', 'No Transpose', 'Unit', 
+     $                  K, N, NEG_ONE, V, LDV, C, LDC)
+         ELSE IF (LQ) THEN
+            ! We are computing C = CH' = C(I-V'T'V)
+            ! Where: V = [ V1 V2 ] and C = [ C1 C2 ]
+            ! with the following dimensions:
+            !     V1\in\R^{K\times K}
+            !     V2\in\R^{K\times N-K}
+            !     C1=0\in\R^{M\times K}
+            !     C2\in\R^{M\times N-K}
+            ! Since we are assuming that C1 is a zero matrix and it will be
+            ! overwritten on exit, we can use this spot as a temporary workspace
+            ! without having to allocate anything extra.
+            ! This lets us simplify our above equation to get
+            !
+            ! C = CH' = [ 0, C2 ](I - [ V1' ]T'[ V1, V2 ])
+            !                         [ V2' ]
+            !
+            !   = [ 0, C2 ] - [ 0, C2 ][ V1' ]T'[ V1, V2 ]
+            !                          [ V2' ]
+            !
+            !   = [ 0, C2 ] - C2*V2'*T'[ V1, V2 ]
+            !
+            !   = [ -C2*V2'*T'*V1, C2 - C2*V2'*T'*V2 ]
+            !
+            ! So, we can order our computations as follows:
+            !
+            ! C1 = C2*V2'
+            ! C1 = C1*T'
+            ! C2 = C2 - C1*V2
+            ! C1 = -C1*V1
+            !
+            ! To achieve the same end result
+            !
+            ! C1 = C2*V2'
+            !
+            CALL SGEMM('No Transpose', 'Transpose', M, K, N-K, 
+     $            ONE, C(1,K+1), LDC, V(1, K+1), LDV, ZERO, C, LDC)
+            !
+            ! C1 = C1*T'
+            !
+            CALL STRMM('Right', 'Upper', 'Transpose', 'Non-unit',
+     $            M, K, ONE, T, LDT, C, LDC)
+            !
+            ! C2 = C2 - C1*V2 = -C1*V2 + C2
+            !
+            CALL SGEMM('No Transpose', 'No Transpose', M, N-K, K,
+     $            NEG_ONE, C, LDC, V(1,K+1), LDV, ONE, 
+     $            C(1,K+1), LDC)
+            !
+            ! C1 = -C1*V1
+            !
+            CALL STRMM('Right', 'Upper', 'No Transpose', 'Unit',
+     $            M, K, NEG_ONE, V, LDV, C, LDC)
+         ELSE IF (QL) THEN
+            ! We are computing C = HC = (I - VTV')C
+            ! Where: V = [ V2 ] and C = [ C2 ]
+            !            [ V1 ]         [ C1 ]
+            ! with the following dimensions:
+            !     V1\in\R^{K\times K}
+            !     V2\in\R^{M-K\times K}
+            !     C1=0\in\R^{K\times N}
+            !     C2\in\R^{M-K\times N}
+            ! Since we are assuming that C1 is a zero matrix and it will be
+            ! overwritten on exit, we can use this spot as a temporary workspace
+            ! without having to allocate anything extra.
+            ! This lets us simplify our above equation to get
+            !
+            ! C = HC = (I-[ V2 ]T[ V2' V1' ])[ C2 ]
+            !             [ V1 ]             [ 0  ]
+            !
+            !   = [ C2 ] - [ V2 ]T*V2'*C2
+            !     [ 0  ]   [ V1 ]
+            !
+            !   = [ C2 ] - [ V2*T*V2'*C2 ]
+            !     [ 0  ]   [ V1*T*V2'*C2 ]
+            !
+            !   = [ C2 - V2*T*V2'*C2 ]
+            !     [    - V1*T*V2'*C2 ]
+            !
+            ! So, we can order our computations as follows:
+            !
+            ! C1 = V2'*C2
+            ! C1 = T*C1
+            ! C2 = C2 - V2*C1
+            ! C1 = -V1*C1
+            !
+            ! To achieve the same end result
+            !
+            ! C1 = V2'*C2
+            !
+            CALL SGEMM('Transpose', 'No Transpose', K, N, M-K, 
+     $         ONE, V, LDV, C, LDC, ZERO, C(M-K+1, 1), LDC)
+            !
+            ! C1 = T*C1
+            !
+            CALL STRMM('Left', 'Lower', 'No Transpose', 'Non-unit',
+     $         K, N, ONE, T, LDT, C(M-K+1,1), LDC)
+            !
+            ! C2 = C2 - V2*C1 = -V2*C1 + C2
+            !
+            CALL SGEMM('No Transpose', 'No Transpose', M-K, N, K, 
+     $         NEG_ONE, V, LDV, C(M-K+1,1), LDC, ONE, C, LDC)
+            !
+            ! C1 = -V1*C1
+            !
+            CALL STRMM('Left', 'Upper', 'No Transpose', 'Unit', 
+     $         K, N, NEG_ONE, V(M-K+1,1), LDV, C(M-K+1,1), LDC)
+         ELSE ! IF (RQ) THEN
+            ! We are computing C = CH' = C(I-V'T'V)
+            ! Where: V = [ V2 V1] and C = [ C2 C1 ]
+            ! with the following dimensions:
+            !     V1\in\R^{K\times K}
+            !     V2\in\R^{K\times N-K}
+            !     C1=0\in\R^{M\times K}
+            !     C2\in\R^{M\times N-K}
+            ! Since we are assuming that C1 is a zero matrix and it will be
+            ! overwritten on exit, we can use this spot as a temporary workspace
+            ! without having to allocate anything extra.
+            ! This lets us simplify our above equation to get
+            !
+            ! C = CH' = [ C2, 0 ] (I - [ V2' ]T'[ V2, V1 ]
+            !                          [ V1' ]
+            !
+            !   = [ C2, 0 ] - [ C2, 0 ] [ V2' ]T'[ V2, V1 ]
+            !                           [ V1' ]
+            !
+            !   = [ C2, 0 ] - C2*V2'*T'[ V2, V1 ]
+            !
+            !   = [ C2, 0 ] - [ C2*V2'*T'*V2, C2*V2'*T'*V1 ] 
+            !
+            !   = [ C2 - C2*V2'*T'*V2, -C2*V2'*T'*V1 ]
+            !
+            ! So, we can order our computations as follows:
+            !
+            ! C1 = C2*V2'
+            ! C1 = C1*T'
+            ! C2 = C2 - C1*V2
+            ! C1 = -C1*V1
+            ! 
+            !
+            ! To achieve the same end result
+            !
+            ! C1 = C2*V2'
+            !
+            CALL SGEMM('No Transpose', 'Transpose', M, K, N-K,
+     $         ONE, C, LDC, V, LDV, ZERO, C(1, N-K+1), LDC)
+            !
+            ! C1 = C1*T'
+            !
+            CALL STRMM('Right', 'Lower', 'Transpose', 'Non-unit', 
+     $         M, K, ONE, T, LDT, C(1, N-K+1), LDC)
+            !
+            ! C2 = C2 - C1*V2 = -C1*V2 + C2
+            !
+            CALL SGEMM('No Transpose', 'No Transpose', M, N-K, K, 
+     $         NEG_ONE, C(1, N-K+1), LDC, V, LDV, ONE, C, LDC)
+            !
+            ! C1 = -C1*V1
+            !
+            CALL STRMM('Right', 'Lower', 'No Transpose', 'Unit', 
+     $         M, K, NEG_ONE, V(1, N-K+1), LDV, C(1,N-K+1), LDC)
+         END IF
+      END SUBROUTINE

From 09a04d7931557c921f906d1784844fa8ddc30738 Mon Sep 17 00:00:00 2001
From: Johnathan Rhyne <rhynejohnathan@gmail.com>
Date: Sun, 9 Feb 2025 10:51:51 -0700
Subject: [PATCH 02/10] Real version of xorgqr using new larfb implemented and
 passing tests locally

---
 SRC/CMakeLists.txt |   2 +-
 SRC/Makefile       |   2 +-
 SRC/dlarfb0c2.f    | 182 ++++++++++++++++++++++++++++++++++++---------
 SRC/dorglq.f       |  96 ++++++++++++++++--------
 SRC/dorgql.f       | 103 +++++++++++++------------
 SRC/dorgqr.f       |  93 ++++++++---------------
 SRC/dorgrq.f       |  76 +++++++++++--------
 SRC/slarfb0c2.f    | 182 ++++++++++++++++++++++++++++++++++++---------
 SRC/sorglq.f       | 107 +++++++++++++++-----------
 SRC/sorgql.f       | 108 ++++++++++++---------------
 SRC/sorgqr.f       | 126 +++++++++++++++----------------
 SRC/sorgrq.f       |  85 +++++++++++----------
 12 files changed, 713 insertions(+), 449 deletions(-)

diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt
index 88e503efea..12cfebca5f 100644
--- a/SRC/CMakeLists.txt
+++ b/SRC/CMakeLists.txt
@@ -106,7 +106,7 @@ set(SLASRC
    slaqgb.f slaqge.f slaqp2.f slaqps.f slaqp2rk.f slaqp3rk.f slaqsb.f slaqsp.f slaqsy.f
    slaqr0.f slaqr1.f slaqr2.f slaqr3.f slaqr4.f slaqr5.f
    slaqtr.f slar1v.f slar2v.f ilaslr.f ilaslc.f
-   slarf.f  slarf1f.f slarf1l.f slarfb.f slarfb_gett.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f
+   slarf.f  slarf1f.f slarf1l.f slarfb.f slarfb0c2.f slarfb_gett.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f
    slargv.f slarmm.f slarrv.f slartv.f
    slarz.f  slarzb.f slarzt.f slasy2.f
    slasyf.f slasyf_rook.f slasyf_rk.f slasyf_aa.f
diff --git a/SRC/Makefile b/SRC/Makefile
index dc8f472092..5fb5db20a6 100644
--- a/SRC/Makefile
+++ b/SRC/Makefile
@@ -137,7 +137,7 @@ SLASRC = \
    slaqgb.o slaqge.o slaqp2.o slaqps.o slaqp2rk.o slaqp3rk.o slaqsb.o slaqsp.o slaqsy.o \
    slaqr0.o slaqr1.o slaqr2.o slaqr3.o slaqr4.o slaqr5.o \
    slaqtr.o slar1v.o slar2v.o ilaslr.o ilaslc.o \
-   slarf.o  slarf1f.o slarf1l.o slarfb.o slarfb_gett.o slarfg.o slarfgp.o slarft.o slarfx.o slarfy.o \
+   slarf.o  slarf1f.o slarf1l.o slarfb.o slarfb0c2.o slarfb_gett.o slarfg.o slarfgp.o slarft.o slarfx.o slarfy.o \
    slargv.o slarmm.o slarrv.o slartv.o \
    slarz.o  slarzb.o slarzt.o slaswp.o slasy2.o slasyf.o slasyf_rook.o \
    slasyf_rk.o \
diff --git a/SRC/dlarfb0c2.f b/SRC/dlarfb0c2.f
index cb7f9d3873..4651cdbea4 100644
--- a/SRC/dlarfb0c2.f
+++ b/SRC/dlarfb0c2.f
@@ -1,20 +1,24 @@
-      SUBROUTINE DLARFB0C2(SIDE, TRANS, DIRECT, STOREV, M, N, K, V, 
-     $                     LDV, T, LDT, C, LDC)
+      SUBROUTINE DLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N,
+     $                     K, V, LDV, T, LDT, C, LDC)
          ! Scalar arguments
          INTEGER           M, N, K, LDV, LDC, LDT
-         ! Note: We should probably get rid of SIDE and TRANS as these flags
-         ! are not used as we are only using this routine inside
-         ! x{OR,UN}G{QR,LQ,QR,LQ}, which have values that never change
          CHARACTER         SIDE, TRANS, DIRECT, STOREV 
+         ! True means that we are assuming C2 is the identity matrix
+         !     and thus don't reference whatever is present in C2 
+         !     at the beginning.
+         LOGICAL           C2I
+
          ! Array arguments
          DOUBLE PRECISION  V(LDV,*), C(LDC,*), T(LDT,*)
          ! Local scalars
-         LOGICAL           QR, LQ, QL, DIRF, COLV
-         ! External subroutines
-         EXTERNAL          DGEMM, DTRMM
+         LOGICAL           QR, LQ, QL, DIRF, COLV, SIDEL, SIDER,
+     $                     TRANST
+         INTEGER           I, J
          ! External functions
          LOGICAL           LSAME
          EXTERNAL          LSAME
+         ! External subroutines
+         EXTERNAL          DGEMM, DTRMM, XERBLA
          ! Parameters
          DOUBLE PRECISION ONE, ZERO, NEG_ONE
          PARAMETER(ONE=1.0D+0, ZERO = 0.0D+0, NEG_ONE = -1.0D+0)
@@ -23,6 +27,9 @@ SUBROUTINE DLARFB0C2(SIDE, TRANS, DIRECT, STOREV, M, N, K, V,
          ! Convert our character flags to logical values
          DIRF = LSAME(DIRECT,'F')
          COLV = LSAME(STOREV,'C')
+         SIDEL = LSAME(SIDE,'L')
+         SIDER = LSAME(SIDE,'R')
+         TRANST = LSAME(TRANS,'T')
 
          ! Determine which of the 4 modes are using.
          ! QR is when we store the reflectors column by column and have the
@@ -78,26 +85,53 @@ SUBROUTINE DLARFB0C2(SIDE, TRANS, DIRECT, STOREV, M, N, K, V,
             !
             ! To achieve the same end result
             !
+            ! Check to ensure side and trans are the expected values 
+            !
+            IF( .NOT.SIDEL ) THEN
+               CALL XERBLA('DLARFB0C2', 2)
+               RETURN
+            ELSE IF(TRANST) THEN
+               CALL XERBLA('DLARFB0C2', 3)
+               RETURN
+            END IF
+            !
             ! C1 = V2'*C2
             !
-            CALL DGEMM('Transpose', 'No Transpose', K, N, M - K,
-     $                  ONE, V(K+1,1), LDV, C(K+1,1), LDC, ZERO, 
-     $                  C, LDC)
+            IF (C2I) THEN
+               DO J = 1, N
+                  DO I = 1, K
+                     C(I,J) = V(K+J,I)
+                  END DO
+               END DO
+            ELSE
+               CALL DGEMM('Transpose', 'No Transpose', K, N, M - K,
+     $                     ONE, V(K+1,1), LDV, C(K+1,1), LDC, ZERO,
+     $                     C, LDC)
+            END IF
             !
             ! C1 = T*C1
             !
-            CALL DTRMM('Left', 'Upper', 'No Transpose', 'Non-unit', 
+            CALL DTRMM('Left', 'Upper', 'No Transpose', 'Non-unit',
      $                  K, N, ONE, T, LDT, C, LDC)
             !
             ! C2 = C2 - V2*C1 = -V2*C1 + C2
             !
-            CALL DGEMM('No Transpose', 'No Transpose', M-K, N, K,
-     $                  NEG_ONE, V(K+1,1), LDV, C, LDC, ONE, 
-     $                  C(K+1,1), LDC)
+            IF (C2I) THEN
+               CALL DGEMM('No Transpose', 'No Transpose', M-K, N, K,
+     $                     NEG_ONE, V(K+1,1), LDV, C, LDC, ZERO,
+     $                     C(K+1,1), LDC)
+               DO I = 1, N
+                  C(K+I,I) = C(K+I,I) + ONE
+               END DO
+            ELSE
+               CALL DGEMM('No Transpose', 'No Transpose', M-K, N, K,
+     $                     NEG_ONE, V(K+1,1), LDV, C, LDC, ONE,
+     $                     C(K+1,1), LDC)
+            END IF
             !
             ! C1 = -V1*C1
             !
-            CALL DTRMM('Left', 'Lower', 'No Transpose', 'Unit', 
+            CALL DTRMM('Left', 'Lower', 'No Transpose', 'Unit',
      $                  K, N, NEG_ONE, V, LDV, C, LDC)
          ELSE IF (LQ) THEN
             ! We are computing C = CH' = C(I-V'T'V)
@@ -131,10 +165,29 @@ SUBROUTINE DLARFB0C2(SIDE, TRANS, DIRECT, STOREV, M, N, K, V,
             !
             ! To achieve the same end result
             !
+            ! Check to ensure side and trans are the expected values 
+            !
+            IF( .NOT.SIDER ) THEN
+               CALL XERBLA('DLARFB0C2', 2)
+               RETURN
+            ELSE IF(.NOT.TRANST) THEN
+               CALL XERBLA('DLARFB0C2', 3)
+               RETURN
+            END IF
+            !
             ! C1 = C2*V2'
             !
-            CALL DGEMM('No Transpose', 'Transpose', M, K, N-K, 
-     $            ONE, C(1,K+1), LDC, V(1, K+1), LDV, ZERO, C, LDC)
+            IF( C2I ) THEN
+               DO J = 1, K
+                  DO I = 1, M
+                     C(I,J) = V(J,K+I)
+                  END DO
+               END DO
+            ELSE
+               CALL DGEMM('No Transpose', 'Transpose', M, K, N-K,
+     $               ONE, C(1,K+1), LDC, V(1, K+1), LDV, ZERO, C,
+     $               LDC)
+            END IF
             !
             ! C1 = C1*T'
             !
@@ -143,9 +196,18 @@ SUBROUTINE DLARFB0C2(SIDE, TRANS, DIRECT, STOREV, M, N, K, V,
             !
             ! C2 = C2 - C1*V2 = -C1*V2 + C2
             !
-            CALL DGEMM('No Transpose', 'No Transpose', M, N-K, K,
-     $            NEG_ONE, C, LDC, V(1,K+1), LDV, ONE, 
-     $            C(1,K+1), LDC)
+            IF( C2I ) THEN
+               CALL DGEMM('No Transpose', 'No Transpose', M, N-K, K,
+     $               NEG_ONE, C, LDC, V(1,K+1), LDV, ZERO, C(1,K+1),
+     $               LDC)
+               DO I = 1, M
+                  C(I,K+I) = C(I,K+I) + ONE
+               END DO
+            ELSE
+               CALL DGEMM('No Transpose', 'No Transpose', M, N-K, K,
+     $               NEG_ONE, C, LDC, V(1,K+1), LDV, ONE, C(1,K+1),
+     $               LDC)
+            END IF
             !
             ! C1 = -C1*V1
             !
@@ -186,10 +248,28 @@ SUBROUTINE DLARFB0C2(SIDE, TRANS, DIRECT, STOREV, M, N, K, V,
             !
             ! To achieve the same end result
             !
+            ! Check to ensure side and trans are the expected values 
+            !
+            IF( .NOT.SIDEL ) THEN
+               CALL XERBLA('DLARFB0C2', 2)
+               RETURN
+            ELSE IF(TRANST) THEN
+               CALL XERBLA('DLARFB0C2', 3)
+               RETURN
+            END IF
+            !
             ! C1 = V2'*C2
             !
-            CALL DGEMM('Transpose', 'No Transpose', K, N, M-K, 
-     $         ONE, V, LDV, C, LDC, ZERO, C(M-K+1, 1), LDC)
+            IF( C2I ) THEN
+               DO J = 1, N
+                  DO I = 1, K
+                     C(M-K+I,J) = V(J,I)
+                  END DO
+               END DO
+            ELSE
+               CALL DGEMM('Transpose', 'No Transpose', K, N, M-K,
+     $            ONE, V, LDV, C, LDC, ZERO, C(M-K+1, 1), LDC)
+            END IF
             !
             ! C1 = T*C1
             !
@@ -198,12 +278,20 @@ SUBROUTINE DLARFB0C2(SIDE, TRANS, DIRECT, STOREV, M, N, K, V,
             !
             ! C2 = C2 - V2*C1 = -V2*C1 + C2
             !
-            CALL DGEMM('No Transpose', 'No Transpose', M-K, N, K, 
-     $         NEG_ONE, V, LDV, C(M-K+1,1), LDC, ONE, C, LDC)
+            IF( C2I ) THEN
+               CALL DGEMM('No Transpose', 'No Transpose', M-K, N, K,
+     $            NEG_ONE, V, LDV, C(M-K+1,1), LDC, ZERO, C, LDC)
+               DO I = 1, N
+                  C(I,I) = C(I,I) + ONE
+               END DO
+            ELSE
+               CALL DGEMM('No Transpose', 'No Transpose', M-K, N, K,
+     $            NEG_ONE, V, LDV, C(M-K+1,1), LDC, ONE, C, LDC)
+            END IF
             !
             ! C1 = -V1*C1
             !
-            CALL DTRMM('Left', 'Upper', 'No Transpose', 'Unit', 
+            CALL DTRMM('Left', 'Upper', 'No Transpose', 'Unit',
      $         K, N, NEG_ONE, V(M-K+1,1), LDV, C(M-K+1,1), LDC)
          ELSE ! IF (RQ) THEN
             ! We are computing C = CH' = C(I-V'T'V)
@@ -226,7 +314,7 @@ SUBROUTINE DLARFB0C2(SIDE, TRANS, DIRECT, STOREV, M, N, K, V,
             !
             !   = [ C2, 0 ] - C2*V2'*T'[ V2, V1 ]
             !
-            !   = [ C2, 0 ] - [ C2*V2'*T'*V2, C2*V2'*T'*V1 ] 
+            !   = [ C2, 0 ] - [ C2*V2'*T'*V2, C2*V2'*T'*V1 ]
             !
             !   = [ C2 - C2*V2'*T'*V2, -C2*V2'*T'*V1 ]
             !
@@ -236,28 +324,54 @@ SUBROUTINE DLARFB0C2(SIDE, TRANS, DIRECT, STOREV, M, N, K, V,
             ! C1 = C1*T'
             ! C2 = C2 - C1*V2
             ! C1 = -C1*V1
-            ! 
+            !
             !
             ! To achieve the same end result
             !
+            ! Check to ensure side and trans are the expected values 
+            !
+            IF( .NOT.SIDER ) THEN
+               CALL XERBLA('DLARFB0C2', 2)
+               RETURN
+            ELSE IF(.NOT.TRANST) THEN
+               CALL XERBLA('DLARFB0C2', 3)
+               RETURN
+            END IF
+            !
             ! C1 = C2*V2'
             !
-            CALL DGEMM('No Transpose', 'Transpose', M, K, N-K,
-     $         ONE, C, LDC, V, LDV, ZERO, C(1, N-K+1), LDC)
+            IF( C2I ) THEN
+               DO J = 1, K
+                  DO I = 1, M
+                     C(I,N-K+J) = V(J,I)
+                  END DO
+               END DO
+            ELSE
+               CALL DGEMM('No Transpose', 'Transpose', M, K, N-K,
+     $            ONE, C, LDC, V, LDV, ZERO, C(1, N-K+1), LDC)
+            END IF
             !
             ! C1 = C1*T'
             !
-            CALL DTRMM('Right', 'Lower', 'Transpose', 'Non-unit', 
+            CALL DTRMM('Right', 'Lower', 'Transpose', 'Non-unit',
      $         M, K, ONE, T, LDT, C(1, N-K+1), LDC)
             !
             ! C2 = C2 - C1*V2 = -C1*V2 + C2
             !
-            CALL DGEMM('No Transpose', 'No Transpose', M, N-K, K, 
-     $         NEG_ONE, C(1, N-K+1), LDC, V, LDV, ONE, C, LDC)
+            IF( C2I ) THEN
+               CALL DGEMM('No Transpose', 'No Transpose', M, N-K, K,
+     $            NEG_ONE, C(1, N-K+1), LDC, V, LDV, ZERO, C, LDC)
+               DO I = 1, M
+                  C(I,I) = C(I,I) + ONE
+               END DO
+            ELSE
+               CALL DGEMM('No Transpose', 'No Transpose', M, N-K, K,
+     $            NEG_ONE, C(1, N-K+1), LDC, V, LDV, ONE, C, LDC)
+            END IF
             !
             ! C1 = -C1*V1
             !
-            CALL DTRMM('Right', 'Lower', 'No Transpose', 'Unit', 
+            CALL DTRMM('Right', 'Lower', 'No Transpose', 'Unit',
      $         M, K, NEG_ONE, V(1, N-K+1), LDV, C(1,N-K+1), LDC)
          END IF
       END SUBROUTINE
diff --git a/SRC/dorglq.f b/SRC/dorglq.f
index b686cc468a..6507f625b3 100644
--- a/SRC/dorglq.f
+++ b/SRC/dorglq.f
@@ -5,6 +5,7 @@
 * Online html documentation available at
 *            http://www.netlib.org/lapack/explore-html/
 *
+*> \htmlonly
 *> Download DORGLQ + dependencies
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dorglq.f">
 *> [TGZ]</a>
@@ -12,6 +13,7 @@
 *> [ZIP]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dorglq.f">
 *> [TXT]</a>
+*> \endhtmlonly
 *
 *  Definition:
 *  ===========
@@ -161,8 +163,7 @@ SUBROUTINE DORGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
       INFO = 0
       NB = ILAENV( 1, 'DORGLQ', ' ', M, N, K, -1 )
-      ! DLARFB0C2 means we only need a workspace for calls to dorgl2
-      LWKOPT = MAX( 1, M )
+      LWKOPT = MAX( 1, M ) * NB
       WORK( 1 ) = LWKOPT
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
@@ -218,55 +219,90 @@ SUBROUTINE DORGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
       IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
 *
-*        Use blocked code after the last block.
-*        The first kk rows are handled by the block method.
-*
-         KI = ( ( K-NX-1 ) / NB )*NB
-         KK = MIN( K, KI+NB )
-*
-*        Set A(kk+1:m,1:kk) to zero.
+*        Handle the first block assuming we are applying to the
+*        identity, then resume regular blocking method after
 *
+         KI = K - 2 * NB
+         KK = K - NB
       ELSE
          KK = 0
       END IF
 *
-*     Use unblocked code for the last or only block.
+*     Potentially bail to the unblocked version
 *
-      IF( KK.LT.M )
-     $   CALL DORGL2( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA,
-     $                TAU( KK+1 ), WORK, IINFO )
+      IF( KK.EQ.0 ) THEN
+         CALL DORGL2( M, N, K, A, LDA, TAU, WORK, IINFO )
+      END IF
 *
       IF( KK.GT.0 ) THEN
+         I = KK + 1
+         IB = NB
+*
+*        Form the triangular factor of the block reflector
+*        H = H(i) H(i+1) . . . H(i+ib-1)
+*
+         CALL DLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ),
+     $               LDA, TAU( I ), WORK, LDWORK )
+*
+*        Apply H**T to A(i+ib:m,i:n) from the right
+*
+         CALL DLARFB0C2(.TRUE., 'Right', 'Transpose', 'Forward',
+     $         'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, 
+     $         LDWORK, A(I+IB,I), LDA)
+*
+*        Apply H**T to columns i:n of current block
+
+         CALL DORGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ),
+     $                WORK, IINFO )
 *
 *        Use blocked code
 *
-         DO 50 I = KI + 1, 1, -NB
-            IB = MIN( NB, K-I+1 )
-            IF( I+IB.LE.M ) THEN
+         DO I = KI + 1, 1, -NB
+            IB = NB
 *
-*              Form the triangular factor of the block reflector
-*              H = H(i) H(i+1) . . . H(i+ib-1)
+*           Form the triangular factor of the block reflector
+*           H = H(i) H(i+1) . . . H(i+ib-1)
 *
-               CALL DLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I,
-     $                      I ),
-     $                      LDA, TAU( I ), WORK, LDWORK )
+            CALL DLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ),
+     $                  LDA, TAU( I ), WORK, LDWORK )
 *
-*              Apply H**T to A(i+ib:m,i:n) from the right
+*           Apply H**T to A(i+ib:m,i:n) from the right
 *
-               CALL DLARFB0C2('A', 'A', 'Forward', 'Rowwise', 
-     $               M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, LDWORK,
-     $               A(I+IB,I), LDA)
-            END IF
+            CALL DLARFB0C2(.FALSE., 'Right', 'Transpose', 'Forward',
+     $            'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, 
+     $            LDWORK, A(I+IB,I), LDA)
 *
 *           Apply H**T to columns i:n of current block
 
             CALL DORGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ),
-     $                   WORK,
-     $                   IINFO )
+     $                   WORK, IINFO )
+         END DO
 *
-*           Set columns 1:i-1 of current block to zero
+*        This checks for if K was a perfect multiple of NB
+*        so that we only have a special case for the last block when
+*        necessary
 *
-   50    CONTINUE
+         IF(I.LT.1) THEN
+            IB = I + NB - 1
+            I = 1
+*
+*           Form the triangular factor of the block reflector
+*           H = H(i) H(i+1) . . . H(i+ib-1)
+*
+            CALL DLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ),
+     $                  LDA, TAU( I ), WORK, LDWORK )
+*
+*           Apply H**T to A(i+ib:m,i:n) from the right
+*
+            CALL DLARFB0C2(.FALSE., 'Right', 'Transpose', 'Forward',
+     $            'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, 
+     $            LDWORK, A(I+IB,I), LDA)
+*
+*           Apply H**T to columns i:n of current block
+
+            CALL DORGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ),
+     $                   WORK, IINFO )
+         END IF
       END IF
 *
       WORK( 1 ) = IWS
diff --git a/SRC/dorgql.f b/SRC/dorgql.f
index 892c91b855..9f34c858f9 100644
--- a/SRC/dorgql.f
+++ b/SRC/dorgql.f
@@ -5,6 +5,7 @@
 * Online html documentation available at
 *            http://www.netlib.org/lapack/explore-html/
 *
+*> \htmlonly
 *> Download DORGQL + dependencies
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dorgql.f">
 *> [TGZ]</a>
@@ -12,6 +13,7 @@
 *> [ZIP]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dorgql.f">
 *> [TXT]</a>
+*> \endhtmlonly
 *
 *  Definition:
 *  ===========
@@ -143,8 +145,7 @@ SUBROUTINE DORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IB, IINFO, IWS, J, KK, L, LDWORK, LWKOPT,
-     $                   NB, NBMIN, NX
+      INTEGER            I, IB, IINFO, IWS, J, KK, LWKOPT, NB, NBMIN
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           DLARFB0C2, DLARFT, DORG2L, XERBLA
@@ -177,7 +178,8 @@ SUBROUTINE DORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
             LWKOPT = 1
          ELSE
             NB = ILAENV( 1, 'DORGQL', ' ', M, N, K, -1 )
-            LWKOPT = N*NB
+            ! Only need a workspace for calls to dorg2l
+            LWKOPT = N
          END IF
          WORK( 1 ) = LWKOPT
 *
@@ -200,82 +202,77 @@ SUBROUTINE DORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
       END IF
 *
       NBMIN = 2
-      NX = 0
       IWS = N
-      IF( NB.GT.1 .AND. NB.LT.K ) THEN
 *
-*        Determine when to cross over from blocked to unblocked code.
+      IF( NB.GE.NBMIN .AND. NB.LT.K ) THEN
 *
-         NX = MAX( 0, ILAENV( 3, 'DORGQL', ' ', M, N, K, -1 ) )
-         IF( NX.LT.K ) THEN
+*        We want to use the blocking method as long as our matrix is big enough
 *
-*           Determine if workspace is large enough for blocked code.
-*
-            LDWORK = N
-            IWS = LDWORK*NB
-            IF( LWORK.LT.IWS ) THEN
+         KK = K
+      ELSE
+         KK = 0
+      END IF
 *
-*              Not enough workspace to use optimal NB:  reduce NB and
-*              determine the minimum value of NB.
+*     Possibly bail to the unblocked code.
 *
-               NB = LWORK / LDWORK
-               NBMIN = MAX( 2, ILAENV( 2, 'DORGQL', ' ', M, N, K,
-     $                      -1 ) )
-            END IF
-         END IF
+      IF( KK.EQ.0 ) THEN
+         CALL DORG2L( M, N, K, A, LDA, TAU, WORK, IINFO )
       END IF
 *
-      IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
+*     Use our blocked code for everything
 *
-*        Use blocked code after the first block.
-*        The last kk columns are handled by the block method.
+      IF( KK.GT.0 ) THEN
 *
-         KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB )
+*        Factor the first block assuming that our first application
+*        will be on the Identity matrix
 *
-*        Set A(m-kk+1:m,1:n-kk) to zero.
+         I = 1
+         IB = NB
 *
-         DO 20 J = 1, N - KK
-            DO 10 I = M - KK + 1, M
-               A( I, J ) = ZERO
-   10       CONTINUE
-   20    CONTINUE
-      ELSE
-         KK = 0
-      END IF
+*        Form the triangular factor of the block reflector
+*        H = H(i+ib-1) . . . H(i+1) H(i)
 *
-*     Use unblocked code for the first or only block.
+         CALL DLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB,
+     $                  A( 1, N-K+I ), LDA, TAU( I ),
+     $                  A( M-K+I, N-K+I ), LDA)
 *
-      CALL DORG2L( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO )
+*        Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left
 *
-      IF( KK.GT.0 ) THEN
+         CALL DLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Backward', 
+     $         'Columnwise', M-K+I+IB-1, N-K+I-1, IB, A(1, N-K+I), 
+     $         LDA, A( M-K+I, N-K+I ), LDA, A, LDA)
+*
+*        Apply H to rows 1:m-k+i+ib-1 of current block
+*
+         CALL DORG2L( M-K+I+IB-1, IB, IB, A( 1, N-K+I ), LDA,
+     $                TAU( I ), WORK, IINFO )
+
+*        Use blocked code on the remaining blocks if there are any.
 *
-*        Use blocked code
+         DO I = NB+1, K, NB
 *
-         DO 50 I = K - KK + 1, K, NB
-            IB = MIN( NB, K-I+1 )
-            IF( N-K+I.GT.1 ) THEN
+*           The last block may be less than size NB
 *
-*              Form the triangular factor of the block reflector
-*              H = H(i+ib-1) . . . H(i+1) H(i)
+            IB = MIN(NB, K-I+1)
 *
-               CALL DLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB,
-     $                      A( 1, N-K+I ), LDA, TAU( I ), WORK, LDWORK )
+*           Form the triangular factor of the block reflector
+*           H = H(i+ib-1) . . . H(i+1) H(i)
 *
-*              Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left
+            CALL DLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB,
+     $                  A( 1, N-K+I ), LDA, TAU( I ), 
+     $                  A( M-K+I, N-K+I ), LDA )
 *
-               CALL DLARFB0C2('A', 'A', 'Backward', 'Columnwise',
-     $               M-K+I+IB-1, N-K+I-1, IB, A(1, N-K+I), LDA, 
-     $               WORK, LDWORK, A, LDA)
-            END IF
+*           Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left
+*
+            CALL DLARFB0C2(.FALSE., 'Left', 'No Transpose',
+     $            'Backward', 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, 
+     $            A(1, N-K+I), LDA, A( M-K+I, N-K+I ), LDA, A, LDA)
 *
 *           Apply H to rows 1:m-k+i+ib-1 of current block
 *
             CALL DORG2L( M-K+I+IB-1, IB, IB, A( 1, N-K+I ), LDA,
      $                   TAU( I ), WORK, IINFO )
-*
-*           Set rows m-k+i+ib:m of current block to zero
-*
-   50    CONTINUE
+         END DO
       END IF
 *
       WORK( 1 ) = IWS
diff --git a/SRC/dorgqr.f b/SRC/dorgqr.f
index 5fae768438..0ed278ffc0 100644
--- a/SRC/dorgqr.f
+++ b/SRC/dorgqr.f
@@ -1,5 +1,3 @@
-c
-c
 *> \brief \b DORGQR
 *
 *  =========== DOCUMENTATION ===========
@@ -7,6 +5,7 @@
 * Online html documentation available at
 *            http://www.netlib.org/lapack/explore-html/
 *
+*> \htmlonly
 *> Download DORGQR + dependencies
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dorgqr.f">
 *> [TGZ]</a>
@@ -14,6 +13,7 @@
 *> [ZIP]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dorgqr.f">
 *> [TXT]</a>
+*> \endhtmlonly
 *
 *  Definition:
 *  ===========
@@ -125,7 +125,6 @@
 *
 *  =====================================================================
       SUBROUTINE DORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
-      IMPLICIT NONE
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
@@ -141,13 +140,13 @@ SUBROUTINE DORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *  =====================================================================
 *
 *     .. Parameters ..
-      DOUBLE PRECISION   ZERO, ONE
-      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
+      DOUBLE PRECISION   ZERO
+      PARAMETER          ( ZERO = 0.0D+0 )
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I,II, IB, IINFO, IWS, J,JJ, KI, KK, L, LDWORK,
-     $                   LWKOPT, NB, NBMIN, NX
+      INTEGER            I, IB, IINFO, IWS, J, KI, KK, LWKOPT,
+     $                   NB, NBMIN, NX
 *     ..
 *     .. External Subroutines ..
       EXTERNAL             DLARFB0C2, DLARFT, DORG2R, XERBLA
@@ -165,7 +164,7 @@ SUBROUTINE DORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
       INFO = 0
       NB = ILAENV( 1, 'DORGQR', ' ', M, N, K, -1 )
-      ! Only need a workspace for dorg2r in case of bailout and 
+      ! Only need a workspace for dorg2r in case of bailout and
       ! for the panel factorization
       LWKOPT = MAX( 1, N )
       WORK( 1 ) = LWKOPT
@@ -195,16 +194,14 @@ SUBROUTINE DORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
          RETURN
       END IF
 *
-      ! Probably not needed anymore
       NBMIN = 2
-      ! Parameter that controls when we cross from blocked to
-      ! unblocked
-      NX = 0
+      NX = MAX(0, ILAENV(3, 'SORGQR', ' ', M, N, K, -1))
+      IWS = N
 *
       IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
 *
-*        Use blocked code after the last block.
-*        The first kk columns are handled by the block method.
+*        Handle the first block assuming we are applying to the
+*        identity, then resume regular blocking method after
 *
          KI = K - 2 * NB
          KK = K - NB
@@ -212,59 +209,27 @@ SUBROUTINE DORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
          KK = 0
       END IF
 *
-*     Use unblocked code for the only block.
+*     Potentially bail to the unblocked code.
 *
       IF( KK.EQ.0 ) THEN
-            CALL DORG2R( M, N, K, A, LDA, TAU, WORK, IINFO ) 
+            CALL DORG2R( M, N, K, A, LDA, TAU, WORK, IINFO )
       END IF
 *
       IF( KK.GT.0 ) THEN
          I = KK + 1
          IB = NB
-         ! This is a specialized form of our loop below. We could make this its
-         ! own function, however this is a specialized step, so currently we
-         ! don't do that.
 *
 *           Form the triangular factor of the block reflector
 *           H = H(i) H(i+1) . . . H(i+ib-1)
 *
-         CALL DLARFT('Forward', 'Column', M-I+1, IB, A(I,I), 
+         CALL DLARFT('Forward', 'Column', M-I+1, IB, A(I,I),
      $                     LDA, TAU(I), A(I,I), LDA)
 *
 *           Apply H to A(i:m,i+ib:n) from the left
 *
-*
-**        W := V2
-*        C1 := V2**T
-*
-*         Since C1 starts as 0, we are using this instead of WORK(IB+1).
-*         This helps us reduce the memory footprint by lowering WORK to
-*         be of only size IB
-*         CALL DLACPY('All', N-K, IB, A(I+IB,I), LDA,WORK(IB+1),LDWORK)
-         DO JJ = K - NB + 1, K
-           DO II = K + 1, N
-              A( JJ, II ) = A( II, JJ)
-           END DO
-         END DO
-*
-*              C1 := T * C1
-*
-         CALL DTRMM( 'Left', 'Upper', 'No transpose', 'Non-unit', IB,
-     $               N-K,ONE, A(I,I), LDA, A(I,I+IB),LDA )
-*
-*                 C2 := C2 - V2 * C1
-*
-         CALL DGEMM( 'No transpose', 'No transpose', M-IB-KK, N-K, IB,
-     $               -ONE, A( I+IB, I ), LDA, A(I,I+IB),LDA, ZERO,
-     $               A( I+IB, I+IB ), LDA )
-         DO JJ = 1, N-K
-            A(I+IB+JJ-1,I+IB+JJ-1) = 1 + A(I+IB+JJ-1,I+IB+JJ-1)
-         END DO
-*
-*              C1 := -V1 * C1 
-*
-         CALL DTRMM( 'Left', 'Lower', 'No transpose', 'Unit', IB, N-K,
-     $               -ONE, A(I,I), LDA, A(I,I+IB),LDA )
+         CALL DLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Forward',
+     $      'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), LDA, A(I,I),
+     $      LDA, A(I,I+IB), LDA)
 *
 *        Apply H to rows i:m of current block
 *
@@ -275,24 +240,26 @@ SUBROUTINE DORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *           Form the triangular factor of the block reflector
 *           H = H(i) H(i+1) . . . H(i+ib-1)
 *
-            CALL DLARFT('Forward', 'Column', M-I+1, IB, A(I,I), 
+            CALL DLARFT('Forward', 'Column', M-I+1, IB, A(I,I),
      $         LDA, TAU(I), A(I,I), LDA)
 *
 *           Apply H to A(i:m,i+ib:n) from the left
 *
-            CALL DLARFB0C2('A', 'A', 'Forward', 'Column', M-I+1, 
-     $         N-(I+IB)+1, IB, A(I,I), LDA, A(I,I), LDA, 
-     $         A(I,I+IB), LDA)
+            CALL DLARFB0C2(.FALSE., 'Left', 'No Transpose',
+     $         'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I),
+     $         LDA, A(I,I), LDA, A(I,I+IB), LDA)
 
 *
 *           Apply H to rows i:m of current block
 *
-            CALL DORG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK, 
+            CALL DORG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK,
      $         IINFO)
          END DO
+*
 *        This checks for if K was a perfect multiple of NB
 *        so that we only have a special case for the last block when
 *        necessary
+*
          IF(I.LT.1) THEN
             IB = I + NB - 1
             I = 1
@@ -300,24 +267,24 @@ SUBROUTINE DORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *           Form the triangular factor of the block reflector
 *           H = H(i) H(i+1) . . . H(i+ib-1)
 *
-            CALL DLARFT('Forward', 'Column', M-I+1, IB, A(I,I), 
+            CALL DLARFT('Forward', 'Column', M-I+1, IB, A(I,I),
      $         LDA, TAU(I), A(I,I), LDA)
 *
 *           Apply H to A(i:m,i+ib:n) from the left
 *
-            CALL DLARFB0C2('A', 'A', 'Forward', 'Column', M-I+1, 
-     $         N-(I+IB)+1, IB, A(I,I), LDA, A(I,I), LDA, 
-     $         A(I,I+IB),LDA)
+            CALL DLARFB0C2(.FALSE., 'Left', 'No Transpose',
+     $         'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I),
+     $         LDA, A(I,I), LDA, A(I,I+IB), LDA)
 
 *
 *           Apply H to rows i:m of current block
 *
-            CALL DORG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK, 
+            CALL DORG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK,
      $         IINFO)
          END IF
       END IF
 *
-*      WORK( 1 ) = IWS
+      WORK( 1 ) = IWS
       RETURN
 *
 *     End of DORGQR
diff --git a/SRC/dorgrq.f b/SRC/dorgrq.f
index ccaad3b433..ce61238776 100644
--- a/SRC/dorgrq.f
+++ b/SRC/dorgrq.f
@@ -5,6 +5,7 @@
 * Online html documentation available at
 *            http://www.netlib.org/lapack/explore-html/
 *
+*> \htmlonly
 *> Download DORGRQ + dependencies
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dorgrq.f">
 *> [TGZ]</a>
@@ -12,6 +13,7 @@
 *> [ZIP]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dorgrq.f">
 *> [TXT]</a>
+*> \endhtmlonly
 *
 *  Definition:
 *  ===========
@@ -227,54 +229,70 @@ SUBROUTINE DORGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
       IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
 *
-*        Use blocked code after the first block.
-*        The last kk rows are handled by the block method.
+*        We want to use the blocking method as long as our matrix is big enough
+*        and it's deemed worthwhile with the extra memory allocations
 *
-         KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB )
-*
-*        Set A(1:m-kk,n-kk+1:n) to zero.
-*
-         DO 20 J = N - KK + 1, N
-            DO 10 I = 1, M - KK
-               A( I, J ) = ZERO
-   10       CONTINUE
-   20    CONTINUE
+         KK = K
       ELSE
          KK = 0
       END IF
 *
-*     Use unblocked code for the first or only block.
+*     Potentially bail to the unblocked code
 *
-      CALL DORGR2( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO )
+      IF( KK.EQ.0 ) THEN
+         CALL DORGR2( M, N, K, A, LDA, TAU, WORK, IINFO )
+      END IF
 *
       IF( KK.GT.0 ) THEN
 *
-*        Use blocked code
+*        Factor the first block assuming that our first application
+*        will be on the Identity matrix
+*
+         I = 1
+         IB = NB
+         II = M - K + I
+*
+*        Form the triangular factor of the block reflector
+*        H = H(i+ib-1) . . . H(i+1) H(i)
+*
+         CALL DLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB,
+     $                A( II, 1 ), LDA, TAU( I ), WORK, LDWORK )
+*
+*        Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right
 *
-         DO 50 I = K - KK + 1, K, NB
-            IB = MIN( NB, K-I+1 )
+         CALL DLARFB0C2(.TRUE., 'Right', 'Transpose', 'Backward', 
+     $         'Rowwise', II-1, N-K+I+IB-1, IB, A(II,1), LDA, WORK,
+     $         LDWORK, A, LDA)
+*
+*           Apply H**T to columns 1:n-k+i+ib-1 of current block
+*
+         CALL DORGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA,
+     $                TAU( I ), WORK, IINFO )
+
+         DO I = NB + 1, K, NB
+*
+*           The last block may be less than size NB
+*
+            IB = MIN(NB, K-I+1)
             II = M - K + I
-            IF( II.GT.1 ) THEN
 *
-*              Form the triangular factor of the block reflector
-*              H = H(i+ib-1) . . . H(i+1) H(i)
+*           Form the triangular factor of the block reflector
+*           H = H(i+ib-1) . . . H(i+1) H(i)
 *
-               CALL DLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB,
-     $                      A( II, 1 ), LDA, TAU( I ), WORK, LDWORK )
+            CALL DLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB,
+     $                   A( II, 1 ), LDA, TAU( I ), WORK, LDWORK )
 *
-*              Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right
+*           Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right
 *
-               CALL DLARFB0C2( 'A', 'A', 'Backward', 'Rowwise', 
-     $               II-1, N-K+I+IB-1, IB, A(II,1), LDA, WORK, LDWORK,
-     $               A, LDA)
-            END IF
+            CALL DLARFB0C2(.FALSE., 'Right', 'Transpose', 
+     $            'Backward', 'Rowwise', II-1, N-K+I+IB-1, IB, 
+     $            A(II,1), LDA, WORK, LDWORK, A, LDA)
 *
 *           Apply H**T to columns 1:n-k+i+ib-1 of current block
 *
             CALL DORGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA,
-     $                   TAU( I ),
-     $                   WORK, IINFO )
-   50    CONTINUE
+     $                   TAU( I ), WORK, IINFO )
+         END DO
       END IF
 *
       WORK( 1 ) = IWS
diff --git a/SRC/slarfb0c2.f b/SRC/slarfb0c2.f
index 5d235bc011..74a31f6f76 100644
--- a/SRC/slarfb0c2.f
+++ b/SRC/slarfb0c2.f
@@ -1,20 +1,24 @@
-      SUBROUTINE SLARFB0C2(SIDE, TRANS, DIRECT, STOREV, M, N, K, V, 
-     $                     LDV, T, LDT, C, LDC)
+      SUBROUTINE SLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N,
+     $                     K, V, LDV, T, LDT, C, LDC)
          ! Scalar arguments
          INTEGER           M, N, K, LDV, LDC, LDT
-         ! Note: We should probably get rid of SIDE and TRANS as these flags
-         ! are not used as we are only using this routine inside
-         ! x{OR,UN}G{QR,LQ,QR,LQ}, which have values that never change
          CHARACTER         SIDE, TRANS, DIRECT, STOREV 
+         ! True means that we are assuming C2 is the identity matrix
+         !     and thus don't reference whatever is present in C2 
+         !     at the beginning.
+         LOGICAL           C2I
+
          ! Array arguments
          REAL              V(LDV,*), C(LDC,*), T(LDT,*)
          ! Local scalars
-         LOGICAL           QR, LQ, QL, DIRF, COLV
-         ! External subroutines
-         EXTERNAL          SGEMM, STRMM
+         LOGICAL           QR, LQ, QL, DIRF, COLV, SIDEL, SIDER,
+     $                     TRANST
+         INTEGER           I, J
          ! External functions
          LOGICAL           LSAME
          EXTERNAL          LSAME
+         ! External Subroutines
+         EXTERNAL          SGEMM, STRMM, XERBLA
          ! Parameters
          REAL              ONE, ZERO, NEG_ONE
          PARAMETER(ONE=1.0E+0, ZERO = 0.0E+0, NEG_ONE = -1.0E+0)
@@ -23,6 +27,9 @@ SUBROUTINE SLARFB0C2(SIDE, TRANS, DIRECT, STOREV, M, N, K, V,
          ! Convert our character flags to logical values
          DIRF = LSAME(DIRECT,'F')
          COLV = LSAME(STOREV,'C')
+         SIDEL = LSAME(SIDE,'L')
+         SIDER = LSAME(SIDE,'R')
+         TRANST = LSAME(TRANS,'T')
 
          ! Determine which of the 4 modes are using.
          ! QR is when we store the reflectors column by column and have the
@@ -78,26 +85,53 @@ SUBROUTINE SLARFB0C2(SIDE, TRANS, DIRECT, STOREV, M, N, K, V,
             !
             ! To achieve the same end result
             !
+            ! Check to ensure side and trans are the expected values 
+            !
+            IF( .NOT.SIDEL ) THEN
+               CALL XERBLA('DLARFB0C2', 2)
+               RETURN
+            ELSE IF(TRANST) THEN
+               CALL XERBLA('DLARFB0C2', 3)
+               RETURN
+            END IF
+            !
             ! C1 = V2'*C2
             !
-            CALL SGEMM('Transpose', 'No Transpose', K, N, M - K,
-     $                  ONE, V(K+1,1), LDV, C(K+1,1), LDC, ZERO, 
-     $                  C, LDC)
+            IF (C2I) THEN
+               DO J = 1, N
+                  DO I = 1, K
+                     C(I,J) = V(K+J,I)
+                  END DO
+               END DO
+            ELSE
+               CALL SGEMM('Transpose', 'No Transpose', K, N, M - K,
+     $                     ONE, V(K+1,1), LDV, C(K+1,1), LDC, ZERO,
+     $                     C, LDC)
+            END IF
             !
             ! C1 = T*C1
             !
-            CALL STRMM('Left', 'Upper', 'No Transpose', 'Non-unit', 
+            CALL STRMM('Left', 'Upper', 'No Transpose', 'Non-unit',
      $                  K, N, ONE, T, LDT, C, LDC)
             !
             ! C2 = C2 - V2*C1 = -V2*C1 + C2
             !
-            CALL SGEMM('No Transpose', 'No Transpose', M-K, N, K,
-     $                  NEG_ONE, V(K+1,1), LDV, C, LDC, ONE, 
-     $                  C(K+1,1), LDC)
+            IF (C2I) THEN
+               CALL SGEMM('No Transpose', 'No Transpose', M-K, N, K,
+     $                     NEG_ONE, V(K+1,1), LDV, C, LDC, ZERO,
+     $                     C(K+1,1), LDC)
+               DO I = 1, N
+                  C(K+I,I) = C(K+I,I) + ONE
+               END DO
+            ELSE
+               CALL SGEMM('No Transpose', 'No Transpose', M-K, N, K,
+     $                     NEG_ONE, V(K+1,1), LDV, C, LDC, ONE,
+     $                     C(K+1,1), LDC)
+            END IF
             !
             ! C1 = -V1*C1
             !
-            CALL STRMM('Left', 'Lower', 'No Transpose', 'Unit', 
+            CALL STRMM('Left', 'Lower', 'No Transpose', 'Unit',
      $                  K, N, NEG_ONE, V, LDV, C, LDC)
          ELSE IF (LQ) THEN
             ! We are computing C = CH' = C(I-V'T'V)
@@ -131,10 +165,29 @@ SUBROUTINE SLARFB0C2(SIDE, TRANS, DIRECT, STOREV, M, N, K, V,
             !
             ! To achieve the same end result
             !
+            ! Check to ensure side and trans are the expected values 
+            !
+            IF( .NOT.SIDER ) THEN
+               CALL XERBLA('DLARFB0C2', 2)
+               RETURN
+            ELSE IF(.NOT.TRANST) THEN
+               CALL XERBLA('DLARFB0C2', 3)
+               RETURN
+            END IF
+            !
             ! C1 = C2*V2'
             !
-            CALL SGEMM('No Transpose', 'Transpose', M, K, N-K, 
-     $            ONE, C(1,K+1), LDC, V(1, K+1), LDV, ZERO, C, LDC)
+            IF( C2I ) THEN
+               DO J = 1, K
+                  DO I = 1, M
+                     C(I,J) = V(J,K+I)
+                  END DO
+               END DO
+            ELSE
+               CALL SGEMM('No Transpose', 'Transpose', M, K, N-K,
+     $               ONE, C(1,K+1), LDC, V(1, K+1), LDV, ZERO, C,
+     $               LDC)
+            END IF
             !
             ! C1 = C1*T'
             !
@@ -143,9 +196,18 @@ SUBROUTINE SLARFB0C2(SIDE, TRANS, DIRECT, STOREV, M, N, K, V,
             !
             ! C2 = C2 - C1*V2 = -C1*V2 + C2
             !
-            CALL SGEMM('No Transpose', 'No Transpose', M, N-K, K,
-     $            NEG_ONE, C, LDC, V(1,K+1), LDV, ONE, 
-     $            C(1,K+1), LDC)
+            IF( C2I ) THEN
+               CALL SGEMM('No Transpose', 'No Transpose', M, N-K, K,
+     $               NEG_ONE, C, LDC, V(1,K+1), LDV, ZERO, C(1,K+1),
+     $               LDC)
+               DO I = 1, M
+                  C(I,K+I) = C(I,K+I) + ONE
+               END DO
+            ELSE
+               CALL SGEMM('No Transpose', 'No Transpose', M, N-K, K,
+     $               NEG_ONE, C, LDC, V(1,K+1), LDV, ONE, C(1,K+1),
+     $               LDC)
+            END IF
             !
             ! C1 = -C1*V1
             !
@@ -186,10 +248,28 @@ SUBROUTINE SLARFB0C2(SIDE, TRANS, DIRECT, STOREV, M, N, K, V,
             !
             ! To achieve the same end result
             !
+            ! Check to ensure side and trans are the expected values 
+            !
+            IF( .NOT.SIDEL ) THEN
+               CALL XERBLA('DLARFB0C2', 2)
+               RETURN
+            ELSE IF(TRANST) THEN
+               CALL XERBLA('DLARFB0C2', 3)
+               RETURN
+            END IF
+            !
             ! C1 = V2'*C2
             !
-            CALL SGEMM('Transpose', 'No Transpose', K, N, M-K, 
-     $         ONE, V, LDV, C, LDC, ZERO, C(M-K+1, 1), LDC)
+            IF( C2I ) THEN
+               DO J = 1, N
+                  DO I = 1, K
+                     C(M-K+I,J) = V(J,I)
+                  END DO
+               END DO
+            ELSE
+               CALL SGEMM('Transpose', 'No Transpose', K, N, M-K,
+     $            ONE, V, LDV, C, LDC, ZERO, C(M-K+1, 1), LDC)
+            END IF
             !
             ! C1 = T*C1
             !
@@ -198,12 +278,20 @@ SUBROUTINE SLARFB0C2(SIDE, TRANS, DIRECT, STOREV, M, N, K, V,
             !
             ! C2 = C2 - V2*C1 = -V2*C1 + C2
             !
-            CALL SGEMM('No Transpose', 'No Transpose', M-K, N, K, 
-     $         NEG_ONE, V, LDV, C(M-K+1,1), LDC, ONE, C, LDC)
+            IF( C2I ) THEN
+               CALL SGEMM('No Transpose', 'No Transpose', M-K, N, K,
+     $            NEG_ONE, V, LDV, C(M-K+1,1), LDC, ZERO, C, LDC)
+               DO I = 1, N
+                  C(I,I) = C(I,I) + ONE
+               END DO
+            ELSE
+               CALL SGEMM('No Transpose', 'No Transpose', M-K, N, K,
+     $            NEG_ONE, V, LDV, C(M-K+1,1), LDC, ONE, C, LDC)
+            END IF
             !
             ! C1 = -V1*C1
             !
-            CALL STRMM('Left', 'Upper', 'No Transpose', 'Unit', 
+            CALL STRMM('Left', 'Upper', 'No Transpose', 'Unit',
      $         K, N, NEG_ONE, V(M-K+1,1), LDV, C(M-K+1,1), LDC)
          ELSE ! IF (RQ) THEN
             ! We are computing C = CH' = C(I-V'T'V)
@@ -226,7 +314,7 @@ SUBROUTINE SLARFB0C2(SIDE, TRANS, DIRECT, STOREV, M, N, K, V,
             !
             !   = [ C2, 0 ] - C2*V2'*T'[ V2, V1 ]
             !
-            !   = [ C2, 0 ] - [ C2*V2'*T'*V2, C2*V2'*T'*V1 ] 
+            !   = [ C2, 0 ] - [ C2*V2'*T'*V2, C2*V2'*T'*V1 ]
             !
             !   = [ C2 - C2*V2'*T'*V2, -C2*V2'*T'*V1 ]
             !
@@ -236,28 +324,54 @@ SUBROUTINE SLARFB0C2(SIDE, TRANS, DIRECT, STOREV, M, N, K, V,
             ! C1 = C1*T'
             ! C2 = C2 - C1*V2
             ! C1 = -C1*V1
-            ! 
+            !
             !
             ! To achieve the same end result
             !
+            ! Check to ensure side and trans are the expected values 
+            !
+            IF( .NOT.SIDER ) THEN
+               CALL XERBLA('DLARFB0C2', 2)
+               RETURN
+            ELSE IF(.NOT.TRANST) THEN
+               CALL XERBLA('DLARFB0C2', 3)
+               RETURN
+            END IF
+            !
             ! C1 = C2*V2'
             !
-            CALL SGEMM('No Transpose', 'Transpose', M, K, N-K,
-     $         ONE, C, LDC, V, LDV, ZERO, C(1, N-K+1), LDC)
+            IF( C2I ) THEN
+               DO J = 1, K
+                  DO I = 1, M
+                     C(I,N-K+J) = V(J,I)
+                  END DO
+               END DO
+            ELSE
+               CALL SGEMM('No Transpose', 'Transpose', M, K, N-K,
+     $            ONE, C, LDC, V, LDV, ZERO, C(1, N-K+1), LDC)
+            END IF
             !
             ! C1 = C1*T'
             !
-            CALL STRMM('Right', 'Lower', 'Transpose', 'Non-unit', 
+            CALL STRMM('Right', 'Lower', 'Transpose', 'Non-unit',
      $         M, K, ONE, T, LDT, C(1, N-K+1), LDC)
             !
             ! C2 = C2 - C1*V2 = -C1*V2 + C2
             !
-            CALL SGEMM('No Transpose', 'No Transpose', M, N-K, K, 
-     $         NEG_ONE, C(1, N-K+1), LDC, V, LDV, ONE, C, LDC)
+            IF( C2I ) THEN
+               CALL SGEMM('No Transpose', 'No Transpose', M, N-K, K,
+     $            NEG_ONE, C(1, N-K+1), LDC, V, LDV, ZERO, C, LDC)
+               DO I = 1, M
+                  C(I,I) = C(I,I) + ONE
+               END DO
+            ELSE
+               CALL SGEMM('No Transpose', 'No Transpose', M, N-K, K,
+     $            NEG_ONE, C(1, N-K+1), LDC, V, LDV, ONE, C, LDC)
+            END IF
             !
             ! C1 = -C1*V1
             !
-            CALL STRMM('Right', 'Lower', 'No Transpose', 'Unit', 
+            CALL STRMM('Right', 'Lower', 'No Transpose', 'Unit',
      $         M, K, NEG_ONE, V(1, N-K+1), LDV, C(1,N-K+1), LDC)
          END IF
       END SUBROUTINE
diff --git a/SRC/sorglq.f b/SRC/sorglq.f
index fb5ef79cce..574b3ae2c1 100644
--- a/SRC/sorglq.f
+++ b/SRC/sorglq.f
@@ -146,7 +146,7 @@ SUBROUTINE SORGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
      $                   LWKOPT, NB, NBMIN, NX
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SLARFB, SLARFT, SORGL2, XERBLA
+      EXTERNAL           SLARFB0C2, SLARFT, SORGL2, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
@@ -218,67 +218,90 @@ SUBROUTINE SORGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
       IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
 *
-*        Use blocked code after the last block.
-*        The first kk rows are handled by the block method.
+*        Handle the first block assuming we are applying to the
+*        identity, then resume regular blocking method after
 *
-         KI = ( ( K-NX-1 ) / NB )*NB
-         KK = MIN( K, KI+NB )
-*
-*        Set A(kk+1:m,1:kk) to zero.
-*
-         DO 20 J = 1, KK
-            DO 10 I = KK + 1, M
-               A( I, J ) = ZERO
-   10       CONTINUE
-   20    CONTINUE
+         KI = K - 2 * NB
+         KK = K - NB
       ELSE
          KK = 0
       END IF
 *
-*     Use unblocked code for the last or only block.
+*     Potentially bail to the unblocked version
 *
-      IF( KK.LT.M )
-     $   CALL SORGL2( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA,
-     $                TAU( KK+1 ), WORK, IINFO )
+      IF( KK.EQ.0 ) THEN
+         CALL SORGL2( M, N, K, A, LDA, TAU, WORK, IINFO )
+      END IF
 *
       IF( KK.GT.0 ) THEN
+         I = KK + 1
+         IB = NB
+*
+*        Form the triangular factor of the block reflector
+*        H = H(i) H(i+1) . . . H(i+ib-1)
+*
+         CALL SLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ),
+     $               LDA, TAU( I ), WORK, LDWORK )
+*
+*        Apply H**T to A(i+ib:m,i:n) from the right
+*
+         CALL SLARFB0C2(.TRUE., 'Right', 'Transpose', 'Forward',
+     $         'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, 
+     $         LDWORK, A(I+IB,I), LDA)
+*
+*        Apply H**T to columns i:n of current block
+
+         CALL SORGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ),
+     $                WORK, IINFO )
 *
 *        Use blocked code
 *
-         DO 50 I = KI + 1, 1, -NB
-            IB = MIN( NB, K-I+1 )
-            IF( I+IB.LE.M ) THEN
+         DO I = KI + 1, 1, -NB
+            IB = NB
 *
-*              Form the triangular factor of the block reflector
-*              H = H(i) H(i+1) . . . H(i+ib-1)
+*           Form the triangular factor of the block reflector
+*           H = H(i) H(i+1) . . . H(i+ib-1)
 *
-               CALL SLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I,
-     $                      I ),
-     $                      LDA, TAU( I ), WORK, LDWORK )
+            CALL SLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ),
+     $                  LDA, TAU( I ), WORK, LDWORK )
 *
-*              Apply H**T to A(i+ib:m,i:n) from the right
+*           Apply H**T to A(i+ib:m,i:n) from the right
 *
-               CALL SLARFB( 'Right', 'Transpose', 'Forward',
-     $                      'Rowwise',
-     $                      M-I-IB+1, N-I+1, IB, A( I, I ), LDA, WORK,
-     $                      LDWORK, A( I+IB, I ), LDA, WORK( IB+1 ),
-     $                      LDWORK )
-            END IF
+            CALL SLARFB0C2(.FALSE., 'Right', 'Transpose', 'Forward',
+     $            'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, 
+     $            LDWORK, A(I+IB,I), LDA)
 *
 *           Apply H**T to columns i:n of current block
-*
+
             CALL SORGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ),
-     $                   WORK,
-     $                   IINFO )
+     $                   WORK, IINFO )
+         END DO
+*
+*        This checks for if K was a perfect multiple of NB
+*        so that we only have a special case for the last block when
+*        necessary
+*
+         IF(I.LT.1) THEN
+            IB = I + NB - 1
+            I = 1
+*
+*           Form the triangular factor of the block reflector
+*           H = H(i) H(i+1) . . . H(i+ib-1)
+*
+            CALL SLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ),
+     $                  LDA, TAU( I ), WORK, LDWORK )
 *
-*           Set columns 1:i-1 of current block to zero
+*           Apply H**T to A(i+ib:m,i:n) from the right
 *
-            DO 40 J = 1, I - 1
-               DO 30 L = I, I + IB - 1
-                  A( L, J ) = ZERO
-   30          CONTINUE
-   40       CONTINUE
-   50    CONTINUE
+            CALL SLARFB0C2(.FALSE., 'Right', 'Transpose', 'Forward',
+     $            'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, 
+     $            LDWORK, A(I+IB,I), LDA)
+*
+*           Apply H**T to columns i:n of current block
+
+            CALL SORGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ),
+     $                   WORK, IINFO )
+         END IF
       END IF
 *
       WORK( 1 ) = SROUNDUP_LWORK(IWS)
diff --git a/SRC/sorgql.f b/SRC/sorgql.f
index 7257ff2d95..882e7d4889 100644
--- a/SRC/sorgql.f
+++ b/SRC/sorgql.f
@@ -143,11 +143,10 @@ SUBROUTINE SORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IB, IINFO, IWS, J, KK, L, LDWORK, LWKOPT,
-     $                   NB, NBMIN, NX
+      INTEGER            I, IB, IINFO, IWS, J, KK, LWKOPT, NB, NBMIN
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SLARFB, SLARFT, SORG2L, XERBLA
+      EXTERNAL           SLARFB0C2, SLARFT, SORG2L, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
@@ -178,7 +177,8 @@ SUBROUTINE SORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
             LWKOPT = 1
          ELSE
             NB = ILAENV( 1, 'SORGQL', ' ', M, N, K, -1 )
-            LWKOPT = N*NB
+            ! Only need a workspace for calls to dorg2l
+            LWKOPT = N
          END IF
          WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
@@ -201,88 +201,76 @@ SUBROUTINE SORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
       END IF
 *
       NBMIN = 2
-      NX = 0
+      NX = MAX(0, ILAENV(3, 'SORGQL', ' ', M, N, K, -1))
       IWS = N
-      IF( NB.GT.1 .AND. NB.LT.K ) THEN
 *
-*        Determine when to cross over from blocked to unblocked code.
-*
-         NX = MAX( 0, ILAENV( 3, 'SORGQL', ' ', M, N, K, -1 ) )
-         IF( NX.LT.K ) THEN
+      IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
 *
-*           Determine if workspace is large enough for blocked code.
+*        We want to use the blocking method as long as our matrix is big enough
 *
-            LDWORK = N
-            IWS = LDWORK*NB
-            IF( LWORK.LT.IWS ) THEN
+         KK = K
+      ELSE
+         KK = 0
+      END IF
 *
-*              Not enough workspace to use optimal NB:  reduce NB and
-*              determine the minimum value of NB.
+*     Possibly bail to the unblocked code.
 *
-               NB = LWORK / LDWORK
-               NBMIN = MAX( 2, ILAENV( 2, 'SORGQL', ' ', M, N, K,
-     $                      -1 ) )
-            END IF
-         END IF
+      IF( KK.EQ.0 ) THEN
+         CALL SORG2L( M, N, K, A, LDA, TAU, WORK, IINFO )
       END IF
 *
-      IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
+      IF( KK.GT.0 ) THEN
 *
-*        Use blocked code after the first block.
-*        The last kk columns are handled by the block method.
+*        Factor the first block assuming that our first application
+*        will be on the Identity matrix
 *
-         KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB )
+         I = 1
+         IB = NB
 *
-*        Set A(m-kk+1:m,1:n-kk) to zero.
+*        Form the triangular factor of the block reflector
+*        H = H(i+ib-1) . . . H(i+1) H(i)
 *
-         DO 20 J = 1, N - KK
-            DO 10 I = M - KK + 1, M
-               A( I, J ) = ZERO
-   10       CONTINUE
-   20    CONTINUE
-      ELSE
-         KK = 0
-      END IF
+         CALL SLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB,
+     $                  A( 1, N-K+I ), LDA, TAU( I ),
+     $                  A( M-K+I, N-K+I ), LDA)
 *
-*     Use unblocked code for the first or only block.
+*        Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left
 *
-      CALL SORG2L( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO )
+         CALL SLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Backward', 
+     $         'Columnwise', M-K+I+IB-1, N-K+I-1, IB, A(1, N-K+I), 
+     $         LDA, A( M-K+I, N-K+I ), LDA, A, LDA)
 *
-      IF( KK.GT.0 ) THEN
+*        Apply H to rows 1:m-k+i+ib-1 of current block
 *
-*        Use blocked code
+         CALL SORG2L( M-K+I+IB-1, IB, IB, A( 1, N-K+I ), LDA,
+     $                TAU( I ), WORK, IINFO )
+
+*        Use blocked code on the remaining blocks if there are any.
 *
-         DO 50 I = K - KK + 1, K, NB
-            IB = MIN( NB, K-I+1 )
-            IF( N-K+I.GT.1 ) THEN
+         DO I = NB+1, K, NB
 *
-*              Form the triangular factor of the block reflector
-*              H = H(i+ib-1) . . . H(i+1) H(i)
+*           The last block may be less than size NB
 *
-               CALL SLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB,
-     $                      A( 1, N-K+I ), LDA, TAU( I ), WORK, LDWORK )
+            IB = MIN(NB, K-I+1)
 *
-*              Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left
+*           Form the triangular factor of the block reflector
+*           H = H(i+ib-1) . . . H(i+1) H(i)
 *
-               CALL SLARFB( 'Left', 'No transpose', 'Backward',
-     $                      'Columnwise', M-K+I+IB-1, N-K+I-1, IB,
-     $                      A( 1, N-K+I ), LDA, WORK, LDWORK, A, LDA,
-     $                      WORK( IB+1 ), LDWORK )
-            END IF
+            CALL SLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB,
+     $                  A( 1, N-K+I ), LDA, TAU( I ), 
+     $                  A( M-K+I, N-K+I ), LDA )
+*
+*           Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left
+*
+            CALL SLARFB0C2(.FALSE., 'Left', 'No Transpose',
+     $            'Backward', 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, 
+     $            A(1, N-K+I), LDA, A( M-K+I, N-K+I ), LDA, A, LDA)
 *
 *           Apply H to rows 1:m-k+i+ib-1 of current block
 *
             CALL SORG2L( M-K+I+IB-1, IB, IB, A( 1, N-K+I ), LDA,
      $                   TAU( I ), WORK, IINFO )
-*
-*           Set rows m-k+i+ib:m of current block to zero
-*
-            DO 40 J = N - K + I, N - K + I + IB - 1
-               DO 30 L = M - K + I + IB, M
-                  A( L, J ) = ZERO
-   30          CONTINUE
-   40       CONTINUE
-   50    CONTINUE
+         END DO
       END IF
 *
       WORK( 1 ) = SROUNDUP_LWORK(IWS)
diff --git a/SRC/sorgqr.f b/SRC/sorgqr.f
index 47d1fd248c..65d397de36 100644
--- a/SRC/sorgqr.f
+++ b/SRC/sorgqr.f
@@ -147,7 +147,7 @@ SUBROUTINE SORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
      $                   LWKOPT, NB, NBMIN, NX
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SLARFB, SLARFT, SORG2R, XERBLA
+      EXTERNAL           SLARFB0C2, SLARFT, SORG2R, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
@@ -163,7 +163,7 @@ SUBROUTINE SORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
       INFO = 0
       NB = ILAENV( 1, 'SORGQR', ' ', M, N, K, -1 )
-      LWKOPT = MAX( 1, N )*NB
+      LWKOPT = MAX( 1, N )
       WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
@@ -194,92 +194,92 @@ SUBROUTINE SORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
       NBMIN = 2
       NX = 0
       IWS = N
-      IF( NB.GT.1 .AND. NB.LT.K ) THEN
 *
-*        Determine when to cross over from blocked to unblocked code.
+      IF( NB.GE.NBMIN .AND. NB.LT.K ) THEN
 *
-         NX = MAX( 0, ILAENV( 3, 'SORGQR', ' ', M, N, K, -1 ) )
-         IF( NX.LT.K ) THEN
+*        Handle the first block assuming we are applying to the
+*        identity, then resume regular blocking method after
 *
-*           Determine if workspace is large enough for blocked code.
-*
-            LDWORK = N
-            IWS = LDWORK*NB
-            IF( LWORK.LT.IWS ) THEN
+         KI = K - 2 * NB
+         KK = K - NB
+      ELSE
+         KK = 0
+      END IF
 *
-*              Not enough workspace to use optimal NB:  reduce NB and
-*              determine the minimum value of NB.
+*     Potentially bail to the unblocked code.
 *
-               NB = LWORK / LDWORK
-               NBMIN = MAX( 2, ILAENV( 2, 'SORGQR', ' ', M, N, K,
-     $                      -1 ) )
-            END IF
-         END IF
+      IF( KK.EQ.0 ) THEN
+         CALL SORG2R( M, N, K, A, LDA, TAU, WORK, IINFO )
       END IF
 *
-      IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
+      IF( KK.GT.0 ) THEN
+         I = KK + 1
+         IB = NB
 *
-*        Use blocked code after the last block.
-*        The first kk columns are handled by the block method.
+*           Form the triangular factor of the block reflector
+*           H = H(i) H(i+1) . . . H(i+ib-1)
 *
-         KI = ( ( K-NX-1 ) / NB )*NB
-         KK = MIN( K, KI+NB )
+         CALL SLARFT('Forward', 'Column', M-I+1, IB, A(I,I),
+     $                     LDA, TAU(I), A(I,I), LDA)
 *
-*        Set A(1:kk,kk+1:n) to zero.
+*           Apply H to A(i:m,i+ib:n) from the left
 *
-         DO 20 J = KK + 1, N
-            DO 10 I = 1, KK
-               A( I, J ) = ZERO
-   10       CONTINUE
-   20    CONTINUE
-      ELSE
-         KK = 0
-      END IF
+         CALL SLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Forward',
+     $      'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), LDA, A(I,I),
+     $      LDA, A(I,I+IB), LDA)
 *
-*     Use unblocked code for the last or only block.
+*        Apply H to rows i:m of current block
 *
-      IF( KK.LT.N )
-     $   CALL SORG2R( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA,
-     $                TAU( KK+1 ), WORK, IINFO )
+         CALL SORG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK, IINFO)
+         DO I = KI + 1, 1, -NB
+            IB = NB
 *
-      IF( KK.GT.0 ) THEN
+*           Form the triangular factor of the block reflector
+*           H = H(i) H(i+1) . . . H(i+ib-1)
 *
-*        Use blocked code
+            CALL SLARFT('Forward', 'Column', M-I+1, IB, A(I,I),
+     $         LDA, TAU(I), A(I,I), LDA)
 *
-         DO 50 I = KI + 1, 1, -NB
-            IB = MIN( NB, K-I+1 )
-            IF( I+IB.LE.N ) THEN
+*           Apply H to A(i:m,i+ib:n) from the left
 *
-*              Form the triangular factor of the block reflector
-*              H = H(i) H(i+1) . . . H(i+ib-1)
+            CALL SLARFB0C2(.FALSE., 'Left', 'No Transpose',
+     $         'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I),
+     $         LDA, A(I,I), LDA, A(I,I+IB), LDA)
+
 *
-               CALL SLARFT( 'Forward', 'Columnwise', M-I+1, IB,
-     $                      A( I, I ), LDA, TAU( I ), WORK, LDWORK )
+*           Apply H to rows i:m of current block
 *
-*              Apply H to A(i:m,i+ib:n) from the left
+            CALL SORG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK,
+     $         IINFO)
+         END DO
 *
-               CALL SLARFB( 'Left', 'No transpose', 'Forward',
-     $                      'Columnwise', M-I+1, N-I-IB+1, IB,
-     $                      A( I, I ), LDA, WORK, LDWORK, A( I, I+IB ),
-     $                      LDA, WORK( IB+1 ), LDWORK )
-            END IF
+*        This checks for if K was a perfect multiple of NB
+*        so that we only have a special case for the last block when
+*        necessary
 *
-*           Apply H to rows i:m of current block
+         IF(I.LT.1) THEN
+            IB = I + NB - 1
+            I = 1
 *
-            CALL SORG2R( M-I+1, IB, IB, A( I, I ), LDA, TAU( I ),
-     $                   WORK,
-     $                   IINFO )
+*           Form the triangular factor of the block reflector
+*           H = H(i) H(i+1) . . . H(i+ib-1)
 *
-*           Set rows 1:i-1 of current block to zero
+            CALL SLARFT('Forward', 'Column', M-I+1, IB, A(I,I),
+     $         LDA, TAU(I), A(I,I), LDA)
 *
-            DO 40 J = I, I + IB - 1
-               DO 30 L = 1, I - 1
-                  A( L, J ) = ZERO
-   30          CONTINUE
-   40       CONTINUE
-   50    CONTINUE
-      END IF
+*           Apply H to A(i:m,i+ib:n) from the left
+*
+            CALL SLARFB0C2(.FALSE., 'Left', 'No Transpose',
+     $         'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I),
+     $         LDA, A(I,I), LDA, A(I,I+IB), LDA)
+
 *
+*           Apply H to rows i:m of current block
+*
+            CALL SORG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK,
+     $         IINFO)
+         END IF
+      END IF
       WORK( 1 ) = SROUNDUP_LWORK(IWS)
       RETURN
 *
diff --git a/SRC/sorgrq.f b/SRC/sorgrq.f
index 5fe685c1d8..ea1a4e73da 100644
--- a/SRC/sorgrq.f
+++ b/SRC/sorgrq.f
@@ -147,7 +147,7 @@ SUBROUTINE SORGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
      $                   LWKOPT, NB, NBMIN, NX
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SLARFB, SLARFT, SORGR2, XERBLA
+      EXTERNAL           SLARFB0C2, SLARFT, SORGR2, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
@@ -228,63 +228,70 @@ SUBROUTINE SORGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
       IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
 *
-*        Use blocked code after the first block.
-*        The last kk rows are handled by the block method.
+*        We want to use the blocking method as long as our matrix is big enough
+*        and it's deemed worthwhile with the extra memory allocations
 *
-         KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB )
-*
-*        Set A(1:m-kk,n-kk+1:n) to zero.
-*
-         DO 20 J = N - KK + 1, N
-            DO 10 I = 1, M - KK
-               A( I, J ) = ZERO
-   10       CONTINUE
-   20    CONTINUE
+         KK = K
       ELSE
          KK = 0
       END IF
 *
-*     Use unblocked code for the first or only block.
+*     Potentially bail to the unblocked code
 *
-      CALL SORGR2( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO )
+      IF( KK.EQ.0 ) THEN
+         CALL SORGR2( M, N, K, A, LDA, TAU, WORK, IINFO )
+      END IF
 *
       IF( KK.GT.0 ) THEN
 *
-*        Use blocked code
+*        Factor the first block assuming that our first application
+*        will be on the Identity matrix
 *
-         DO 50 I = K - KK + 1, K, NB
-            IB = MIN( NB, K-I+1 )
-            II = M - K + I
-            IF( II.GT.1 ) THEN
+         I = 1
+         IB = NB
+         II = M - K + I
 *
-*              Form the triangular factor of the block reflector
-*              H = H(i+ib-1) . . . H(i+1) H(i)
+*        Form the triangular factor of the block reflector
+*        H = H(i+ib-1) . . . H(i+1) H(i)
 *
-               CALL SLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB,
-     $                      A( II, 1 ), LDA, TAU( I ), WORK, LDWORK )
+         CALL SLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB,
+     $                A( II, 1 ), LDA, TAU( I ), WORK, LDWORK )
 *
-*              Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right
+*        Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right
 *
-               CALL SLARFB( 'Right', 'Transpose', 'Backward',
-     $                      'Rowwise',
-     $                      II-1, N-K+I+IB-1, IB, A( II, 1 ), LDA, WORK,
-     $                      LDWORK, A, LDA, WORK( IB+1 ), LDWORK )
-            END IF
+         CALL SLARFB0C2(.TRUE., 'Right', 'Transpose', 'Backward', 
+     $         'Rowwise', II-1, N-K+I+IB-1, IB, A(II,1), LDA, WORK,
+     $         LDWORK, A, LDA)
 *
 *           Apply H**T to columns 1:n-k+i+ib-1 of current block
 *
-            CALL SORGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA,
-     $                   TAU( I ),
-     $                   WORK, IINFO )
+         CALL SORGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA,
+     $                TAU( I ), WORK, IINFO )
+
+         DO I = NB + 1, K, NB
 *
-*           Set columns n-k+i+ib:n of current block to zero
+*           The last block may be less than size NB
+*
+            IB = MIN(NB, K-I+1)
+            II = M - K + I
 *
-            DO 40 L = N - K + I + IB, N
-               DO 30 J = II, II + IB - 1
-                  A( J, L ) = ZERO
-   30          CONTINUE
-   40       CONTINUE
-   50    CONTINUE
+*           Form the triangular factor of the block reflector
+*           H = H(i+ib-1) . . . H(i+1) H(i)
+*
+            CALL SLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB,
+     $                   A( II, 1 ), LDA, TAU( I ), WORK, LDWORK )
+*
+*           Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right
+*
+            CALL SLARFB0C2(.FALSE., 'Right', 'Transpose', 
+     $            'Backward', 'Rowwise', II-1, N-K+I+IB-1, IB, 
+     $            A(II,1), LDA, WORK, LDWORK, A, LDA)
+*
+*           Apply H**T to columns 1:n-k+i+ib-1 of current block
+*
+            CALL SORGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA,
+     $                   TAU( I ), WORK, IINFO )
+         END DO
       END IF
 *
       WORK( 1 ) = SROUNDUP_LWORK(IWS)

From 338bae27fcb4bf507ceaeeb1387f0f72705213ca Mon Sep 17 00:00:00 2001
From: Johnathan Rhyne <rhynejohnathan@gmail.com>
Date: Tue, 11 Feb 2025 16:32:05 -0700
Subject: [PATCH 03/10] double complex tests pass locally

---
 SRC/CMakeLists.txt |   2 +-
 SRC/Makefile       |   2 +-
 SRC/dorglq.f       |   8 +-
 SRC/dorgql.f       |   6 +-
 SRC/dorgqr.f       |   6 +-
 SRC/dorgrq.f       |   6 +-
 SRC/sorgql.f       |   3 +-
 SRC/zlarfb0c2.f    | 381 +++++++++++++++++++++++++++++++++++++++++++++
 SRC/zunglq.f       | 124 +++++++++------
 SRC/zungql.f       | 121 +++++++-------
 SRC/zungqr.f       | 148 ++++++++++--------
 SRC/zungrq.f       |  90 +++++------
 12 files changed, 644 insertions(+), 253 deletions(-)
 create mode 100644 SRC/zlarfb0c2.f

diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt
index 12cfebca5f..16156b72a2 100644
--- a/SRC/CMakeLists.txt
+++ b/SRC/CMakeLists.txt
@@ -418,7 +418,7 @@ set(ZLASRC
    zlaqhb.f zlaqhe.f zlaqhp.f zlaqp2.f zlaqps.f zlaqp2rk.f zlaqp3rk.f zlaqsb.f
    zlaqr0.f zlaqr1.f zlaqr2.f zlaqr3.f zlaqr4.f zlaqr5.f
    zlaqsp.f zlaqsy.f zlar1v.f zlar2v.f ilazlr.f ilazlc.f
-   zlarcm.f zlarf.f  zlarfb.f zlarfb_gett.f zlarf1f.f zlarf1l.f
+   zlarcm.f zlarf.f  zlarfb.f zlarfb0c2.f zlarfb_gett.f zlarf1f.f zlarf1l.f
    zlarfg.f zlarfgp.f zlarft.f
    zlarfx.f zlarfy.f zlargv.f zlarnv.f zlarrv.f zlartg.f90 zlartv.f
    zlarz.f  zlarzb.f zlarzt.f zlascl.f zlaset.f zlasr.f
diff --git a/SRC/Makefile b/SRC/Makefile
index 5fb5db20a6..44ba06be9d 100644
--- a/SRC/Makefile
+++ b/SRC/Makefile
@@ -453,7 +453,7 @@ ZLASRC = \
    zlaqhb.o zlaqhe.o zlaqhp.o zlaqp2.o zlaqps.o zlaqp2rk.o zlaqp3rk.o zlaqsb.o \
    zlaqr0.o zlaqr1.o zlaqr2.o zlaqr3.o zlaqr4.o zlaqr5.o \
    zlaqsp.o zlaqsy.o zlar1v.o zlar2v.o ilazlr.o ilazlc.o \
-   zlarcm.o zlarf.o  zlarfb.o zlarfb_gett.o zlarf1f.o zlarf1l.o \
+   zlarcm.o zlarf.o  zlarfb.o zlarfb0c2.o zlarfb_gett.o zlarf1f.o zlarf1l.o \
    zlarfg.o zlarft.o zlarfgp.o \
    zlarfx.o zlarfy.o zlargv.o zlarnv.o zlarrv.o zlartg.o zlartv.o \
    zlarz.o  zlarzb.o zlarzt.o zlascl.o zlaset.o zlasr.o \
diff --git a/SRC/dorglq.f b/SRC/dorglq.f
index 6507f625b3..c7f32afb8f 100644
--- a/SRC/dorglq.f
+++ b/SRC/dorglq.f
@@ -138,14 +138,10 @@ SUBROUTINE DORGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
 *  =====================================================================
 *
-*     .. Parameters ..
-      DOUBLE PRECISION   ZERO
-      PARAMETER          ( ZERO = 0.0D+0 )
-*     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IB, IINFO, IWS, J, KI, KK, L, LDWORK,
-     $                   LWKOPT, NB, NBMIN, NX
+      INTEGER            I, IB, IINFO, IWS, KI, KK, LDWORK, LWKOPT,
+     $                   NB, NBMIN, NX
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           DLARFB0C2, DLARFT, DORGL2, XERBLA
diff --git a/SRC/dorgql.f b/SRC/dorgql.f
index 9f34c858f9..13146142ed 100644
--- a/SRC/dorgql.f
+++ b/SRC/dorgql.f
@@ -139,13 +139,9 @@ SUBROUTINE DORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
 *  =====================================================================
 *
-*     .. Parameters ..
-      DOUBLE PRECISION   ZERO
-      PARAMETER          ( ZERO = 0.0D+0 )
-*     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IB, IINFO, IWS, J, KK, LWKOPT, NB, NBMIN
+      INTEGER            I, IB, IINFO, IWS, KK, LWKOPT, NB, NBMIN
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           DLARFB0C2, DLARFT, DORG2L, XERBLA
diff --git a/SRC/dorgqr.f b/SRC/dorgqr.f
index 0ed278ffc0..4b323f68fb 100644
--- a/SRC/dorgqr.f
+++ b/SRC/dorgqr.f
@@ -139,13 +139,9 @@ SUBROUTINE DORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
 *  =====================================================================
 *
-*     .. Parameters ..
-      DOUBLE PRECISION   ZERO
-      PARAMETER          ( ZERO = 0.0D+0 )
-*     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IB, IINFO, IWS, J, KI, KK, LWKOPT,
+      INTEGER            I, IB, IINFO, IWS, KI, KK, LWKOPT,
      $                   NB, NBMIN, NX
 *     ..
 *     .. External Subroutines ..
diff --git a/SRC/dorgrq.f b/SRC/dorgrq.f
index ce61238776..811c2cdb5e 100644
--- a/SRC/dorgrq.f
+++ b/SRC/dorgrq.f
@@ -139,13 +139,9 @@ SUBROUTINE DORGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
 *  =====================================================================
 *
-*     .. Parameters ..
-      DOUBLE PRECISION   ZERO
-      PARAMETER          ( ZERO = 0.0D+0 )
-*     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IB, II, IINFO, IWS, J, KK, L, LDWORK,
+      INTEGER            I, IB, II, IINFO, IWS, KK, LDWORK,
      $                   LWKOPT, NB, NBMIN, NX
 *     ..
 *     .. External Subroutines ..
diff --git a/SRC/sorgql.f b/SRC/sorgql.f
index 882e7d4889..69484d2c36 100644
--- a/SRC/sorgql.f
+++ b/SRC/sorgql.f
@@ -143,7 +143,8 @@ SUBROUTINE SORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IB, IINFO, IWS, J, KK, LWKOPT, NB, NBMIN
+      INTEGER            I, IB, IINFO, IWS, J, KK, LWKOPT, NB,
+     $                   NBMIN, NX
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SLARFB0C2, SLARFT, SORG2L, XERBLA
diff --git a/SRC/zlarfb0c2.f b/SRC/zlarfb0c2.f
new file mode 100644
index 0000000000..3bbf11e268
--- /dev/null
+++ b/SRC/zlarfb0c2.f
@@ -0,0 +1,381 @@
+      SUBROUTINE ZLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N,
+     $                     K, V, LDV, T, LDT, C, LDC)
+         ! Scalar arguments
+         INTEGER           M, N, K, LDV, LDC, LDT
+         CHARACTER         SIDE, TRANS, DIRECT, STOREV 
+         ! True means that we are assuming C2 is the identity matrix
+         !     and thus don't reference whatever is present in C2 
+         !     at the beginning.
+         LOGICAL           C2I
+
+         ! Array arguments
+         COMPLEX*16        V(LDV,*), C(LDC,*), T(LDT,*)
+         ! Local scalars
+         LOGICAL           QR, LQ, QL, DIRF, COLV, SIDEL, SIDER,
+     $                     TRANST
+         INTEGER           I, J
+         ! Intrinsic Functions
+         INTRINSIC         CONJG
+         ! External functions
+         LOGICAL           LSAME
+         EXTERNAL          LSAME
+         ! External Subroutines
+         EXTERNAL          ZGEMM, ZTRMM
+         ! Parameters
+         COMPLEX*16        ONE, ZERO, NEG_ONE
+         PARAMETER(ONE=(1.0D+0, 0.0D+0),
+     $            ZERO = (0.0D+0, 0.0D+0), 
+     $            NEG_ONE = (-1.0D+0, 0.0D+0))
+
+         ! Beginning of executable statements
+         ! Convert our character flags to logical values
+         DIRF = LSAME(DIRECT,'F')
+         COLV = LSAME(STOREV,'C')
+         SIDEL = LSAME(SIDE,'L')
+         SIDER = LSAME(SIDE,'R')
+         TRANST = LSAME(TRANS,'C')
+
+         ! Determine which of the 4 modes are using.
+         ! QR is when we store the reflectors column by column and have the
+         ! 'first' reflector stored in the first column
+         QR = DIRF.AND.COLV
+
+         ! LQ is when we store the reflectors row by row and have the
+         ! 'first' reflector stored in the first row
+         LQ = DIRF.AND.(.NOT.COLV)
+
+         ! QL is when we store the reflectors column by column and have the
+         ! 'first' reflector stored in the last column
+         QL = (.NOT.DIRF).AND.COLV
+
+         ! RQ is when we store the reflectors row by row and have the
+         ! 'first' reflector stored in the last row
+         ! RQ = (.NOT.DIRF).AND.(.NOT.COLV)
+         ! Since we have exactly one of these 4 modes, we don't need to actually
+         ! store the value of RQ, instead we assume this is the case if we fail
+         ! the above 3 checks.
+
+         IF (QR) THEN
+            ! We are computing C = HC = (I - VTV')C
+            ! Where: V = [ V1 ] and C = [ C1 ]
+            !            [ V2 ]         [ C2 ]
+            ! with the following dimensions:
+            !     V1\in\R^{K\times K}
+            !     V2\in\R^{M-K\times K}
+            !     C1=0\in\R^{K\times N}
+            !     C2\in\R^{M-K\times N}
+            ! Since we are assuming that C1 is a zero matrix and it will be
+            ! overwritten on exit, we can use this spot as a temporary workspace
+            ! without having to allocate anything extra.
+            ! This lets us simplify our above equation to get
+            !
+            ! C = HC = (I - [ V1 ]T [V1', V2'])[ 0  ]
+            !               [ V2 ]             [ C2 ]
+            !   = [ 0  ] - [ V1 ]T*V2'*C2
+            !     [ C2 ]   [ V2 ]
+            !
+            !   = [ 0  ] - [ V1*T*V2'*C2 ]
+            !     [ C2 ]   [ V2*T*V2'*C2 ]
+            !
+            !   = [      V1*T*V2'*C2 ]
+            !     [ C2 - V2*T*V2'*C2 ]
+            !
+            ! So, we can order our computations as follows:
+            !
+            ! C1 = V2'*C2
+            ! C1 = T*C1
+            ! C2 = C2 - V2*C1
+            ! C1 = -V1*C1
+            !
+            ! To achieve the same end result
+            !
+            ! Check to ensure side and trans are the expected values 
+            !
+            IF( .NOT.SIDEL ) THEN
+               CALL XERBLA('DLARFB0C2', 2)
+               RETURN
+            ELSE IF(TRANST) THEN
+               CALL XERBLA('DLARFB0C2', 3)
+               RETURN
+            END IF
+            !
+            ! C1 = V2'*C2
+            !
+            IF (C2I) THEN
+               DO J = 1, N
+                  DO I = 1, K
+                     C(I,J) = CONJG(V(K+J,I))
+                  END DO
+               END DO
+            ELSE
+               CALL ZGEMM('Conjugate', 'No Transpose', K, N, M - K,
+     $                     ONE, V(K+1,1), LDV, C(K+1,1), LDC, ZERO,
+     $                     C, LDC)
+            END IF
+            !
+            ! C1 = T*C1
+            !
+            CALL ZTRMM('Left', 'Upper', 'No Transpose', 'Non-unit',
+     $                  K, N, ONE, T, LDT, C, LDC)
+            !
+            ! C2 = C2 - V2*C1 = -V2*C1 + C2
+            !
+            IF (C2I) THEN
+               CALL ZGEMM('No Transpose', 'No Transpose', M-K, N, K,
+     $                     NEG_ONE, V(K+1,1), LDV, C, LDC, ZERO,
+     $                     C(K+1,1), LDC)
+               DO I = 1, N
+                  C(K+I,I) = C(K+I,I) + ONE
+               END DO
+            ELSE
+               CALL ZGEMM('No Transpose', 'No Transpose', M-K, N, K,
+     $                     NEG_ONE, V(K+1,1), LDV, C, LDC, ONE,
+     $                     C(K+1,1), LDC)
+            END IF
+            !
+            ! C1 = -V1*C1
+            !
+            CALL ZTRMM('Left', 'Lower', 'No Transpose', 'Unit',
+     $                  K, N, NEG_ONE, V, LDV, C, LDC)
+         ELSE IF (LQ) THEN
+            ! We are computing C = CH' = C(I-V'T'V)
+            ! Where: V = [ V1 V2 ] and C = [ C1 C2 ]
+            ! with the following dimensions:
+            !     V1\in\R^{K\times K}
+            !     V2\in\R^{K\times N-K}
+            !     C1=0\in\R^{M\times K}
+            !     C2\in\R^{M\times N-K}
+            ! Since we are assuming that C1 is a zero matrix and it will be
+            ! overwritten on exit, we can use this spot as a temporary workspace
+            ! without having to allocate anything extra.
+            ! This lets us simplify our above equation to get
+            !
+            ! C = CH' = [ 0, C2 ](I - [ V1' ]T'[ V1, V2 ])
+            !                         [ V2' ]
+            !
+            !   = [ 0, C2 ] - [ 0, C2 ][ V1' ]T'[ V1, V2 ]
+            !                          [ V2' ]
+            !
+            !   = [ 0, C2 ] - C2*V2'*T'[ V1, V2 ]
+            !
+            !   = [ -C2*V2'*T'*V1, C2 - C2*V2'*T'*V2 ]
+            !
+            ! So, we can order our computations as follows:
+            !
+            ! C1 = C2*V2'
+            ! C1 = C1*T'
+            ! C2 = C2 - C1*V2
+            ! C1 = -C1*V1
+            !
+            ! To achieve the same end result
+            !
+            ! Check to ensure side and trans are the expected values 
+            !
+            IF( .NOT.SIDER ) THEN
+               CALL XERBLA('DLARFB0C2', 2)
+               RETURN
+            ELSE IF(.NOT.TRANST) THEN
+               CALL XERBLA('DLARFB0C2', 3)
+               RETURN
+            END IF
+            !
+            ! C1 = C2*V2'
+            !
+            IF( C2I ) THEN
+               DO J = 1, K
+                  DO I = 1, M
+                     C(I,J) = CONJG(V(J,K+I))
+                  END DO
+               END DO
+            ELSE
+               CALL ZGEMM('No Transpose', 'Conjugate', M, K, N-K,
+     $               ONE, C(1,K+1), LDC, V(1, K+1), LDV, ZERO, C,
+     $               LDC)
+            END IF
+            !
+            ! C1 = C1*T'
+            !
+            CALL ZTRMM('Right', 'Upper', 'Conjugate', 'Non-unit',
+     $            M, K, ONE, T, LDT, C, LDC)
+            !
+            ! C2 = C2 - C1*V2 = -C1*V2 + C2
+            !
+            IF( C2I ) THEN
+               CALL ZGEMM('No Transpose', 'No Transpose', M, N-K, K,
+     $               NEG_ONE, C, LDC, V(1,K+1), LDV, ZERO, C(1,K+1),
+     $               LDC)
+               DO I = 1, M
+                  C(I,K+I) = C(I,K+I) + ONE
+               END DO
+            ELSE
+               CALL ZGEMM('No Transpose', 'No Transpose', M, N-K, K,
+     $               NEG_ONE, C, LDC, V(1,K+1), LDV, ONE, C(1,K+1),
+     $               LDC)
+            END IF
+            !
+            ! C1 = -C1*V1
+            !
+            CALL ZTRMM('Right', 'Upper', 'No Transpose', 'Unit',
+     $            M, K, NEG_ONE, V, LDV, C, LDC)
+         ELSE IF (QL) THEN
+            ! We are computing C = HC = (I - VTV')C
+            ! Where: V = [ V2 ] and C = [ C2 ]
+            !            [ V1 ]         [ C1 ]
+            ! with the following dimensions:
+            !     V1\in\R^{K\times K}
+            !     V2\in\R^{M-K\times K}
+            !     C1=0\in\R^{K\times N}
+            !     C2\in\R^{M-K\times N}
+            ! Since we are assuming that C1 is a zero matrix and it will be
+            ! overwritten on exit, we can use this spot as a temporary workspace
+            ! without having to allocate anything extra.
+            ! This lets us simplify our above equation to get
+            !
+            ! C = HC = (I-[ V2 ]T[ V2' V1' ])[ C2 ]
+            !             [ V1 ]             [ 0  ]
+            !
+            !   = [ C2 ] - [ V2 ]T*V2'*C2
+            !     [ 0  ]   [ V1 ]
+            !
+            !   = [ C2 ] - [ V2*T*V2'*C2 ]
+            !     [ 0  ]   [ V1*T*V2'*C2 ]
+            !
+            !   = [ C2 - V2*T*V2'*C2 ]
+            !     [    - V1*T*V2'*C2 ]
+            !
+            ! So, we can order our computations as follows:
+            !
+            ! C1 = V2'*C2
+            ! C1 = T*C1
+            ! C2 = C2 - V2*C1
+            ! C1 = -V1*C1
+            !
+            ! To achieve the same end result
+            !
+            ! Check to ensure side and trans are the expected values 
+            !
+            IF( .NOT.SIDEL ) THEN
+               CALL XERBLA('DLARFB0C2', 2)
+               RETURN
+            ELSE IF(TRANST) THEN
+               CALL XERBLA('DLARFB0C2', 3)
+               RETURN
+            END IF
+            !
+            ! C1 = V2'*C2
+            !
+            IF( C2I ) THEN
+               DO J = 1, N
+                  DO I = 1, K
+                     C(M-K+I,J) = CONJG(V(J,I))
+                  END DO
+               END DO
+            ELSE
+               CALL ZGEMM('Conjugate', 'No Transpose', K, N, M-K,
+     $            ONE, V, LDV, C, LDC, ZERO, C(M-K+1, 1), LDC)
+            END IF
+            !
+            ! C1 = T*C1
+            !
+            CALL ZTRMM('Left', 'Lower', 'No Transpose', 'Non-unit',
+     $         K, N, ONE, T, LDT, C(M-K+1,1), LDC)
+            !
+            ! C2 = C2 - V2*C1 = -V2*C1 + C2
+            !
+            IF( C2I ) THEN
+               CALL ZGEMM('No Transpose', 'No Transpose', M-K, N, K,
+     $            NEG_ONE, V, LDV, C(M-K+1,1), LDC, ZERO, C, LDC)
+               DO I = 1, N
+                  C(I,I) = C(I,I) + ONE
+               END DO
+            ELSE
+               CALL ZGEMM('No Transpose', 'No Transpose', M-K, N, K,
+     $            NEG_ONE, V, LDV, C(M-K+1,1), LDC, ONE, C, LDC)
+            END IF
+            !
+            ! C1 = -V1*C1
+            !
+            CALL ZTRMM('Left', 'Upper', 'No Transpose', 'Unit',
+     $         K, N, NEG_ONE, V(M-K+1,1), LDV, C(M-K+1,1), LDC)
+         ELSE ! IF (RQ) THEN
+            ! We are computing C = CH' = C(I-V'T'V)
+            ! Where: V = [ V2 V1] and C = [ C2 C1 ]
+            ! with the following dimensions:
+            !     V1\in\R^{K\times K}
+            !     V2\in\R^{K\times N-K}
+            !     C1=0\in\R^{M\times K}
+            !     C2\in\R^{M\times N-K}
+            ! Since we are assuming that C1 is a zero matrix and it will be
+            ! overwritten on exit, we can use this spot as a temporary workspace
+            ! without having to allocate anything extra.
+            ! This lets us simplify our above equation to get
+            !
+            ! C = CH' = [ C2, 0 ] (I - [ V2' ]T'[ V2, V1 ]
+            !                          [ V1' ]
+            !
+            !   = [ C2, 0 ] - [ C2, 0 ] [ V2' ]T'[ V2, V1 ]
+            !                           [ V1' ]
+            !
+            !   = [ C2, 0 ] - C2*V2'*T'[ V2, V1 ]
+            !
+            !   = [ C2, 0 ] - [ C2*V2'*T'*V2, C2*V2'*T'*V1 ]
+            !
+            !   = [ C2 - C2*V2'*T'*V2, -C2*V2'*T'*V1 ]
+            !
+            ! So, we can order our computations as follows:
+            !
+            ! C1 = C2*V2'
+            ! C1 = C1*T'
+            ! C2 = C2 - C1*V2
+            ! C1 = -C1*V1
+            !
+            !
+            ! To achieve the same end result
+            !
+            ! Check to ensure side and trans are the expected values 
+            !
+            IF( .NOT.SIDER ) THEN
+               CALL XERBLA('DLARFB0C2', 2)
+               RETURN
+            ELSE IF(.NOT.TRANST) THEN
+               CALL XERBLA('DLARFB0C2', 3)
+               RETURN
+            END IF
+            !
+            ! C1 = C2*V2'
+            !
+            IF( C2I ) THEN
+               DO J = 1, K
+                  DO I = 1, M
+                     C(I,N-K+J) = CONJG(V(J,I))
+                  END DO
+               END DO
+            ELSE
+               CALL ZGEMM('No Transpose', 'Conjugate', M, K, N-K,
+     $            ONE, C, LDC, V, LDV, ZERO, C(1, N-K+1), LDC)
+            END IF
+            !
+            ! C1 = C1*T'
+            !
+            CALL ZTRMM('Right', 'Lower', 'Conjugate', 'Non-unit',
+     $         M, K, ONE, T, LDT, C(1, N-K+1), LDC)
+            !
+            ! C2 = C2 - C1*V2 = -C1*V2 + C2
+            !
+            IF( C2I ) THEN
+               CALL ZGEMM('No Transpose', 'No Transpose', M, N-K, K,
+     $            NEG_ONE, C(1, N-K+1), LDC, V, LDV, ZERO, C, LDC)
+               DO I = 1, M
+                  C(I,I) = C(I,I) + ONE
+               END DO
+            ELSE
+               CALL ZGEMM('No Transpose', 'No Transpose', M, N-K, K,
+     $            NEG_ONE, C(1, N-K+1), LDC, V, LDV, ONE, C, LDC)
+            END IF
+            !
+            ! C1 = -C1*V1
+            !
+            CALL ZTRMM('Right', 'Lower', 'No Transpose', 'Unit',
+     $         M, K, NEG_ONE, V(1, N-K+1), LDV, C(1,N-K+1), LDC)
+         END IF
+      END SUBROUTINE
diff --git a/SRC/zunglq.f b/SRC/zunglq.f
index 3cc107560d..9da595f491 100644
--- a/SRC/zunglq.f
+++ b/SRC/zunglq.f
@@ -121,7 +121,7 @@
 *> \ingroup unglq
 *
 *  =====================================================================
-      SUBROUTINE ZUNGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
+      SUBROUTINE ZUNGLQ(M, N, K, A, LDA, TAU, WORK, LWORK, INFO)
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
@@ -136,17 +136,13 @@ SUBROUTINE ZUNGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
 *  =====================================================================
 *
-*     .. Parameters ..
-      COMPLEX*16         ZERO
-      PARAMETER          ( ZERO = ( 0.0D+0, 0.0D+0 ) )
-*     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IB, IINFO, IWS, J, KI, KK, L, LDWORK,
-     $                   LWKOPT, NB, NBMIN, NX
+      INTEGER            I, IB, IINFO, IWS, KI, KK, LWKOPT, LDWORK,
+     $                   NB, NBMIN, NX
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           XERBLA, ZLARFB, ZLARFT, ZUNGL2
+      EXTERNAL           XERBLA, ZLARFB0C2, ZLARFT, ZUNGL2
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
@@ -217,67 +213,95 @@ SUBROUTINE ZUNGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
       IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
 *
-*        Use blocked code after the last block.
-*        The first kk rows are handled by the block method.
-*
-         KI = ( ( K-NX-1 ) / NB )*NB
-         KK = MIN( K, KI+NB )
-*
-*        Set A(kk+1:m,1:kk) to zero.
+*        Treat the last NB block starting at KK+1 specially then use our blocking
+*        method from the block starting at KI+1 to 1 
 *
-         DO 20 J = 1, KK
-            DO 10 I = KK + 1, M
-               A( I, J ) = ZERO
-   10       CONTINUE
-   20    CONTINUE
+         KI = K - 2 * NB
+         KK = K - NB
       ELSE
          KK = 0
       END IF
 *
-*     Use unblocked code for the last or only block.
+*     Potentially bail to the unblocked version
 *
-      IF( KK.LT.M )
-     $   CALL ZUNGL2( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA,
-     $                TAU( KK+1 ), WORK, IINFO )
+      IF( KK.EQ.0 ) THEN
+         CALL ZUNGL2( M, N, K, A, LDA, TAU, WORK, IINFO )
+      END IF
 *
       IF( KK.GT.0 ) THEN
 *
-*        Use blocked code
+*        Factor the last block assuming that our first application
+*        will be on the Identity matrix
 *
-         DO 50 I = KI + 1, 1, -NB
-            IB = MIN( NB, K-I+1 )
-            IF( I+IB.LE.M ) THEN
+         I = KK + 1
+         IB = NB
 *
-*              Form the triangular factor of the block reflector
-*              H = H(i) H(i+1) . . . H(i+ib-1)
+*        Form the triangular factor of the block reflector
+*        H = H(i) H(i+1) . . . H(i+ib-1)
 *
-               CALL ZLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I,
-     $                      I ),
-     $                      LDA, TAU( I ), WORK, LDWORK )
+         CALL ZLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ),
+     $               LDA, TAU( I ), WORK, LDWORK )
 *
-*              Apply H**H to A(i+ib:m,i:n) from the right
+*        Apply H**T to A(i+ib:m,i:n) from the right
+*        Exploit the fact that we are applying to an identity 
 *
-               CALL ZLARFB( 'Right', 'Conjugate transpose',
-     $                      'Forward',
-     $                      'Rowwise', M-I-IB+1, N-I+1, IB, A( I, I ),
-     $                      LDA, WORK, LDWORK, A( I+IB, I ), LDA,
-     $                      WORK( IB+1 ), LDWORK )
-            END IF
+         CALL ZLARFB0C2(.TRUE., 'Right', 'Conjugate', 'Forward',
+     $         'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, 
+     $         LDWORK, A(I+IB,I), LDA)
+*
+*        Apply H**T to columns i:n of current block
+
+         CALL ZUNGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ),
+     $                WORK, IINFO )
+*
+*        Use our standard blocking method after the last block
+*
+         DO I = KI + 1, 1, -NB
+            IB = NB
+*
+*           Form the triangular factor of the block reflector
+*           H = H(i) H(i+1) . . . H(i+ib-1)
 *
-*           Apply H**H to columns i:n of current block
+            CALL ZLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ),
+     $                  LDA, TAU( I ), WORK, LDWORK )
 *
+*           Apply H**T to A(i+ib:m,i:n) from the right
+*
+            CALL ZLARFB0C2(.FALSE., 'Right', 'Conjugate', 'Forward',
+     $            'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, 
+     $            LDWORK, A(I+IB,I), LDA)
+*
+*           Apply H**T to columns i:n of current block
+
             CALL ZUNGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ),
-     $                   WORK,
-     $                   IINFO )
+     $                   WORK, IINFO )
+         END DO
+*
+*        This checks for if K was a perfect multiple of NB
+*        so that we only have a special case for the last block when
+*        necessary
+*
+         IF(I.LT.1) THEN
+            IB = I + NB - 1
+            I = 1
+*
+*           Form the triangular factor of the block reflector
+*           H = H(i) H(i+1) . . . H(i+ib-1)
 *
-*           Set columns 1:i-1 of current block to zero
+            CALL ZLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ),
+     $                  LDA, TAU( I ), WORK, LDWORK )
 *
-            DO 40 J = 1, I - 1
-               DO 30 L = I, I + IB - 1
-                  A( L, J ) = ZERO
-   30          CONTINUE
-   40       CONTINUE
-   50    CONTINUE
+*           Apply H**T to A(i+ib:m,i:n) from the right
+*
+            CALL ZLARFB0C2(.FALSE., 'Right', 'Conjugate', 'Forward',
+     $            'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, 
+     $            LDWORK, A(I+IB,I), LDA)
+*
+*           Apply H**T to columns i:n of current block
+
+            CALL ZUNGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ),
+     $                   WORK, IINFO )
+         END IF
       END IF
 *
       WORK( 1 ) = IWS
diff --git a/SRC/zungql.f b/SRC/zungql.f
index a64de501d3..e3e604e791 100644
--- a/SRC/zungql.f
+++ b/SRC/zungql.f
@@ -122,7 +122,7 @@
 *> \ingroup ungql
 *
 *  =====================================================================
-      SUBROUTINE ZUNGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
+      SUBROUTINE ZUNGQL(M, N, K, A, LDA, TAU, WORK, LWORK, INFO)
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
@@ -136,18 +136,14 @@ SUBROUTINE ZUNGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *     ..
 *
 *  =====================================================================
-*
-*     .. Parameters ..
-      COMPLEX*16         ZERO
-      PARAMETER          ( ZERO = ( 0.0D+0, 0.0D+0 ) )
-*     ..
+*     
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IB, IINFO, IWS, J, KK, L, LDWORK, LWKOPT,
-     $                   NB, NBMIN, NX
+      INTEGER            I, IB, IINFO, IWS, KK, LWKOPT, NB, NBMIN,
+     $                   NX
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           XERBLA, ZLARFB, ZLARFT, ZUNG2L
+      EXTERNAL           XERBLA, ZLARFB0C2, ZLARFT, ZUNG2L
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
@@ -177,7 +173,11 @@ SUBROUTINE ZUNGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
             LWKOPT = 1
          ELSE
             NB = ILAENV( 1, 'ZUNGQL', ' ', M, N, K, -1 )
-            LWKOPT = N*NB
+*
+*           Only need a workspace for zung2l in case of bailout
+*           and for the panel factorization
+*
+            LWKOPT = N
          END IF
          WORK( 1 ) = LWKOPT
 *
@@ -200,88 +200,77 @@ SUBROUTINE ZUNGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
       END IF
 *
       NBMIN = 2
-      NX = 0
+      NX = MAX( 0, ILAENV( 3, 'ZUNGQL', ' ', M, N, K, -1 ))
       IWS = N
-      IF( NB.GT.1 .AND. NB.LT.K ) THEN
-*
-*        Determine when to cross over from blocked to unblocked code.
 *
-         NX = MAX( 0, ILAENV( 3, 'ZUNGQL', ' ', M, N, K, -1 ) )
-         IF( NX.LT.K ) THEN
+      IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
 *
-*           Determine if workspace is large enough for blocked code.
+*        We use blocked code for the entire construction
 *
-            LDWORK = N
-            IWS = LDWORK*NB
-            IF( LWORK.LT.IWS ) THEN
+         KK = K
+      ELSE
+         KK = 0
+      END IF
 *
-*              Not enough workspace to use optimal NB:  reduce NB and
-*              determine the minimum value of NB.
+*     Possibly bail to the unblocked code.
 *
-               NB = LWORK / LDWORK
-               NBMIN = MAX( 2, ILAENV( 2, 'ZUNGQL', ' ', M, N, K,
-     $                      -1 ) )
-            END IF
-         END IF
+      IF( KK.EQ.0 ) THEN
+         CALL ZUNG2L( M, N, K, A, LDA, TAU, WORK, IINFO )
       END IF
 *
-      IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
+      IF( KK.GT.0 ) THEN
 *
-*        Use blocked code after the first block.
-*        The last kk columns are handled by the block method.
+*        Factor the first block assuming that our first application
+*        will be on the Identity matrix
 *
-         KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB )
+         I = 1
+         IB = NB
 *
-*        Set A(m-kk+1:m,1:n-kk) to zero.
+*        Form the triangular factor of the block reflector
+*        H = H(i+ib-1) . . . H(i+1) H(i)
 *
-         DO 20 J = 1, N - KK
-            DO 10 I = M - KK + 1, M
-               A( I, J ) = ZERO
-   10       CONTINUE
-   20    CONTINUE
-      ELSE
-         KK = 0
-      END IF
+         CALL ZLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB,
+     $                  A( 1, N-K+I ), LDA, TAU( I ),
+     $                  A( M-K+I, N-K+I ), LDA)
 *
-*     Use unblocked code for the first or only block.
+*        Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left
+*        Exploit the fact that we are applying to an identity 
 *
-      CALL ZUNG2L( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO )
+         CALL ZLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Backward', 
+     $         'Columnwise', M-K+I+IB-1, N-K+I-1, IB, A(1, N-K+I), 
+     $         LDA, A( M-K+I, N-K+I ), LDA, A, LDA)
 *
-      IF( KK.GT.0 ) THEN
+*        Apply H to rows 1:m-k+i+ib-1 of current block
+*
+         CALL ZUNG2L( M-K+I+IB-1, IB, IB, A( 1, N-K+I ), LDA,
+     $                TAU( I ), WORK, IINFO )
+
+*        Use blocked code on the remaining blocks if there are any.
+*
+         DO I = NB+1, K, NB
 *
-*        Use blocked code
+*           The last block may be less than size NB
 *
-         DO 50 I = K - KK + 1, K, NB
-            IB = MIN( NB, K-I+1 )
-            IF( N-K+I.GT.1 ) THEN
+            IB = MIN(NB, K-I+1)
 *
-*              Form the triangular factor of the block reflector
-*              H = H(i+ib-1) . . . H(i+1) H(i)
+*           Form the triangular factor of the block reflector
+*           H = H(i+ib-1) . . . H(i+1) H(i)
 *
-               CALL ZLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB,
-     $                      A( 1, N-K+I ), LDA, TAU( I ), WORK, LDWORK )
+            CALL ZLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB,
+     $                  A( 1, N-K+I ), LDA, TAU( I ), 
+     $                  A( M-K+I, N-K+I ), LDA )
 *
-*              Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left
+*           Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left
 *
-               CALL ZLARFB( 'Left', 'No transpose', 'Backward',
-     $                      'Columnwise', M-K+I+IB-1, N-K+I-1, IB,
-     $                      A( 1, N-K+I ), LDA, WORK, LDWORK, A, LDA,
-     $                      WORK( IB+1 ), LDWORK )
-            END IF
+            CALL ZLARFB0C2(.FALSE., 'Left', 'No Transpose',
+     $            'Backward', 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, 
+     $            A(1, N-K+I), LDA, A( M-K+I, N-K+I ), LDA, A, LDA)
 *
 *           Apply H to rows 1:m-k+i+ib-1 of current block
 *
             CALL ZUNG2L( M-K+I+IB-1, IB, IB, A( 1, N-K+I ), LDA,
      $                   TAU( I ), WORK, IINFO )
-*
-*           Set rows m-k+i+ib:m of current block to zero
-*
-            DO 40 J = N - K + I, N - K + I + IB - 1
-               DO 30 L = M - K + I + IB, M
-                  A( L, J ) = ZERO
-   30          CONTINUE
-   40       CONTINUE
-   50    CONTINUE
+         END DO
       END IF
 *
       WORK( 1 ) = IWS
diff --git a/SRC/zungqr.f b/SRC/zungqr.f
index 6c9f2e3ff5..bb5dd9ee03 100644
--- a/SRC/zungqr.f
+++ b/SRC/zungqr.f
@@ -122,7 +122,7 @@
 *> \ingroup ungqr
 *
 *  =====================================================================
-      SUBROUTINE ZUNGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
+      SUBROUTINE ZUNGQR(M, N, K, A, LDA, TAU, WORK, LWORK, INFO)
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
@@ -137,17 +137,13 @@ SUBROUTINE ZUNGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
 *  =====================================================================
 *
-*     .. Parameters ..
-      COMPLEX*16         ZERO
-      PARAMETER          ( ZERO = ( 0.0D+0, 0.0D+0 ) )
-*     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IB, IINFO, IWS, J, KI, KK, L, LDWORK,
-     $                   LWKOPT, NB, NBMIN, NX
+      INTEGER            I, IB, IINFO, IWS, KI, KK, LWKOPT, NB,
+     $                   NBMIN, NX
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           XERBLA, ZLARFB, ZLARFT, ZUNG2R
+      EXTERNAL           XERBLA, ZLARFB0C2, ZLARFT, ZUNG2R
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
@@ -162,7 +158,11 @@ SUBROUTINE ZUNGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
       INFO = 0
       NB = ILAENV( 1, 'ZUNGQR', ' ', M, N, K, -1 )
-      LWKOPT = MAX( 1, N )*NB
+*
+*     Only need a workspace for zung2r in case of bailout
+*     and for the panel factorization
+*
+      LWKOPT = MAX( 1, N )
       WORK( 1 ) = LWKOPT
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
@@ -191,92 +191,102 @@ SUBROUTINE ZUNGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
       END IF
 *
       NBMIN = 2
-      NX = 0
+*     Determine when to cross over from unblocked to blocked
+      NX = MAX( 0, ILAENV( 3, 'ZUNGQR', ' ', M, N, K, -1 ) )
       IWS = N
-      IF( NB.GT.1 .AND. NB.LT.K ) THEN
 *
-*        Determine when to cross over from blocked to unblocked code.
-*
-         NX = MAX( 0, ILAENV( 3, 'ZUNGQR', ' ', M, N, K, -1 ) )
-         IF( NX.LT.K ) THEN
+      IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
 *
-*           Determine if workspace is large enough for blocked code.
+*        Treat the last NB block starting at KK+1 specially then use our blocking
+*        method from the block starting at KI+1 to 1 
 *
-            LDWORK = N
-            IWS = LDWORK*NB
-            IF( LWORK.LT.IWS ) THEN
+         KI = K - 2 * NB
+         KK = K - NB
+      ELSE
+         KK = 0
+      END IF
 *
-*              Not enough workspace to use optimal NB:  reduce NB and
-*              determine the minimum value of NB.
+*     Potentially bail to the unblocked code.
 *
-               NB = LWORK / LDWORK
-               NBMIN = MAX( 2, ILAENV( 2, 'ZUNGQR', ' ', M, N, K,
-     $                      -1 ) )
-            END IF
-         END IF
+      IF( KK.EQ.0 ) THEN
+         CALL ZUNG2R( M, N, K, A, LDA, TAU, WORK, IINFO )
       END IF
 *
-      IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
+      IF( KK.GT.0 ) THEN
 *
-*        Use blocked code after the last block.
-*        The first kk columns are handled by the block method.
+*        Factor the last block assuming that our first application
+*        will be on the Identity matrix
 *
-         KI = ( ( K-NX-1 ) / NB )*NB
-         KK = MIN( K, KI+NB )
+         I = KK + 1
+         IB = NB
 *
-*        Set A(1:kk,kk+1:n) to zero.
+*        Form the triangular factor of the block reflector
+*        H = H(i) H(i+1) . . . H(i+ib-1)
 *
-         DO 20 J = KK + 1, N
-            DO 10 I = 1, KK
-               A( I, J ) = ZERO
-   10       CONTINUE
-   20    CONTINUE
-      ELSE
-         KK = 0
-      END IF
+         CALL ZLARFT('Forward', 'Column', M-I+1, IB, A(I,I),
+     $                     LDA, TAU(I), A(I,I), LDA)
 *
-*     Use unblocked code for the last or only block.
+*        Apply H to A(i:m,i+ib:n) from the left
+*        Exploit the fact that we are applying to an identity 
 *
-      IF( KK.LT.N )
-     $   CALL ZUNG2R( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA,
-     $                TAU( KK+1 ), WORK, IINFO )
+         CALL ZLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Forward',
+     $      'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), LDA, A(I,I),
+     $      LDA, A(I,I+IB), LDA)
 *
-      IF( KK.GT.0 ) THEN
+*        Apply H to rows i:m of current block
 *
-*        Use blocked code
+         CALL ZUNG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK, IINFO)
 *
-         DO 50 I = KI + 1, 1, -NB
-            IB = MIN( NB, K-I+1 )
-            IF( I+IB.LE.N ) THEN
+*        Use our standard blocking method after the last block
 *
-*              Form the triangular factor of the block reflector
-*              H = H(i) H(i+1) . . . H(i+ib-1)
+         DO I = KI + 1, 1, -NB
+            IB = NB
 *
-               CALL ZLARFT( 'Forward', 'Columnwise', M-I+1, IB,
-     $                      A( I, I ), LDA, TAU( I ), WORK, LDWORK )
+*           Form the triangular factor of the block reflector
+*           H = H(i) H(i+1) . . . H(i+ib-1)
 *
-*              Apply H to A(i:m,i+ib:n) from the left
+            CALL ZLARFT('Forward', 'Column', M-I+1, IB, A(I,I),
+     $         LDA, TAU(I), A(I,I), LDA)
 *
-               CALL ZLARFB( 'Left', 'No transpose', 'Forward',
-     $                      'Columnwise', M-I+1, N-I-IB+1, IB,
-     $                      A( I, I ), LDA, WORK, LDWORK, A( I, I+IB ),
-     $                      LDA, WORK( IB+1 ), LDWORK )
-            END IF
+*           Apply H to A(i:m,i+ib:n) from the left
+*
+            CALL ZLARFB0C2(.FALSE., 'Left', 'No Transpose',
+     $         'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I),
+     $         LDA, A(I,I), LDA, A(I,I+IB), LDA)
+
 *
 *           Apply H to rows i:m of current block
 *
-            CALL ZUNG2R( M-I+1, IB, IB, A( I, I ), LDA, TAU( I ),
-     $                   WORK,
-     $                   IINFO )
+            CALL ZUNG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK,
+     $         IINFO)
+         END DO
+*
+*        This checks for if K was a perfect multiple of NB
+*        so that we only have a special case for the last block when
+*        necessary
+*
+         IF(I.LT.1) THEN
+            IB = I + NB - 1
+            I = 1
 *
-*           Set rows 1:i-1 of current block to zero
+*           Form the triangular factor of the block reflector
+*           H = H(i) H(i+1) . . . H(i+ib-1)
 *
-            DO 40 J = I, I + IB - 1
-               DO 30 L = 1, I - 1
-                  A( L, J ) = ZERO
-   30          CONTINUE
-   40       CONTINUE
-   50    CONTINUE
+            CALL ZLARFT('Forward', 'Column', M-I+1, IB, A(I,I),
+     $         LDA, TAU(I), A(I,I), LDA)
+*
+*           Apply H to A(i:m,i+ib:n) from the left
+*
+            CALL ZLARFB0C2(.FALSE., 'Left', 'No Transpose',
+     $         'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I),
+     $         LDA, A(I,I), LDA, A(I,I+IB), LDA)
+
+*
+*           Apply H to rows i:m of current block
+*
+            CALL ZUNG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK,
+     $         IINFO)
+         END IF
       END IF
 *
       WORK( 1 ) = IWS
diff --git a/SRC/zungrq.f b/SRC/zungrq.f
index dcc2772d67..1a7640d102 100644
--- a/SRC/zungrq.f
+++ b/SRC/zungrq.f
@@ -122,7 +122,7 @@
 *> \ingroup ungrq
 *
 *  =====================================================================
-      SUBROUTINE ZUNGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
+      SUBROUTINE ZUNGRQ(M, N, K, A, LDA, TAU, WORK, LWORK, INFO)
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
@@ -137,13 +137,9 @@ SUBROUTINE ZUNGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
 *  =====================================================================
 *
-*     .. Parameters ..
-      COMPLEX*16         ZERO
-      PARAMETER          ( ZERO = ( 0.0D+0, 0.0D+0 ) )
-*     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IB, II, IINFO, IWS, J, KK, L, LDWORK,
+      INTEGER            I, IB, II, IINFO, IWS, KK, LDWORK,
      $                   LWKOPT, NB, NBMIN, NX
 *     ..
 *     .. External Subroutines ..
@@ -230,61 +226,67 @@ SUBROUTINE ZUNGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *        Use blocked code after the first block.
 *        The last kk rows are handled by the block method.
 *
-         KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB )
-*
-*        Set A(1:m-kk,n-kk+1:n) to zero.
-*
-         DO 20 J = N - KK + 1, N
-            DO 10 I = 1, M - KK
-               A( I, J ) = ZERO
-   10       CONTINUE
-   20    CONTINUE
+         KK = K
       ELSE
          KK = 0
       END IF
 *
-*     Use unblocked code for the first or only block.
+*     Potentially bail to the unblocked code
 *
-      CALL ZUNGR2( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO )
+      IF( KK.EQ.0 ) THEN
+         CALL ZUNGR2( M, N, K, A, LDA, TAU, WORK, IINFO )
+      END IF
 *
       IF( KK.GT.0 ) THEN
 *
-*        Use blocked code
+*        Factor the first block assuming that our first application
+*        will be on the Identity matrix
 *
-         DO 50 I = K - KK + 1, K, NB
-            IB = MIN( NB, K-I+1 )
-            II = M - K + I
-            IF( II.GT.1 ) THEN
+         I = 1
+         IB = NB
+         II = M - K + I
 *
-*              Form the triangular factor of the block reflector
-*              H = H(i+ib-1) . . . H(i+1) H(i)
+*        Form the triangular factor of the block reflector
+*        H = H(i+ib-1) . . . H(i+1) H(i)
 *
-               CALL ZLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB,
-     $                      A( II, 1 ), LDA, TAU( I ), WORK, LDWORK )
+         CALL ZLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB,
+     $                A( II, 1 ), LDA, TAU( I ), WORK, LDWORK )
 *
-*              Apply H**H to A(1:m-k+i-1,1:n-k+i+ib-1) from the right
+*        Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right
 *
-               CALL ZLARFB( 'Right', 'Conjugate transpose',
-     $                      'Backward',
-     $                      'Rowwise', II-1, N-K+I+IB-1, IB, A( II, 1 ),
-     $                      LDA, WORK, LDWORK, A, LDA, WORK( IB+1 ),
-     $                      LDWORK )
-            END IF
+         CALL ZLARFB0C2(.TRUE., 'Right', 'Conjugate', 'Backward', 
+     $         'Rowwise', II-1, N-K+I+IB-1, IB, A(II,1), LDA, WORK,
+     $         LDWORK, A, LDA)
 *
-*           Apply H**H to columns 1:n-k+i+ib-1 of current block
+*           Apply H**T to columns 1:n-k+i+ib-1 of current block
 *
-            CALL ZUNGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA,
-     $                   TAU( I ),
-     $                   WORK, IINFO )
+         CALL ZUNGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA,
+     $                TAU( I ), WORK, IINFO )
+
+         DO I = NB+1, K, NB
 *
-*           Set columns n-k+i+ib:n of current block to zero
+*           The last block may be less than size NB
 *
-            DO 40 L = N - K + I + IB, N
-               DO 30 J = II, II + IB - 1
-                  A( J, L ) = ZERO
-   30          CONTINUE
-   40       CONTINUE
-   50    CONTINUE
+            IB = MIN(NB, K-I+1)
+            II = M - K + I
+*
+*           Form the triangular factor of the block reflector
+*           H = H(i+ib-1) . . . H(i+1) H(i)
+*
+            CALL ZLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB,
+     $                   A( II, 1 ), LDA, TAU( I ), WORK, LDWORK )
+*
+*           Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right
+*
+            CALL ZLARFB0C2(.FALSE., 'Right', 'Conjugate', 
+     $            'Backward', 'Rowwise', II-1, N-K+I+IB-1, IB, 
+     $            A(II,1), LDA, WORK, LDWORK, A, LDA)
+*
+*           Apply H**T to columns 1:n-k+i+ib-1 of current block
+*
+            CALL ZUNGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA,
+     $                   TAU( I ), WORK, IINFO )
+         END DO
       END IF
 *
       WORK( 1 ) = IWS

From 2b21ac7bceadaeb7f6022d0c00d38aedd74068d6 Mon Sep 17 00:00:00 2001
From: Johnathan Rhyne <rhynejohnathan@gmail.com>
Date: Tue, 11 Feb 2025 17:05:34 -0700
Subject: [PATCH 04/10] single complex implementation also pass local tests.
 TODO: check github tests

---
 SRC/CMakeLists.txt |   2 +-
 SRC/Makefile       |   2 +-
 SRC/clarfb0c2.f    | 381 +++++++++++++++++++++++++++++++++++++++++++++
 SRC/cunglq.f       | 122 +++++++++------
 SRC/cungql.f       | 117 +++++++-------
 SRC/cungqr.f       | 146 +++++++++--------
 SRC/cungrq.f       |  90 +++++------
 SRC/slarfb0c2.f    |  16 +-
 SRC/zlarfb0c2.f    |  16 +-
 9 files changed, 649 insertions(+), 243 deletions(-)
 create mode 100644 SRC/clarfb0c2.f

diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt
index 16156b72a2..24740835b8 100644
--- a/SRC/CMakeLists.txt
+++ b/SRC/CMakeLists.txt
@@ -218,7 +218,7 @@ set(CLASRC
    claqhb.f claqhe.f claqhp.f claqp2.f claqps.f claqp2rk.f claqp3rk.f claqsb.f
    claqr0.f claqr1.f claqr2.f claqr3.f claqr4.f claqr5.f
    claqsp.f claqsy.f clar1v.f clar2v.f ilaclr.f ilaclc.f
-   clarf.f  clarf1f.f clarf1l.f clarfb.f clarfb_gett.f clarfg.f clarfgp.f clarft.f
+   clarf.f  clarf1f.f clarf1l.f clarfb.f clarfb0c2.f clarfb_gett.f clarfg.f clarfgp.f clarft.f
    clarfx.f clarfy.f clargv.f clarnv.f clarrv.f clartg.f90 clartv.f
    clarz.f  clarzb.f clarzt.f clascl.f claset.f clasr.f  classq.f90
    claswp.f clasyf.f clasyf_rook.f clasyf_rk.f clasyf_aa.f
diff --git a/SRC/Makefile b/SRC/Makefile
index 44ba06be9d..b9eaf1bca3 100644
--- a/SRC/Makefile
+++ b/SRC/Makefile
@@ -249,7 +249,7 @@ CLASRC = \
    claqhb.o claqhe.o claqhp.o claqp2.o claqps.o claqp2rk.o claqp3rk.o claqsb.o \
    claqr0.o claqr1.o claqr2.o claqr3.o claqr4.o claqr5.o \
    claqsp.o claqsy.o clar1v.o clar2v.o ilaclr.o ilaclc.o \
-   clarf.o  clarf1f.o clarf1l.o clarfb.o clarfb_gett.o clarfg.o clarft.o clarfgp.o \
+   clarf.o  clarf1f.o clarf1l.o clarfb.o clarfb0c2.o clarfb_gett.o clarfg.o clarft.o clarfgp.o \
    clarfx.o clarfy.o clargv.o clarnv.o clarrv.o clartg.o clartv.o \
    clarz.o  clarzb.o clarzt.o clascl.o claset.o clasr.o  classq.o \
    claswp.o clasyf.o clasyf_rook.o clasyf_rk.o clasyf_aa.o \
diff --git a/SRC/clarfb0c2.f b/SRC/clarfb0c2.f
new file mode 100644
index 0000000000..4ab8f47aea
--- /dev/null
+++ b/SRC/clarfb0c2.f
@@ -0,0 +1,381 @@
+      SUBROUTINE CLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N,
+     $                     K, V, LDV, T, LDT, C, LDC)
+         ! Scalar arguments
+         INTEGER           M, N, K, LDV, LDC, LDT
+         CHARACTER         SIDE, TRANS, DIRECT, STOREV 
+         ! True means that we are assuming C2 is the identity matrix
+         !     and thus don't reference whatever is present in C2 
+         !     at the beginning.
+         LOGICAL           C2I
+
+         ! Array arguments
+         COMPLEX*8         V(LDV,*), C(LDC,*), T(LDT,*)
+         ! Local scalars
+         LOGICAL           QR, LQ, QL, DIRF, COLV, SIDEL, SIDER,
+     $                     TRANST
+         INTEGER           I, J
+         ! Intrinsic Functions
+         INTRINSIC         CONJG
+         ! External functions
+         LOGICAL           LSAME
+         EXTERNAL          LSAME
+         ! External Subroutines
+         EXTERNAL          CGEMM, CTRMM
+         ! Parameters
+         COMPLEX*8         ONE, ZERO, NEG_ONE
+         PARAMETER(ONE=(1.0E+0, 0.0E+0),
+     $            ZERO = (0.0E+0, 0.0E+0), 
+     $            NEG_ONE = (-1.0E+0, 0.0E+0))
+
+         ! Beginning of executable statements
+         ! Convert our character flags to logical values
+         DIRF = LSAME(DIRECT,'F')
+         COLV = LSAME(STOREV,'C')
+         SIDEL = LSAME(SIDE,'L')
+         SIDER = LSAME(SIDE,'R')
+         TRANST = LSAME(TRANS,'C')
+
+         ! Determine which of the 4 modes are using.
+         ! QR is when we store the reflectors column by column and have the
+         ! 'first' reflector stored in the first column
+         QR = DIRF.AND.COLV
+
+         ! LQ is when we store the reflectors row by row and have the
+         ! 'first' reflector stored in the first row
+         LQ = DIRF.AND.(.NOT.COLV)
+
+         ! QL is when we store the reflectors column by column and have the
+         ! 'first' reflector stored in the last column
+         QL = (.NOT.DIRF).AND.COLV
+
+         ! RQ is when we store the reflectors row by row and have the
+         ! 'first' reflector stored in the last row
+         ! RQ = (.NOT.DIRF).AND.(.NOT.COLV)
+         ! Since we have exactly one of these 4 modes, we don't need to actually
+         ! store the value of RQ, instead we assume this is the case if we fail
+         ! the above 3 checks.
+
+         IF (QR) THEN
+            ! We are computing C = HC = (I - VTV')C
+            ! Where: V = [ V1 ] and C = [ C1 ]
+            !            [ V2 ]         [ C2 ]
+            ! with the following dimensions:
+            !     V1\in\R^{K\times K}
+            !     V2\in\R^{M-K\times K}
+            !     C1=0\in\R^{K\times N}
+            !     C2\in\R^{M-K\times N}
+            ! Since we are assuming that C1 is a zero matrix and it will be
+            ! overwritten on exit, we can use this spot as a temporary workspace
+            ! without having to allocate anything extra.
+            ! This lets us simplify our above equation to get
+            !
+            ! C = HC = (I - [ V1 ]T [V1', V2'])[ 0  ]
+            !               [ V2 ]             [ C2 ]
+            !   = [ 0  ] - [ V1 ]T*V2'*C2
+            !     [ C2 ]   [ V2 ]
+            !
+            !   = [ 0  ] - [ V1*T*V2'*C2 ]
+            !     [ C2 ]   [ V2*T*V2'*C2 ]
+            !
+            !   = [      V1*T*V2'*C2 ]
+            !     [ C2 - V2*T*V2'*C2 ]
+            !
+            ! So, we can order our computations as follows:
+            !
+            ! C1 = V2'*C2
+            ! C1 = T*C1
+            ! C2 = C2 - V2*C1
+            ! C1 = -V1*C1
+            !
+            ! To achieve the same end result
+            !
+            ! Check to ensure side and trans are the expected values 
+            !
+            IF( .NOT.SIDEL ) THEN
+               CALL XERBLA('CLARFB0C2', 2)
+               RETURN
+            ELSE IF(TRANST) THEN
+               CALL XERBLA('CLARFB0C2', 3)
+               RETURN
+            END IF
+            !
+            ! C1 = V2'*C2
+            !
+            IF (C2I) THEN
+               DO J = 1, N
+                  DO I = 1, K
+                     C(I,J) = CONJG(V(K+J,I))
+                  END DO
+               END DO
+            ELSE
+               CALL CGEMM('Conjugate', 'No Transpose', K, N, M - K,
+     $                     ONE, V(K+1,1), LDV, C(K+1,1), LDC, ZERO,
+     $                     C, LDC)
+            END IF
+            !
+            ! C1 = T*C1
+            !
+            CALL CTRMM('Left', 'Upper', 'No Transpose', 'Non-unit',
+     $                  K, N, ONE, T, LDT, C, LDC)
+            !
+            ! C2 = C2 - V2*C1 = -V2*C1 + C2
+            !
+            IF (C2I) THEN
+               CALL CGEMM('No Transpose', 'No Transpose', M-K, N, K,
+     $                     NEG_ONE, V(K+1,1), LDV, C, LDC, ZERO,
+     $                     C(K+1,1), LDC)
+               DO I = 1, N
+                  C(K+I,I) = C(K+I,I) + ONE
+               END DO
+            ELSE
+               CALL CGEMM('No Transpose', 'No Transpose', M-K, N, K,
+     $                     NEG_ONE, V(K+1,1), LDV, C, LDC, ONE,
+     $                     C(K+1,1), LDC)
+            END IF
+            !
+            ! C1 = -V1*C1
+            !
+            CALL CTRMM('Left', 'Lower', 'No Transpose', 'Unit',
+     $                  K, N, NEG_ONE, V, LDV, C, LDC)
+         ELSE IF (LQ) THEN
+            ! We are computing C = CH' = C(I-V'T'V)
+            ! Where: V = [ V1 V2 ] and C = [ C1 C2 ]
+            ! with the following dimensions:
+            !     V1\in\R^{K\times K}
+            !     V2\in\R^{K\times N-K}
+            !     C1=0\in\R^{M\times K}
+            !     C2\in\R^{M\times N-K}
+            ! Since we are assuming that C1 is a zero matrix and it will be
+            ! overwritten on exit, we can use this spot as a temporary workspace
+            ! without having to allocate anything extra.
+            ! This lets us simplify our above equation to get
+            !
+            ! C = CH' = [ 0, C2 ](I - [ V1' ]T'[ V1, V2 ])
+            !                         [ V2' ]
+            !
+            !   = [ 0, C2 ] - [ 0, C2 ][ V1' ]T'[ V1, V2 ]
+            !                          [ V2' ]
+            !
+            !   = [ 0, C2 ] - C2*V2'*T'[ V1, V2 ]
+            !
+            !   = [ -C2*V2'*T'*V1, C2 - C2*V2'*T'*V2 ]
+            !
+            ! So, we can order our computations as follows:
+            !
+            ! C1 = C2*V2'
+            ! C1 = C1*T'
+            ! C2 = C2 - C1*V2
+            ! C1 = -C1*V1
+            !
+            ! To achieve the same end result
+            !
+            ! Check to ensure side and trans are the expected values 
+            !
+            IF( .NOT.SIDER ) THEN
+               CALL XERBLA('CLARFB0C2', 2)
+               RETURN
+            ELSE IF(.NOT.TRANST) THEN
+               CALL XERBLA('CLARFB0C2', 3)
+               RETURN
+            END IF
+            !
+            ! C1 = C2*V2'
+            !
+            IF( C2I ) THEN
+               DO J = 1, K
+                  DO I = 1, M
+                     C(I,J) = CONJG(V(J,K+I))
+                  END DO
+               END DO
+            ELSE
+               CALL CGEMM('No Transpose', 'Conjugate', M, K, N-K,
+     $               ONE, C(1,K+1), LDC, V(1, K+1), LDV, ZERO, C,
+     $               LDC)
+            END IF
+            !
+            ! C1 = C1*T'
+            !
+            CALL CTRMM('Right', 'Upper', 'Conjugate', 'Non-unit',
+     $            M, K, ONE, T, LDT, C, LDC)
+            !
+            ! C2 = C2 - C1*V2 = -C1*V2 + C2
+            !
+            IF( C2I ) THEN
+               CALL CGEMM('No Transpose', 'No Transpose', M, N-K, K,
+     $               NEG_ONE, C, LDC, V(1,K+1), LDV, ZERO, C(1,K+1),
+     $               LDC)
+               DO I = 1, M
+                  C(I,K+I) = C(I,K+I) + ONE
+               END DO
+            ELSE
+               CALL CGEMM('No Transpose', 'No Transpose', M, N-K, K,
+     $               NEG_ONE, C, LDC, V(1,K+1), LDV, ONE, C(1,K+1),
+     $               LDC)
+            END IF
+            !
+            ! C1 = -C1*V1
+            !
+            CALL CTRMM('Right', 'Upper', 'No Transpose', 'Unit',
+     $            M, K, NEG_ONE, V, LDV, C, LDC)
+         ELSE IF (QL) THEN
+            ! We are computing C = HC = (I - VTV')C
+            ! Where: V = [ V2 ] and C = [ C2 ]
+            !            [ V1 ]         [ C1 ]
+            ! with the following dimensions:
+            !     V1\in\R^{K\times K}
+            !     V2\in\R^{M-K\times K}
+            !     C1=0\in\R^{K\times N}
+            !     C2\in\R^{M-K\times N}
+            ! Since we are assuming that C1 is a zero matrix and it will be
+            ! overwritten on exit, we can use this spot as a temporary workspace
+            ! without having to allocate anything extra.
+            ! This lets us simplify our above equation to get
+            !
+            ! C = HC = (I-[ V2 ]T[ V2' V1' ])[ C2 ]
+            !             [ V1 ]             [ 0  ]
+            !
+            !   = [ C2 ] - [ V2 ]T*V2'*C2
+            !     [ 0  ]   [ V1 ]
+            !
+            !   = [ C2 ] - [ V2*T*V2'*C2 ]
+            !     [ 0  ]   [ V1*T*V2'*C2 ]
+            !
+            !   = [ C2 - V2*T*V2'*C2 ]
+            !     [    - V1*T*V2'*C2 ]
+            !
+            ! So, we can order our computations as follows:
+            !
+            ! C1 = V2'*C2
+            ! C1 = T*C1
+            ! C2 = C2 - V2*C1
+            ! C1 = -V1*C1
+            !
+            ! To achieve the same end result
+            !
+            ! Check to ensure side and trans are the expected values 
+            !
+            IF( .NOT.SIDEL ) THEN
+               CALL XERBLA('CLARFB0C2', 2)
+               RETURN
+            ELSE IF(TRANST) THEN
+               CALL XERBLA('CLARFB0C2', 3)
+               RETURN
+            END IF
+            !
+            ! C1 = V2'*C2
+            !
+            IF( C2I ) THEN
+               DO J = 1, N
+                  DO I = 1, K
+                     C(M-K+I,J) = CONJG(V(J,I))
+                  END DO
+               END DO
+            ELSE
+               CALL CGEMM('Conjugate', 'No Transpose', K, N, M-K,
+     $            ONE, V, LDV, C, LDC, ZERO, C(M-K+1, 1), LDC)
+            END IF
+            !
+            ! C1 = T*C1
+            !
+            CALL CTRMM('Left', 'Lower', 'No Transpose', 'Non-unit',
+     $         K, N, ONE, T, LDT, C(M-K+1,1), LDC)
+            !
+            ! C2 = C2 - V2*C1 = -V2*C1 + C2
+            !
+            IF( C2I ) THEN
+               CALL CGEMM('No Transpose', 'No Transpose', M-K, N, K,
+     $            NEG_ONE, V, LDV, C(M-K+1,1), LDC, ZERO, C, LDC)
+               DO I = 1, N
+                  C(I,I) = C(I,I) + ONE
+               END DO
+            ELSE
+               CALL CGEMM('No Transpose', 'No Transpose', M-K, N, K,
+     $            NEG_ONE, V, LDV, C(M-K+1,1), LDC, ONE, C, LDC)
+            END IF
+            !
+            ! C1 = -V1*C1
+            !
+            CALL CTRMM('Left', 'Upper', 'No Transpose', 'Unit',
+     $         K, N, NEG_ONE, V(M-K+1,1), LDV, C(M-K+1,1), LDC)
+         ELSE ! IF (RQ) THEN
+            ! We are computing C = CH' = C(I-V'T'V)
+            ! Where: V = [ V2 V1] and C = [ C2 C1 ]
+            ! with the following dimensions:
+            !     V1\in\R^{K\times K}
+            !     V2\in\R^{K\times N-K}
+            !     C1=0\in\R^{M\times K}
+            !     C2\in\R^{M\times N-K}
+            ! Since we are assuming that C1 is a zero matrix and it will be
+            ! overwritten on exit, we can use this spot as a temporary workspace
+            ! without having to allocate anything extra.
+            ! This lets us simplify our above equation to get
+            !
+            ! C = CH' = [ C2, 0 ] (I - [ V2' ]T'[ V2, V1 ]
+            !                          [ V1' ]
+            !
+            !   = [ C2, 0 ] - [ C2, 0 ] [ V2' ]T'[ V2, V1 ]
+            !                           [ V1' ]
+            !
+            !   = [ C2, 0 ] - C2*V2'*T'[ V2, V1 ]
+            !
+            !   = [ C2, 0 ] - [ C2*V2'*T'*V2, C2*V2'*T'*V1 ]
+            !
+            !   = [ C2 - C2*V2'*T'*V2, -C2*V2'*T'*V1 ]
+            !
+            ! So, we can order our computations as follows:
+            !
+            ! C1 = C2*V2'
+            ! C1 = C1*T'
+            ! C2 = C2 - C1*V2
+            ! C1 = -C1*V1
+            !
+            !
+            ! To achieve the same end result
+            !
+            ! Check to ensure side and trans are the expected values 
+            !
+            IF( .NOT.SIDER ) THEN
+               CALL XERBLA('CLARFB0C2', 2)
+               RETURN
+            ELSE IF(.NOT.TRANST) THEN
+               CALL XERBLA('CLARFB0C2', 3)
+               RETURN
+            END IF
+            !
+            ! C1 = C2*V2'
+            !
+            IF( C2I ) THEN
+               DO J = 1, K
+                  DO I = 1, M
+                     C(I,N-K+J) = CONJG(V(J,I))
+                  END DO
+               END DO
+            ELSE
+               CALL CGEMM('No Transpose', 'Conjugate', M, K, N-K,
+     $            ONE, C, LDC, V, LDV, ZERO, C(1, N-K+1), LDC)
+            END IF
+            !
+            ! C1 = C1*T'
+            !
+            CALL CTRMM('Right', 'Lower', 'Conjugate', 'Non-unit',
+     $         M, K, ONE, T, LDT, C(1, N-K+1), LDC)
+            !
+            ! C2 = C2 - C1*V2 = -C1*V2 + C2
+            !
+            IF( C2I ) THEN
+               CALL CGEMM('No Transpose', 'No Transpose', M, N-K, K,
+     $            NEG_ONE, C(1, N-K+1), LDC, V, LDV, ZERO, C, LDC)
+               DO I = 1, M
+                  C(I,I) = C(I,I) + ONE
+               END DO
+            ELSE
+               CALL CGEMM('No Transpose', 'No Transpose', M, N-K, K,
+     $            NEG_ONE, C(1, N-K+1), LDC, V, LDV, ONE, C, LDC)
+            END IF
+            !
+            ! C1 = -C1*V1
+            !
+            CALL CTRMM('Right', 'Lower', 'No Transpose', 'Unit',
+     $         M, K, NEG_ONE, V(1, N-K+1), LDV, C(1,N-K+1), LDC)
+         END IF
+      END SUBROUTINE
diff --git a/SRC/cunglq.f b/SRC/cunglq.f
index 7f42c3dcb0..a57f5a037d 100644
--- a/SRC/cunglq.f
+++ b/SRC/cunglq.f
@@ -136,17 +136,13 @@ SUBROUTINE CUNGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
 *  =====================================================================
 *
-*     .. Parameters ..
-      COMPLEX            ZERO
-      PARAMETER          ( ZERO = ( 0.0E+0, 0.0E+0 ) )
-*     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IB, IINFO, IWS, J, KI, KK, L, LDWORK,
-     $                   LWKOPT, NB, NBMIN, NX
+      INTEGER            I, IB, IINFO, IWS, KI, KK, LWKOPT, LDWORK,
+     $                   NB, NBMIN, NX
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           CLARFB, CLARFT, CUNGL2, XERBLA
+      EXTERNAL           CLARFB0C2, CLARFT, CUNGL2, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
@@ -218,67 +214,95 @@ SUBROUTINE CUNGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
       IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
 *
-*        Use blocked code after the last block.
-*        The first kk rows are handled by the block method.
-*
-         KI = ( ( K-NX-1 ) / NB )*NB
-         KK = MIN( K, KI+NB )
-*
-*        Set A(kk+1:m,1:kk) to zero.
+*        Treat the last NB block starting at KK+1 specially then use our blocking
+*        method from the block starting at KI+1 to 1 
 *
-         DO 20 J = 1, KK
-            DO 10 I = KK + 1, M
-               A( I, J ) = ZERO
-   10       CONTINUE
-   20    CONTINUE
+         KI = K - 2 * NB
+         KK = K - NB
       ELSE
          KK = 0
       END IF
 *
-*     Use unblocked code for the last or only block.
+*     Potentially bail to the unblocked version
 *
-      IF( KK.LT.M )
-     $   CALL CUNGL2( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA,
-     $                TAU( KK+1 ), WORK, IINFO )
+      IF( KK.EQ.0 ) THEN
+         CALL CUNGL2( M, N, K, A, LDA, TAU, WORK, IINFO )
+      END IF
 *
       IF( KK.GT.0 ) THEN
 *
-*        Use blocked code
+*        Factor the last block assuming that our first application
+*        will be on the Identity matrix
 *
-         DO 50 I = KI + 1, 1, -NB
-            IB = MIN( NB, K-I+1 )
-            IF( I+IB.LE.M ) THEN
+         I = KK + 1
+         IB = NB
 *
-*              Form the triangular factor of the block reflector
-*              H = H(i) H(i+1) . . . H(i+ib-1)
+*        Form the triangular factor of the block reflector
+*        H = H(i) H(i+1) . . . H(i+ib-1)
 *
-               CALL CLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I,
-     $                      I ),
-     $                      LDA, TAU( I ), WORK, LDWORK )
+         CALL CLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ),
+     $               LDA, TAU( I ), WORK, LDWORK )
 *
-*              Apply H**H to A(i+ib:m,i:n) from the right
+*        Apply H**T to A(i+ib:m,i:n) from the right
+*        Exploit the fact that we are applying to an identity 
 *
-               CALL CLARFB( 'Right', 'Conjugate transpose',
-     $                      'Forward',
-     $                      'Rowwise', M-I-IB+1, N-I+1, IB, A( I, I ),
-     $                      LDA, WORK, LDWORK, A( I+IB, I ), LDA,
-     $                      WORK( IB+1 ), LDWORK )
-            END IF
+         CALL CLARFB0C2(.TRUE., 'Right', 'Conjugate', 'Forward',
+     $         'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, 
+     $         LDWORK, A(I+IB,I), LDA)
+*
+*        Apply H**T to columns i:n of current block
+
+         CALL CUNGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ),
+     $                WORK, IINFO )
+*
+*        Use our standard blocking method after the last block
+*
+         DO I = KI + 1, 1, -NB
+            IB = NB
+*
+*           Form the triangular factor of the block reflector
+*           H = H(i) H(i+1) . . . H(i+ib-1)
 *
-*           Apply H**H to columns i:n of current block
+            CALL CLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ),
+     $                  LDA, TAU( I ), WORK, LDWORK )
 *
+*           Apply H**T to A(i+ib:m,i:n) from the right
+*
+            CALL CLARFB0C2(.FALSE., 'Right', 'Conjugate', 'Forward',
+     $            'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, 
+     $            LDWORK, A(I+IB,I), LDA)
+*
+*           Apply H**T to columns i:n of current block
+
             CALL CUNGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ),
-     $                   WORK,
-     $                   IINFO )
+     $                   WORK, IINFO )
+         END DO
+*
+*        This checks for if K was a perfect multiple of NB
+*        so that we only have a special case for the last block when
+*        necessary
+*
+         IF(I.LT.1) THEN
+            IB = I + NB - 1
+            I = 1
+*
+*           Form the triangular factor of the block reflector
+*           H = H(i) H(i+1) . . . H(i+ib-1)
 *
-*           Set columns 1:i-1 of current block to zero
+            CALL CLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ),
+     $                  LDA, TAU( I ), WORK, LDWORK )
 *
-            DO 40 J = 1, I - 1
-               DO 30 L = I, I + IB - 1
-                  A( L, J ) = ZERO
-   30          CONTINUE
-   40       CONTINUE
-   50    CONTINUE
+*           Apply H**T to A(i+ib:m,i:n) from the right
+*
+            CALL CLARFB0C2(.FALSE., 'Right', 'Conjugate', 'Forward',
+     $            'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, 
+     $            LDWORK, A(I+IB,I), LDA)
+*
+*           Apply H**T to columns i:n of current block
+
+            CALL CUNGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ),
+     $                   WORK, IINFO )
+         END IF
       END IF
 *
       WORK( 1 ) = SROUNDUP_LWORK(IWS)
diff --git a/SRC/cungql.f b/SRC/cungql.f
index 3da2702d8e..893303b4e1 100644
--- a/SRC/cungql.f
+++ b/SRC/cungql.f
@@ -137,17 +137,13 @@ SUBROUTINE CUNGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
 *  =====================================================================
 *
-*     .. Parameters ..
-      COMPLEX            ZERO
-      PARAMETER          ( ZERO = ( 0.0E+0, 0.0E+0 ) )
-*     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IB, IINFO, IWS, J, KK, L, LDWORK, LWKOPT,
-     $                   NB, NBMIN, NX
+      INTEGER            I, IB, IINFO, IWS, KK, LWKOPT, NB, NBMIN,
+     $                   NX
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           CLARFB, CLARFT, CUNG2L, XERBLA
+      EXTERNAL           CLARFB0C2, CLARFT, CUNG2L, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
@@ -178,7 +174,11 @@ SUBROUTINE CUNGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
             LWKOPT = 1
          ELSE
             NB = ILAENV( 1, 'CUNGQL', ' ', M, N, K, -1 )
-            LWKOPT = N*NB
+*
+*           Only need a workspace for cung2l in case of bailout
+*           and for the panel factorization
+*
+            LWKOPT = N
          END IF
          WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
@@ -201,88 +201,77 @@ SUBROUTINE CUNGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
       END IF
 *
       NBMIN = 2
-      NX = 0
+      NX = MAX( 0, ILAENV( 3, 'CUNGQL', ' ', M, N, K, -1 ))
       IWS = N
-      IF( NB.GT.1 .AND. NB.LT.K ) THEN
 *
-*        Determine when to cross over from blocked to unblocked code.
-*
-         NX = MAX( 0, ILAENV( 3, 'CUNGQL', ' ', M, N, K, -1 ) )
-         IF( NX.LT.K ) THEN
+      IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
 *
-*           Determine if workspace is large enough for blocked code.
+*        We use blocked code for the entire construction
 *
-            LDWORK = N
-            IWS = LDWORK*NB
-            IF( LWORK.LT.IWS ) THEN
+         KK = K
+      ELSE
+         KK = 0
+      END IF
 *
-*              Not enough workspace to use optimal NB:  reduce NB and
-*              determine the minimum value of NB.
+*     Possibly bail to the unblocked code.
 *
-               NB = LWORK / LDWORK
-               NBMIN = MAX( 2, ILAENV( 2, 'CUNGQL', ' ', M, N, K,
-     $                      -1 ) )
-            END IF
-         END IF
+      IF( KK.EQ.0 ) THEN
+         CALL CUNG2L( M, N, K, A, LDA, TAU, WORK, IINFO )
       END IF
 *
-      IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
+      IF( KK.GT.0 ) THEN
 *
-*        Use blocked code after the first block.
-*        The last kk columns are handled by the block method.
+*        Factor the first block assuming that our first application
+*        will be on the Identity matrix
 *
-         KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB )
+         I = 1
+         IB = NB
 *
-*        Set A(m-kk+1:m,1:n-kk) to zero.
+*        Form the triangular factor of the block reflector
+*        H = H(i+ib-1) . . . H(i+1) H(i)
 *
-         DO 20 J = 1, N - KK
-            DO 10 I = M - KK + 1, M
-               A( I, J ) = ZERO
-   10       CONTINUE
-   20    CONTINUE
-      ELSE
-         KK = 0
-      END IF
+         CALL CLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB,
+     $                  A( 1, N-K+I ), LDA, TAU( I ),
+     $                  A( M-K+I, N-K+I ), LDA)
 *
-*     Use unblocked code for the first or only block.
+*        Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left
+*        Exploit the fact that we are applying to an identity 
 *
-      CALL CUNG2L( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO )
+         CALL CLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Backward', 
+     $         'Columnwise', M-K+I+IB-1, N-K+I-1, IB, A(1, N-K+I), 
+     $         LDA, A( M-K+I, N-K+I ), LDA, A, LDA)
 *
-      IF( KK.GT.0 ) THEN
+*        Apply H to rows 1:m-k+i+ib-1 of current block
 *
-*        Use blocked code
+         CALL CUNG2L( M-K+I+IB-1, IB, IB, A( 1, N-K+I ), LDA,
+     $                TAU( I ), WORK, IINFO )
+
+*        Use blocked code on the remaining blocks if there are any.
 *
-         DO 50 I = K - KK + 1, K, NB
-            IB = MIN( NB, K-I+1 )
-            IF( N-K+I.GT.1 ) THEN
+         DO I = NB+1, K, NB
 *
-*              Form the triangular factor of the block reflector
-*              H = H(i+ib-1) . . . H(i+1) H(i)
+*           The last block may be less than size NB
 *
-               CALL CLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB,
-     $                      A( 1, N-K+I ), LDA, TAU( I ), WORK, LDWORK )
+            IB = MIN(NB, K-I+1)
 *
-*              Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left
+*           Form the triangular factor of the block reflector
+*           H = H(i+ib-1) . . . H(i+1) H(i)
 *
-               CALL CLARFB( 'Left', 'No transpose', 'Backward',
-     $                      'Columnwise', M-K+I+IB-1, N-K+I-1, IB,
-     $                      A( 1, N-K+I ), LDA, WORK, LDWORK, A, LDA,
-     $                      WORK( IB+1 ), LDWORK )
-            END IF
+            CALL CLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB,
+     $                  A( 1, N-K+I ), LDA, TAU( I ), 
+     $                  A( M-K+I, N-K+I ), LDA )
+*
+*           Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left
+*
+            CALL CLARFB0C2(.FALSE., 'Left', 'No Transpose',
+     $            'Backward', 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, 
+     $            A(1, N-K+I), LDA, A( M-K+I, N-K+I ), LDA, A, LDA)
 *
 *           Apply H to rows 1:m-k+i+ib-1 of current block
 *
             CALL CUNG2L( M-K+I+IB-1, IB, IB, A( 1, N-K+I ), LDA,
      $                   TAU( I ), WORK, IINFO )
-*
-*           Set rows m-k+i+ib:m of current block to zero
-*
-            DO 40 J = N - K + I, N - K + I + IB - 1
-               DO 30 L = M - K + I + IB, M
-                  A( L, J ) = ZERO
-   30          CONTINUE
-   40       CONTINUE
-   50    CONTINUE
+         END DO
       END IF
 *
       WORK( 1 ) = CMPLX( IWS )
diff --git a/SRC/cungqr.f b/SRC/cungqr.f
index eb49d2fed5..2544b67a99 100644
--- a/SRC/cungqr.f
+++ b/SRC/cungqr.f
@@ -137,17 +137,13 @@ SUBROUTINE CUNGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
 *  =====================================================================
 *
-*     .. Parameters ..
-      COMPLEX            ZERO
-      PARAMETER          ( ZERO = ( 0.0E+0, 0.0E+0 ) )
-*     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IB, IINFO, IWS, J, KI, KK, L, LDWORK,
-     $                   LWKOPT, NB, NBMIN, NX
+      INTEGER            I, IB, IINFO, IWS, KI, KK, LWKOPT, NB,
+     $                   NBMIN, NX
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           CLARFB, CLARFT, CUNG2R, XERBLA
+      EXTERNAL           CLARFB0C2, CLARFT, CUNG2R, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
@@ -163,7 +159,11 @@ SUBROUTINE CUNGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
       INFO = 0
       NB = ILAENV( 1, 'CUNGQR', ' ', M, N, K, -1 )
-      LWKOPT = MAX( 1, N )*NB
+*
+*     Only need a workspace for zung2r in case of bailout
+*     and for the panel factorization
+*
+      LWKOPT = MAX( 1, N )
       WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
@@ -192,92 +192,102 @@ SUBROUTINE CUNGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
       END IF
 *
       NBMIN = 2
-      NX = 0
+*     Determine when to cross over from unblocked to blocked
+      NX = MAX( 0, ILAENV( 3, 'CUNGQR', ' ', M, N, K, -1 ) )
       IWS = N
-      IF( NB.GT.1 .AND. NB.LT.K ) THEN
 *
-*        Determine when to cross over from blocked to unblocked code.
-*
-         NX = MAX( 0, ILAENV( 3, 'CUNGQR', ' ', M, N, K, -1 ) )
-         IF( NX.LT.K ) THEN
+      IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
 *
-*           Determine if workspace is large enough for blocked code.
+*        Treat the last NB block starting at KK+1 specially then use our blocking
+*        method from the block starting at KI+1 to 1 
 *
-            LDWORK = N
-            IWS = LDWORK*NB
-            IF( LWORK.LT.IWS ) THEN
+         KI = K - 2 * NB
+         KK = K - NB
+      ELSE
+         KK = 0
+      END IF
 *
-*              Not enough workspace to use optimal NB:  reduce NB and
-*              determine the minimum value of NB.
+*     Potentially bail to the unblocked code.
 *
-               NB = LWORK / LDWORK
-               NBMIN = MAX( 2, ILAENV( 2, 'CUNGQR', ' ', M, N, K,
-     $                      -1 ) )
-            END IF
-         END IF
+      IF( KK.EQ.0 ) THEN
+         CALL CUNG2R( M, N, K, A, LDA, TAU, WORK, IINFO )
       END IF
 *
-      IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
+      IF( KK.GT.0 ) THEN
 *
-*        Use blocked code after the last block.
-*        The first kk columns are handled by the block method.
+*        Factor the last block assuming that our first application
+*        will be on the Identity matrix
 *
-         KI = ( ( K-NX-1 ) / NB )*NB
-         KK = MIN( K, KI+NB )
+         I = KK + 1
+         IB = NB
 *
-*        Set A(1:kk,kk+1:n) to zero.
+*        Form the triangular factor of the block reflector
+*        H = H(i) H(i+1) . . . H(i+ib-1)
 *
-         DO 20 J = KK + 1, N
-            DO 10 I = 1, KK
-               A( I, J ) = ZERO
-   10       CONTINUE
-   20    CONTINUE
-      ELSE
-         KK = 0
-      END IF
+         CALL CLARFT('Forward', 'Column', M-I+1, IB, A(I,I),
+     $                     LDA, TAU(I), A(I,I), LDA)
 *
-*     Use unblocked code for the last or only block.
+*        Apply H to A(i:m,i+ib:n) from the left
+*        Exploit the fact that we are applying to an identity 
 *
-      IF( KK.LT.N )
-     $   CALL CUNG2R( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA,
-     $                TAU( KK+1 ), WORK, IINFO )
+         CALL CLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Forward',
+     $      'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), LDA, A(I,I),
+     $      LDA, A(I,I+IB), LDA)
 *
-      IF( KK.GT.0 ) THEN
+*        Apply H to rows i:m of current block
 *
-*        Use blocked code
+         CALL CUNG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK, IINFO)
 *
-         DO 50 I = KI + 1, 1, -NB
-            IB = MIN( NB, K-I+1 )
-            IF( I+IB.LE.N ) THEN
+*        Use our standard blocking method after the last block
 *
-*              Form the triangular factor of the block reflector
-*              H = H(i) H(i+1) . . . H(i+ib-1)
+         DO I = KI + 1, 1, -NB
+            IB = NB
 *
-               CALL CLARFT( 'Forward', 'Columnwise', M-I+1, IB,
-     $                      A( I, I ), LDA, TAU( I ), WORK, LDWORK )
+*           Form the triangular factor of the block reflector
+*           H = H(i) H(i+1) . . . H(i+ib-1)
 *
-*              Apply H to A(i:m,i+ib:n) from the left
+            CALL CLARFT('Forward', 'Column', M-I+1, IB, A(I,I),
+     $         LDA, TAU(I), A(I,I), LDA)
 *
-               CALL CLARFB( 'Left', 'No transpose', 'Forward',
-     $                      'Columnwise', M-I+1, N-I-IB+1, IB,
-     $                      A( I, I ), LDA, WORK, LDWORK, A( I, I+IB ),
-     $                      LDA, WORK( IB+1 ), LDWORK )
-            END IF
+*           Apply H to A(i:m,i+ib:n) from the left
+*
+            CALL CLARFB0C2(.FALSE., 'Left', 'No Transpose',
+     $         'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I),
+     $         LDA, A(I,I), LDA, A(I,I+IB), LDA)
+
 *
 *           Apply H to rows i:m of current block
 *
-            CALL CUNG2R( M-I+1, IB, IB, A( I, I ), LDA, TAU( I ),
-     $                   WORK,
-     $                   IINFO )
+            CALL CUNG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK,
+     $         IINFO)
+         END DO
+*
+*        This checks for if K was a perfect multiple of NB
+*        so that we only have a special case for the last block when
+*        necessary
+*
+         IF(I.LT.1) THEN
+            IB = I + NB - 1
+            I = 1
 *
-*           Set rows 1:i-1 of current block to zero
+*           Form the triangular factor of the block reflector
+*           H = H(i) H(i+1) . . . H(i+ib-1)
 *
-            DO 40 J = I, I + IB - 1
-               DO 30 L = 1, I - 1
-                  A( L, J ) = ZERO
-   30          CONTINUE
-   40       CONTINUE
-   50    CONTINUE
+            CALL CLARFT('Forward', 'Column', M-I+1, IB, A(I,I),
+     $         LDA, TAU(I), A(I,I), LDA)
+*
+*           Apply H to A(i:m,i+ib:n) from the left
+*
+            CALL CLARFB0C2(.FALSE., 'Left', 'No Transpose',
+     $         'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I),
+     $         LDA, A(I,I), LDA, A(I,I+IB), LDA)
+
+*
+*           Apply H to rows i:m of current block
+*
+            CALL CUNG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK,
+     $         IINFO)
+         END IF
       END IF
 *
       WORK( 1 ) = SROUNDUP_LWORK(IWS)
diff --git a/SRC/cungrq.f b/SRC/cungrq.f
index 7e812b839f..4a9b2fe512 100644
--- a/SRC/cungrq.f
+++ b/SRC/cungrq.f
@@ -137,17 +137,13 @@ SUBROUTINE CUNGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
 *  =====================================================================
 *
-*     .. Parameters ..
-      COMPLEX            ZERO
-      PARAMETER          ( ZERO = ( 0.0E+0, 0.0E+0 ) )
-*     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IB, II, IINFO, IWS, J, KK, L, LDWORK,
+      INTEGER            I, IB, II, IINFO, IWS, KK, LDWORK,
      $                   LWKOPT, NB, NBMIN, NX
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           CLARFB, CLARFT, CUNGR2, XERBLA
+      EXTERNAL           CLARFB0C2, CLARFT, CUNGR2, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
@@ -231,61 +227,67 @@ SUBROUTINE CUNGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *        Use blocked code after the first block.
 *        The last kk rows are handled by the block method.
 *
-         KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB )
-*
-*        Set A(1:m-kk,n-kk+1:n) to zero.
-*
-         DO 20 J = N - KK + 1, N
-            DO 10 I = 1, M - KK
-               A( I, J ) = ZERO
-   10       CONTINUE
-   20    CONTINUE
+         KK = K
       ELSE
          KK = 0
       END IF
 *
-*     Use unblocked code for the first or only block.
+*     Potentially bail to the unblocked code
 *
-      CALL CUNGR2( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO )
+      IF( KK.EQ.0 ) THEN
+         CALL CUNGR2( M, N, K, A, LDA, TAU, WORK, IINFO )
+      END IF
 *
       IF( KK.GT.0 ) THEN
 *
-*        Use blocked code
+*        Factor the first block assuming that our first application
+*        will be on the Identity matrix
 *
-         DO 50 I = K - KK + 1, K, NB
-            IB = MIN( NB, K-I+1 )
-            II = M - K + I
-            IF( II.GT.1 ) THEN
+         I = 1
+         IB = NB
+         II = M - K + I
 *
-*              Form the triangular factor of the block reflector
-*              H = H(i+ib-1) . . . H(i+1) H(i)
+*        Form the triangular factor of the block reflector
+*        H = H(i+ib-1) . . . H(i+1) H(i)
 *
-               CALL CLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB,
-     $                      A( II, 1 ), LDA, TAU( I ), WORK, LDWORK )
+         CALL CLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB,
+     $                A( II, 1 ), LDA, TAU( I ), WORK, LDWORK )
 *
-*              Apply H**H to A(1:m-k+i-1,1:n-k+i+ib-1) from the right
+*        Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right
 *
-               CALL CLARFB( 'Right', 'Conjugate transpose',
-     $                      'Backward',
-     $                      'Rowwise', II-1, N-K+I+IB-1, IB, A( II, 1 ),
-     $                      LDA, WORK, LDWORK, A, LDA, WORK( IB+1 ),
-     $                      LDWORK )
-            END IF
+         CALL CLARFB0C2(.TRUE., 'Right', 'Conjugate', 'Backward', 
+     $         'Rowwise', II-1, N-K+I+IB-1, IB, A(II,1), LDA, WORK,
+     $         LDWORK, A, LDA)
 *
-*           Apply H**H to columns 1:n-k+i+ib-1 of current block
+*           Apply H**T to columns 1:n-k+i+ib-1 of current block
 *
-            CALL CUNGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA,
-     $                   TAU( I ),
-     $                   WORK, IINFO )
+         CALL CUNGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA,
+     $                TAU( I ), WORK, IINFO )
+
+         DO I = NB+1, K, NB
 *
-*           Set columns n-k+i+ib:n of current block to zero
+*           The last block may be less than size NB
 *
-            DO 40 L = N - K + I + IB, N
-               DO 30 J = II, II + IB - 1
-                  A( J, L ) = ZERO
-   30          CONTINUE
-   40       CONTINUE
-   50    CONTINUE
+            IB = MIN(NB, K-I+1)
+            II = M - K + I
+*
+*           Form the triangular factor of the block reflector
+*           H = H(i+ib-1) . . . H(i+1) H(i)
+*
+            CALL CLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB,
+     $                   A( II, 1 ), LDA, TAU( I ), WORK, LDWORK )
+*
+*           Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right
+*
+            CALL CLARFB0C2(.FALSE., 'Right', 'Conjugate', 
+     $            'Backward', 'Rowwise', II-1, N-K+I+IB-1, IB, 
+     $            A(II,1), LDA, WORK, LDWORK, A, LDA)
+*
+*           Apply H**T to columns 1:n-k+i+ib-1 of current block
+*
+            CALL CUNGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA,
+     $                   TAU( I ), WORK, IINFO )
+         END DO
       END IF
 *
       WORK( 1 ) = SROUNDUP_LWORK(IWS)
diff --git a/SRC/slarfb0c2.f b/SRC/slarfb0c2.f
index 74a31f6f76..6200e448c5 100644
--- a/SRC/slarfb0c2.f
+++ b/SRC/slarfb0c2.f
@@ -88,10 +88,10 @@ SUBROUTINE SLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N,
             ! Check to ensure side and trans are the expected values 
             !
             IF( .NOT.SIDEL ) THEN
-               CALL XERBLA('DLARFB0C2', 2)
+               CALL XERBLA('SLARFB0C2', 2)
                RETURN
             ELSE IF(TRANST) THEN
-               CALL XERBLA('DLARFB0C2', 3)
+               CALL XERBLA('SLARFB0C2', 3)
                RETURN
             END IF
             !
@@ -168,10 +168,10 @@ SUBROUTINE SLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N,
             ! Check to ensure side and trans are the expected values 
             !
             IF( .NOT.SIDER ) THEN
-               CALL XERBLA('DLARFB0C2', 2)
+               CALL XERBLA('SLARFB0C2', 2)
                RETURN
             ELSE IF(.NOT.TRANST) THEN
-               CALL XERBLA('DLARFB0C2', 3)
+               CALL XERBLA('SLARFB0C2', 3)
                RETURN
             END IF
             !
@@ -251,10 +251,10 @@ SUBROUTINE SLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N,
             ! Check to ensure side and trans are the expected values 
             !
             IF( .NOT.SIDEL ) THEN
-               CALL XERBLA('DLARFB0C2', 2)
+               CALL XERBLA('SLARFB0C2', 2)
                RETURN
             ELSE IF(TRANST) THEN
-               CALL XERBLA('DLARFB0C2', 3)
+               CALL XERBLA('SLARFB0C2', 3)
                RETURN
             END IF
             !
@@ -331,10 +331,10 @@ SUBROUTINE SLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N,
             ! Check to ensure side and trans are the expected values 
             !
             IF( .NOT.SIDER ) THEN
-               CALL XERBLA('DLARFB0C2', 2)
+               CALL XERBLA('SLARFB0C2', 2)
                RETURN
             ELSE IF(.NOT.TRANST) THEN
-               CALL XERBLA('DLARFB0C2', 3)
+               CALL XERBLA('SLARFB0C2', 3)
                RETURN
             END IF
             !
diff --git a/SRC/zlarfb0c2.f b/SRC/zlarfb0c2.f
index 3bbf11e268..fab2ca66a9 100644
--- a/SRC/zlarfb0c2.f
+++ b/SRC/zlarfb0c2.f
@@ -92,10 +92,10 @@ SUBROUTINE ZLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N,
             ! Check to ensure side and trans are the expected values 
             !
             IF( .NOT.SIDEL ) THEN
-               CALL XERBLA('DLARFB0C2', 2)
+               CALL XERBLA('ZLARFB0C2', 2)
                RETURN
             ELSE IF(TRANST) THEN
-               CALL XERBLA('DLARFB0C2', 3)
+               CALL XERBLA('ZLARFB0C2', 3)
                RETURN
             END IF
             !
@@ -172,10 +172,10 @@ SUBROUTINE ZLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N,
             ! Check to ensure side and trans are the expected values 
             !
             IF( .NOT.SIDER ) THEN
-               CALL XERBLA('DLARFB0C2', 2)
+               CALL XERBLA('ZLARFB0C2', 2)
                RETURN
             ELSE IF(.NOT.TRANST) THEN
-               CALL XERBLA('DLARFB0C2', 3)
+               CALL XERBLA('ZLARFB0C2', 3)
                RETURN
             END IF
             !
@@ -255,10 +255,10 @@ SUBROUTINE ZLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N,
             ! Check to ensure side and trans are the expected values 
             !
             IF( .NOT.SIDEL ) THEN
-               CALL XERBLA('DLARFB0C2', 2)
+               CALL XERBLA('ZLARFB0C2', 2)
                RETURN
             ELSE IF(TRANST) THEN
-               CALL XERBLA('DLARFB0C2', 3)
+               CALL XERBLA('ZLARFB0C2', 3)
                RETURN
             END IF
             !
@@ -335,10 +335,10 @@ SUBROUTINE ZLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N,
             ! Check to ensure side and trans are the expected values 
             !
             IF( .NOT.SIDER ) THEN
-               CALL XERBLA('DLARFB0C2', 2)
+               CALL XERBLA('ZLARFB0C2', 2)
                RETURN
             ELSE IF(.NOT.TRANST) THEN
-               CALL XERBLA('DLARFB0C2', 3)
+               CALL XERBLA('ZLARFB0C2', 3)
                RETURN
             END IF
             !

From 46dd9bd05ae29917bff6ac0306fde6bd20d0ef14 Mon Sep 17 00:00:00 2001
From: Johnathan Rhyne <rhynejohnathan@gmail.com>
Date: Fri, 7 Mar 2025 15:23:37 -0700
Subject: [PATCH 05/10] Adding documentation to my new functions

---
 SRC/clarfb0c2.f | 184 +++++++++++++++++++++++++++++++++++++++++++++++-
 SRC/dlarfb0c2.f | 180 +++++++++++++++++++++++++++++++++++++++++++++-
 SRC/slarfb0c2.f | 179 ++++++++++++++++++++++++++++++++++++++++++++++
 SRC/zlarfb0c2.f | 181 ++++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 720 insertions(+), 4 deletions(-)

diff --git a/SRC/clarfb0c2.f b/SRC/clarfb0c2.f
index 4ab8f47aea..4a7a2018d3 100644
--- a/SRC/clarfb0c2.f
+++ b/SRC/clarfb0c2.f
@@ -1,3 +1,183 @@
+*> \brief \b CLARFB0C2 applies a block reflector or its conjugate-transpose 
+* to a rectangular matrix with a 0 block while constructing the explicit Q
+* factor
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*
+*  Definition:
+*  ===========
+*
+*     SUBROUTINE CLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N,
+*    $                     K, V, LDV, T, LDT, C, LDC)
+*        ! Scalar arguments
+*        INTEGER           M, N, K, LDV, LDC, LDT
+*        CHARACTER         SIDE, TRANS, DIRECT, STOREV 
+*        ! True means that we are assuming C2 is the identity matrix
+*        !     and thus don't reference whatever is present in C2 
+*        !     at the beginning.
+*        LOGICAL           C2I
+*        ! Array arguments
+*        COMPLEX           V(LDV,*), C(LDC,*), T(LDT,*)
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CLARFB0C2 applies a real block reflector H or its transpose H**H to a
+*> complex m by n matrix C with a 0 block, while computing the explicit Q factor
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] C2I
+*> \verbatim
+*>          C2I is LOGICAL
+*>          = .TRUE.: Assume the nonzero block of C is the identity matrix
+*>          = .FALSE.: Use existing data in the nonzero block of C
+*> \endverbatim
+*>
+*> \param[in] SIDE
+*> \verbatim
+*>          SIDE is CHARACTER*1
+*>          = 'L': apply H or H**H from the Left
+*>          = 'R': apply H or H**H from the Right
+*> \endverbatim
+*>
+*> \param[in] TRANS
+*> \verbatim
+*>          TRANS is CHARACTER*1
+*>          = 'N': apply H (No transpose)
+*>          = 'C': apply H**H (Conjugate transpose)
+*> \endverbatim
+*>
+*> \param[in] DIRECT
+*> \verbatim
+*>          DIRECT is CHARACTER*1
+*>          Indicates how H is formed from a product of elementary
+*>          reflectors
+*>          = 'F': H = H(1) H(2) . . . H(k) (Forward)
+*>          = 'B': H = H(k) . . . H(2) H(1) (Backward)
+*> \endverbatim
+*>
+*> \param[in] STOREV
+*> \verbatim
+*>          STOREV is CHARACTER*1
+*>          Indicates how the vectors which define the elementary
+*>          reflectors are stored:
+*>          = 'C': Columnwise
+*>          = 'R': Rowwise
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix C.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix C.
+*> \endverbatim
+*>
+*> \param[in] K
+*> \verbatim
+*>          K is INTEGER
+*>          The order of the matrix T (= the number of elementary
+*>          reflectors whose product defines the block reflector).
+*>          If SIDE = 'L', M >= K >= 0;
+*>          if SIDE = 'R', N >= K >= 0.
+*> \endverbatim
+*>
+*> \param[in] V
+*> \verbatim
+*>          V is COMPLEX array, dimension
+*>                                (LDV,K) if STOREV = 'C'
+*>                                (LDV,M) if STOREV = 'R' and SIDE = 'L'
+*>                                (LDV,N) if STOREV = 'R' and SIDE = 'R'
+*>          See Further Details.
+*> \endverbatim
+*>
+*> \param[in] LDV
+*> \verbatim
+*>          LDV is INTEGER
+*>          The leading dimension of the array V.
+*>          If STOREV = 'C' and SIDE = 'L', LDV >= max(1,M);
+*>          if STOREV = 'C' and SIDE = 'R', LDV >= max(1,N);
+*>          if STOREV = 'R', LDV >= K.
+*> \endverbatim
+*>
+*> \param[in] T
+*> \verbatim
+*>          T is COMPLEX array, dimension (LDT,K)
+*>          The triangular K-by-K matrix T in the representation of the
+*>          block reflector.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*>          LDT is INTEGER
+*>          The leading dimension of the array T. LDT >= K.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is COMPLEX array, dimension (LDC,N)
+*>          On entry, the M-by-N matrix C.
+*>          On exit, C is overwritten by H*C or H**H*C or C*H or C*H**H.
+*> \endverbatim
+*>
+*> \param[in] LDC
+*> \verbatim
+*>          LDC is INTEGER
+*>          The leading dimension of the array C. LDC >= max(1,M).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup larfb
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  The shape of the matrix V and the storage of the vectors which define
+*>  the H(i) is best illustrated by the following example with n = 5 and
+*>  k = 3. The triangular part of V (including its diagonal) is not
+*>  referenced.
+*>
+*>  DIRECT = 'F' and STOREV = 'C':         DIRECT = 'F' and STOREV = 'R':
+*>
+*>               V = (  1       )                 V = (  1 v1 v1 v1 v1 )
+*>                   ( v1  1    )                     (     1 v2 v2 v2 )
+*>                   ( v1 v2  1 )                     (        1 v3 v3 )
+*>                   ( v1 v2 v3 )
+*>                   ( v1 v2 v3 )
+*>
+*>  DIRECT = 'B' and STOREV = 'C':         DIRECT = 'B' and STOREV = 'R':
+*>
+*>               V = ( v1 v2 v3 )                 V = ( v1 v1  1       )
+*>                   ( v1 v2 v3 )                     ( v2 v2 v2  1    )
+*>                   (  1 v2 v3 )                     ( v3 v3 v3 v3  1 )
+*>                   (     1 v3 )
+*>                   (        1 )
+*> \endverbatim
+*>
+*  =====================================================================
       SUBROUTINE CLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N,
      $                     K, V, LDV, T, LDT, C, LDC)
          ! Scalar arguments
@@ -9,7 +189,7 @@ SUBROUTINE CLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N,
          LOGICAL           C2I
 
          ! Array arguments
-         COMPLEX*8         V(LDV,*), C(LDC,*), T(LDT,*)
+         COMPLEX           V(LDV,*), C(LDC,*), T(LDT,*)
          ! Local scalars
          LOGICAL           QR, LQ, QL, DIRF, COLV, SIDEL, SIDER,
      $                     TRANST
@@ -22,7 +202,7 @@ SUBROUTINE CLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N,
          ! External Subroutines
          EXTERNAL          CGEMM, CTRMM
          ! Parameters
-         COMPLEX*8         ONE, ZERO, NEG_ONE
+         COMPLEX           ONE, ZERO, NEG_ONE
          PARAMETER(ONE=(1.0E+0, 0.0E+0),
      $            ZERO = (0.0E+0, 0.0E+0), 
      $            NEG_ONE = (-1.0E+0, 0.0E+0))
diff --git a/SRC/dlarfb0c2.f b/SRC/dlarfb0c2.f
index 4651cdbea4..493580e090 100644
--- a/SRC/dlarfb0c2.f
+++ b/SRC/dlarfb0c2.f
@@ -1,3 +1,182 @@
+*> \brief \b DLARFB0C2 applies a block reflector or its transpose to a
+* rectangular matrix with a 0 block while constructing the explicit Q factor
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*
+*  Definition:
+*  ===========
+*
+*     SUBROUTINE DLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N,
+*    $                     K, V, LDV, T, LDT, C, LDC)
+*        ! Scalar arguments
+*        INTEGER           M, N, K, LDV, LDC, LDT
+*        CHARACTER         SIDE, TRANS, DIRECT, STOREV 
+*        ! True means that we are assuming C2 is the identity matrix
+*        !     and thus don't reference whatever is present in C2 
+*        !     at the beginning.
+*        LOGICAL           C2I
+*        ! Array arguments
+*        DOUBLE PRECISION  V(LDV,*), C(LDC,*), T(LDT,*)
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DLARFB0C2 applies a real block reflector H or its transpose H**T to a
+*> real m by n matrix C with a 0 block, while computing the explicit Q factor
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] C2I
+*> \verbatim
+*>          C2I is LOGICAL
+*>          = .TRUE.: Assume the nonzero block of C is the identity matrix
+*>          = .FALSE.: Use existing data in the nonzero block of C
+*> \endverbatim
+*>
+*> \param[in] SIDE
+*> \verbatim
+*>          SIDE is CHARACTER*1
+*>          = 'L': apply H or H**T from the Left
+*>          = 'R': apply H or H**T from the Right
+*> \endverbatim
+*>
+*> \param[in] TRANS
+*> \verbatim
+*>          TRANS is CHARACTER*1
+*>          = 'N': apply H (No transpose)
+*>          = 'T': apply H**T (Transpose)
+*> \endverbatim
+*>
+*> \param[in] DIRECT
+*> \verbatim
+*>          DIRECT is CHARACTER*1
+*>          Indicates how H is formed from a product of elementary
+*>          reflectors
+*>          = 'F': H = H(1) H(2) . . . H(k) (Forward)
+*>          = 'B': H = H(k) . . . H(2) H(1) (Backward)
+*> \endverbatim
+*>
+*> \param[in] STOREV
+*> \verbatim
+*>          STOREV is CHARACTER*1
+*>          Indicates how the vectors which define the elementary
+*>          reflectors are stored:
+*>          = 'C': Columnwise
+*>          = 'R': Rowwise
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix C.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix C.
+*> \endverbatim
+*>
+*> \param[in] K
+*> \verbatim
+*>          K is INTEGER
+*>          The order of the matrix T (= the number of elementary
+*>          reflectors whose product defines the block reflector).
+*>          If SIDE = 'L', M >= K >= 0;
+*>          if SIDE = 'R', N >= K >= 0.
+*> \endverbatim
+*>
+*> \param[in] V
+*> \verbatim
+*>          V is DOUBLE PRECISION array, dimension
+*>                                (LDV,K) if STOREV = 'C'
+*>                                (LDV,M) if STOREV = 'R' and SIDE = 'L'
+*>                                (LDV,N) if STOREV = 'R' and SIDE = 'R'
+*>          The matrix V. See Further Details.
+*> \endverbatim
+*>
+*> \param[in] LDV
+*> \verbatim
+*>          LDV is INTEGER
+*>          The leading dimension of the array V.
+*>          If STOREV = 'C' and SIDE = 'L', LDV >= max(1,M);
+*>          if STOREV = 'C' and SIDE = 'R', LDV >= max(1,N);
+*>          if STOREV = 'R', LDV >= K.
+*> \endverbatim
+*>
+*> \param[in] T
+*> \verbatim
+*>          T is DOUBLE PRECISION array, dimension (LDT,K)
+*>          The triangular k by k matrix T in the representation of the
+*>          block reflector.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*>          LDT is INTEGER
+*>          The leading dimension of the array T. LDT >= K.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is DOUBLE PRECISION array, dimension (LDC,N)
+*>          On entry, the m by n matrix C.
+*>          On exit, C is overwritten by H*C or H**T*C or C*H or C*H**T.
+*> \endverbatim
+*>
+*> \param[in] LDC
+*> \verbatim
+*>          LDC is INTEGER
+*>          The leading dimension of the array C. LDC >= max(1,M).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup larfb
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  The shape of the matrix V and the storage of the vectors which define
+*>  the H(i) is best illustrated by the following example with n = 5 and
+*>  k = 3. The triangular part of V (including its diagonal) is not
+*>  referenced.
+*>
+*>  DIRECT = 'F' and STOREV = 'C':         DIRECT = 'F' and STOREV = 'R':
+*>
+*>               V = (  1       )                 V = (  1 v1 v1 v1 v1 )
+*>                   ( v1  1    )                     (     1 v2 v2 v2 )
+*>                   ( v1 v2  1 )                     (        1 v3 v3 )
+*>                   ( v1 v2 v3 )
+*>                   ( v1 v2 v3 )
+*>
+*>  DIRECT = 'B' and STOREV = 'C':         DIRECT = 'B' and STOREV = 'R':
+*>
+*>               V = ( v1 v2 v3 )                 V = ( v1 v1  1       )
+*>                   ( v1 v2 v3 )                     ( v2 v2 v2  1    )
+*>                   (  1 v2 v3 )                     ( v3 v3 v3 v3  1 )
+*>                   (     1 v3 )
+*>                   (        1 )
+*> \endverbatim
+*>
+*  =====================================================================
       SUBROUTINE DLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N,
      $                     K, V, LDV, T, LDT, C, LDC)
          ! Scalar arguments
@@ -7,7 +186,6 @@ SUBROUTINE DLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N,
          !     and thus don't reference whatever is present in C2 
          !     at the beginning.
          LOGICAL           C2I
-
          ! Array arguments
          DOUBLE PRECISION  V(LDV,*), C(LDC,*), T(LDT,*)
          ! Local scalars
diff --git a/SRC/slarfb0c2.f b/SRC/slarfb0c2.f
index 6200e448c5..e3c2229045 100644
--- a/SRC/slarfb0c2.f
+++ b/SRC/slarfb0c2.f
@@ -1,3 +1,182 @@
+*> \brief \b SLARFB0C2 applies a block reflector or its transpose to a
+* rectangular matrix with a 0 block while constructing the explicit Q factor
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*
+*  Definition:
+*  ===========
+*
+*     SUBROUTINE SLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N,
+*    $                     K, V, LDV, T, LDT, C, LDC)
+*        ! Scalar arguments
+*        INTEGER           M, N, K, LDV, LDC, LDT
+*        CHARACTER         SIDE, TRANS, DIRECT, STOREV 
+*        ! True means that we are assuming C2 is the identity matrix
+*        !     and thus don't reference whatever is present in C2 
+*        !     at the beginning.
+*        LOGICAL           C2I
+*        ! Array arguments
+*        REAL              V(LDV,*), C(LDC,*), T(LDT,*)
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SLARFB0C2 applies a real block reflector H or its transpose H**T to a
+*> real m by n matrix C with a 0 block, while computing the explicit Q factor
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] C2I
+*> \verbatim
+*>          C2I is LOGICAL
+*>          = .TRUE.: Assume the nonzero block of C is the identity matrix
+*>          = .FALSE.: Use existing data in the nonzero block of C
+*> \endverbatim
+*>
+*> \param[in] SIDE
+*> \verbatim
+*>          SIDE is CHARACTER*1
+*>          = 'L': apply H or H**T from the Left
+*>          = 'R': apply H or H**T from the Right
+*> \endverbatim
+*>
+*> \param[in] TRANS
+*> \verbatim
+*>          TRANS is CHARACTER*1
+*>          = 'N': apply H (No transpose)
+*>          = 'T': apply H**T (Transpose)
+*> \endverbatim
+*>
+*> \param[in] DIRECT
+*> \verbatim
+*>          DIRECT is CHARACTER*1
+*>          Indicates how H is formed from a product of elementary
+*>          reflectors
+*>          = 'F': H = H(1) H(2) . . . H(k) (Forward)
+*>          = 'B': H = H(k) . . . H(2) H(1) (Backward)
+*> \endverbatim
+*>
+*> \param[in] STOREV
+*> \verbatim
+*>          STOREV is CHARACTER*1
+*>          Indicates how the vectors which define the elementary
+*>          reflectors are stored:
+*>          = 'C': Columnwise
+*>          = 'R': Rowwise
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix C.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix C.
+*> \endverbatim
+*>
+*> \param[in] K
+*> \verbatim
+*>          K is INTEGER
+*>          The order of the matrix T (= the number of elementary
+*>          reflectors whose product defines the block reflector).
+*>          If SIDE = 'L', M >= K >= 0;
+*>          if SIDE = 'R', N >= K >= 0.
+*> \endverbatim
+*>
+*> \param[in] V
+*> \verbatim
+*>          V is REAL array, dimension
+*>                                (LDV,K) if STOREV = 'C'
+*>                                (LDV,M) if STOREV = 'R' and SIDE = 'L'
+*>                                (LDV,N) if STOREV = 'R' and SIDE = 'R'
+*>          The matrix V. See Further Details.
+*> \endverbatim
+*>
+*> \param[in] LDV
+*> \verbatim
+*>          LDV is INTEGER
+*>          The leading dimension of the array V.
+*>          If STOREV = 'C' and SIDE = 'L', LDV >= max(1,M);
+*>          if STOREV = 'C' and SIDE = 'R', LDV >= max(1,N);
+*>          if STOREV = 'R', LDV >= K.
+*> \endverbatim
+*>
+*> \param[in] T
+*> \verbatim
+*>          T is REAL array, dimension (LDT,K)
+*>          The triangular k by k matrix T in the representation of the
+*>          block reflector.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*>          LDT is INTEGER
+*>          The leading dimension of the array T. LDT >= K.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is REAL array, dimension (LDC,N)
+*>          On entry, the m by n matrix C.
+*>          On exit, C is overwritten by H*C or H**T*C or C*H or C*H**T.
+*> \endverbatim
+*>
+*> \param[in] LDC
+*> \verbatim
+*>          LDC is INTEGER
+*>          The leading dimension of the array C. LDC >= max(1,M).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup larfb
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  The shape of the matrix V and the storage of the vectors which define
+*>  the H(i) is best illustrated by the following example with n = 5 and
+*>  k = 3. The triangular part of V (including its diagonal) is not
+*>  referenced.
+*>
+*>  DIRECT = 'F' and STOREV = 'C':         DIRECT = 'F' and STOREV = 'R':
+*>
+*>               V = (  1       )                 V = (  1 v1 v1 v1 v1 )
+*>                   ( v1  1    )                     (     1 v2 v2 v2 )
+*>                   ( v1 v2  1 )                     (        1 v3 v3 )
+*>                   ( v1 v2 v3 )
+*>                   ( v1 v2 v3 )
+*>
+*>  DIRECT = 'B' and STOREV = 'C':         DIRECT = 'B' and STOREV = 'R':
+*>
+*>               V = ( v1 v2 v3 )                 V = ( v1 v1  1       )
+*>                   ( v1 v2 v3 )                     ( v2 v2 v2  1    )
+*>                   (  1 v2 v3 )                     ( v3 v3 v3 v3  1 )
+*>                   (     1 v3 )
+*>                   (        1 )
+*> \endverbatim
+*>
+*  =====================================================================
       SUBROUTINE SLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N,
      $                     K, V, LDV, T, LDT, C, LDC)
          ! Scalar arguments
diff --git a/SRC/zlarfb0c2.f b/SRC/zlarfb0c2.f
index fab2ca66a9..1f43b4b141 100644
--- a/SRC/zlarfb0c2.f
+++ b/SRC/zlarfb0c2.f
@@ -1,3 +1,183 @@
+*> \brief \b ZLARFB0C2 applies a block reflector or its conjugate-transpose 
+* to a rectangular matrix with a 0 block while constructing the explicit Q
+* factor
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*
+*  Definition:
+*  ===========
+*
+*     SUBROUTINE ZLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N,
+*    $                     K, V, LDV, T, LDT, C, LDC)
+*        ! Scalar arguments
+*        INTEGER           M, N, K, LDV, LDC, LDT
+*        CHARACTER         SIDE, TRANS, DIRECT, STOREV 
+*        ! True means that we are assuming C2 is the identity matrix
+*        !     and thus don't reference whatever is present in C2 
+*        !     at the beginning.
+*        LOGICAL           C2I
+*        ! Array arguments
+*        COMPLEX*16        V(LDV,*), C(LDC,*), T(LDT,*)
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZLARFB0C2 applies a real block reflector H or its transpose H**H to a
+*> complex m by n matrix C with a 0 block, while computing the explicit Q factor
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] C2I
+*> \verbatim
+*>          C2I is LOGICAL
+*>          = .TRUE.: Assume the nonzero block of C is the identity matrix
+*>          = .FALSE.: Use existing data in the nonzero block of C
+*> \endverbatim
+*>
+*> \param[in] SIDE
+*> \verbatim
+*>          SIDE is CHARACTER*1
+*>          = 'L': apply H or H**H from the Left
+*>          = 'R': apply H or H**H from the Right
+*> \endverbatim
+*>
+*> \param[in] TRANS
+*> \verbatim
+*>          TRANS is CHARACTER*1
+*>          = 'N': apply H (No transpose)
+*>          = 'C': apply H**H (Conjugate transpose)
+*> \endverbatim
+*>
+*> \param[in] DIRECT
+*> \verbatim
+*>          DIRECT is CHARACTER*1
+*>          Indicates how H is formed from a product of elementary
+*>          reflectors
+*>          = 'F': H = H(1) H(2) . . . H(k) (Forward)
+*>          = 'B': H = H(k) . . . H(2) H(1) (Backward)
+*> \endverbatim
+*>
+*> \param[in] STOREV
+*> \verbatim
+*>          STOREV is CHARACTER*1
+*>          Indicates how the vectors which define the elementary
+*>          reflectors are stored:
+*>          = 'C': Columnwise
+*>          = 'R': Rowwise
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix C.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix C.
+*> \endverbatim
+*>
+*> \param[in] K
+*> \verbatim
+*>          K is INTEGER
+*>          The order of the matrix T (= the number of elementary
+*>          reflectors whose product defines the block reflector).
+*>          If SIDE = 'L', M >= K >= 0;
+*>          if SIDE = 'R', N >= K >= 0.
+*> \endverbatim
+*>
+*> \param[in] V
+*> \verbatim
+*>          V is COMPLEX*16 array, dimension
+*>                                (LDV,K) if STOREV = 'C'
+*>                                (LDV,M) if STOREV = 'R' and SIDE = 'L'
+*>                                (LDV,N) if STOREV = 'R' and SIDE = 'R'
+*>          See Further Details.
+*> \endverbatim
+*>
+*> \param[in] LDV
+*> \verbatim
+*>          LDV is INTEGER
+*>          The leading dimension of the array V.
+*>          If STOREV = 'C' and SIDE = 'L', LDV >= max(1,M);
+*>          if STOREV = 'C' and SIDE = 'R', LDV >= max(1,N);
+*>          if STOREV = 'R', LDV >= K.
+*> \endverbatim
+*>
+*> \param[in] T
+*> \verbatim
+*>          T is COMPLEX*16 array, dimension (LDT,K)
+*>          The triangular K-by-K matrix T in the representation of the
+*>          block reflector.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*>          LDT is INTEGER
+*>          The leading dimension of the array T. LDT >= K.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is COMPLEX*16 array, dimension (LDC,N)
+*>          On entry, the M-by-N matrix C.
+*>          On exit, C is overwritten by H*C or H**H*C or C*H or C*H**H.
+*> \endverbatim
+*>
+*> \param[in] LDC
+*> \verbatim
+*>          LDC is INTEGER
+*>          The leading dimension of the array C. LDC >= max(1,M).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup larfb
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  The shape of the matrix V and the storage of the vectors which define
+*>  the H(i) is best illustrated by the following example with n = 5 and
+*>  k = 3. The triangular part of V (including its diagonal) is not
+*>  referenced.
+*>
+*>  DIRECT = 'F' and STOREV = 'C':         DIRECT = 'F' and STOREV = 'R':
+*>
+*>               V = (  1       )                 V = (  1 v1 v1 v1 v1 )
+*>                   ( v1  1    )                     (     1 v2 v2 v2 )
+*>                   ( v1 v2  1 )                     (        1 v3 v3 )
+*>                   ( v1 v2 v3 )
+*>                   ( v1 v2 v3 )
+*>
+*>  DIRECT = 'B' and STOREV = 'C':         DIRECT = 'B' and STOREV = 'R':
+*>
+*>               V = ( v1 v2 v3 )                 V = ( v1 v1  1       )
+*>                   ( v1 v2 v3 )                     ( v2 v2 v2  1    )
+*>                   (  1 v2 v3 )                     ( v3 v3 v3 v3  1 )
+*>                   (     1 v3 )
+*>                   (        1 )
+*> \endverbatim
+*>
+*  =====================================================================
       SUBROUTINE ZLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N,
      $                     K, V, LDV, T, LDT, C, LDC)
          ! Scalar arguments
@@ -7,7 +187,6 @@ SUBROUTINE ZLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N,
          !     and thus don't reference whatever is present in C2 
          !     at the beginning.
          LOGICAL           C2I
-
          ! Array arguments
          COMPLEX*16        V(LDV,*), C(LDC,*), T(LDT,*)
          ! Local scalars

From 87c19d1add0735788da978f4fa16a9807076aec2 Mon Sep 17 00:00:00 2001
From: Johnathan Rhyne <rhynejohnathan@gmail.com>
Date: Fri, 7 Mar 2025 17:47:08 -0700
Subject: [PATCH 06/10] Updating documentation for x{or,un}gy to account for
 workspace changes

---
 SRC/cungql.f | 2 --
 SRC/cungqr.f | 2 --
 SRC/dorgql.f | 2 --
 SRC/dorgqr.f | 2 --
 SRC/sorgql.f | 2 --
 SRC/sorgqr.f | 2 --
 SRC/zungql.f | 2 --
 SRC/zungqr.f | 2 --
 8 files changed, 16 deletions(-)

diff --git a/SRC/cungql.f b/SRC/cungql.f
index 893303b4e1..9022d37acc 100644
--- a/SRC/cungql.f
+++ b/SRC/cungql.f
@@ -95,8 +95,6 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK. LWORK >= max(1,N).
-*>          For optimum performance LWORK >= N*NB, where NB is the
-*>          optimal blocksize.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
diff --git a/SRC/cungqr.f b/SRC/cungqr.f
index 2544b67a99..3d7aaceb96 100644
--- a/SRC/cungqr.f
+++ b/SRC/cungqr.f
@@ -95,8 +95,6 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK. LWORK >= max(1,N).
-*>          For optimum performance LWORK >= N*NB, where NB is the
-*>          optimal blocksize.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
diff --git a/SRC/dorgql.f b/SRC/dorgql.f
index 13146142ed..378147844d 100644
--- a/SRC/dorgql.f
+++ b/SRC/dorgql.f
@@ -97,8 +97,6 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK. LWORK >= max(1,N).
-*>          For optimum performance LWORK >= N*NB, where NB is the
-*>          optimal blocksize.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
diff --git a/SRC/dorgqr.f b/SRC/dorgqr.f
index 4b323f68fb..8f3c442783 100644
--- a/SRC/dorgqr.f
+++ b/SRC/dorgqr.f
@@ -97,8 +97,6 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK. LWORK >= max(1,N).
-*>          For optimum performance LWORK >= N*NB, where NB is the
-*>          optimal blocksize.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
diff --git a/SRC/sorgql.f b/SRC/sorgql.f
index 69484d2c36..c9b2918b71 100644
--- a/SRC/sorgql.f
+++ b/SRC/sorgql.f
@@ -95,8 +95,6 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK. LWORK >= max(1,N).
-*>          For optimum performance LWORK >= N*NB, where NB is the
-*>          optimal blocksize.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
diff --git a/SRC/sorgqr.f b/SRC/sorgqr.f
index 65d397de36..070257c46f 100644
--- a/SRC/sorgqr.f
+++ b/SRC/sorgqr.f
@@ -95,8 +95,6 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK. LWORK >= max(1,N).
-*>          For optimum performance LWORK >= N*NB, where NB is the
-*>          optimal blocksize.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
diff --git a/SRC/zungql.f b/SRC/zungql.f
index e3e604e791..2aa5295564 100644
--- a/SRC/zungql.f
+++ b/SRC/zungql.f
@@ -95,8 +95,6 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK. LWORK >= max(1,N).
-*>          For optimum performance LWORK >= N*NB, where NB is the
-*>          optimal blocksize.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
diff --git a/SRC/zungqr.f b/SRC/zungqr.f
index bb5dd9ee03..e3d31a7442 100644
--- a/SRC/zungqr.f
+++ b/SRC/zungqr.f
@@ -95,8 +95,6 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK. LWORK >= max(1,N).
-*>          For optimum performance LWORK >= N*NB, where NB is the
-*>          optimal blocksize.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns

From 318a43d20bc343d7af499dec99960b83ea0819db Mon Sep 17 00:00:00 2001
From: Johnathan Rhyne <rhynejohnathan@gmail.com>
Date: Fri, 7 Mar 2025 17:51:50 -0700
Subject: [PATCH 07/10] fixed whitespace inconsistencies in subroutine
 definitions

---
 SRC/zunglq.f | 2 +-
 SRC/zungql.f | 2 +-
 SRC/zungqr.f | 2 +-
 SRC/zungrq.f | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/SRC/zunglq.f b/SRC/zunglq.f
index 9da595f491..54aaf7686e 100644
--- a/SRC/zunglq.f
+++ b/SRC/zunglq.f
@@ -121,7 +121,7 @@
 *> \ingroup unglq
 *
 *  =====================================================================
-      SUBROUTINE ZUNGLQ(M, N, K, A, LDA, TAU, WORK, LWORK, INFO)
+      SUBROUTINE ZUNGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
diff --git a/SRC/zungql.f b/SRC/zungql.f
index 2aa5295564..74c5ec8e65 100644
--- a/SRC/zungql.f
+++ b/SRC/zungql.f
@@ -120,7 +120,7 @@
 *> \ingroup ungql
 *
 *  =====================================================================
-      SUBROUTINE ZUNGQL(M, N, K, A, LDA, TAU, WORK, LWORK, INFO)
+      SUBROUTINE ZUNGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
diff --git a/SRC/zungqr.f b/SRC/zungqr.f
index e3d31a7442..50d5d91a2a 100644
--- a/SRC/zungqr.f
+++ b/SRC/zungqr.f
@@ -120,7 +120,7 @@
 *> \ingroup ungqr
 *
 *  =====================================================================
-      SUBROUTINE ZUNGQR(M, N, K, A, LDA, TAU, WORK, LWORK, INFO)
+      SUBROUTINE ZUNGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
diff --git a/SRC/zungrq.f b/SRC/zungrq.f
index 1a7640d102..939fff3fcf 100644
--- a/SRC/zungrq.f
+++ b/SRC/zungrq.f
@@ -122,7 +122,7 @@
 *> \ingroup ungrq
 *
 *  =====================================================================
-      SUBROUTINE ZUNGRQ(M, N, K, A, LDA, TAU, WORK, LWORK, INFO)
+      SUBROUTINE ZUNGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --

From bf84397ea9cb17508e068a2401a36b256beeb62b Mon Sep 17 00:00:00 2001
From: Johnathan Rhyne <rhynejohnathan@gmail.com>
Date: Sat, 8 Mar 2025 15:19:06 -0700
Subject: [PATCH 08/10] added definitions to build with _64 using cmake

---
 SRC/clarfb0c2.f | 4 ++--
 SRC/lapack_64.h | 4 ++++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/SRC/clarfb0c2.f b/SRC/clarfb0c2.f
index 4a7a2018d3..9b5b739174 100644
--- a/SRC/clarfb0c2.f
+++ b/SRC/clarfb0c2.f
@@ -178,8 +178,8 @@
 *> \endverbatim
 *>
 *  =====================================================================
-      SUBROUTINE CLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N,
-     $                     K, V, LDV, T, LDT, C, LDC)
+      SUBROUTINE CLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV,
+     $                     M, N, K, V, LDV, T, LDT, C, LDC)
          ! Scalar arguments
          INTEGER           M, N, K, LDV, LDC, LDT
          CHARACTER         SIDE, TRANS, DIRECT, STOREV 
diff --git a/SRC/lapack_64.h b/SRC/lapack_64.h
index e8000bf2c4..e8d7920946 100644
--- a/SRC/lapack_64.h
+++ b/SRC/lapack_64.h
@@ -331,6 +331,7 @@
 #define CLARF1F CLARF1F_64
 #define CLARF1L CLARF1L_64
 #define CLARFB CLARFB_64
+#define CLARFB0C2 CLARFB0C2_64
 #define CLARFB_GETT CLARFB_GETT_64
 #define CLARFG CLARFG_64
 #define CLARFGP CLARFGP_64
@@ -805,6 +806,7 @@
 #define DLARF1F DLARF1F_64
 #define DLARF1L DLARF1L_64
 #define DLARFB DLARFB_64
+#define DLARFB0C2 DLARFB0C2_64
 #define DLARFB_GETT DLARFB_GETT_64
 #define DLARFG DLARFG_64
 #define DLARFGP DLARFGP_64
@@ -1400,6 +1402,7 @@
 #define SLARF1F SLARF1F_64
 #define SLARF1L SLARF1L_64
 #define SLARFB SLARFB_64
+#define SLARFB0C2 SLARFB0C2_64
 #define SLARFB_GETT SLARFB_GETT_64
 #define SLARFG SLARFG_64
 #define SLARFGP SLARFGP_64
@@ -2046,6 +2049,7 @@
 #define ZLARF1F ZLARF1F_64
 #define ZLARF1L ZLARF1L_64
 #define ZLARFB ZLARFB_64
+#define ZLARFB0C2 ZLARFB0C2_64
 #define ZLARFB_GETT ZLARFB_GETT_64
 #define ZLARFG ZLARFG_64
 #define ZLARFGP ZLARFGP_64

From 181b0cc9265ad0a2604e3bb00c2bece7f7972896 Mon Sep 17 00:00:00 2001
From: Johnathan Rhyne <rhynejohnathan@gmail.com>
Date: Sat, 8 Mar 2025 15:20:17 -0700
Subject: [PATCH 09/10] fixed inconsistent function definitiom of CLARFB0C2

---
 SRC/clarfb0c2.f | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/SRC/clarfb0c2.f b/SRC/clarfb0c2.f
index 9b5b739174..9d788df707 100644
--- a/SRC/clarfb0c2.f
+++ b/SRC/clarfb0c2.f
@@ -178,8 +178,8 @@
 *> \endverbatim
 *>
 *  =====================================================================
-      SUBROUTINE CLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV,
-     $                     M, N, K, V, LDV, T, LDT, C, LDC)
+      SUBROUTINE CLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N
+     $                     K, V, LDV, T, LDT, C, LDC)
          ! Scalar arguments
          INTEGER           M, N, K, LDV, LDC, LDT
          CHARACTER         SIDE, TRANS, DIRECT, STOREV 

From 4e3bad0aed6f67e04098c24698c9c61e094aeadd Mon Sep 17 00:00:00 2001
From: Johnathan Rhyne <rhynejohnathan@gmail.com>
Date: Sat, 8 Mar 2025 15:22:14 -0700
Subject: [PATCH 10/10] adding a dropped comma

---
 SRC/clarfb0c2.f | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SRC/clarfb0c2.f b/SRC/clarfb0c2.f
index 9d788df707..4a7a2018d3 100644
--- a/SRC/clarfb0c2.f
+++ b/SRC/clarfb0c2.f
@@ -178,7 +178,7 @@
 *> \endverbatim
 *>
 *  =====================================================================
-      SUBROUTINE CLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N
+      SUBROUTINE CLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N,
      $                     K, V, LDV, T, LDT, C, LDC)
          ! Scalar arguments
          INTEGER           M, N, K, LDV, LDC, LDT