diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt
index be426cecd..24740835b 100644
--- a/SRC/CMakeLists.txt
+++ b/SRC/CMakeLists.txt
@@ -106,7 +106,7 @@ set(SLASRC
    slaqgb.f slaqge.f slaqp2.f slaqps.f slaqp2rk.f slaqp3rk.f slaqsb.f slaqsp.f slaqsy.f
    slaqr0.f slaqr1.f slaqr2.f slaqr3.f slaqr4.f slaqr5.f
    slaqtr.f slar1v.f slar2v.f ilaslr.f ilaslc.f
-   slarf.f  slarf1f.f slarf1l.f slarfb.f slarfb_gett.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f
+   slarf.f  slarf1f.f slarf1l.f slarfb.f slarfb0c2.f slarfb_gett.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f
    slargv.f slarmm.f slarrv.f slartv.f
    slarz.f  slarzb.f slarzt.f slasy2.f
    slasyf.f slasyf_rook.f slasyf_rk.f slasyf_aa.f
@@ -218,7 +218,7 @@ set(CLASRC
    claqhb.f claqhe.f claqhp.f claqp2.f claqps.f claqp2rk.f claqp3rk.f claqsb.f
    claqr0.f claqr1.f claqr2.f claqr3.f claqr4.f claqr5.f
    claqsp.f claqsy.f clar1v.f clar2v.f ilaclr.f ilaclc.f
-   clarf.f  clarf1f.f clarf1l.f clarfb.f clarfb_gett.f clarfg.f clarfgp.f clarft.f
+   clarf.f  clarf1f.f clarf1l.f clarfb.f clarfb0c2.f clarfb_gett.f clarfg.f clarfgp.f clarft.f
    clarfx.f clarfy.f clargv.f clarnv.f clarrv.f clartg.f90 clartv.f
    clarz.f  clarzb.f clarzt.f clascl.f claset.f clasr.f  classq.f90
    claswp.f clasyf.f clasyf_rook.f clasyf_rk.f clasyf_aa.f
@@ -307,7 +307,7 @@ set(DLASRC
    dlaqgb.f dlaqge.f dlaqp2.f dlaqps.f dlaqp2rk.f dlaqp3rk.f dlaqsb.f dlaqsp.f dlaqsy.f
    dlaqr0.f dlaqr1.f dlaqr2.f dlaqr3.f dlaqr4.f dlaqr5.f
    dlaqtr.f dlar1v.f dlar2v.f iladlr.f iladlc.f
-   dlarf.f  dlarfb.f dlarfb_gett.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f dlarf1f.f dlarf1l.f
+   dlarf.f  dlarfb.f dlarfb_gett.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f dlarf1f.f dlarf1l.f dlarfb0c2.f
    dlargv.f dlarmm.f dlarrv.f dlartv.f
    dlarz.f  dlarzb.f dlarzt.f dlaswp.f dlasy2.f
    dlasyf.f dlasyf_rook.f dlasyf_rk.f dlasyf_aa.f
@@ -418,7 +418,7 @@ set(ZLASRC
    zlaqhb.f zlaqhe.f zlaqhp.f zlaqp2.f zlaqps.f zlaqp2rk.f zlaqp3rk.f zlaqsb.f
    zlaqr0.f zlaqr1.f zlaqr2.f zlaqr3.f zlaqr4.f zlaqr5.f
    zlaqsp.f zlaqsy.f zlar1v.f zlar2v.f ilazlr.f ilazlc.f
-   zlarcm.f zlarf.f  zlarfb.f zlarfb_gett.f zlarf1f.f zlarf1l.f
+   zlarcm.f zlarf.f  zlarfb.f zlarfb0c2.f zlarfb_gett.f zlarf1f.f zlarf1l.f
    zlarfg.f zlarfgp.f zlarft.f
    zlarfx.f zlarfy.f zlargv.f zlarnv.f zlarrv.f zlartg.f90 zlartv.f
    zlarz.f  zlarzb.f zlarzt.f zlascl.f zlaset.f zlasr.f
diff --git a/SRC/Makefile b/SRC/Makefile
index 0191626f0..b9eaf1bca 100644
--- a/SRC/Makefile
+++ b/SRC/Makefile
@@ -137,7 +137,7 @@ SLASRC = \
    slaqgb.o slaqge.o slaqp2.o slaqps.o slaqp2rk.o slaqp3rk.o slaqsb.o slaqsp.o slaqsy.o \
    slaqr0.o slaqr1.o slaqr2.o slaqr3.o slaqr4.o slaqr5.o \
    slaqtr.o slar1v.o slar2v.o ilaslr.o ilaslc.o \
-   slarf.o  slarf1f.o slarf1l.o slarfb.o slarfb_gett.o slarfg.o slarfgp.o slarft.o slarfx.o slarfy.o \
+   slarf.o  slarf1f.o slarf1l.o slarfb.o slarfb0c2.o slarfb_gett.o slarfg.o slarfgp.o slarft.o slarfx.o slarfy.o \
    slargv.o slarmm.o slarrv.o slartv.o \
    slarz.o  slarzb.o slarzt.o slaswp.o slasy2.o slasyf.o slasyf_rook.o \
    slasyf_rk.o \
@@ -249,7 +249,7 @@ CLASRC = \
    claqhb.o claqhe.o claqhp.o claqp2.o claqps.o claqp2rk.o claqp3rk.o claqsb.o \
    claqr0.o claqr1.o claqr2.o claqr3.o claqr4.o claqr5.o \
    claqsp.o claqsy.o clar1v.o clar2v.o ilaclr.o ilaclc.o \
-   clarf.o  clarf1f.o clarf1l.o clarfb.o clarfb_gett.o clarfg.o clarft.o clarfgp.o \
+   clarf.o  clarf1f.o clarf1l.o clarfb.o clarfb0c2.o clarfb_gett.o clarfg.o clarft.o clarfgp.o \
    clarfx.o clarfy.o clargv.o clarnv.o clarrv.o clartg.o clartv.o \
    clarz.o  clarzb.o clarzt.o clascl.o claset.o clasr.o  classq.o \
    claswp.o clasyf.o clasyf_rook.o clasyf_rk.o clasyf_aa.o \
@@ -339,7 +339,7 @@ DLASRC = \
    dlaqgb.o dlaqge.o dlaqp2.o dlaqps.o dlaqp2rk.o dlaqp3rk.o dlaqsb.o dlaqsp.o dlaqsy.o \
    dlaqr0.o dlaqr1.o dlaqr2.o dlaqr3.o dlaqr4.o dlaqr5.o \
    dlaqtr.o dlar1v.o dlar2v.o iladlr.o iladlc.o \
-   dlarf.o  dlarfb.o dlarfb_gett.o dlarfg.o dlarfgp.o dlarft.o dlarfx.o dlarfy.o dlarf1f.o dlarf1l.o\
+   dlarf.o  dlarfb.o dlarfb_gett.o dlarfg.o dlarfgp.o dlarft.o dlarfx.o dlarfy.o dlarf1f.o dlarf1l.o dlarfb0c2.o\
    dlargv.o dlarmm.o dlarrv.o dlartv.o \
    dlarz.o  dlarzb.o dlarzt.o dlaswp.o dlasy2.o \
    dlasyf.o dlasyf_rook.o dlasyf_rk.o \
@@ -453,7 +453,7 @@ ZLASRC = \
    zlaqhb.o zlaqhe.o zlaqhp.o zlaqp2.o zlaqps.o zlaqp2rk.o zlaqp3rk.o zlaqsb.o \
    zlaqr0.o zlaqr1.o zlaqr2.o zlaqr3.o zlaqr4.o zlaqr5.o \
    zlaqsp.o zlaqsy.o zlar1v.o zlar2v.o ilazlr.o ilazlc.o \
-   zlarcm.o zlarf.o  zlarfb.o zlarfb_gett.o zlarf1f.o zlarf1l.o \
+   zlarcm.o zlarf.o  zlarfb.o zlarfb0c2.o zlarfb_gett.o zlarf1f.o zlarf1l.o \
    zlarfg.o zlarft.o zlarfgp.o \
    zlarfx.o zlarfy.o zlargv.o zlarnv.o zlarrv.o zlartg.o zlartv.o \
    zlarz.o  zlarzb.o zlarzt.o zlascl.o zlaset.o zlasr.o \
diff --git a/SRC/clarfb0c2.f b/SRC/clarfb0c2.f
new file mode 100644
index 000000000..4a7a2018d
--- /dev/null
+++ b/SRC/clarfb0c2.f
@@ -0,0 +1,561 @@
+*> \brief \b CLARFB0C2 applies a block reflector or its conjugate-transpose 
+* to a rectangular matrix with a 0 block while constructing the explicit Q
+* factor
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*
+*  Definition:
+*  ===========
+*
+*     SUBROUTINE CLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N,
+*    $                     K, V, LDV, T, LDT, C, LDC)
+*        ! Scalar arguments
+*        INTEGER           M, N, K, LDV, LDC, LDT
+*        CHARACTER         SIDE, TRANS, DIRECT, STOREV 
+*        ! True means that we are assuming C2 is the identity matrix
+*        !     and thus don't reference whatever is present in C2 
+*        !     at the beginning.
+*        LOGICAL           C2I
+*        ! Array arguments
+*        COMPLEX           V(LDV,*), C(LDC,*), T(LDT,*)
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CLARFB0C2 applies a real block reflector H or its transpose H**H to a
+*> complex m by n matrix C with a 0 block, while computing the explicit Q factor
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] C2I
+*> \verbatim
+*>          C2I is LOGICAL
+*>          = .TRUE.: Assume the nonzero block of C is the identity matrix
+*>          = .FALSE.: Use existing data in the nonzero block of C
+*> \endverbatim
+*>
+*> \param[in] SIDE
+*> \verbatim
+*>          SIDE is CHARACTER*1
+*>          = 'L': apply H or H**H from the Left
+*>          = 'R': apply H or H**H from the Right
+*> \endverbatim
+*>
+*> \param[in] TRANS
+*> \verbatim
+*>          TRANS is CHARACTER*1
+*>          = 'N': apply H (No transpose)
+*>          = 'C': apply H**H (Conjugate transpose)
+*> \endverbatim
+*>
+*> \param[in] DIRECT
+*> \verbatim
+*>          DIRECT is CHARACTER*1
+*>          Indicates how H is formed from a product of elementary
+*>          reflectors
+*>          = 'F': H = H(1) H(2) . . . H(k) (Forward)
+*>          = 'B': H = H(k) . . . H(2) H(1) (Backward)
+*> \endverbatim
+*>
+*> \param[in] STOREV
+*> \verbatim
+*>          STOREV is CHARACTER*1
+*>          Indicates how the vectors which define the elementary
+*>          reflectors are stored:
+*>          = 'C': Columnwise
+*>          = 'R': Rowwise
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix C.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix C.
+*> \endverbatim
+*>
+*> \param[in] K
+*> \verbatim
+*>          K is INTEGER
+*>          The order of the matrix T (= the number of elementary
+*>          reflectors whose product defines the block reflector).
+*>          If SIDE = 'L', M >= K >= 0;
+*>          if SIDE = 'R', N >= K >= 0.
+*> \endverbatim
+*>
+*> \param[in] V
+*> \verbatim
+*>          V is COMPLEX array, dimension
+*>                                (LDV,K) if STOREV = 'C'
+*>                                (LDV,M) if STOREV = 'R' and SIDE = 'L'
+*>                                (LDV,N) if STOREV = 'R' and SIDE = 'R'
+*>          See Further Details.
+*> \endverbatim
+*>
+*> \param[in] LDV
+*> \verbatim
+*>          LDV is INTEGER
+*>          The leading dimension of the array V.
+*>          If STOREV = 'C' and SIDE = 'L', LDV >= max(1,M);
+*>          if STOREV = 'C' and SIDE = 'R', LDV >= max(1,N);
+*>          if STOREV = 'R', LDV >= K.
+*> \endverbatim
+*>
+*> \param[in] T
+*> \verbatim
+*>          T is COMPLEX array, dimension (LDT,K)
+*>          The triangular K-by-K matrix T in the representation of the
+*>          block reflector.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*>          LDT is INTEGER
+*>          The leading dimension of the array T. LDT >= K.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is COMPLEX array, dimension (LDC,N)
+*>          On entry, the M-by-N matrix C.
+*>          On exit, C is overwritten by H*C or H**H*C or C*H or C*H**H.
+*> \endverbatim
+*>
+*> \param[in] LDC
+*> \verbatim
+*>          LDC is INTEGER
+*>          The leading dimension of the array C. LDC >= max(1,M).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup larfb
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  The shape of the matrix V and the storage of the vectors which define
+*>  the H(i) is best illustrated by the following example with n = 5 and
+*>  k = 3. The triangular part of V (including its diagonal) is not
+*>  referenced.
+*>
+*>  DIRECT = 'F' and STOREV = 'C':         DIRECT = 'F' and STOREV = 'R':
+*>
+*>               V = (  1       )                 V = (  1 v1 v1 v1 v1 )
+*>                   ( v1  1    )                     (     1 v2 v2 v2 )
+*>                   ( v1 v2  1 )                     (        1 v3 v3 )
+*>                   ( v1 v2 v3 )
+*>                   ( v1 v2 v3 )
+*>
+*>  DIRECT = 'B' and STOREV = 'C':         DIRECT = 'B' and STOREV = 'R':
+*>
+*>               V = ( v1 v2 v3 )                 V = ( v1 v1  1       )
+*>                   ( v1 v2 v3 )                     ( v2 v2 v2  1    )
+*>                   (  1 v2 v3 )                     ( v3 v3 v3 v3  1 )
+*>                   (     1 v3 )
+*>                   (        1 )
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE CLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N,
+     $                     K, V, LDV, T, LDT, C, LDC)
+         ! Scalar arguments
+         INTEGER           M, N, K, LDV, LDC, LDT
+         CHARACTER         SIDE, TRANS, DIRECT, STOREV 
+         ! True means that we are assuming C2 is the identity matrix
+         !     and thus don't reference whatever is present in C2 
+         !     at the beginning.
+         LOGICAL           C2I
+
+         ! Array arguments
+         COMPLEX           V(LDV,*), C(LDC,*), T(LDT,*)
+         ! Local scalars
+         LOGICAL           QR, LQ, QL, DIRF, COLV, SIDEL, SIDER,
+     $                     TRANST
+         INTEGER           I, J
+         ! Intrinsic Functions
+         INTRINSIC         CONJG
+         ! External functions
+         LOGICAL           LSAME
+         EXTERNAL          LSAME
+         ! External Subroutines
+         EXTERNAL          CGEMM, CTRMM
+         ! Parameters
+         COMPLEX           ONE, ZERO, NEG_ONE
+         PARAMETER(ONE=(1.0E+0, 0.0E+0),
+     $            ZERO = (0.0E+0, 0.0E+0), 
+     $            NEG_ONE = (-1.0E+0, 0.0E+0))
+
+         ! Beginning of executable statements
+         ! Convert our character flags to logical values
+         DIRF = LSAME(DIRECT,'F')
+         COLV = LSAME(STOREV,'C')
+         SIDEL = LSAME(SIDE,'L')
+         SIDER = LSAME(SIDE,'R')
+         TRANST = LSAME(TRANS,'C')
+
+         ! Determine which of the 4 modes are using.
+         ! QR is when we store the reflectors column by column and have the
+         ! 'first' reflector stored in the first column
+         QR = DIRF.AND.COLV
+
+         ! LQ is when we store the reflectors row by row and have the
+         ! 'first' reflector stored in the first row
+         LQ = DIRF.AND.(.NOT.COLV)
+
+         ! QL is when we store the reflectors column by column and have the
+         ! 'first' reflector stored in the last column
+         QL = (.NOT.DIRF).AND.COLV
+
+         ! RQ is when we store the reflectors row by row and have the
+         ! 'first' reflector stored in the last row
+         ! RQ = (.NOT.DIRF).AND.(.NOT.COLV)
+         ! Since we have exactly one of these 4 modes, we don't need to actually
+         ! store the value of RQ, instead we assume this is the case if we fail
+         ! the above 3 checks.
+
+         IF (QR) THEN
+            ! We are computing C = HC = (I - VTV')C
+            ! Where: V = [ V1 ] and C = [ C1 ]
+            !            [ V2 ]         [ C2 ]
+            ! with the following dimensions:
+            !     V1\in\R^{K\times K}
+            !     V2\in\R^{M-K\times K}
+            !     C1=0\in\R^{K\times N}
+            !     C2\in\R^{M-K\times N}
+            ! Since we are assuming that C1 is a zero matrix and it will be
+            ! overwritten on exit, we can use this spot as a temporary workspace
+            ! without having to allocate anything extra.
+            ! This lets us simplify our above equation to get
+            !
+            ! C = HC = (I - [ V1 ]T [V1', V2'])[ 0  ]
+            !               [ V2 ]             [ C2 ]
+            !   = [ 0  ] - [ V1 ]T*V2'*C2
+            !     [ C2 ]   [ V2 ]
+            !
+            !   = [ 0  ] - [ V1*T*V2'*C2 ]
+            !     [ C2 ]   [ V2*T*V2'*C2 ]
+            !
+            !   = [      V1*T*V2'*C2 ]
+            !     [ C2 - V2*T*V2'*C2 ]
+            !
+            ! So, we can order our computations as follows:
+            !
+            ! C1 = V2'*C2
+            ! C1 = T*C1
+            ! C2 = C2 - V2*C1
+            ! C1 = -V1*C1
+            !
+            ! To achieve the same end result
+            !
+            ! Check to ensure side and trans are the expected values 
+            !
+            IF( .NOT.SIDEL ) THEN
+               CALL XERBLA('CLARFB0C2', 2)
+               RETURN
+            ELSE IF(TRANST) THEN
+               CALL XERBLA('CLARFB0C2', 3)
+               RETURN
+            END IF
+            !
+            ! C1 = V2'*C2
+            !
+            IF (C2I) THEN
+               DO J = 1, N
+                  DO I = 1, K
+                     C(I,J) = CONJG(V(K+J,I))
+                  END DO
+               END DO
+            ELSE
+               CALL CGEMM('Conjugate', 'No Transpose', K, N, M - K,
+     $                     ONE, V(K+1,1), LDV, C(K+1,1), LDC, ZERO,
+     $                     C, LDC)
+            END IF
+            !
+            ! C1 = T*C1
+            !
+            CALL CTRMM('Left', 'Upper', 'No Transpose', 'Non-unit',
+     $                  K, N, ONE, T, LDT, C, LDC)
+            !
+            ! C2 = C2 - V2*C1 = -V2*C1 + C2
+            !
+            IF (C2I) THEN
+               CALL CGEMM('No Transpose', 'No Transpose', M-K, N, K,
+     $                     NEG_ONE, V(K+1,1), LDV, C, LDC, ZERO,
+     $                     C(K+1,1), LDC)
+               DO I = 1, N
+                  C(K+I,I) = C(K+I,I) + ONE
+               END DO
+            ELSE
+               CALL CGEMM('No Transpose', 'No Transpose', M-K, N, K,
+     $                     NEG_ONE, V(K+1,1), LDV, C, LDC, ONE,
+     $                     C(K+1,1), LDC)
+            END IF
+            !
+            ! C1 = -V1*C1
+            !
+            CALL CTRMM('Left', 'Lower', 'No Transpose', 'Unit',
+     $                  K, N, NEG_ONE, V, LDV, C, LDC)
+         ELSE IF (LQ) THEN
+            ! We are computing C = CH' = C(I-V'T'V)
+            ! Where: V = [ V1 V2 ] and C = [ C1 C2 ]
+            ! with the following dimensions:
+            !     V1\in\R^{K\times K}
+            !     V2\in\R^{K\times N-K}
+            !     C1=0\in\R^{M\times K}
+            !     C2\in\R^{M\times N-K}
+            ! Since we are assuming that C1 is a zero matrix and it will be
+            ! overwritten on exit, we can use this spot as a temporary workspace
+            ! without having to allocate anything extra.
+            ! This lets us simplify our above equation to get
+            !
+            ! C = CH' = [ 0, C2 ](I - [ V1' ]T'[ V1, V2 ])
+            !                         [ V2' ]
+            !
+            !   = [ 0, C2 ] - [ 0, C2 ][ V1' ]T'[ V1, V2 ]
+            !                          [ V2' ]
+            !
+            !   = [ 0, C2 ] - C2*V2'*T'[ V1, V2 ]
+            !
+            !   = [ -C2*V2'*T'*V1, C2 - C2*V2'*T'*V2 ]
+            !
+            ! So, we can order our computations as follows:
+            !
+            ! C1 = C2*V2'
+            ! C1 = C1*T'
+            ! C2 = C2 - C1*V2
+            ! C1 = -C1*V1
+            !
+            ! To achieve the same end result
+            !
+            ! Check to ensure side and trans are the expected values 
+            !
+            IF( .NOT.SIDER ) THEN
+               CALL XERBLA('CLARFB0C2', 2)
+               RETURN
+            ELSE IF(.NOT.TRANST) THEN
+               CALL XERBLA('CLARFB0C2', 3)
+               RETURN
+            END IF
+            !
+            ! C1 = C2*V2'
+            !
+            IF( C2I ) THEN
+               DO J = 1, K
+                  DO I = 1, M
+                     C(I,J) = CONJG(V(J,K+I))
+                  END DO
+               END DO
+            ELSE
+               CALL CGEMM('No Transpose', 'Conjugate', M, K, N-K,
+     $               ONE, C(1,K+1), LDC, V(1, K+1), LDV, ZERO, C,
+     $               LDC)
+            END IF
+            !
+            ! C1 = C1*T'
+            !
+            CALL CTRMM('Right', 'Upper', 'Conjugate', 'Non-unit',
+     $            M, K, ONE, T, LDT, C, LDC)
+            !
+            ! C2 = C2 - C1*V2 = -C1*V2 + C2
+            !
+            IF( C2I ) THEN
+               CALL CGEMM('No Transpose', 'No Transpose', M, N-K, K,
+     $               NEG_ONE, C, LDC, V(1,K+1), LDV, ZERO, C(1,K+1),
+     $               LDC)
+               DO I = 1, M
+                  C(I,K+I) = C(I,K+I) + ONE
+               END DO
+            ELSE
+               CALL CGEMM('No Transpose', 'No Transpose', M, N-K, K,
+     $               NEG_ONE, C, LDC, V(1,K+1), LDV, ONE, C(1,K+1),
+     $               LDC)
+            END IF
+            !
+            ! C1 = -C1*V1
+            !
+            CALL CTRMM('Right', 'Upper', 'No Transpose', 'Unit',
+     $            M, K, NEG_ONE, V, LDV, C, LDC)
+         ELSE IF (QL) THEN
+            ! We are computing C = HC = (I - VTV')C
+            ! Where: V = [ V2 ] and C = [ C2 ]
+            !            [ V1 ]         [ C1 ]
+            ! with the following dimensions:
+            !     V1\in\R^{K\times K}
+            !     V2\in\R^{M-K\times K}
+            !     C1=0\in\R^{K\times N}
+            !     C2\in\R^{M-K\times N}
+            ! Since we are assuming that C1 is a zero matrix and it will be
+            ! overwritten on exit, we can use this spot as a temporary workspace
+            ! without having to allocate anything extra.
+            ! This lets us simplify our above equation to get
+            !
+            ! C = HC = (I-[ V2 ]T[ V2' V1' ])[ C2 ]
+            !             [ V1 ]             [ 0  ]
+            !
+            !   = [ C2 ] - [ V2 ]T*V2'*C2
+            !     [ 0  ]   [ V1 ]
+            !
+            !   = [ C2 ] - [ V2*T*V2'*C2 ]
+            !     [ 0  ]   [ V1*T*V2'*C2 ]
+            !
+            !   = [ C2 - V2*T*V2'*C2 ]
+            !     [    - V1*T*V2'*C2 ]
+            !
+            ! So, we can order our computations as follows:
+            !
+            ! C1 = V2'*C2
+            ! C1 = T*C1
+            ! C2 = C2 - V2*C1
+            ! C1 = -V1*C1
+            !
+            ! To achieve the same end result
+            !
+            ! Check to ensure side and trans are the expected values 
+            !
+            IF( .NOT.SIDEL ) THEN
+               CALL XERBLA('CLARFB0C2', 2)
+               RETURN
+            ELSE IF(TRANST) THEN
+               CALL XERBLA('CLARFB0C2', 3)
+               RETURN
+            END IF
+            !
+            ! C1 = V2'*C2
+            !
+            IF( C2I ) THEN
+               DO J = 1, N
+                  DO I = 1, K
+                     C(M-K+I,J) = CONJG(V(J,I))
+                  END DO
+               END DO
+            ELSE
+               CALL CGEMM('Conjugate', 'No Transpose', K, N, M-K,
+     $            ONE, V, LDV, C, LDC, ZERO, C(M-K+1, 1), LDC)
+            END IF
+            !
+            ! C1 = T*C1
+            !
+            CALL CTRMM('Left', 'Lower', 'No Transpose', 'Non-unit',
+     $         K, N, ONE, T, LDT, C(M-K+1,1), LDC)
+            !
+            ! C2 = C2 - V2*C1 = -V2*C1 + C2
+            !
+            IF( C2I ) THEN
+               CALL CGEMM('No Transpose', 'No Transpose', M-K, N, K,
+     $            NEG_ONE, V, LDV, C(M-K+1,1), LDC, ZERO, C, LDC)
+               DO I = 1, N
+                  C(I,I) = C(I,I) + ONE
+               END DO
+            ELSE
+               CALL CGEMM('No Transpose', 'No Transpose', M-K, N, K,
+     $            NEG_ONE, V, LDV, C(M-K+1,1), LDC, ONE, C, LDC)
+            END IF
+            !
+            ! C1 = -V1*C1
+            !
+            CALL CTRMM('Left', 'Upper', 'No Transpose', 'Unit',
+     $         K, N, NEG_ONE, V(M-K+1,1), LDV, C(M-K+1,1), LDC)
+         ELSE ! IF (RQ) THEN
+            ! We are computing C = CH' = C(I-V'T'V)
+            ! Where: V = [ V2 V1] and C = [ C2 C1 ]
+            ! with the following dimensions:
+            !     V1\in\R^{K\times K}
+            !     V2\in\R^{K\times N-K}
+            !     C1=0\in\R^{M\times K}
+            !     C2\in\R^{M\times N-K}
+            ! Since we are assuming that C1 is a zero matrix and it will be
+            ! overwritten on exit, we can use this spot as a temporary workspace
+            ! without having to allocate anything extra.
+            ! This lets us simplify our above equation to get
+            !
+            ! C = CH' = [ C2, 0 ] (I - [ V2' ]T'[ V2, V1 ]
+            !                          [ V1' ]
+            !
+            !   = [ C2, 0 ] - [ C2, 0 ] [ V2' ]T'[ V2, V1 ]
+            !                           [ V1' ]
+            !
+            !   = [ C2, 0 ] - C2*V2'*T'[ V2, V1 ]
+            !
+            !   = [ C2, 0 ] - [ C2*V2'*T'*V2, C2*V2'*T'*V1 ]
+            !
+            !   = [ C2 - C2*V2'*T'*V2, -C2*V2'*T'*V1 ]
+            !
+            ! So, we can order our computations as follows:
+            !
+            ! C1 = C2*V2'
+            ! C1 = C1*T'
+            ! C2 = C2 - C1*V2
+            ! C1 = -C1*V1
+            !
+            !
+            ! To achieve the same end result
+            !
+            ! Check to ensure side and trans are the expected values 
+            !
+            IF( .NOT.SIDER ) THEN
+               CALL XERBLA('CLARFB0C2', 2)
+               RETURN
+            ELSE IF(.NOT.TRANST) THEN
+               CALL XERBLA('CLARFB0C2', 3)
+               RETURN
+            END IF
+            !
+            ! C1 = C2*V2'
+            !
+            IF( C2I ) THEN
+               DO J = 1, K
+                  DO I = 1, M
+                     C(I,N-K+J) = CONJG(V(J,I))
+                  END DO
+               END DO
+            ELSE
+               CALL CGEMM('No Transpose', 'Conjugate', M, K, N-K,
+     $            ONE, C, LDC, V, LDV, ZERO, C(1, N-K+1), LDC)
+            END IF
+            !
+            ! C1 = C1*T'
+            !
+            CALL CTRMM('Right', 'Lower', 'Conjugate', 'Non-unit',
+     $         M, K, ONE, T, LDT, C(1, N-K+1), LDC)
+            !
+            ! C2 = C2 - C1*V2 = -C1*V2 + C2
+            !
+            IF( C2I ) THEN
+               CALL CGEMM('No Transpose', 'No Transpose', M, N-K, K,
+     $            NEG_ONE, C(1, N-K+1), LDC, V, LDV, ZERO, C, LDC)
+               DO I = 1, M
+                  C(I,I) = C(I,I) + ONE
+               END DO
+            ELSE
+               CALL CGEMM('No Transpose', 'No Transpose', M, N-K, K,
+     $            NEG_ONE, C(1, N-K+1), LDC, V, LDV, ONE, C, LDC)
+            END IF
+            !
+            ! C1 = -C1*V1
+            !
+            CALL CTRMM('Right', 'Lower', 'No Transpose', 'Unit',
+     $         M, K, NEG_ONE, V(1, N-K+1), LDV, C(1,N-K+1), LDC)
+         END IF
+      END SUBROUTINE
diff --git a/SRC/cunglq.f b/SRC/cunglq.f
index 7f42c3dcb..a57f5a037 100644
--- a/SRC/cunglq.f
+++ b/SRC/cunglq.f
@@ -136,17 +136,13 @@ SUBROUTINE CUNGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
 *  =====================================================================
 *
-*     .. Parameters ..
-      COMPLEX            ZERO
-      PARAMETER          ( ZERO = ( 0.0E+0, 0.0E+0 ) )
-*     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IB, IINFO, IWS, J, KI, KK, L, LDWORK,
-     $                   LWKOPT, NB, NBMIN, NX
+      INTEGER            I, IB, IINFO, IWS, KI, KK, LWKOPT, LDWORK,
+     $                   NB, NBMIN, NX
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           CLARFB, CLARFT, CUNGL2, XERBLA
+      EXTERNAL           CLARFB0C2, CLARFT, CUNGL2, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
@@ -218,67 +214,95 @@ SUBROUTINE CUNGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
       IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
 *
-*        Use blocked code after the last block.
-*        The first kk rows are handled by the block method.
-*
-         KI = ( ( K-NX-1 ) / NB )*NB
-         KK = MIN( K, KI+NB )
-*
-*        Set A(kk+1:m,1:kk) to zero.
+*        Treat the last NB block starting at KK+1 specially then use our blocking
+*        method from the block starting at KI+1 to 1 
 *
-         DO 20 J = 1, KK
-            DO 10 I = KK + 1, M
-               A( I, J ) = ZERO
-   10       CONTINUE
-   20    CONTINUE
+         KI = K - 2 * NB
+         KK = K - NB
       ELSE
          KK = 0
       END IF
 *
-*     Use unblocked code for the last or only block.
+*     Potentially bail to the unblocked version
 *
-      IF( KK.LT.M )
-     $   CALL CUNGL2( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA,
-     $                TAU( KK+1 ), WORK, IINFO )
+      IF( KK.EQ.0 ) THEN
+         CALL CUNGL2( M, N, K, A, LDA, TAU, WORK, IINFO )
+      END IF
 *
       IF( KK.GT.0 ) THEN
 *
-*        Use blocked code
+*        Factor the last block assuming that our first application
+*        will be on the Identity matrix
 *
-         DO 50 I = KI + 1, 1, -NB
-            IB = MIN( NB, K-I+1 )
-            IF( I+IB.LE.M ) THEN
+         I = KK + 1
+         IB = NB
 *
-*              Form the triangular factor of the block reflector
-*              H = H(i) H(i+1) . . . H(i+ib-1)
+*        Form the triangular factor of the block reflector
+*        H = H(i) H(i+1) . . . H(i+ib-1)
 *
-               CALL CLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I,
-     $                      I ),
-     $                      LDA, TAU( I ), WORK, LDWORK )
+         CALL CLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ),
+     $               LDA, TAU( I ), WORK, LDWORK )
 *
-*              Apply H**H to A(i+ib:m,i:n) from the right
+*        Apply H**T to A(i+ib:m,i:n) from the right
+*        Exploit the fact that we are applying to an identity 
 *
-               CALL CLARFB( 'Right', 'Conjugate transpose',
-     $                      'Forward',
-     $                      'Rowwise', M-I-IB+1, N-I+1, IB, A( I, I ),
-     $                      LDA, WORK, LDWORK, A( I+IB, I ), LDA,
-     $                      WORK( IB+1 ), LDWORK )
-            END IF
+         CALL CLARFB0C2(.TRUE., 'Right', 'Conjugate', 'Forward',
+     $         'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, 
+     $         LDWORK, A(I+IB,I), LDA)
+*
+*        Apply H**T to columns i:n of current block
+
+         CALL CUNGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ),
+     $                WORK, IINFO )
+*
+*        Use our standard blocking method after the last block
+*
+         DO I = KI + 1, 1, -NB
+            IB = NB
+*
+*           Form the triangular factor of the block reflector
+*           H = H(i) H(i+1) . . . H(i+ib-1)
 *
-*           Apply H**H to columns i:n of current block
+            CALL CLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ),
+     $                  LDA, TAU( I ), WORK, LDWORK )
 *
+*           Apply H**T to A(i+ib:m,i:n) from the right
+*
+            CALL CLARFB0C2(.FALSE., 'Right', 'Conjugate', 'Forward',
+     $            'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, 
+     $            LDWORK, A(I+IB,I), LDA)
+*
+*           Apply H**T to columns i:n of current block
+
             CALL CUNGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ),
-     $                   WORK,
-     $                   IINFO )
+     $                   WORK, IINFO )
+         END DO
+*
+*        This checks for if K was a perfect multiple of NB
+*        so that we only have a special case for the last block when
+*        necessary
+*
+         IF(I.LT.1) THEN
+            IB = I + NB - 1
+            I = 1
+*
+*           Form the triangular factor of the block reflector
+*           H = H(i) H(i+1) . . . H(i+ib-1)
 *
-*           Set columns 1:i-1 of current block to zero
+            CALL CLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ),
+     $                  LDA, TAU( I ), WORK, LDWORK )
 *
-            DO 40 J = 1, I - 1
-               DO 30 L = I, I + IB - 1
-                  A( L, J ) = ZERO
-   30          CONTINUE
-   40       CONTINUE
-   50    CONTINUE
+*           Apply H**T to A(i+ib:m,i:n) from the right
+*
+            CALL CLARFB0C2(.FALSE., 'Right', 'Conjugate', 'Forward',
+     $            'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, 
+     $            LDWORK, A(I+IB,I), LDA)
+*
+*           Apply H**T to columns i:n of current block
+
+            CALL CUNGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ),
+     $                   WORK, IINFO )
+         END IF
       END IF
 *
       WORK( 1 ) = SROUNDUP_LWORK(IWS)
diff --git a/SRC/cungql.f b/SRC/cungql.f
index 3da2702d8..9022d37ac 100644
--- a/SRC/cungql.f
+++ b/SRC/cungql.f
@@ -95,8 +95,6 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK. LWORK >= max(1,N).
-*>          For optimum performance LWORK >= N*NB, where NB is the
-*>          optimal blocksize.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -137,17 +135,13 @@ SUBROUTINE CUNGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
 *  =====================================================================
 *
-*     .. Parameters ..
-      COMPLEX            ZERO
-      PARAMETER          ( ZERO = ( 0.0E+0, 0.0E+0 ) )
-*     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IB, IINFO, IWS, J, KK, L, LDWORK, LWKOPT,
-     $                   NB, NBMIN, NX
+      INTEGER            I, IB, IINFO, IWS, KK, LWKOPT, NB, NBMIN,
+     $                   NX
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           CLARFB, CLARFT, CUNG2L, XERBLA
+      EXTERNAL           CLARFB0C2, CLARFT, CUNG2L, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
@@ -178,7 +172,11 @@ SUBROUTINE CUNGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
             LWKOPT = 1
          ELSE
             NB = ILAENV( 1, 'CUNGQL', ' ', M, N, K, -1 )
-            LWKOPT = N*NB
+*
+*           Only need a workspace for cung2l in case of bailout
+*           and for the panel factorization
+*
+            LWKOPT = N
          END IF
          WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
@@ -201,88 +199,77 @@ SUBROUTINE CUNGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
       END IF
 *
       NBMIN = 2
-      NX = 0
+      NX = MAX( 0, ILAENV( 3, 'CUNGQL', ' ', M, N, K, -1 ))
       IWS = N
-      IF( NB.GT.1 .AND. NB.LT.K ) THEN
 *
-*        Determine when to cross over from blocked to unblocked code.
-*
-         NX = MAX( 0, ILAENV( 3, 'CUNGQL', ' ', M, N, K, -1 ) )
-         IF( NX.LT.K ) THEN
+      IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
 *
-*           Determine if workspace is large enough for blocked code.
+*        We use blocked code for the entire construction
 *
-            LDWORK = N
-            IWS = LDWORK*NB
-            IF( LWORK.LT.IWS ) THEN
+         KK = K
+      ELSE
+         KK = 0
+      END IF
 *
-*              Not enough workspace to use optimal NB:  reduce NB and
-*              determine the minimum value of NB.
+*     Possibly bail to the unblocked code.
 *
-               NB = LWORK / LDWORK
-               NBMIN = MAX( 2, ILAENV( 2, 'CUNGQL', ' ', M, N, K,
-     $                      -1 ) )
-            END IF
-         END IF
+      IF( KK.EQ.0 ) THEN
+         CALL CUNG2L( M, N, K, A, LDA, TAU, WORK, IINFO )
       END IF
 *
-      IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
+      IF( KK.GT.0 ) THEN
 *
-*        Use blocked code after the first block.
-*        The last kk columns are handled by the block method.
+*        Factor the first block assuming that our first application
+*        will be on the Identity matrix
 *
-         KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB )
+         I = 1
+         IB = NB
 *
-*        Set A(m-kk+1:m,1:n-kk) to zero.
+*        Form the triangular factor of the block reflector
+*        H = H(i+ib-1) . . . H(i+1) H(i)
 *
-         DO 20 J = 1, N - KK
-            DO 10 I = M - KK + 1, M
-               A( I, J ) = ZERO
-   10       CONTINUE
-   20    CONTINUE
-      ELSE
-         KK = 0
-      END IF
+         CALL CLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB,
+     $                  A( 1, N-K+I ), LDA, TAU( I ),
+     $                  A( M-K+I, N-K+I ), LDA)
 *
-*     Use unblocked code for the first or only block.
+*        Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left
+*        Exploit the fact that we are applying to an identity 
 *
-      CALL CUNG2L( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO )
+         CALL CLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Backward', 
+     $         'Columnwise', M-K+I+IB-1, N-K+I-1, IB, A(1, N-K+I), 
+     $         LDA, A( M-K+I, N-K+I ), LDA, A, LDA)
 *
-      IF( KK.GT.0 ) THEN
+*        Apply H to rows 1:m-k+i+ib-1 of current block
 *
-*        Use blocked code
+         CALL CUNG2L( M-K+I+IB-1, IB, IB, A( 1, N-K+I ), LDA,
+     $                TAU( I ), WORK, IINFO )
+
+*        Use blocked code on the remaining blocks if there are any.
 *
-         DO 50 I = K - KK + 1, K, NB
-            IB = MIN( NB, K-I+1 )
-            IF( N-K+I.GT.1 ) THEN
+         DO I = NB+1, K, NB
 *
-*              Form the triangular factor of the block reflector
-*              H = H(i+ib-1) . . . H(i+1) H(i)
+*           The last block may be less than size NB
 *
-               CALL CLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB,
-     $                      A( 1, N-K+I ), LDA, TAU( I ), WORK, LDWORK )
+            IB = MIN(NB, K-I+1)
 *
-*              Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left
+*           Form the triangular factor of the block reflector
+*           H = H(i+ib-1) . . . H(i+1) H(i)
 *
-               CALL CLARFB( 'Left', 'No transpose', 'Backward',
-     $                      'Columnwise', M-K+I+IB-1, N-K+I-1, IB,
-     $                      A( 1, N-K+I ), LDA, WORK, LDWORK, A, LDA,
-     $                      WORK( IB+1 ), LDWORK )
-            END IF
+            CALL CLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB,
+     $                  A( 1, N-K+I ), LDA, TAU( I ), 
+     $                  A( M-K+I, N-K+I ), LDA )
+*
+*           Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left
+*
+            CALL CLARFB0C2(.FALSE., 'Left', 'No Transpose',
+     $            'Backward', 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, 
+     $            A(1, N-K+I), LDA, A( M-K+I, N-K+I ), LDA, A, LDA)
 *
 *           Apply H to rows 1:m-k+i+ib-1 of current block
 *
             CALL CUNG2L( M-K+I+IB-1, IB, IB, A( 1, N-K+I ), LDA,
      $                   TAU( I ), WORK, IINFO )
-*
-*           Set rows m-k+i+ib:m of current block to zero
-*
-            DO 40 J = N - K + I, N - K + I + IB - 1
-               DO 30 L = M - K + I + IB, M
-                  A( L, J ) = ZERO
-   30          CONTINUE
-   40       CONTINUE
-   50    CONTINUE
+         END DO
       END IF
 *
       WORK( 1 ) = CMPLX( IWS )
diff --git a/SRC/cungqr.f b/SRC/cungqr.f
index eb49d2fed..3d7aaceb9 100644
--- a/SRC/cungqr.f
+++ b/SRC/cungqr.f
@@ -95,8 +95,6 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK. LWORK >= max(1,N).
-*>          For optimum performance LWORK >= N*NB, where NB is the
-*>          optimal blocksize.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -137,17 +135,13 @@ SUBROUTINE CUNGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
 *  =====================================================================
 *
-*     .. Parameters ..
-      COMPLEX            ZERO
-      PARAMETER          ( ZERO = ( 0.0E+0, 0.0E+0 ) )
-*     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IB, IINFO, IWS, J, KI, KK, L, LDWORK,
-     $                   LWKOPT, NB, NBMIN, NX
+      INTEGER            I, IB, IINFO, IWS, KI, KK, LWKOPT, NB,
+     $                   NBMIN, NX
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           CLARFB, CLARFT, CUNG2R, XERBLA
+      EXTERNAL           CLARFB0C2, CLARFT, CUNG2R, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
@@ -163,7 +157,11 @@ SUBROUTINE CUNGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
       INFO = 0
       NB = ILAENV( 1, 'CUNGQR', ' ', M, N, K, -1 )
-      LWKOPT = MAX( 1, N )*NB
+*
+*     Only need a workspace for zung2r in case of bailout
+*     and for the panel factorization
+*
+      LWKOPT = MAX( 1, N )
       WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
@@ -192,92 +190,102 @@ SUBROUTINE CUNGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
       END IF
 *
       NBMIN = 2
-      NX = 0
+*     Determine when to cross over from unblocked to blocked
+      NX = MAX( 0, ILAENV( 3, 'CUNGQR', ' ', M, N, K, -1 ) )
       IWS = N
-      IF( NB.GT.1 .AND. NB.LT.K ) THEN
 *
-*        Determine when to cross over from blocked to unblocked code.
-*
-         NX = MAX( 0, ILAENV( 3, 'CUNGQR', ' ', M, N, K, -1 ) )
-         IF( NX.LT.K ) THEN
+      IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
 *
-*           Determine if workspace is large enough for blocked code.
+*        Treat the last NB block starting at KK+1 specially then use our blocking
+*        method from the block starting at KI+1 to 1 
 *
-            LDWORK = N
-            IWS = LDWORK*NB
-            IF( LWORK.LT.IWS ) THEN
+         KI = K - 2 * NB
+         KK = K - NB
+      ELSE
+         KK = 0
+      END IF
 *
-*              Not enough workspace to use optimal NB:  reduce NB and
-*              determine the minimum value of NB.
+*     Potentially bail to the unblocked code.
 *
-               NB = LWORK / LDWORK
-               NBMIN = MAX( 2, ILAENV( 2, 'CUNGQR', ' ', M, N, K,
-     $                      -1 ) )
-            END IF
-         END IF
+      IF( KK.EQ.0 ) THEN
+         CALL CUNG2R( M, N, K, A, LDA, TAU, WORK, IINFO )
       END IF
 *
-      IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
+      IF( KK.GT.0 ) THEN
 *
-*        Use blocked code after the last block.
-*        The first kk columns are handled by the block method.
+*        Factor the last block assuming that our first application
+*        will be on the Identity matrix
 *
-         KI = ( ( K-NX-1 ) / NB )*NB
-         KK = MIN( K, KI+NB )
+         I = KK + 1
+         IB = NB
 *
-*        Set A(1:kk,kk+1:n) to zero.
+*        Form the triangular factor of the block reflector
+*        H = H(i) H(i+1) . . . H(i+ib-1)
 *
-         DO 20 J = KK + 1, N
-            DO 10 I = 1, KK
-               A( I, J ) = ZERO
-   10       CONTINUE
-   20    CONTINUE
-      ELSE
-         KK = 0
-      END IF
+         CALL CLARFT('Forward', 'Column', M-I+1, IB, A(I,I),
+     $                     LDA, TAU(I), A(I,I), LDA)
 *
-*     Use unblocked code for the last or only block.
+*        Apply H to A(i:m,i+ib:n) from the left
+*        Exploit the fact that we are applying to an identity 
 *
-      IF( KK.LT.N )
-     $   CALL CUNG2R( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA,
-     $                TAU( KK+1 ), WORK, IINFO )
+         CALL CLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Forward',
+     $      'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), LDA, A(I,I),
+     $      LDA, A(I,I+IB), LDA)
 *
-      IF( KK.GT.0 ) THEN
+*        Apply H to rows i:m of current block
 *
-*        Use blocked code
+         CALL CUNG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK, IINFO)
 *
-         DO 50 I = KI + 1, 1, -NB
-            IB = MIN( NB, K-I+1 )
-            IF( I+IB.LE.N ) THEN
+*        Use our standard blocking method after the last block
 *
-*              Form the triangular factor of the block reflector
-*              H = H(i) H(i+1) . . . H(i+ib-1)
+         DO I = KI + 1, 1, -NB
+            IB = NB
 *
-               CALL CLARFT( 'Forward', 'Columnwise', M-I+1, IB,
-     $                      A( I, I ), LDA, TAU( I ), WORK, LDWORK )
+*           Form the triangular factor of the block reflector
+*           H = H(i) H(i+1) . . . H(i+ib-1)
 *
-*              Apply H to A(i:m,i+ib:n) from the left
+            CALL CLARFT('Forward', 'Column', M-I+1, IB, A(I,I),
+     $         LDA, TAU(I), A(I,I), LDA)
 *
-               CALL CLARFB( 'Left', 'No transpose', 'Forward',
-     $                      'Columnwise', M-I+1, N-I-IB+1, IB,
-     $                      A( I, I ), LDA, WORK, LDWORK, A( I, I+IB ),
-     $                      LDA, WORK( IB+1 ), LDWORK )
-            END IF
+*           Apply H to A(i:m,i+ib:n) from the left
+*
+            CALL CLARFB0C2(.FALSE., 'Left', 'No Transpose',
+     $         'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I),
+     $         LDA, A(I,I), LDA, A(I,I+IB), LDA)
+
 *
 *           Apply H to rows i:m of current block
 *
-            CALL CUNG2R( M-I+1, IB, IB, A( I, I ), LDA, TAU( I ),
-     $                   WORK,
-     $                   IINFO )
+            CALL CUNG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK,
+     $         IINFO)
+         END DO
+*
+*        This checks for if K was a perfect multiple of NB
+*        so that we only have a special case for the last block when
+*        necessary
+*
+         IF(I.LT.1) THEN
+            IB = I + NB - 1
+            I = 1
 *
-*           Set rows 1:i-1 of current block to zero
+*           Form the triangular factor of the block reflector
+*           H = H(i) H(i+1) . . . H(i+ib-1)
 *
-            DO 40 J = I, I + IB - 1
-               DO 30 L = 1, I - 1
-                  A( L, J ) = ZERO
-   30          CONTINUE
-   40       CONTINUE
-   50    CONTINUE
+            CALL CLARFT('Forward', 'Column', M-I+1, IB, A(I,I),
+     $         LDA, TAU(I), A(I,I), LDA)
+*
+*           Apply H to A(i:m,i+ib:n) from the left
+*
+            CALL CLARFB0C2(.FALSE., 'Left', 'No Transpose',
+     $         'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I),
+     $         LDA, A(I,I), LDA, A(I,I+IB), LDA)
+
+*
+*           Apply H to rows i:m of current block
+*
+            CALL CUNG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK,
+     $         IINFO)
+         END IF
       END IF
 *
       WORK( 1 ) = SROUNDUP_LWORK(IWS)
diff --git a/SRC/cungrq.f b/SRC/cungrq.f
index 7e812b839..4a9b2fe51 100644
--- a/SRC/cungrq.f
+++ b/SRC/cungrq.f
@@ -137,17 +137,13 @@ SUBROUTINE CUNGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
 *  =====================================================================
 *
-*     .. Parameters ..
-      COMPLEX            ZERO
-      PARAMETER          ( ZERO = ( 0.0E+0, 0.0E+0 ) )
-*     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IB, II, IINFO, IWS, J, KK, L, LDWORK,
+      INTEGER            I, IB, II, IINFO, IWS, KK, LDWORK,
      $                   LWKOPT, NB, NBMIN, NX
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           CLARFB, CLARFT, CUNGR2, XERBLA
+      EXTERNAL           CLARFB0C2, CLARFT, CUNGR2, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
@@ -231,61 +227,67 @@ SUBROUTINE CUNGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *        Use blocked code after the first block.
 *        The last kk rows are handled by the block method.
 *
-         KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB )
-*
-*        Set A(1:m-kk,n-kk+1:n) to zero.
-*
-         DO 20 J = N - KK + 1, N
-            DO 10 I = 1, M - KK
-               A( I, J ) = ZERO
-   10       CONTINUE
-   20    CONTINUE
+         KK = K
       ELSE
          KK = 0
       END IF
 *
-*     Use unblocked code for the first or only block.
+*     Potentially bail to the unblocked code
 *
-      CALL CUNGR2( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO )
+      IF( KK.EQ.0 ) THEN
+         CALL CUNGR2( M, N, K, A, LDA, TAU, WORK, IINFO )
+      END IF
 *
       IF( KK.GT.0 ) THEN
 *
-*        Use blocked code
+*        Factor the first block assuming that our first application
+*        will be on the Identity matrix
 *
-         DO 50 I = K - KK + 1, K, NB
-            IB = MIN( NB, K-I+1 )
-            II = M - K + I
-            IF( II.GT.1 ) THEN
+         I = 1
+         IB = NB
+         II = M - K + I
 *
-*              Form the triangular factor of the block reflector
-*              H = H(i+ib-1) . . . H(i+1) H(i)
+*        Form the triangular factor of the block reflector
+*        H = H(i+ib-1) . . . H(i+1) H(i)
 *
-               CALL CLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB,
-     $                      A( II, 1 ), LDA, TAU( I ), WORK, LDWORK )
+         CALL CLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB,
+     $                A( II, 1 ), LDA, TAU( I ), WORK, LDWORK )
 *
-*              Apply H**H to A(1:m-k+i-1,1:n-k+i+ib-1) from the right
+*        Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right
 *
-               CALL CLARFB( 'Right', 'Conjugate transpose',
-     $                      'Backward',
-     $                      'Rowwise', II-1, N-K+I+IB-1, IB, A( II, 1 ),
-     $                      LDA, WORK, LDWORK, A, LDA, WORK( IB+1 ),
-     $                      LDWORK )
-            END IF
+         CALL CLARFB0C2(.TRUE., 'Right', 'Conjugate', 'Backward', 
+     $         'Rowwise', II-1, N-K+I+IB-1, IB, A(II,1), LDA, WORK,
+     $         LDWORK, A, LDA)
 *
-*           Apply H**H to columns 1:n-k+i+ib-1 of current block
+*           Apply H**T to columns 1:n-k+i+ib-1 of current block
 *
-            CALL CUNGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA,
-     $                   TAU( I ),
-     $                   WORK, IINFO )
+         CALL CUNGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA,
+     $                TAU( I ), WORK, IINFO )
+
+         DO I = NB+1, K, NB
 *
-*           Set columns n-k+i+ib:n of current block to zero
+*           The last block may be less than size NB
 *
-            DO 40 L = N - K + I + IB, N
-               DO 30 J = II, II + IB - 1
-                  A( J, L ) = ZERO
-   30          CONTINUE
-   40       CONTINUE
-   50    CONTINUE
+            IB = MIN(NB, K-I+1)
+            II = M - K + I
+*
+*           Form the triangular factor of the block reflector
+*           H = H(i+ib-1) . . . H(i+1) H(i)
+*
+            CALL CLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB,
+     $                   A( II, 1 ), LDA, TAU( I ), WORK, LDWORK )
+*
+*           Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right
+*
+            CALL CLARFB0C2(.FALSE., 'Right', 'Conjugate', 
+     $            'Backward', 'Rowwise', II-1, N-K+I+IB-1, IB, 
+     $            A(II,1), LDA, WORK, LDWORK, A, LDA)
+*
+*           Apply H**T to columns 1:n-k+i+ib-1 of current block
+*
+            CALL CUNGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA,
+     $                   TAU( I ), WORK, IINFO )
+         END DO
       END IF
 *
       WORK( 1 ) = SROUNDUP_LWORK(IWS)
diff --git a/SRC/dlarfb0c2.f b/SRC/dlarfb0c2.f
new file mode 100644
index 000000000..493580e09
--- /dev/null
+++ b/SRC/dlarfb0c2.f
@@ -0,0 +1,555 @@
+*> \brief \b DLARFB0C2 applies a block reflector or its transpose to a
+* rectangular matrix with a 0 block while constructing the explicit Q factor
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*
+*  Definition:
+*  ===========
+*
+*     SUBROUTINE DLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N,
+*    $                     K, V, LDV, T, LDT, C, LDC)
+*        ! Scalar arguments
+*        INTEGER           M, N, K, LDV, LDC, LDT
+*        CHARACTER         SIDE, TRANS, DIRECT, STOREV 
+*        ! True means that we are assuming C2 is the identity matrix
+*        !     and thus don't reference whatever is present in C2 
+*        !     at the beginning.
+*        LOGICAL           C2I
+*        ! Array arguments
+*        DOUBLE PRECISION  V(LDV,*), C(LDC,*), T(LDT,*)
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DLARFB0C2 applies a real block reflector H or its transpose H**T to a
+*> real m by n matrix C with a 0 block, while computing the explicit Q factor
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] C2I
+*> \verbatim
+*>          C2I is LOGICAL
+*>          = .TRUE.: Assume the nonzero block of C is the identity matrix
+*>          = .FALSE.: Use existing data in the nonzero block of C
+*> \endverbatim
+*>
+*> \param[in] SIDE
+*> \verbatim
+*>          SIDE is CHARACTER*1
+*>          = 'L': apply H or H**T from the Left
+*>          = 'R': apply H or H**T from the Right
+*> \endverbatim
+*>
+*> \param[in] TRANS
+*> \verbatim
+*>          TRANS is CHARACTER*1
+*>          = 'N': apply H (No transpose)
+*>          = 'T': apply H**T (Transpose)
+*> \endverbatim
+*>
+*> \param[in] DIRECT
+*> \verbatim
+*>          DIRECT is CHARACTER*1
+*>          Indicates how H is formed from a product of elementary
+*>          reflectors
+*>          = 'F': H = H(1) H(2) . . . H(k) (Forward)
+*>          = 'B': H = H(k) . . . H(2) H(1) (Backward)
+*> \endverbatim
+*>
+*> \param[in] STOREV
+*> \verbatim
+*>          STOREV is CHARACTER*1
+*>          Indicates how the vectors which define the elementary
+*>          reflectors are stored:
+*>          = 'C': Columnwise
+*>          = 'R': Rowwise
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix C.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix C.
+*> \endverbatim
+*>
+*> \param[in] K
+*> \verbatim
+*>          K is INTEGER
+*>          The order of the matrix T (= the number of elementary
+*>          reflectors whose product defines the block reflector).
+*>          If SIDE = 'L', M >= K >= 0;
+*>          if SIDE = 'R', N >= K >= 0.
+*> \endverbatim
+*>
+*> \param[in] V
+*> \verbatim
+*>          V is DOUBLE PRECISION array, dimension
+*>                                (LDV,K) if STOREV = 'C'
+*>                                (LDV,M) if STOREV = 'R' and SIDE = 'L'
+*>                                (LDV,N) if STOREV = 'R' and SIDE = 'R'
+*>          The matrix V. See Further Details.
+*> \endverbatim
+*>
+*> \param[in] LDV
+*> \verbatim
+*>          LDV is INTEGER
+*>          The leading dimension of the array V.
+*>          If STOREV = 'C' and SIDE = 'L', LDV >= max(1,M);
+*>          if STOREV = 'C' and SIDE = 'R', LDV >= max(1,N);
+*>          if STOREV = 'R', LDV >= K.
+*> \endverbatim
+*>
+*> \param[in] T
+*> \verbatim
+*>          T is DOUBLE PRECISION array, dimension (LDT,K)
+*>          The triangular k by k matrix T in the representation of the
+*>          block reflector.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*>          LDT is INTEGER
+*>          The leading dimension of the array T. LDT >= K.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is DOUBLE PRECISION array, dimension (LDC,N)
+*>          On entry, the m by n matrix C.
+*>          On exit, C is overwritten by H*C or H**T*C or C*H or C*H**T.
+*> \endverbatim
+*>
+*> \param[in] LDC
+*> \verbatim
+*>          LDC is INTEGER
+*>          The leading dimension of the array C. LDC >= max(1,M).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup larfb
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  The shape of the matrix V and the storage of the vectors which define
+*>  the H(i) is best illustrated by the following example with n = 5 and
+*>  k = 3. The triangular part of V (including its diagonal) is not
+*>  referenced.
+*>
+*>  DIRECT = 'F' and STOREV = 'C':         DIRECT = 'F' and STOREV = 'R':
+*>
+*>               V = (  1       )                 V = (  1 v1 v1 v1 v1 )
+*>                   ( v1  1    )                     (     1 v2 v2 v2 )
+*>                   ( v1 v2  1 )                     (        1 v3 v3 )
+*>                   ( v1 v2 v3 )
+*>                   ( v1 v2 v3 )
+*>
+*>  DIRECT = 'B' and STOREV = 'C':         DIRECT = 'B' and STOREV = 'R':
+*>
+*>               V = ( v1 v2 v3 )                 V = ( v1 v1  1       )
+*>                   ( v1 v2 v3 )                     ( v2 v2 v2  1    )
+*>                   (  1 v2 v3 )                     ( v3 v3 v3 v3  1 )
+*>                   (     1 v3 )
+*>                   (        1 )
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE DLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N,
+     $                     K, V, LDV, T, LDT, C, LDC)
+         ! Scalar arguments
+         INTEGER           M, N, K, LDV, LDC, LDT
+         CHARACTER         SIDE, TRANS, DIRECT, STOREV 
+         ! True means that we are assuming C2 is the identity matrix
+         !     and thus don't reference whatever is present in C2 
+         !     at the beginning.
+         LOGICAL           C2I
+         ! Array arguments
+         DOUBLE PRECISION  V(LDV,*), C(LDC,*), T(LDT,*)
+         ! Local scalars
+         LOGICAL           QR, LQ, QL, DIRF, COLV, SIDEL, SIDER,
+     $                     TRANST
+         INTEGER           I, J
+         ! External functions
+         LOGICAL           LSAME
+         EXTERNAL          LSAME
+         ! External subroutines
+         EXTERNAL          DGEMM, DTRMM, XERBLA
+         ! Parameters
+         DOUBLE PRECISION ONE, ZERO, NEG_ONE
+         PARAMETER(ONE=1.0D+0, ZERO = 0.0D+0, NEG_ONE = -1.0D+0)
+
+         ! Beginning of executable statements
+         ! Convert our character flags to logical values
+         DIRF = LSAME(DIRECT,'F')
+         COLV = LSAME(STOREV,'C')
+         SIDEL = LSAME(SIDE,'L')
+         SIDER = LSAME(SIDE,'R')
+         TRANST = LSAME(TRANS,'T')
+
+         ! Determine which of the 4 modes are using.
+         ! QR is when we store the reflectors column by column and have the
+         ! 'first' reflector stored in the first column
+         QR = DIRF.AND.COLV
+
+         ! LQ is when we store the reflectors row by row and have the
+         ! 'first' reflector stored in the first row
+         LQ = DIRF.AND.(.NOT.COLV)
+
+         ! QL is when we store the reflectors column by column and have the
+         ! 'first' reflector stored in the last column
+         QL = (.NOT.DIRF).AND.COLV
+
+         ! RQ is when we store the reflectors row by row and have the
+         ! 'first' reflector stored in the last row
+         ! RQ = (.NOT.DIRF).AND.(.NOT.COLV)
+         ! Since we have exactly one of these 4 modes, we don't need to actually
+         ! store the value of RQ, instead we assume this is the case if we fail
+         ! the above 3 checks.
+
+         IF (QR) THEN
+            ! We are computing C = HC = (I - VTV')C
+            ! Where: V = [ V1 ] and C = [ C1 ]
+            !            [ V2 ]         [ C2 ]
+            ! with the following dimensions:
+            !     V1\in\R^{K\times K}
+            !     V2\in\R^{M-K\times K}
+            !     C1=0\in\R^{K\times N}
+            !     C2\in\R^{M-K\times N}
+            ! Since we are assuming that C1 is a zero matrix and it will be
+            ! overwritten on exit, we can use this spot as a temporary workspace
+            ! without having to allocate anything extra.
+            ! This lets us simplify our above equation to get
+            !
+            ! C = HC = (I - [ V1 ]T [V1', V2'])[ 0  ]
+            !               [ V2 ]             [ C2 ]
+            !   = [ 0  ] - [ V1 ]T*V2'*C2
+            !     [ C2 ]   [ V2 ]
+            !
+            !   = [ 0  ] - [ V1*T*V2'*C2 ]
+            !     [ C2 ]   [ V2*T*V2'*C2 ]
+            !
+            !   = [      V1*T*V2'*C2 ]
+            !     [ C2 - V2*T*V2'*C2 ]
+            !
+            ! So, we can order our computations as follows:
+            !
+            ! C1 = V2'*C2
+            ! C1 = T*C1
+            ! C2 = C2 - V2*C1
+            ! C1 = -V1*C1
+            !
+            ! To achieve the same end result
+            !
+            ! Check to ensure side and trans are the expected values 
+            !
+            IF( .NOT.SIDEL ) THEN
+               CALL XERBLA('DLARFB0C2', 2)
+               RETURN
+            ELSE IF(TRANST) THEN
+               CALL XERBLA('DLARFB0C2', 3)
+               RETURN
+            END IF
+            !
+            ! C1 = V2'*C2
+            !
+            IF (C2I) THEN
+               DO J = 1, N
+                  DO I = 1, K
+                     C(I,J) = V(K+J,I)
+                  END DO
+               END DO
+            ELSE
+               CALL DGEMM('Transpose', 'No Transpose', K, N, M - K,
+     $                     ONE, V(K+1,1), LDV, C(K+1,1), LDC, ZERO,
+     $                     C, LDC)
+            END IF
+            !
+            ! C1 = T*C1
+            !
+            CALL DTRMM('Left', 'Upper', 'No Transpose', 'Non-unit',
+     $                  K, N, ONE, T, LDT, C, LDC)
+            !
+            ! C2 = C2 - V2*C1 = -V2*C1 + C2
+            !
+            IF (C2I) THEN
+               CALL DGEMM('No Transpose', 'No Transpose', M-K, N, K,
+     $                     NEG_ONE, V(K+1,1), LDV, C, LDC, ZERO,
+     $                     C(K+1,1), LDC)
+               DO I = 1, N
+                  C(K+I,I) = C(K+I,I) + ONE
+               END DO
+            ELSE
+               CALL DGEMM('No Transpose', 'No Transpose', M-K, N, K,
+     $                     NEG_ONE, V(K+1,1), LDV, C, LDC, ONE,
+     $                     C(K+1,1), LDC)
+            END IF
+            !
+            ! C1 = -V1*C1
+            !
+            CALL DTRMM('Left', 'Lower', 'No Transpose', 'Unit',
+     $                  K, N, NEG_ONE, V, LDV, C, LDC)
+         ELSE IF (LQ) THEN
+            ! We are computing C = CH' = C(I-V'T'V)
+            ! Where: V = [ V1 V2 ] and C = [ C1 C2 ]
+            ! with the following dimensions:
+            !     V1\in\R^{K\times K}
+            !     V2\in\R^{K\times N-K}
+            !     C1=0\in\R^{M\times K}
+            !     C2\in\R^{M\times N-K}
+            ! Since we are assuming that C1 is a zero matrix and it will be
+            ! overwritten on exit, we can use this spot as a temporary workspace
+            ! without having to allocate anything extra.
+            ! This lets us simplify our above equation to get
+            !
+            ! C = CH' = [ 0, C2 ](I - [ V1' ]T'[ V1, V2 ])
+            !                         [ V2' ]
+            !
+            !   = [ 0, C2 ] - [ 0, C2 ][ V1' ]T'[ V1, V2 ]
+            !                          [ V2' ]
+            !
+            !   = [ 0, C2 ] - C2*V2'*T'[ V1, V2 ]
+            !
+            !   = [ -C2*V2'*T'*V1, C2 - C2*V2'*T'*V2 ]
+            !
+            ! So, we can order our computations as follows:
+            !
+            ! C1 = C2*V2'
+            ! C1 = C1*T'
+            ! C2 = C2 - C1*V2
+            ! C1 = -C1*V1
+            !
+            ! To achieve the same end result
+            !
+            ! Check to ensure side and trans are the expected values 
+            !
+            IF( .NOT.SIDER ) THEN
+               CALL XERBLA('DLARFB0C2', 2)
+               RETURN
+            ELSE IF(.NOT.TRANST) THEN
+               CALL XERBLA('DLARFB0C2', 3)
+               RETURN
+            END IF
+            !
+            ! C1 = C2*V2'
+            !
+            IF( C2I ) THEN
+               DO J = 1, K
+                  DO I = 1, M
+                     C(I,J) = V(J,K+I)
+                  END DO
+               END DO
+            ELSE
+               CALL DGEMM('No Transpose', 'Transpose', M, K, N-K,
+     $               ONE, C(1,K+1), LDC, V(1, K+1), LDV, ZERO, C,
+     $               LDC)
+            END IF
+            !
+            ! C1 = C1*T'
+            !
+            CALL DTRMM('Right', 'Upper', 'Transpose', 'Non-unit',
+     $            M, K, ONE, T, LDT, C, LDC)
+            !
+            ! C2 = C2 - C1*V2 = -C1*V2 + C2
+            !
+            IF( C2I ) THEN
+               CALL DGEMM('No Transpose', 'No Transpose', M, N-K, K,
+     $               NEG_ONE, C, LDC, V(1,K+1), LDV, ZERO, C(1,K+1),
+     $               LDC)
+               DO I = 1, M
+                  C(I,K+I) = C(I,K+I) + ONE
+               END DO
+            ELSE
+               CALL DGEMM('No Transpose', 'No Transpose', M, N-K, K,
+     $               NEG_ONE, C, LDC, V(1,K+1), LDV, ONE, C(1,K+1),
+     $               LDC)
+            END IF
+            !
+            ! C1 = -C1*V1
+            !
+            CALL DTRMM('Right', 'Upper', 'No Transpose', 'Unit',
+     $            M, K, NEG_ONE, V, LDV, C, LDC)
+         ELSE IF (QL) THEN
+            ! We are computing C = HC = (I - VTV')C
+            ! Where: V = [ V2 ] and C = [ C2 ]
+            !            [ V1 ]         [ C1 ]
+            ! with the following dimensions:
+            !     V1\in\R^{K\times K}
+            !     V2\in\R^{M-K\times K}
+            !     C1=0\in\R^{K\times N}
+            !     C2\in\R^{M-K\times N}
+            ! Since we are assuming that C1 is a zero matrix and it will be
+            ! overwritten on exit, we can use this spot as a temporary workspace
+            ! without having to allocate anything extra.
+            ! This lets us simplify our above equation to get
+            !
+            ! C = HC = (I-[ V2 ]T[ V2' V1' ])[ C2 ]
+            !             [ V1 ]             [ 0  ]
+            !
+            !   = [ C2 ] - [ V2 ]T*V2'*C2
+            !     [ 0  ]   [ V1 ]
+            !
+            !   = [ C2 ] - [ V2*T*V2'*C2 ]
+            !     [ 0  ]   [ V1*T*V2'*C2 ]
+            !
+            !   = [ C2 - V2*T*V2'*C2 ]
+            !     [    - V1*T*V2'*C2 ]
+            !
+            ! So, we can order our computations as follows:
+            !
+            ! C1 = V2'*C2
+            ! C1 = T*C1
+            ! C2 = C2 - V2*C1
+            ! C1 = -V1*C1
+            !
+            ! To achieve the same end result
+            !
+            ! Check to ensure side and trans are the expected values 
+            !
+            IF( .NOT.SIDEL ) THEN
+               CALL XERBLA('DLARFB0C2', 2)
+               RETURN
+            ELSE IF(TRANST) THEN
+               CALL XERBLA('DLARFB0C2', 3)
+               RETURN
+            END IF
+            !
+            ! C1 = V2'*C2
+            !
+            IF( C2I ) THEN
+               DO J = 1, N
+                  DO I = 1, K
+                     C(M-K+I,J) = V(J,I)
+                  END DO
+               END DO
+            ELSE
+               CALL DGEMM('Transpose', 'No Transpose', K, N, M-K,
+     $            ONE, V, LDV, C, LDC, ZERO, C(M-K+1, 1), LDC)
+            END IF
+            !
+            ! C1 = T*C1
+            !
+            CALL DTRMM('Left', 'Lower', 'No Transpose', 'Non-unit',
+     $         K, N, ONE, T, LDT, C(M-K+1,1), LDC)
+            !
+            ! C2 = C2 - V2*C1 = -V2*C1 + C2
+            !
+            IF( C2I ) THEN
+               CALL DGEMM('No Transpose', 'No Transpose', M-K, N, K,
+     $            NEG_ONE, V, LDV, C(M-K+1,1), LDC, ZERO, C, LDC)
+               DO I = 1, N
+                  C(I,I) = C(I,I) + ONE
+               END DO
+            ELSE
+               CALL DGEMM('No Transpose', 'No Transpose', M-K, N, K,
+     $            NEG_ONE, V, LDV, C(M-K+1,1), LDC, ONE, C, LDC)
+            END IF
+            !
+            ! C1 = -V1*C1
+            !
+            CALL DTRMM('Left', 'Upper', 'No Transpose', 'Unit',
+     $         K, N, NEG_ONE, V(M-K+1,1), LDV, C(M-K+1,1), LDC)
+         ELSE ! IF (RQ) THEN
+            ! We are computing C = CH' = C(I-V'T'V)
+            ! Where: V = [ V2 V1] and C = [ C2 C1 ]
+            ! with the following dimensions:
+            !     V1\in\R^{K\times K}
+            !     V2\in\R^{K\times N-K}
+            !     C1=0\in\R^{M\times K}
+            !     C2\in\R^{M\times N-K}
+            ! Since we are assuming that C1 is a zero matrix and it will be
+            ! overwritten on exit, we can use this spot as a temporary workspace
+            ! without having to allocate anything extra.
+            ! This lets us simplify our above equation to get
+            !
+            ! C = CH' = [ C2, 0 ] (I - [ V2' ]T'[ V2, V1 ]
+            !                          [ V1' ]
+            !
+            !   = [ C2, 0 ] - [ C2, 0 ] [ V2' ]T'[ V2, V1 ]
+            !                           [ V1' ]
+            !
+            !   = [ C2, 0 ] - C2*V2'*T'[ V2, V1 ]
+            !
+            !   = [ C2, 0 ] - [ C2*V2'*T'*V2, C2*V2'*T'*V1 ]
+            !
+            !   = [ C2 - C2*V2'*T'*V2, -C2*V2'*T'*V1 ]
+            !
+            ! So, we can order our computations as follows:
+            !
+            ! C1 = C2*V2'
+            ! C1 = C1*T'
+            ! C2 = C2 - C1*V2
+            ! C1 = -C1*V1
+            !
+            !
+            ! To achieve the same end result
+            !
+            ! Check to ensure side and trans are the expected values 
+            !
+            IF( .NOT.SIDER ) THEN
+               CALL XERBLA('DLARFB0C2', 2)
+               RETURN
+            ELSE IF(.NOT.TRANST) THEN
+               CALL XERBLA('DLARFB0C2', 3)
+               RETURN
+            END IF
+            !
+            ! C1 = C2*V2'
+            !
+            IF( C2I ) THEN
+               DO J = 1, K
+                  DO I = 1, M
+                     C(I,N-K+J) = V(J,I)
+                  END DO
+               END DO
+            ELSE
+               CALL DGEMM('No Transpose', 'Transpose', M, K, N-K,
+     $            ONE, C, LDC, V, LDV, ZERO, C(1, N-K+1), LDC)
+            END IF
+            !
+            ! C1 = C1*T'
+            !
+            CALL DTRMM('Right', 'Lower', 'Transpose', 'Non-unit',
+     $         M, K, ONE, T, LDT, C(1, N-K+1), LDC)
+            !
+            ! C2 = C2 - C1*V2 = -C1*V2 + C2
+            !
+            IF( C2I ) THEN
+               CALL DGEMM('No Transpose', 'No Transpose', M, N-K, K,
+     $            NEG_ONE, C(1, N-K+1), LDC, V, LDV, ZERO, C, LDC)
+               DO I = 1, M
+                  C(I,I) = C(I,I) + ONE
+               END DO
+            ELSE
+               CALL DGEMM('No Transpose', 'No Transpose', M, N-K, K,
+     $            NEG_ONE, C(1, N-K+1), LDC, V, LDV, ONE, C, LDC)
+            END IF
+            !
+            ! C1 = -C1*V1
+            !
+            CALL DTRMM('Right', 'Lower', 'No Transpose', 'Unit',
+     $         M, K, NEG_ONE, V(1, N-K+1), LDV, C(1,N-K+1), LDC)
+         END IF
+      END SUBROUTINE
diff --git a/SRC/dorglq.f b/SRC/dorglq.f
index 71ff93fa6..c7f32afb8 100644
--- a/SRC/dorglq.f
+++ b/SRC/dorglq.f
@@ -5,6 +5,7 @@
 * Online html documentation available at
 *            http://www.netlib.org/lapack/explore-html/
 *
+*> \htmlonly
 *> Download DORGLQ + dependencies
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dorglq.f">
 *> [TGZ]</a>
@@ -12,6 +13,7 @@
 *> [ZIP]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dorglq.f">
 *> [TXT]</a>
+*> \endhtmlonly
 *
 *  Definition:
 *  ===========
@@ -136,17 +138,13 @@ SUBROUTINE DORGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
 *  =====================================================================
 *
-*     .. Parameters ..
-      DOUBLE PRECISION   ZERO
-      PARAMETER          ( ZERO = 0.0D+0 )
-*     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IB, IINFO, IWS, J, KI, KK, L, LDWORK,
-     $                   LWKOPT, NB, NBMIN, NX
+      INTEGER            I, IB, IINFO, IWS, KI, KK, LDWORK, LWKOPT,
+     $                   NB, NBMIN, NX
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DLARFB, DLARFT, DORGL2, XERBLA
+      EXTERNAL           DLARFB0C2, DLARFT, DORGL2, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
@@ -161,7 +159,7 @@ SUBROUTINE DORGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
       INFO = 0
       NB = ILAENV( 1, 'DORGLQ', ' ', M, N, K, -1 )
-      LWKOPT = MAX( 1, M )*NB
+      LWKOPT = MAX( 1, M ) * NB
       WORK( 1 ) = LWKOPT
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
@@ -217,67 +215,90 @@ SUBROUTINE DORGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
       IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
 *
-*        Use blocked code after the last block.
-*        The first kk rows are handled by the block method.
-*
-         KI = ( ( K-NX-1 ) / NB )*NB
-         KK = MIN( K, KI+NB )
-*
-*        Set A(kk+1:m,1:kk) to zero.
+*        Handle the first block assuming we are applying to the
+*        identity, then resume regular blocking method after
 *
-         DO 20 J = 1, KK
-            DO 10 I = KK + 1, M
-               A( I, J ) = ZERO
-   10       CONTINUE
-   20    CONTINUE
+         KI = K - 2 * NB
+         KK = K - NB
       ELSE
          KK = 0
       END IF
 *
-*     Use unblocked code for the last or only block.
+*     Potentially bail to the unblocked version
 *
-      IF( KK.LT.M )
-     $   CALL DORGL2( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA,
-     $                TAU( KK+1 ), WORK, IINFO )
+      IF( KK.EQ.0 ) THEN
+         CALL DORGL2( M, N, K, A, LDA, TAU, WORK, IINFO )
+      END IF
 *
       IF( KK.GT.0 ) THEN
+         I = KK + 1
+         IB = NB
+*
+*        Form the triangular factor of the block reflector
+*        H = H(i) H(i+1) . . . H(i+ib-1)
+*
+         CALL DLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ),
+     $               LDA, TAU( I ), WORK, LDWORK )
+*
+*        Apply H**T to A(i+ib:m,i:n) from the right
+*
+         CALL DLARFB0C2(.TRUE., 'Right', 'Transpose', 'Forward',
+     $         'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, 
+     $         LDWORK, A(I+IB,I), LDA)
+*
+*        Apply H**T to columns i:n of current block
+
+         CALL DORGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ),
+     $                WORK, IINFO )
 *
 *        Use blocked code
 *
-         DO 50 I = KI + 1, 1, -NB
-            IB = MIN( NB, K-I+1 )
-            IF( I+IB.LE.M ) THEN
+         DO I = KI + 1, 1, -NB
+            IB = NB
 *
-*              Form the triangular factor of the block reflector
-*              H = H(i) H(i+1) . . . H(i+ib-1)
+*           Form the triangular factor of the block reflector
+*           H = H(i) H(i+1) . . . H(i+ib-1)
 *
-               CALL DLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I,
-     $                      I ),
-     $                      LDA, TAU( I ), WORK, LDWORK )
+            CALL DLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ),
+     $                  LDA, TAU( I ), WORK, LDWORK )
 *
-*              Apply H**T to A(i+ib:m,i:n) from the right
+*           Apply H**T to A(i+ib:m,i:n) from the right
 *
-               CALL DLARFB( 'Right', 'Transpose', 'Forward',
-     $                      'Rowwise',
-     $                      M-I-IB+1, N-I+1, IB, A( I, I ), LDA, WORK,
-     $                      LDWORK, A( I+IB, I ), LDA, WORK( IB+1 ),
-     $                      LDWORK )
-            END IF
+            CALL DLARFB0C2(.FALSE., 'Right', 'Transpose', 'Forward',
+     $            'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, 
+     $            LDWORK, A(I+IB,I), LDA)
 *
 *           Apply H**T to columns i:n of current block
-*
+
             CALL DORGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ),
-     $                   WORK,
-     $                   IINFO )
+     $                   WORK, IINFO )
+         END DO
+*
+*        This checks for if K was a perfect multiple of NB
+*        so that we only have a special case for the last block when
+*        necessary
+*
+         IF(I.LT.1) THEN
+            IB = I + NB - 1
+            I = 1
+*
+*           Form the triangular factor of the block reflector
+*           H = H(i) H(i+1) . . . H(i+ib-1)
+*
+            CALL DLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ),
+     $                  LDA, TAU( I ), WORK, LDWORK )
 *
-*           Set columns 1:i-1 of current block to zero
+*           Apply H**T to A(i+ib:m,i:n) from the right
 *
-            DO 40 J = 1, I - 1
-               DO 30 L = I, I + IB - 1
-                  A( L, J ) = ZERO
-   30          CONTINUE
-   40       CONTINUE
-   50    CONTINUE
+            CALL DLARFB0C2(.FALSE., 'Right', 'Transpose', 'Forward',
+     $            'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, 
+     $            LDWORK, A(I+IB,I), LDA)
+*
+*           Apply H**T to columns i:n of current block
+
+            CALL DORGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ),
+     $                   WORK, IINFO )
+         END IF
       END IF
 *
       WORK( 1 ) = IWS
diff --git a/SRC/dorgql.f b/SRC/dorgql.f
index c7bba63c2..378147844 100644
--- a/SRC/dorgql.f
+++ b/SRC/dorgql.f
@@ -5,6 +5,7 @@
 * Online html documentation available at
 *            http://www.netlib.org/lapack/explore-html/
 *
+*> \htmlonly
 *> Download DORGQL + dependencies
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dorgql.f">
 *> [TGZ]</a>
@@ -12,6 +13,7 @@
 *> [ZIP]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dorgql.f">
 *> [TXT]</a>
+*> \endhtmlonly
 *
 *  Definition:
 *  ===========
@@ -95,8 +97,6 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK. LWORK >= max(1,N).
-*>          For optimum performance LWORK >= N*NB, where NB is the
-*>          optimal blocksize.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -137,17 +137,12 @@ SUBROUTINE DORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
 *  =====================================================================
 *
-*     .. Parameters ..
-      DOUBLE PRECISION   ZERO
-      PARAMETER          ( ZERO = 0.0D+0 )
-*     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IB, IINFO, IWS, J, KK, L, LDWORK, LWKOPT,
-     $                   NB, NBMIN, NX
+      INTEGER            I, IB, IINFO, IWS, KK, LWKOPT, NB, NBMIN
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DLARFB, DLARFT, DORG2L, XERBLA
+      EXTERNAL           DLARFB0C2, DLARFT, DORG2L, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
@@ -177,7 +172,8 @@ SUBROUTINE DORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
             LWKOPT = 1
          ELSE
             NB = ILAENV( 1, 'DORGQL', ' ', M, N, K, -1 )
-            LWKOPT = N*NB
+            ! Only need a workspace for calls to dorg2l
+            LWKOPT = N
          END IF
          WORK( 1 ) = LWKOPT
 *
@@ -200,88 +196,77 @@ SUBROUTINE DORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
       END IF
 *
       NBMIN = 2
-      NX = 0
       IWS = N
-      IF( NB.GT.1 .AND. NB.LT.K ) THEN
-*
-*        Determine when to cross over from blocked to unblocked code.
 *
-         NX = MAX( 0, ILAENV( 3, 'DORGQL', ' ', M, N, K, -1 ) )
-         IF( NX.LT.K ) THEN
+      IF( NB.GE.NBMIN .AND. NB.LT.K ) THEN
 *
-*           Determine if workspace is large enough for blocked code.
+*        We want to use the blocking method as long as our matrix is big enough
 *
-            LDWORK = N
-            IWS = LDWORK*NB
-            IF( LWORK.LT.IWS ) THEN
+         KK = K
+      ELSE
+         KK = 0
+      END IF
 *
-*              Not enough workspace to use optimal NB:  reduce NB and
-*              determine the minimum value of NB.
+*     Possibly bail to the unblocked code.
 *
-               NB = LWORK / LDWORK
-               NBMIN = MAX( 2, ILAENV( 2, 'DORGQL', ' ', M, N, K,
-     $                      -1 ) )
-            END IF
-         END IF
+      IF( KK.EQ.0 ) THEN
+         CALL DORG2L( M, N, K, A, LDA, TAU, WORK, IINFO )
       END IF
 *
-      IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
+*     Use our blocked code for everything
 *
-*        Use blocked code after the first block.
-*        The last kk columns are handled by the block method.
+      IF( KK.GT.0 ) THEN
 *
-         KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB )
+*        Factor the first block assuming that our first application
+*        will be on the Identity matrix
 *
-*        Set A(m-kk+1:m,1:n-kk) to zero.
+         I = 1
+         IB = NB
 *
-         DO 20 J = 1, N - KK
-            DO 10 I = M - KK + 1, M
-               A( I, J ) = ZERO
-   10       CONTINUE
-   20    CONTINUE
-      ELSE
-         KK = 0
-      END IF
+*        Form the triangular factor of the block reflector
+*        H = H(i+ib-1) . . . H(i+1) H(i)
 *
-*     Use unblocked code for the first or only block.
+         CALL DLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB,
+     $                  A( 1, N-K+I ), LDA, TAU( I ),
+     $                  A( M-K+I, N-K+I ), LDA)
 *
-      CALL DORG2L( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO )
+*        Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left
 *
-      IF( KK.GT.0 ) THEN
+         CALL DLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Backward', 
+     $         'Columnwise', M-K+I+IB-1, N-K+I-1, IB, A(1, N-K+I), 
+     $         LDA, A( M-K+I, N-K+I ), LDA, A, LDA)
+*
+*        Apply H to rows 1:m-k+i+ib-1 of current block
 *
-*        Use blocked code
+         CALL DORG2L( M-K+I+IB-1, IB, IB, A( 1, N-K+I ), LDA,
+     $                TAU( I ), WORK, IINFO )
+
+*        Use blocked code on the remaining blocks if there are any.
 *
-         DO 50 I = K - KK + 1, K, NB
-            IB = MIN( NB, K-I+1 )
-            IF( N-K+I.GT.1 ) THEN
+         DO I = NB+1, K, NB
 *
-*              Form the triangular factor of the block reflector
-*              H = H(i+ib-1) . . . H(i+1) H(i)
+*           The last block may be less than size NB
 *
-               CALL DLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB,
-     $                      A( 1, N-K+I ), LDA, TAU( I ), WORK, LDWORK )
+            IB = MIN(NB, K-I+1)
 *
-*              Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left
+*           Form the triangular factor of the block reflector
+*           H = H(i+ib-1) . . . H(i+1) H(i)
 *
-               CALL DLARFB( 'Left', 'No transpose', 'Backward',
-     $                      'Columnwise', M-K+I+IB-1, N-K+I-1, IB,
-     $                      A( 1, N-K+I ), LDA, WORK, LDWORK, A, LDA,
-     $                      WORK( IB+1 ), LDWORK )
-            END IF
+            CALL DLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB,
+     $                  A( 1, N-K+I ), LDA, TAU( I ), 
+     $                  A( M-K+I, N-K+I ), LDA )
+*
+*           Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left
+*
+            CALL DLARFB0C2(.FALSE., 'Left', 'No Transpose',
+     $            'Backward', 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, 
+     $            A(1, N-K+I), LDA, A( M-K+I, N-K+I ), LDA, A, LDA)
 *
 *           Apply H to rows 1:m-k+i+ib-1 of current block
 *
             CALL DORG2L( M-K+I+IB-1, IB, IB, A( 1, N-K+I ), LDA,
      $                   TAU( I ), WORK, IINFO )
-*
-*           Set rows m-k+i+ib:m of current block to zero
-*
-            DO 40 J = N - K + I, N - K + I + IB - 1
-               DO 30 L = M - K + I + IB, M
-                  A( L, J ) = ZERO
-   30          CONTINUE
-   40       CONTINUE
-   50    CONTINUE
+         END DO
       END IF
 *
       WORK( 1 ) = IWS
diff --git a/SRC/dorgqr.f b/SRC/dorgqr.f
index 83e26588e..8f3c44278 100644
--- a/SRC/dorgqr.f
+++ b/SRC/dorgqr.f
@@ -5,6 +5,7 @@
 * Online html documentation available at
 *            http://www.netlib.org/lapack/explore-html/
 *
+*> \htmlonly
 *> Download DORGQR + dependencies
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dorgqr.f">
 *> [TGZ]</a>
@@ -12,6 +13,7 @@
 *> [ZIP]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dorgqr.f">
 *> [TXT]</a>
+*> \endhtmlonly
 *
 *  Definition:
 *  ===========
@@ -95,8 +97,6 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK. LWORK >= max(1,N).
-*>          For optimum performance LWORK >= N*NB, where NB is the
-*>          optimal blocksize.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -119,7 +119,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup ungqr
+*> \ingroup doubleOTHERcomputational
 *
 *  =====================================================================
       SUBROUTINE DORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
@@ -137,17 +137,13 @@ SUBROUTINE DORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
 *  =====================================================================
 *
-*     .. Parameters ..
-      DOUBLE PRECISION   ZERO
-      PARAMETER          ( ZERO = 0.0D+0 )
-*     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IB, IINFO, IWS, J, KI, KK, L, LDWORK,
-     $                   LWKOPT, NB, NBMIN, NX
+      INTEGER            I, IB, IINFO, IWS, KI, KK, LWKOPT,
+     $                   NB, NBMIN, NX
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DLARFB, DLARFT, DORG2R, XERBLA
+      EXTERNAL             DLARFB0C2, DLARFT, DORG2R, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
@@ -162,7 +158,9 @@ SUBROUTINE DORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
       INFO = 0
       NB = ILAENV( 1, 'DORGQR', ' ', M, N, K, -1 )
-      LWKOPT = MAX( 1, N )*NB
+      ! Only need a workspace for dorg2r in case of bailout and
+      ! for the panel factorization
+      LWKOPT = MAX( 1, N )
       WORK( 1 ) = LWKOPT
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
@@ -191,92 +189,93 @@ SUBROUTINE DORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
       END IF
 *
       NBMIN = 2
-      NX = 0
+      NX = MAX(0, ILAENV(3, 'SORGQR', ' ', M, N, K, -1))
       IWS = N
-      IF( NB.GT.1 .AND. NB.LT.K ) THEN
-*
-*        Determine when to cross over from blocked to unblocked code.
 *
-         NX = MAX( 0, ILAENV( 3, 'DORGQR', ' ', M, N, K, -1 ) )
-         IF( NX.LT.K ) THEN
+      IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
 *
-*           Determine if workspace is large enough for blocked code.
+*        Handle the first block assuming we are applying to the
+*        identity, then resume regular blocking method after
 *
-            LDWORK = N
-            IWS = LDWORK*NB
-            IF( LWORK.LT.IWS ) THEN
+         KI = K - 2 * NB
+         KK = K - NB
+      ELSE
+         KK = 0
+      END IF
 *
-*              Not enough workspace to use optimal NB:  reduce NB and
-*              determine the minimum value of NB.
+*     Potentially bail to the unblocked code.
 *
-               NB = LWORK / LDWORK
-               NBMIN = MAX( 2, ILAENV( 2, 'DORGQR', ' ', M, N, K,
-     $                      -1 ) )
-            END IF
-         END IF
+      IF( KK.EQ.0 ) THEN
+            CALL DORG2R( M, N, K, A, LDA, TAU, WORK, IINFO )
       END IF
 *
-      IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
+      IF( KK.GT.0 ) THEN
+         I = KK + 1
+         IB = NB
 *
-*        Use blocked code after the last block.
-*        The first kk columns are handled by the block method.
+*           Form the triangular factor of the block reflector
+*           H = H(i) H(i+1) . . . H(i+ib-1)
 *
-         KI = ( ( K-NX-1 ) / NB )*NB
-         KK = MIN( K, KI+NB )
+         CALL DLARFT('Forward', 'Column', M-I+1, IB, A(I,I),
+     $                     LDA, TAU(I), A(I,I), LDA)
 *
-*        Set A(1:kk,kk+1:n) to zero.
+*           Apply H to A(i:m,i+ib:n) from the left
 *
-         DO 20 J = KK + 1, N
-            DO 10 I = 1, KK
-               A( I, J ) = ZERO
-   10       CONTINUE
-   20    CONTINUE
-      ELSE
-         KK = 0
-      END IF
+         CALL DLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Forward',
+     $      'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), LDA, A(I,I),
+     $      LDA, A(I,I+IB), LDA)
 *
-*     Use unblocked code for the last or only block.
+*        Apply H to rows i:m of current block
 *
-      IF( KK.LT.N )
-     $   CALL DORG2R( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA,
-     $                TAU( KK+1 ), WORK, IINFO )
+         CALL DORG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK, IINFO)
+         DO I = KI + 1, 1, -NB
+            IB = NB
 *
-      IF( KK.GT.0 ) THEN
+*           Form the triangular factor of the block reflector
+*           H = H(i) H(i+1) . . . H(i+ib-1)
 *
-*        Use blocked code
+            CALL DLARFT('Forward', 'Column', M-I+1, IB, A(I,I),
+     $         LDA, TAU(I), A(I,I), LDA)
 *
-         DO 50 I = KI + 1, 1, -NB
-            IB = MIN( NB, K-I+1 )
-            IF( I+IB.LE.N ) THEN
+*           Apply H to A(i:m,i+ib:n) from the left
 *
-*              Form the triangular factor of the block reflector
-*              H = H(i) H(i+1) . . . H(i+ib-1)
+            CALL DLARFB0C2(.FALSE., 'Left', 'No Transpose',
+     $         'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I),
+     $         LDA, A(I,I), LDA, A(I,I+IB), LDA)
+
 *
-               CALL DLARFT( 'Forward', 'Columnwise', M-I+1, IB,
-     $                      A( I, I ), LDA, TAU( I ), WORK, LDWORK )
+*           Apply H to rows i:m of current block
 *
-*              Apply H to A(i:m,i+ib:n) from the left
+            CALL DORG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK,
+     $         IINFO)
+         END DO
 *
-               CALL DLARFB( 'Left', 'No transpose', 'Forward',
-     $                      'Columnwise', M-I+1, N-I-IB+1, IB,
-     $                      A( I, I ), LDA, WORK, LDWORK, A( I, I+IB ),
-     $                      LDA, WORK( IB+1 ), LDWORK )
-            END IF
+*        This checks for if K was a perfect multiple of NB
+*        so that we only have a special case for the last block when
+*        necessary
 *
-*           Apply H to rows i:m of current block
+         IF(I.LT.1) THEN
+            IB = I + NB - 1
+            I = 1
 *
-            CALL DORG2R( M-I+1, IB, IB, A( I, I ), LDA, TAU( I ),
-     $                   WORK,
-     $                   IINFO )
+*           Form the triangular factor of the block reflector
+*           H = H(i) H(i+1) . . . H(i+ib-1)
 *
-*           Set rows 1:i-1 of current block to zero
+            CALL DLARFT('Forward', 'Column', M-I+1, IB, A(I,I),
+     $         LDA, TAU(I), A(I,I), LDA)
 *
-            DO 40 J = I, I + IB - 1
-               DO 30 L = 1, I - 1
-                  A( L, J ) = ZERO
-   30          CONTINUE
-   40       CONTINUE
-   50    CONTINUE
+*           Apply H to A(i:m,i+ib:n) from the left
+*
+            CALL DLARFB0C2(.FALSE., 'Left', 'No Transpose',
+     $         'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I),
+     $         LDA, A(I,I), LDA, A(I,I+IB), LDA)
+
+*
+*           Apply H to rows i:m of current block
+*
+            CALL DORG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK,
+     $         IINFO)
+         END IF
       END IF
 *
       WORK( 1 ) = IWS
diff --git a/SRC/dorgrq.f b/SRC/dorgrq.f
index 126489bbe..811c2cdb5 100644
--- a/SRC/dorgrq.f
+++ b/SRC/dorgrq.f
@@ -5,6 +5,7 @@
 * Online html documentation available at
 *            http://www.netlib.org/lapack/explore-html/
 *
+*> \htmlonly
 *> Download DORGRQ + dependencies
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dorgrq.f">
 *> [TGZ]</a>
@@ -12,6 +13,7 @@
 *> [ZIP]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dorgrq.f">
 *> [TXT]</a>
+*> \endhtmlonly
 *
 *  Definition:
 *  ===========
@@ -137,17 +139,13 @@ SUBROUTINE DORGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
 *  =====================================================================
 *
-*     .. Parameters ..
-      DOUBLE PRECISION   ZERO
-      PARAMETER          ( ZERO = 0.0D+0 )
-*     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IB, II, IINFO, IWS, J, KK, L, LDWORK,
+      INTEGER            I, IB, II, IINFO, IWS, KK, LDWORK,
      $                   LWKOPT, NB, NBMIN, NX
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DLARFB, DLARFT, DORGR2, XERBLA
+      EXTERNAL           DLARFB0C2, DLARFT, DORGR2, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
@@ -227,63 +225,70 @@ SUBROUTINE DORGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
       IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
 *
-*        Use blocked code after the first block.
-*        The last kk rows are handled by the block method.
-*
-         KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB )
+*        We want to use the blocking method as long as our matrix is big enough
+*        and it's deemed worthwhile with the extra memory allocations
 *
-*        Set A(1:m-kk,n-kk+1:n) to zero.
-*
-         DO 20 J = N - KK + 1, N
-            DO 10 I = 1, M - KK
-               A( I, J ) = ZERO
-   10       CONTINUE
-   20    CONTINUE
+         KK = K
       ELSE
          KK = 0
       END IF
 *
-*     Use unblocked code for the first or only block.
+*     Potentially bail to the unblocked code
 *
-      CALL DORGR2( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO )
+      IF( KK.EQ.0 ) THEN
+         CALL DORGR2( M, N, K, A, LDA, TAU, WORK, IINFO )
+      END IF
 *
       IF( KK.GT.0 ) THEN
 *
-*        Use blocked code
+*        Factor the first block assuming that our first application
+*        will be on the Identity matrix
 *
-         DO 50 I = K - KK + 1, K, NB
-            IB = MIN( NB, K-I+1 )
-            II = M - K + I
-            IF( II.GT.1 ) THEN
+         I = 1
+         IB = NB
+         II = M - K + I
 *
-*              Form the triangular factor of the block reflector
-*              H = H(i+ib-1) . . . H(i+1) H(i)
+*        Form the triangular factor of the block reflector
+*        H = H(i+ib-1) . . . H(i+1) H(i)
 *
-               CALL DLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB,
-     $                      A( II, 1 ), LDA, TAU( I ), WORK, LDWORK )
+         CALL DLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB,
+     $                A( II, 1 ), LDA, TAU( I ), WORK, LDWORK )
 *
-*              Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right
+*        Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right
 *
-               CALL DLARFB( 'Right', 'Transpose', 'Backward',
-     $                      'Rowwise',
-     $                      II-1, N-K+I+IB-1, IB, A( II, 1 ), LDA, WORK,
-     $                      LDWORK, A, LDA, WORK( IB+1 ), LDWORK )
-            END IF
+         CALL DLARFB0C2(.TRUE., 'Right', 'Transpose', 'Backward', 
+     $         'Rowwise', II-1, N-K+I+IB-1, IB, A(II,1), LDA, WORK,
+     $         LDWORK, A, LDA)
 *
 *           Apply H**T to columns 1:n-k+i+ib-1 of current block
 *
-            CALL DORGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA,
-     $                   TAU( I ),
-     $                   WORK, IINFO )
+         CALL DORGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA,
+     $                TAU( I ), WORK, IINFO )
+
+         DO I = NB + 1, K, NB
+*
+*           The last block may be less than size NB
 *
-*           Set columns n-k+i+ib:n of current block to zero
+            IB = MIN(NB, K-I+1)
+            II = M - K + I
+*
+*           Form the triangular factor of the block reflector
+*           H = H(i+ib-1) . . . H(i+1) H(i)
+*
+            CALL DLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB,
+     $                   A( II, 1 ), LDA, TAU( I ), WORK, LDWORK )
+*
+*           Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right
 *
-            DO 40 L = N - K + I + IB, N
-               DO 30 J = II, II + IB - 1
-                  A( J, L ) = ZERO
-   30          CONTINUE
-   40       CONTINUE
-   50    CONTINUE
+            CALL DLARFB0C2(.FALSE., 'Right', 'Transpose', 
+     $            'Backward', 'Rowwise', II-1, N-K+I+IB-1, IB, 
+     $            A(II,1), LDA, WORK, LDWORK, A, LDA)
+*
+*           Apply H**T to columns 1:n-k+i+ib-1 of current block
+*
+            CALL DORGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA,
+     $                   TAU( I ), WORK, IINFO )
+         END DO
       END IF
 *
       WORK( 1 ) = IWS
diff --git a/SRC/lapack_64.h b/SRC/lapack_64.h
index e8000bf2c..e8d792094 100644
--- a/SRC/lapack_64.h
+++ b/SRC/lapack_64.h
@@ -331,6 +331,7 @@
 #define CLARF1F CLARF1F_64
 #define CLARF1L CLARF1L_64
 #define CLARFB CLARFB_64
+#define CLARFB0C2 CLARFB0C2_64
 #define CLARFB_GETT CLARFB_GETT_64
 #define CLARFG CLARFG_64
 #define CLARFGP CLARFGP_64
@@ -805,6 +806,7 @@
 #define DLARF1F DLARF1F_64
 #define DLARF1L DLARF1L_64
 #define DLARFB DLARFB_64
+#define DLARFB0C2 DLARFB0C2_64
 #define DLARFB_GETT DLARFB_GETT_64
 #define DLARFG DLARFG_64
 #define DLARFGP DLARFGP_64
@@ -1400,6 +1402,7 @@
 #define SLARF1F SLARF1F_64
 #define SLARF1L SLARF1L_64
 #define SLARFB SLARFB_64
+#define SLARFB0C2 SLARFB0C2_64
 #define SLARFB_GETT SLARFB_GETT_64
 #define SLARFG SLARFG_64
 #define SLARFGP SLARFGP_64
@@ -2046,6 +2049,7 @@
 #define ZLARF1F ZLARF1F_64
 #define ZLARF1L ZLARF1L_64
 #define ZLARFB ZLARFB_64
+#define ZLARFB0C2 ZLARFB0C2_64
 #define ZLARFB_GETT ZLARFB_GETT_64
 #define ZLARFG ZLARFG_64
 #define ZLARFGP ZLARFGP_64
diff --git a/SRC/slarfb0c2.f b/SRC/slarfb0c2.f
new file mode 100644
index 000000000..e3c222904
--- /dev/null
+++ b/SRC/slarfb0c2.f
@@ -0,0 +1,556 @@
+*> \brief \b SLARFB0C2 applies a block reflector or its transpose to a
+* rectangular matrix with a 0 block while constructing the explicit Q factor
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*
+*  Definition:
+*  ===========
+*
+*     SUBROUTINE SLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N,
+*    $                     K, V, LDV, T, LDT, C, LDC)
+*        ! Scalar arguments
+*        INTEGER           M, N, K, LDV, LDC, LDT
+*        CHARACTER         SIDE, TRANS, DIRECT, STOREV 
+*        ! True means that we are assuming C2 is the identity matrix
+*        !     and thus don't reference whatever is present in C2 
+*        !     at the beginning.
+*        LOGICAL           C2I
+*        ! Array arguments
+*        REAL              V(LDV,*), C(LDC,*), T(LDT,*)
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SLARFB0C2 applies a real block reflector H or its transpose H**T to a
+*> real m by n matrix C with a 0 block, while computing the explicit Q factor
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] C2I
+*> \verbatim
+*>          C2I is LOGICAL
+*>          = .TRUE.: Assume the nonzero block of C is the identity matrix
+*>          = .FALSE.: Use existing data in the nonzero block of C
+*> \endverbatim
+*>
+*> \param[in] SIDE
+*> \verbatim
+*>          SIDE is CHARACTER*1
+*>          = 'L': apply H or H**T from the Left
+*>          = 'R': apply H or H**T from the Right
+*> \endverbatim
+*>
+*> \param[in] TRANS
+*> \verbatim
+*>          TRANS is CHARACTER*1
+*>          = 'N': apply H (No transpose)
+*>          = 'T': apply H**T (Transpose)
+*> \endverbatim
+*>
+*> \param[in] DIRECT
+*> \verbatim
+*>          DIRECT is CHARACTER*1
+*>          Indicates how H is formed from a product of elementary
+*>          reflectors
+*>          = 'F': H = H(1) H(2) . . . H(k) (Forward)
+*>          = 'B': H = H(k) . . . H(2) H(1) (Backward)
+*> \endverbatim
+*>
+*> \param[in] STOREV
+*> \verbatim
+*>          STOREV is CHARACTER*1
+*>          Indicates how the vectors which define the elementary
+*>          reflectors are stored:
+*>          = 'C': Columnwise
+*>          = 'R': Rowwise
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix C.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix C.
+*> \endverbatim
+*>
+*> \param[in] K
+*> \verbatim
+*>          K is INTEGER
+*>          The order of the matrix T (= the number of elementary
+*>          reflectors whose product defines the block reflector).
+*>          If SIDE = 'L', M >= K >= 0;
+*>          if SIDE = 'R', N >= K >= 0.
+*> \endverbatim
+*>
+*> \param[in] V
+*> \verbatim
+*>          V is REAL array, dimension
+*>                                (LDV,K) if STOREV = 'C'
+*>                                (LDV,M) if STOREV = 'R' and SIDE = 'L'
+*>                                (LDV,N) if STOREV = 'R' and SIDE = 'R'
+*>          The matrix V. See Further Details.
+*> \endverbatim
+*>
+*> \param[in] LDV
+*> \verbatim
+*>          LDV is INTEGER
+*>          The leading dimension of the array V.
+*>          If STOREV = 'C' and SIDE = 'L', LDV >= max(1,M);
+*>          if STOREV = 'C' and SIDE = 'R', LDV >= max(1,N);
+*>          if STOREV = 'R', LDV >= K.
+*> \endverbatim
+*>
+*> \param[in] T
+*> \verbatim
+*>          T is REAL array, dimension (LDT,K)
+*>          The triangular k by k matrix T in the representation of the
+*>          block reflector.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*>          LDT is INTEGER
+*>          The leading dimension of the array T. LDT >= K.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is REAL array, dimension (LDC,N)
+*>          On entry, the m by n matrix C.
+*>          On exit, C is overwritten by H*C or H**T*C or C*H or C*H**T.
+*> \endverbatim
+*>
+*> \param[in] LDC
+*> \verbatim
+*>          LDC is INTEGER
+*>          The leading dimension of the array C. LDC >= max(1,M).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup larfb
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  The shape of the matrix V and the storage of the vectors which define
+*>  the H(i) is best illustrated by the following example with n = 5 and
+*>  k = 3. The triangular part of V (including its diagonal) is not
+*>  referenced.
+*>
+*>  DIRECT = 'F' and STOREV = 'C':         DIRECT = 'F' and STOREV = 'R':
+*>
+*>               V = (  1       )                 V = (  1 v1 v1 v1 v1 )
+*>                   ( v1  1    )                     (     1 v2 v2 v2 )
+*>                   ( v1 v2  1 )                     (        1 v3 v3 )
+*>                   ( v1 v2 v3 )
+*>                   ( v1 v2 v3 )
+*>
+*>  DIRECT = 'B' and STOREV = 'C':         DIRECT = 'B' and STOREV = 'R':
+*>
+*>               V = ( v1 v2 v3 )                 V = ( v1 v1  1       )
+*>                   ( v1 v2 v3 )                     ( v2 v2 v2  1    )
+*>                   (  1 v2 v3 )                     ( v3 v3 v3 v3  1 )
+*>                   (     1 v3 )
+*>                   (        1 )
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE SLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N,
+     $                     K, V, LDV, T, LDT, C, LDC)
+         ! Scalar arguments
+         INTEGER           M, N, K, LDV, LDC, LDT
+         CHARACTER         SIDE, TRANS, DIRECT, STOREV 
+         ! True means that we are assuming C2 is the identity matrix
+         !     and thus don't reference whatever is present in C2 
+         !     at the beginning.
+         LOGICAL           C2I
+
+         ! Array arguments
+         REAL              V(LDV,*), C(LDC,*), T(LDT,*)
+         ! Local scalars
+         LOGICAL           QR, LQ, QL, DIRF, COLV, SIDEL, SIDER,
+     $                     TRANST
+         INTEGER           I, J
+         ! External functions
+         LOGICAL           LSAME
+         EXTERNAL          LSAME
+         ! External Subroutines
+         EXTERNAL          SGEMM, STRMM, XERBLA
+         ! Parameters
+         REAL              ONE, ZERO, NEG_ONE
+         PARAMETER(ONE=1.0E+0, ZERO = 0.0E+0, NEG_ONE = -1.0E+0)
+
+         ! Beginning of executable statements
+         ! Convert our character flags to logical values
+         DIRF = LSAME(DIRECT,'F')
+         COLV = LSAME(STOREV,'C')
+         SIDEL = LSAME(SIDE,'L')
+         SIDER = LSAME(SIDE,'R')
+         TRANST = LSAME(TRANS,'T')
+
+         ! Determine which of the 4 modes are using.
+         ! QR is when we store the reflectors column by column and have the
+         ! 'first' reflector stored in the first column
+         QR = DIRF.AND.COLV
+
+         ! LQ is when we store the reflectors row by row and have the
+         ! 'first' reflector stored in the first row
+         LQ = DIRF.AND.(.NOT.COLV)
+
+         ! QL is when we store the reflectors column by column and have the
+         ! 'first' reflector stored in the last column
+         QL = (.NOT.DIRF).AND.COLV
+
+         ! RQ is when we store the reflectors row by row and have the
+         ! 'first' reflector stored in the last row
+         ! RQ = (.NOT.DIRF).AND.(.NOT.COLV)
+         ! Since we have exactly one of these 4 modes, we don't need to actually
+         ! store the value of RQ, instead we assume this is the case if we fail
+         ! the above 3 checks.
+
+         IF (QR) THEN
+            ! We are computing C = HC = (I - VTV')C
+            ! Where: V = [ V1 ] and C = [ C1 ]
+            !            [ V2 ]         [ C2 ]
+            ! with the following dimensions:
+            !     V1\in\R^{K\times K}
+            !     V2\in\R^{M-K\times K}
+            !     C1=0\in\R^{K\times N}
+            !     C2\in\R^{M-K\times N}
+            ! Since we are assuming that C1 is a zero matrix and it will be
+            ! overwritten on exit, we can use this spot as a temporary workspace
+            ! without having to allocate anything extra.
+            ! This lets us simplify our above equation to get
+            !
+            ! C = HC = (I - [ V1 ]T [V1', V2'])[ 0  ]
+            !               [ V2 ]             [ C2 ]
+            !   = [ 0  ] - [ V1 ]T*V2'*C2
+            !     [ C2 ]   [ V2 ]
+            !
+            !   = [ 0  ] - [ V1*T*V2'*C2 ]
+            !     [ C2 ]   [ V2*T*V2'*C2 ]
+            !
+            !   = [      V1*T*V2'*C2 ]
+            !     [ C2 - V2*T*V2'*C2 ]
+            !
+            ! So, we can order our computations as follows:
+            !
+            ! C1 = V2'*C2
+            ! C1 = T*C1
+            ! C2 = C2 - V2*C1
+            ! C1 = -V1*C1
+            !
+            ! To achieve the same end result
+            !
+            ! Check to ensure side and trans are the expected values 
+            !
+            IF( .NOT.SIDEL ) THEN
+               CALL XERBLA('SLARFB0C2', 2)
+               RETURN
+            ELSE IF(TRANST) THEN
+               CALL XERBLA('SLARFB0C2', 3)
+               RETURN
+            END IF
+            !
+            ! C1 = V2'*C2
+            !
+            IF (C2I) THEN
+               DO J = 1, N
+                  DO I = 1, K
+                     C(I,J) = V(K+J,I)
+                  END DO
+               END DO
+            ELSE
+               CALL SGEMM('Transpose', 'No Transpose', K, N, M - K,
+     $                     ONE, V(K+1,1), LDV, C(K+1,1), LDC, ZERO,
+     $                     C, LDC)
+            END IF
+            !
+            ! C1 = T*C1
+            !
+            CALL STRMM('Left', 'Upper', 'No Transpose', 'Non-unit',
+     $                  K, N, ONE, T, LDT, C, LDC)
+            !
+            ! C2 = C2 - V2*C1 = -V2*C1 + C2
+            !
+            IF (C2I) THEN
+               CALL SGEMM('No Transpose', 'No Transpose', M-K, N, K,
+     $                     NEG_ONE, V(K+1,1), LDV, C, LDC, ZERO,
+     $                     C(K+1,1), LDC)
+               DO I = 1, N
+                  C(K+I,I) = C(K+I,I) + ONE
+               END DO
+            ELSE
+               CALL SGEMM('No Transpose', 'No Transpose', M-K, N, K,
+     $                     NEG_ONE, V(K+1,1), LDV, C, LDC, ONE,
+     $                     C(K+1,1), LDC)
+            END IF
+            !
+            ! C1 = -V1*C1
+            !
+            CALL STRMM('Left', 'Lower', 'No Transpose', 'Unit',
+     $                  K, N, NEG_ONE, V, LDV, C, LDC)
+         ELSE IF (LQ) THEN
+            ! We are computing C = CH' = C(I-V'T'V)
+            ! Where: V = [ V1 V2 ] and C = [ C1 C2 ]
+            ! with the following dimensions:
+            !     V1\in\R^{K\times K}
+            !     V2\in\R^{K\times N-K}
+            !     C1=0\in\R^{M\times K}
+            !     C2\in\R^{M\times N-K}
+            ! Since we are assuming that C1 is a zero matrix and it will be
+            ! overwritten on exit, we can use this spot as a temporary workspace
+            ! without having to allocate anything extra.
+            ! This lets us simplify our above equation to get
+            !
+            ! C = CH' = [ 0, C2 ](I - [ V1' ]T'[ V1, V2 ])
+            !                         [ V2' ]
+            !
+            !   = [ 0, C2 ] - [ 0, C2 ][ V1' ]T'[ V1, V2 ]
+            !                          [ V2' ]
+            !
+            !   = [ 0, C2 ] - C2*V2'*T'[ V1, V2 ]
+            !
+            !   = [ -C2*V2'*T'*V1, C2 - C2*V2'*T'*V2 ]
+            !
+            ! So, we can order our computations as follows:
+            !
+            ! C1 = C2*V2'
+            ! C1 = C1*T'
+            ! C2 = C2 - C1*V2
+            ! C1 = -C1*V1
+            !
+            ! To achieve the same end result
+            !
+            ! Check to ensure side and trans are the expected values 
+            !
+            IF( .NOT.SIDER ) THEN
+               CALL XERBLA('SLARFB0C2', 2)
+               RETURN
+            ELSE IF(.NOT.TRANST) THEN
+               CALL XERBLA('SLARFB0C2', 3)
+               RETURN
+            END IF
+            !
+            ! C1 = C2*V2'
+            !
+            IF( C2I ) THEN
+               DO J = 1, K
+                  DO I = 1, M
+                     C(I,J) = V(J,K+I)
+                  END DO
+               END DO
+            ELSE
+               CALL SGEMM('No Transpose', 'Transpose', M, K, N-K,
+     $               ONE, C(1,K+1), LDC, V(1, K+1), LDV, ZERO, C,
+     $               LDC)
+            END IF
+            !
+            ! C1 = C1*T'
+            !
+            CALL STRMM('Right', 'Upper', 'Transpose', 'Non-unit',
+     $            M, K, ONE, T, LDT, C, LDC)
+            !
+            ! C2 = C2 - C1*V2 = -C1*V2 + C2
+            !
+            IF( C2I ) THEN
+               CALL SGEMM('No Transpose', 'No Transpose', M, N-K, K,
+     $               NEG_ONE, C, LDC, V(1,K+1), LDV, ZERO, C(1,K+1),
+     $               LDC)
+               DO I = 1, M
+                  C(I,K+I) = C(I,K+I) + ONE
+               END DO
+            ELSE
+               CALL SGEMM('No Transpose', 'No Transpose', M, N-K, K,
+     $               NEG_ONE, C, LDC, V(1,K+1), LDV, ONE, C(1,K+1),
+     $               LDC)
+            END IF
+            !
+            ! C1 = -C1*V1
+            !
+            CALL STRMM('Right', 'Upper', 'No Transpose', 'Unit',
+     $            M, K, NEG_ONE, V, LDV, C, LDC)
+         ELSE IF (QL) THEN
+            ! We are computing C = HC = (I - VTV')C
+            ! Where: V = [ V2 ] and C = [ C2 ]
+            !            [ V1 ]         [ C1 ]
+            ! with the following dimensions:
+            !     V1\in\R^{K\times K}
+            !     V2\in\R^{M-K\times K}
+            !     C1=0\in\R^{K\times N}
+            !     C2\in\R^{M-K\times N}
+            ! Since we are assuming that C1 is a zero matrix and it will be
+            ! overwritten on exit, we can use this spot as a temporary workspace
+            ! without having to allocate anything extra.
+            ! This lets us simplify our above equation to get
+            !
+            ! C = HC = (I-[ V2 ]T[ V2' V1' ])[ C2 ]
+            !             [ V1 ]             [ 0  ]
+            !
+            !   = [ C2 ] - [ V2 ]T*V2'*C2
+            !     [ 0  ]   [ V1 ]
+            !
+            !   = [ C2 ] - [ V2*T*V2'*C2 ]
+            !     [ 0  ]   [ V1*T*V2'*C2 ]
+            !
+            !   = [ C2 - V2*T*V2'*C2 ]
+            !     [    - V1*T*V2'*C2 ]
+            !
+            ! So, we can order our computations as follows:
+            !
+            ! C1 = V2'*C2
+            ! C1 = T*C1
+            ! C2 = C2 - V2*C1
+            ! C1 = -V1*C1
+            !
+            ! To achieve the same end result
+            !
+            ! Check to ensure side and trans are the expected values 
+            !
+            IF( .NOT.SIDEL ) THEN
+               CALL XERBLA('SLARFB0C2', 2)
+               RETURN
+            ELSE IF(TRANST) THEN
+               CALL XERBLA('SLARFB0C2', 3)
+               RETURN
+            END IF
+            !
+            ! C1 = V2'*C2
+            !
+            IF( C2I ) THEN
+               DO J = 1, N
+                  DO I = 1, K
+                     C(M-K+I,J) = V(J,I)
+                  END DO
+               END DO
+            ELSE
+               CALL SGEMM('Transpose', 'No Transpose', K, N, M-K,
+     $            ONE, V, LDV, C, LDC, ZERO, C(M-K+1, 1), LDC)
+            END IF
+            !
+            ! C1 = T*C1
+            !
+            CALL STRMM('Left', 'Lower', 'No Transpose', 'Non-unit',
+     $         K, N, ONE, T, LDT, C(M-K+1,1), LDC)
+            !
+            ! C2 = C2 - V2*C1 = -V2*C1 + C2
+            !
+            IF( C2I ) THEN
+               CALL SGEMM('No Transpose', 'No Transpose', M-K, N, K,
+     $            NEG_ONE, V, LDV, C(M-K+1,1), LDC, ZERO, C, LDC)
+               DO I = 1, N
+                  C(I,I) = C(I,I) + ONE
+               END DO
+            ELSE
+               CALL SGEMM('No Transpose', 'No Transpose', M-K, N, K,
+     $            NEG_ONE, V, LDV, C(M-K+1,1), LDC, ONE, C, LDC)
+            END IF
+            !
+            ! C1 = -V1*C1
+            !
+            CALL STRMM('Left', 'Upper', 'No Transpose', 'Unit',
+     $         K, N, NEG_ONE, V(M-K+1,1), LDV, C(M-K+1,1), LDC)
+         ELSE ! IF (RQ) THEN
+            ! We are computing C = CH' = C(I-V'T'V)
+            ! Where: V = [ V2 V1] and C = [ C2 C1 ]
+            ! with the following dimensions:
+            !     V1\in\R^{K\times K}
+            !     V2\in\R^{K\times N-K}
+            !     C1=0\in\R^{M\times K}
+            !     C2\in\R^{M\times N-K}
+            ! Since we are assuming that C1 is a zero matrix and it will be
+            ! overwritten on exit, we can use this spot as a temporary workspace
+            ! without having to allocate anything extra.
+            ! This lets us simplify our above equation to get
+            !
+            ! C = CH' = [ C2, 0 ] (I - [ V2' ]T'[ V2, V1 ]
+            !                          [ V1' ]
+            !
+            !   = [ C2, 0 ] - [ C2, 0 ] [ V2' ]T'[ V2, V1 ]
+            !                           [ V1' ]
+            !
+            !   = [ C2, 0 ] - C2*V2'*T'[ V2, V1 ]
+            !
+            !   = [ C2, 0 ] - [ C2*V2'*T'*V2, C2*V2'*T'*V1 ]
+            !
+            !   = [ C2 - C2*V2'*T'*V2, -C2*V2'*T'*V1 ]
+            !
+            ! So, we can order our computations as follows:
+            !
+            ! C1 = C2*V2'
+            ! C1 = C1*T'
+            ! C2 = C2 - C1*V2
+            ! C1 = -C1*V1
+            !
+            !
+            ! To achieve the same end result
+            !
+            ! Check to ensure side and trans are the expected values 
+            !
+            IF( .NOT.SIDER ) THEN
+               CALL XERBLA('SLARFB0C2', 2)
+               RETURN
+            ELSE IF(.NOT.TRANST) THEN
+               CALL XERBLA('SLARFB0C2', 3)
+               RETURN
+            END IF
+            !
+            ! C1 = C2*V2'
+            !
+            IF( C2I ) THEN
+               DO J = 1, K
+                  DO I = 1, M
+                     C(I,N-K+J) = V(J,I)
+                  END DO
+               END DO
+            ELSE
+               CALL SGEMM('No Transpose', 'Transpose', M, K, N-K,
+     $            ONE, C, LDC, V, LDV, ZERO, C(1, N-K+1), LDC)
+            END IF
+            !
+            ! C1 = C1*T'
+            !
+            CALL STRMM('Right', 'Lower', 'Transpose', 'Non-unit',
+     $         M, K, ONE, T, LDT, C(1, N-K+1), LDC)
+            !
+            ! C2 = C2 - C1*V2 = -C1*V2 + C2
+            !
+            IF( C2I ) THEN
+               CALL SGEMM('No Transpose', 'No Transpose', M, N-K, K,
+     $            NEG_ONE, C(1, N-K+1), LDC, V, LDV, ZERO, C, LDC)
+               DO I = 1, M
+                  C(I,I) = C(I,I) + ONE
+               END DO
+            ELSE
+               CALL SGEMM('No Transpose', 'No Transpose', M, N-K, K,
+     $            NEG_ONE, C(1, N-K+1), LDC, V, LDV, ONE, C, LDC)
+            END IF
+            !
+            ! C1 = -C1*V1
+            !
+            CALL STRMM('Right', 'Lower', 'No Transpose', 'Unit',
+     $         M, K, NEG_ONE, V(1, N-K+1), LDV, C(1,N-K+1), LDC)
+         END IF
+      END SUBROUTINE
diff --git a/SRC/sorglq.f b/SRC/sorglq.f
index fb5ef79cc..574b3ae2c 100644
--- a/SRC/sorglq.f
+++ b/SRC/sorglq.f
@@ -146,7 +146,7 @@ SUBROUTINE SORGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
      $                   LWKOPT, NB, NBMIN, NX
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SLARFB, SLARFT, SORGL2, XERBLA
+      EXTERNAL           SLARFB0C2, SLARFT, SORGL2, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
@@ -218,67 +218,90 @@ SUBROUTINE SORGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
       IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
 *
-*        Use blocked code after the last block.
-*        The first kk rows are handled by the block method.
+*        Handle the first block assuming we are applying to the
+*        identity, then resume regular blocking method after
 *
-         KI = ( ( K-NX-1 ) / NB )*NB
-         KK = MIN( K, KI+NB )
-*
-*        Set A(kk+1:m,1:kk) to zero.
-*
-         DO 20 J = 1, KK
-            DO 10 I = KK + 1, M
-               A( I, J ) = ZERO
-   10       CONTINUE
-   20    CONTINUE
+         KI = K - 2 * NB
+         KK = K - NB
       ELSE
          KK = 0
       END IF
 *
-*     Use unblocked code for the last or only block.
+*     Potentially bail to the unblocked version
 *
-      IF( KK.LT.M )
-     $   CALL SORGL2( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA,
-     $                TAU( KK+1 ), WORK, IINFO )
+      IF( KK.EQ.0 ) THEN
+         CALL SORGL2( M, N, K, A, LDA, TAU, WORK, IINFO )
+      END IF
 *
       IF( KK.GT.0 ) THEN
+         I = KK + 1
+         IB = NB
+*
+*        Form the triangular factor of the block reflector
+*        H = H(i) H(i+1) . . . H(i+ib-1)
+*
+         CALL SLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ),
+     $               LDA, TAU( I ), WORK, LDWORK )
+*
+*        Apply H**T to A(i+ib:m,i:n) from the right
+*
+         CALL SLARFB0C2(.TRUE., 'Right', 'Transpose', 'Forward',
+     $         'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, 
+     $         LDWORK, A(I+IB,I), LDA)
+*
+*        Apply H**T to columns i:n of current block
+
+         CALL SORGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ),
+     $                WORK, IINFO )
 *
 *        Use blocked code
 *
-         DO 50 I = KI + 1, 1, -NB
-            IB = MIN( NB, K-I+1 )
-            IF( I+IB.LE.M ) THEN
+         DO I = KI + 1, 1, -NB
+            IB = NB
 *
-*              Form the triangular factor of the block reflector
-*              H = H(i) H(i+1) . . . H(i+ib-1)
+*           Form the triangular factor of the block reflector
+*           H = H(i) H(i+1) . . . H(i+ib-1)
 *
-               CALL SLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I,
-     $                      I ),
-     $                      LDA, TAU( I ), WORK, LDWORK )
+            CALL SLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ),
+     $                  LDA, TAU( I ), WORK, LDWORK )
 *
-*              Apply H**T to A(i+ib:m,i:n) from the right
+*           Apply H**T to A(i+ib:m,i:n) from the right
 *
-               CALL SLARFB( 'Right', 'Transpose', 'Forward',
-     $                      'Rowwise',
-     $                      M-I-IB+1, N-I+1, IB, A( I, I ), LDA, WORK,
-     $                      LDWORK, A( I+IB, I ), LDA, WORK( IB+1 ),
-     $                      LDWORK )
-            END IF
+            CALL SLARFB0C2(.FALSE., 'Right', 'Transpose', 'Forward',
+     $            'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, 
+     $            LDWORK, A(I+IB,I), LDA)
 *
 *           Apply H**T to columns i:n of current block
-*
+
             CALL SORGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ),
-     $                   WORK,
-     $                   IINFO )
+     $                   WORK, IINFO )
+         END DO
+*
+*        This checks for if K was a perfect multiple of NB
+*        so that we only have a special case for the last block when
+*        necessary
+*
+         IF(I.LT.1) THEN
+            IB = I + NB - 1
+            I = 1
+*
+*           Form the triangular factor of the block reflector
+*           H = H(i) H(i+1) . . . H(i+ib-1)
+*
+            CALL SLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ),
+     $                  LDA, TAU( I ), WORK, LDWORK )
 *
-*           Set columns 1:i-1 of current block to zero
+*           Apply H**T to A(i+ib:m,i:n) from the right
 *
-            DO 40 J = 1, I - 1
-               DO 30 L = I, I + IB - 1
-                  A( L, J ) = ZERO
-   30          CONTINUE
-   40       CONTINUE
-   50    CONTINUE
+            CALL SLARFB0C2(.FALSE., 'Right', 'Transpose', 'Forward',
+     $            'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, 
+     $            LDWORK, A(I+IB,I), LDA)
+*
+*           Apply H**T to columns i:n of current block
+
+            CALL SORGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ),
+     $                   WORK, IINFO )
+         END IF
       END IF
 *
       WORK( 1 ) = SROUNDUP_LWORK(IWS)
diff --git a/SRC/sorgql.f b/SRC/sorgql.f
index 7257ff2d9..c9b2918b7 100644
--- a/SRC/sorgql.f
+++ b/SRC/sorgql.f
@@ -95,8 +95,6 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK. LWORK >= max(1,N).
-*>          For optimum performance LWORK >= N*NB, where NB is the
-*>          optimal blocksize.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -143,11 +141,11 @@ SUBROUTINE SORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IB, IINFO, IWS, J, KK, L, LDWORK, LWKOPT,
-     $                   NB, NBMIN, NX
+      INTEGER            I, IB, IINFO, IWS, J, KK, LWKOPT, NB,
+     $                   NBMIN, NX
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SLARFB, SLARFT, SORG2L, XERBLA
+      EXTERNAL           SLARFB0C2, SLARFT, SORG2L, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
@@ -178,7 +176,8 @@ SUBROUTINE SORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
             LWKOPT = 1
          ELSE
             NB = ILAENV( 1, 'SORGQL', ' ', M, N, K, -1 )
-            LWKOPT = N*NB
+            ! Only need a workspace for calls to dorg2l
+            LWKOPT = N
          END IF
          WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
@@ -201,88 +200,76 @@ SUBROUTINE SORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
       END IF
 *
       NBMIN = 2
-      NX = 0
+      NX = MAX(0, ILAENV(3, 'SORGQL', ' ', M, N, K, -1))
       IWS = N
-      IF( NB.GT.1 .AND. NB.LT.K ) THEN
 *
-*        Determine when to cross over from blocked to unblocked code.
-*
-         NX = MAX( 0, ILAENV( 3, 'SORGQL', ' ', M, N, K, -1 ) )
-         IF( NX.LT.K ) THEN
+      IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
 *
-*           Determine if workspace is large enough for blocked code.
+*        We want to use the blocking method as long as our matrix is big enough
 *
-            LDWORK = N
-            IWS = LDWORK*NB
-            IF( LWORK.LT.IWS ) THEN
+         KK = K
+      ELSE
+         KK = 0
+      END IF
 *
-*              Not enough workspace to use optimal NB:  reduce NB and
-*              determine the minimum value of NB.
+*     Possibly bail to the unblocked code.
 *
-               NB = LWORK / LDWORK
-               NBMIN = MAX( 2, ILAENV( 2, 'SORGQL', ' ', M, N, K,
-     $                      -1 ) )
-            END IF
-         END IF
+      IF( KK.EQ.0 ) THEN
+         CALL SORG2L( M, N, K, A, LDA, TAU, WORK, IINFO )
       END IF
 *
-      IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
+      IF( KK.GT.0 ) THEN
 *
-*        Use blocked code after the first block.
-*        The last kk columns are handled by the block method.
+*        Factor the first block assuming that our first application
+*        will be on the Identity matrix
 *
-         KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB )
+         I = 1
+         IB = NB
 *
-*        Set A(m-kk+1:m,1:n-kk) to zero.
+*        Form the triangular factor of the block reflector
+*        H = H(i+ib-1) . . . H(i+1) H(i)
 *
-         DO 20 J = 1, N - KK
-            DO 10 I = M - KK + 1, M
-               A( I, J ) = ZERO
-   10       CONTINUE
-   20    CONTINUE
-      ELSE
-         KK = 0
-      END IF
+         CALL SLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB,
+     $                  A( 1, N-K+I ), LDA, TAU( I ),
+     $                  A( M-K+I, N-K+I ), LDA)
 *
-*     Use unblocked code for the first or only block.
+*        Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left
 *
-      CALL SORG2L( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO )
+         CALL SLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Backward', 
+     $         'Columnwise', M-K+I+IB-1, N-K+I-1, IB, A(1, N-K+I), 
+     $         LDA, A( M-K+I, N-K+I ), LDA, A, LDA)
 *
-      IF( KK.GT.0 ) THEN
+*        Apply H to rows 1:m-k+i+ib-1 of current block
 *
-*        Use blocked code
+         CALL SORG2L( M-K+I+IB-1, IB, IB, A( 1, N-K+I ), LDA,
+     $                TAU( I ), WORK, IINFO )
+
+*        Use blocked code on the remaining blocks if there are any.
 *
-         DO 50 I = K - KK + 1, K, NB
-            IB = MIN( NB, K-I+1 )
-            IF( N-K+I.GT.1 ) THEN
+         DO I = NB+1, K, NB
 *
-*              Form the triangular factor of the block reflector
-*              H = H(i+ib-1) . . . H(i+1) H(i)
+*           The last block may be less than size NB
 *
-               CALL SLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB,
-     $                      A( 1, N-K+I ), LDA, TAU( I ), WORK, LDWORK )
+            IB = MIN(NB, K-I+1)
 *
-*              Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left
+*           Form the triangular factor of the block reflector
+*           H = H(i+ib-1) . . . H(i+1) H(i)
 *
-               CALL SLARFB( 'Left', 'No transpose', 'Backward',
-     $                      'Columnwise', M-K+I+IB-1, N-K+I-1, IB,
-     $                      A( 1, N-K+I ), LDA, WORK, LDWORK, A, LDA,
-     $                      WORK( IB+1 ), LDWORK )
-            END IF
+            CALL SLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB,
+     $                  A( 1, N-K+I ), LDA, TAU( I ), 
+     $                  A( M-K+I, N-K+I ), LDA )
+*
+*           Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left
+*
+            CALL SLARFB0C2(.FALSE., 'Left', 'No Transpose',
+     $            'Backward', 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, 
+     $            A(1, N-K+I), LDA, A( M-K+I, N-K+I ), LDA, A, LDA)
 *
 *           Apply H to rows 1:m-k+i+ib-1 of current block
 *
             CALL SORG2L( M-K+I+IB-1, IB, IB, A( 1, N-K+I ), LDA,
      $                   TAU( I ), WORK, IINFO )
-*
-*           Set rows m-k+i+ib:m of current block to zero
-*
-            DO 40 J = N - K + I, N - K + I + IB - 1
-               DO 30 L = M - K + I + IB, M
-                  A( L, J ) = ZERO
-   30          CONTINUE
-   40       CONTINUE
-   50    CONTINUE
+         END DO
       END IF
 *
       WORK( 1 ) = SROUNDUP_LWORK(IWS)
diff --git a/SRC/sorgqr.f b/SRC/sorgqr.f
index 47d1fd248..070257c46 100644
--- a/SRC/sorgqr.f
+++ b/SRC/sorgqr.f
@@ -95,8 +95,6 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK. LWORK >= max(1,N).
-*>          For optimum performance LWORK >= N*NB, where NB is the
-*>          optimal blocksize.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -147,7 +145,7 @@ SUBROUTINE SORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
      $                   LWKOPT, NB, NBMIN, NX
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SLARFB, SLARFT, SORG2R, XERBLA
+      EXTERNAL           SLARFB0C2, SLARFT, SORG2R, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
@@ -163,7 +161,7 @@ SUBROUTINE SORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
       INFO = 0
       NB = ILAENV( 1, 'SORGQR', ' ', M, N, K, -1 )
-      LWKOPT = MAX( 1, N )*NB
+      LWKOPT = MAX( 1, N )
       WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
@@ -194,92 +192,92 @@ SUBROUTINE SORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
       NBMIN = 2
       NX = 0
       IWS = N
-      IF( NB.GT.1 .AND. NB.LT.K ) THEN
 *
-*        Determine when to cross over from blocked to unblocked code.
+      IF( NB.GE.NBMIN .AND. NB.LT.K ) THEN
 *
-         NX = MAX( 0, ILAENV( 3, 'SORGQR', ' ', M, N, K, -1 ) )
-         IF( NX.LT.K ) THEN
+*        Handle the first block assuming we are applying to the
+*        identity, then resume regular blocking method after
 *
-*           Determine if workspace is large enough for blocked code.
-*
-            LDWORK = N
-            IWS = LDWORK*NB
-            IF( LWORK.LT.IWS ) THEN
+         KI = K - 2 * NB
+         KK = K - NB
+      ELSE
+         KK = 0
+      END IF
 *
-*              Not enough workspace to use optimal NB:  reduce NB and
-*              determine the minimum value of NB.
+*     Potentially bail to the unblocked code.
 *
-               NB = LWORK / LDWORK
-               NBMIN = MAX( 2, ILAENV( 2, 'SORGQR', ' ', M, N, K,
-     $                      -1 ) )
-            END IF
-         END IF
+      IF( KK.EQ.0 ) THEN
+         CALL SORG2R( M, N, K, A, LDA, TAU, WORK, IINFO )
       END IF
 *
-      IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
+      IF( KK.GT.0 ) THEN
+         I = KK + 1
+         IB = NB
 *
-*        Use blocked code after the last block.
-*        The first kk columns are handled by the block method.
+*           Form the triangular factor of the block reflector
+*           H = H(i) H(i+1) . . . H(i+ib-1)
 *
-         KI = ( ( K-NX-1 ) / NB )*NB
-         KK = MIN( K, KI+NB )
+         CALL SLARFT('Forward', 'Column', M-I+1, IB, A(I,I),
+     $                     LDA, TAU(I), A(I,I), LDA)
 *
-*        Set A(1:kk,kk+1:n) to zero.
+*           Apply H to A(i:m,i+ib:n) from the left
 *
-         DO 20 J = KK + 1, N
-            DO 10 I = 1, KK
-               A( I, J ) = ZERO
-   10       CONTINUE
-   20    CONTINUE
-      ELSE
-         KK = 0
-      END IF
+         CALL SLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Forward',
+     $      'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), LDA, A(I,I),
+     $      LDA, A(I,I+IB), LDA)
 *
-*     Use unblocked code for the last or only block.
+*        Apply H to rows i:m of current block
 *
-      IF( KK.LT.N )
-     $   CALL SORG2R( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA,
-     $                TAU( KK+1 ), WORK, IINFO )
+         CALL SORG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK, IINFO)
+         DO I = KI + 1, 1, -NB
+            IB = NB
 *
-      IF( KK.GT.0 ) THEN
+*           Form the triangular factor of the block reflector
+*           H = H(i) H(i+1) . . . H(i+ib-1)
 *
-*        Use blocked code
+            CALL SLARFT('Forward', 'Column', M-I+1, IB, A(I,I),
+     $         LDA, TAU(I), A(I,I), LDA)
 *
-         DO 50 I = KI + 1, 1, -NB
-            IB = MIN( NB, K-I+1 )
-            IF( I+IB.LE.N ) THEN
+*           Apply H to A(i:m,i+ib:n) from the left
 *
-*              Form the triangular factor of the block reflector
-*              H = H(i) H(i+1) . . . H(i+ib-1)
+            CALL SLARFB0C2(.FALSE., 'Left', 'No Transpose',
+     $         'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I),
+     $         LDA, A(I,I), LDA, A(I,I+IB), LDA)
+
 *
-               CALL SLARFT( 'Forward', 'Columnwise', M-I+1, IB,
-     $                      A( I, I ), LDA, TAU( I ), WORK, LDWORK )
+*           Apply H to rows i:m of current block
 *
-*              Apply H to A(i:m,i+ib:n) from the left
+            CALL SORG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK,
+     $         IINFO)
+         END DO
 *
-               CALL SLARFB( 'Left', 'No transpose', 'Forward',
-     $                      'Columnwise', M-I+1, N-I-IB+1, IB,
-     $                      A( I, I ), LDA, WORK, LDWORK, A( I, I+IB ),
-     $                      LDA, WORK( IB+1 ), LDWORK )
-            END IF
+*        This checks for if K was a perfect multiple of NB
+*        so that we only have a special case for the last block when
+*        necessary
 *
-*           Apply H to rows i:m of current block
+         IF(I.LT.1) THEN
+            IB = I + NB - 1
+            I = 1
 *
-            CALL SORG2R( M-I+1, IB, IB, A( I, I ), LDA, TAU( I ),
-     $                   WORK,
-     $                   IINFO )
+*           Form the triangular factor of the block reflector
+*           H = H(i) H(i+1) . . . H(i+ib-1)
 *
-*           Set rows 1:i-1 of current block to zero
+            CALL SLARFT('Forward', 'Column', M-I+1, IB, A(I,I),
+     $         LDA, TAU(I), A(I,I), LDA)
 *
-            DO 40 J = I, I + IB - 1
-               DO 30 L = 1, I - 1
-                  A( L, J ) = ZERO
-   30          CONTINUE
-   40       CONTINUE
-   50    CONTINUE
-      END IF
+*           Apply H to A(i:m,i+ib:n) from the left
+*
+            CALL SLARFB0C2(.FALSE., 'Left', 'No Transpose',
+     $         'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I),
+     $         LDA, A(I,I), LDA, A(I,I+IB), LDA)
+
 *
+*           Apply H to rows i:m of current block
+*
+            CALL SORG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK,
+     $         IINFO)
+         END IF
+      END IF
       WORK( 1 ) = SROUNDUP_LWORK(IWS)
       RETURN
 *
diff --git a/SRC/sorgrq.f b/SRC/sorgrq.f
index 5fe685c1d..ea1a4e73d 100644
--- a/SRC/sorgrq.f
+++ b/SRC/sorgrq.f
@@ -147,7 +147,7 @@ SUBROUTINE SORGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
      $                   LWKOPT, NB, NBMIN, NX
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SLARFB, SLARFT, SORGR2, XERBLA
+      EXTERNAL           SLARFB0C2, SLARFT, SORGR2, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
@@ -228,63 +228,70 @@ SUBROUTINE SORGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
       IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
 *
-*        Use blocked code after the first block.
-*        The last kk rows are handled by the block method.
+*        We want to use the blocking method as long as our matrix is big enough
+*        and it's deemed worthwhile with the extra memory allocations
 *
-         KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB )
-*
-*        Set A(1:m-kk,n-kk+1:n) to zero.
-*
-         DO 20 J = N - KK + 1, N
-            DO 10 I = 1, M - KK
-               A( I, J ) = ZERO
-   10       CONTINUE
-   20    CONTINUE
+         KK = K
       ELSE
          KK = 0
       END IF
 *
-*     Use unblocked code for the first or only block.
+*     Potentially bail to the unblocked code
 *
-      CALL SORGR2( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO )
+      IF( KK.EQ.0 ) THEN
+         CALL SORGR2( M, N, K, A, LDA, TAU, WORK, IINFO )
+      END IF
 *
       IF( KK.GT.0 ) THEN
 *
-*        Use blocked code
+*        Factor the first block assuming that our first application
+*        will be on the Identity matrix
 *
-         DO 50 I = K - KK + 1, K, NB
-            IB = MIN( NB, K-I+1 )
-            II = M - K + I
-            IF( II.GT.1 ) THEN
+         I = 1
+         IB = NB
+         II = M - K + I
 *
-*              Form the triangular factor of the block reflector
-*              H = H(i+ib-1) . . . H(i+1) H(i)
+*        Form the triangular factor of the block reflector
+*        H = H(i+ib-1) . . . H(i+1) H(i)
 *
-               CALL SLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB,
-     $                      A( II, 1 ), LDA, TAU( I ), WORK, LDWORK )
+         CALL SLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB,
+     $                A( II, 1 ), LDA, TAU( I ), WORK, LDWORK )
 *
-*              Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right
+*        Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right
 *
-               CALL SLARFB( 'Right', 'Transpose', 'Backward',
-     $                      'Rowwise',
-     $                      II-1, N-K+I+IB-1, IB, A( II, 1 ), LDA, WORK,
-     $                      LDWORK, A, LDA, WORK( IB+1 ), LDWORK )
-            END IF
+         CALL SLARFB0C2(.TRUE., 'Right', 'Transpose', 'Backward', 
+     $         'Rowwise', II-1, N-K+I+IB-1, IB, A(II,1), LDA, WORK,
+     $         LDWORK, A, LDA)
 *
 *           Apply H**T to columns 1:n-k+i+ib-1 of current block
 *
-            CALL SORGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA,
-     $                   TAU( I ),
-     $                   WORK, IINFO )
+         CALL SORGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA,
+     $                TAU( I ), WORK, IINFO )
+
+         DO I = NB + 1, K, NB
 *
-*           Set columns n-k+i+ib:n of current block to zero
+*           The last block may be less than size NB
+*
+            IB = MIN(NB, K-I+1)
+            II = M - K + I
 *
-            DO 40 L = N - K + I + IB, N
-               DO 30 J = II, II + IB - 1
-                  A( J, L ) = ZERO
-   30          CONTINUE
-   40       CONTINUE
-   50    CONTINUE
+*           Form the triangular factor of the block reflector
+*           H = H(i+ib-1) . . . H(i+1) H(i)
+*
+            CALL SLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB,
+     $                   A( II, 1 ), LDA, TAU( I ), WORK, LDWORK )
+*
+*           Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right
+*
+            CALL SLARFB0C2(.FALSE., 'Right', 'Transpose', 
+     $            'Backward', 'Rowwise', II-1, N-K+I+IB-1, IB, 
+     $            A(II,1), LDA, WORK, LDWORK, A, LDA)
+*
+*           Apply H**T to columns 1:n-k+i+ib-1 of current block
+*
+            CALL SORGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA,
+     $                   TAU( I ), WORK, IINFO )
+         END DO
       END IF
 *
       WORK( 1 ) = SROUNDUP_LWORK(IWS)
diff --git a/SRC/zlarfb0c2.f b/SRC/zlarfb0c2.f
new file mode 100644
index 000000000..1f43b4b14
--- /dev/null
+++ b/SRC/zlarfb0c2.f
@@ -0,0 +1,560 @@
+*> \brief \b ZLARFB0C2 applies a block reflector or its conjugate-transpose 
+* to a rectangular matrix with a 0 block while constructing the explicit Q
+* factor
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*
+*  Definition:
+*  ===========
+*
+*     SUBROUTINE ZLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N,
+*    $                     K, V, LDV, T, LDT, C, LDC)
+*        ! Scalar arguments
+*        INTEGER           M, N, K, LDV, LDC, LDT
+*        CHARACTER         SIDE, TRANS, DIRECT, STOREV 
+*        ! True means that we are assuming C2 is the identity matrix
+*        !     and thus don't reference whatever is present in C2 
+*        !     at the beginning.
+*        LOGICAL           C2I
+*        ! Array arguments
+*        COMPLEX*16        V(LDV,*), C(LDC,*), T(LDT,*)
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZLARFB0C2 applies a real block reflector H or its transpose H**H to a
+*> complex m by n matrix C with a 0 block, while computing the explicit Q factor
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] C2I
+*> \verbatim
+*>          C2I is LOGICAL
+*>          = .TRUE.: Assume the nonzero block of C is the identity matrix
+*>          = .FALSE.: Use existing data in the nonzero block of C
+*> \endverbatim
+*>
+*> \param[in] SIDE
+*> \verbatim
+*>          SIDE is CHARACTER*1
+*>          = 'L': apply H or H**H from the Left
+*>          = 'R': apply H or H**H from the Right
+*> \endverbatim
+*>
+*> \param[in] TRANS
+*> \verbatim
+*>          TRANS is CHARACTER*1
+*>          = 'N': apply H (No transpose)
+*>          = 'C': apply H**H (Conjugate transpose)
+*> \endverbatim
+*>
+*> \param[in] DIRECT
+*> \verbatim
+*>          DIRECT is CHARACTER*1
+*>          Indicates how H is formed from a product of elementary
+*>          reflectors
+*>          = 'F': H = H(1) H(2) . . . H(k) (Forward)
+*>          = 'B': H = H(k) . . . H(2) H(1) (Backward)
+*> \endverbatim
+*>
+*> \param[in] STOREV
+*> \verbatim
+*>          STOREV is CHARACTER*1
+*>          Indicates how the vectors which define the elementary
+*>          reflectors are stored:
+*>          = 'C': Columnwise
+*>          = 'R': Rowwise
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix C.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix C.
+*> \endverbatim
+*>
+*> \param[in] K
+*> \verbatim
+*>          K is INTEGER
+*>          The order of the matrix T (= the number of elementary
+*>          reflectors whose product defines the block reflector).
+*>          If SIDE = 'L', M >= K >= 0;
+*>          if SIDE = 'R', N >= K >= 0.
+*> \endverbatim
+*>
+*> \param[in] V
+*> \verbatim
+*>          V is COMPLEX*16 array, dimension
+*>                                (LDV,K) if STOREV = 'C'
+*>                                (LDV,M) if STOREV = 'R' and SIDE = 'L'
+*>                                (LDV,N) if STOREV = 'R' and SIDE = 'R'
+*>          See Further Details.
+*> \endverbatim
+*>
+*> \param[in] LDV
+*> \verbatim
+*>          LDV is INTEGER
+*>          The leading dimension of the array V.
+*>          If STOREV = 'C' and SIDE = 'L', LDV >= max(1,M);
+*>          if STOREV = 'C' and SIDE = 'R', LDV >= max(1,N);
+*>          if STOREV = 'R', LDV >= K.
+*> \endverbatim
+*>
+*> \param[in] T
+*> \verbatim
+*>          T is COMPLEX*16 array, dimension (LDT,K)
+*>          The triangular K-by-K matrix T in the representation of the
+*>          block reflector.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*>          LDT is INTEGER
+*>          The leading dimension of the array T. LDT >= K.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is COMPLEX*16 array, dimension (LDC,N)
+*>          On entry, the M-by-N matrix C.
+*>          On exit, C is overwritten by H*C or H**H*C or C*H or C*H**H.
+*> \endverbatim
+*>
+*> \param[in] LDC
+*> \verbatim
+*>          LDC is INTEGER
+*>          The leading dimension of the array C. LDC >= max(1,M).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup larfb
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  The shape of the matrix V and the storage of the vectors which define
+*>  the H(i) is best illustrated by the following example with n = 5 and
+*>  k = 3. The triangular part of V (including its diagonal) is not
+*>  referenced.
+*>
+*>  DIRECT = 'F' and STOREV = 'C':         DIRECT = 'F' and STOREV = 'R':
+*>
+*>               V = (  1       )                 V = (  1 v1 v1 v1 v1 )
+*>                   ( v1  1    )                     (     1 v2 v2 v2 )
+*>                   ( v1 v2  1 )                     (        1 v3 v3 )
+*>                   ( v1 v2 v3 )
+*>                   ( v1 v2 v3 )
+*>
+*>  DIRECT = 'B' and STOREV = 'C':         DIRECT = 'B' and STOREV = 'R':
+*>
+*>               V = ( v1 v2 v3 )                 V = ( v1 v1  1       )
+*>                   ( v1 v2 v3 )                     ( v2 v2 v2  1    )
+*>                   (  1 v2 v3 )                     ( v3 v3 v3 v3  1 )
+*>                   (     1 v3 )
+*>                   (        1 )
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE ZLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N,
+     $                     K, V, LDV, T, LDT, C, LDC)
+         ! Scalar arguments
+         INTEGER           M, N, K, LDV, LDC, LDT
+         CHARACTER         SIDE, TRANS, DIRECT, STOREV 
+         ! True means that we are assuming C2 is the identity matrix
+         !     and thus don't reference whatever is present in C2 
+         !     at the beginning.
+         LOGICAL           C2I
+         ! Array arguments
+         COMPLEX*16        V(LDV,*), C(LDC,*), T(LDT,*)
+         ! Local scalars
+         LOGICAL           QR, LQ, QL, DIRF, COLV, SIDEL, SIDER,
+     $                     TRANST
+         INTEGER           I, J
+         ! Intrinsic Functions
+         INTRINSIC         CONJG
+         ! External functions
+         LOGICAL           LSAME
+         EXTERNAL          LSAME
+         ! External Subroutines
+         EXTERNAL          ZGEMM, ZTRMM
+         ! Parameters
+         COMPLEX*16        ONE, ZERO, NEG_ONE
+         PARAMETER(ONE=(1.0D+0, 0.0D+0),
+     $            ZERO = (0.0D+0, 0.0D+0), 
+     $            NEG_ONE = (-1.0D+0, 0.0D+0))
+
+         ! Beginning of executable statements
+         ! Convert our character flags to logical values
+         DIRF = LSAME(DIRECT,'F')
+         COLV = LSAME(STOREV,'C')
+         SIDEL = LSAME(SIDE,'L')
+         SIDER = LSAME(SIDE,'R')
+         TRANST = LSAME(TRANS,'C')
+
+         ! Determine which of the 4 modes are using.
+         ! QR is when we store the reflectors column by column and have the
+         ! 'first' reflector stored in the first column
+         QR = DIRF.AND.COLV
+
+         ! LQ is when we store the reflectors row by row and have the
+         ! 'first' reflector stored in the first row
+         LQ = DIRF.AND.(.NOT.COLV)
+
+         ! QL is when we store the reflectors column by column and have the
+         ! 'first' reflector stored in the last column
+         QL = (.NOT.DIRF).AND.COLV
+
+         ! RQ is when we store the reflectors row by row and have the
+         ! 'first' reflector stored in the last row
+         ! RQ = (.NOT.DIRF).AND.(.NOT.COLV)
+         ! Since we have exactly one of these 4 modes, we don't need to actually
+         ! store the value of RQ, instead we assume this is the case if we fail
+         ! the above 3 checks.
+
+         IF (QR) THEN
+            ! We are computing C = HC = (I - VTV')C
+            ! Where: V = [ V1 ] and C = [ C1 ]
+            !            [ V2 ]         [ C2 ]
+            ! with the following dimensions:
+            !     V1\in\R^{K\times K}
+            !     V2\in\R^{M-K\times K}
+            !     C1=0\in\R^{K\times N}
+            !     C2\in\R^{M-K\times N}
+            ! Since we are assuming that C1 is a zero matrix and it will be
+            ! overwritten on exit, we can use this spot as a temporary workspace
+            ! without having to allocate anything extra.
+            ! This lets us simplify our above equation to get
+            !
+            ! C = HC = (I - [ V1 ]T [V1', V2'])[ 0  ]
+            !               [ V2 ]             [ C2 ]
+            !   = [ 0  ] - [ V1 ]T*V2'*C2
+            !     [ C2 ]   [ V2 ]
+            !
+            !   = [ 0  ] - [ V1*T*V2'*C2 ]
+            !     [ C2 ]   [ V2*T*V2'*C2 ]
+            !
+            !   = [      V1*T*V2'*C2 ]
+            !     [ C2 - V2*T*V2'*C2 ]
+            !
+            ! So, we can order our computations as follows:
+            !
+            ! C1 = V2'*C2
+            ! C1 = T*C1
+            ! C2 = C2 - V2*C1
+            ! C1 = -V1*C1
+            !
+            ! To achieve the same end result
+            !
+            ! Check to ensure side and trans are the expected values 
+            !
+            IF( .NOT.SIDEL ) THEN
+               CALL XERBLA('ZLARFB0C2', 2)
+               RETURN
+            ELSE IF(TRANST) THEN
+               CALL XERBLA('ZLARFB0C2', 3)
+               RETURN
+            END IF
+            !
+            ! C1 = V2'*C2
+            !
+            IF (C2I) THEN
+               DO J = 1, N
+                  DO I = 1, K
+                     C(I,J) = CONJG(V(K+J,I))
+                  END DO
+               END DO
+            ELSE
+               CALL ZGEMM('Conjugate', 'No Transpose', K, N, M - K,
+     $                     ONE, V(K+1,1), LDV, C(K+1,1), LDC, ZERO,
+     $                     C, LDC)
+            END IF
+            !
+            ! C1 = T*C1
+            !
+            CALL ZTRMM('Left', 'Upper', 'No Transpose', 'Non-unit',
+     $                  K, N, ONE, T, LDT, C, LDC)
+            !
+            ! C2 = C2 - V2*C1 = -V2*C1 + C2
+            !
+            IF (C2I) THEN
+               CALL ZGEMM('No Transpose', 'No Transpose', M-K, N, K,
+     $                     NEG_ONE, V(K+1,1), LDV, C, LDC, ZERO,
+     $                     C(K+1,1), LDC)
+               DO I = 1, N
+                  C(K+I,I) = C(K+I,I) + ONE
+               END DO
+            ELSE
+               CALL ZGEMM('No Transpose', 'No Transpose', M-K, N, K,
+     $                     NEG_ONE, V(K+1,1), LDV, C, LDC, ONE,
+     $                     C(K+1,1), LDC)
+            END IF
+            !
+            ! C1 = -V1*C1
+            !
+            CALL ZTRMM('Left', 'Lower', 'No Transpose', 'Unit',
+     $                  K, N, NEG_ONE, V, LDV, C, LDC)
+         ELSE IF (LQ) THEN
+            ! We are computing C = CH' = C(I-V'T'V)
+            ! Where: V = [ V1 V2 ] and C = [ C1 C2 ]
+            ! with the following dimensions:
+            !     V1\in\R^{K\times K}
+            !     V2\in\R^{K\times N-K}
+            !     C1=0\in\R^{M\times K}
+            !     C2\in\R^{M\times N-K}
+            ! Since we are assuming that C1 is a zero matrix and it will be
+            ! overwritten on exit, we can use this spot as a temporary workspace
+            ! without having to allocate anything extra.
+            ! This lets us simplify our above equation to get
+            !
+            ! C = CH' = [ 0, C2 ](I - [ V1' ]T'[ V1, V2 ])
+            !                         [ V2' ]
+            !
+            !   = [ 0, C2 ] - [ 0, C2 ][ V1' ]T'[ V1, V2 ]
+            !                          [ V2' ]
+            !
+            !   = [ 0, C2 ] - C2*V2'*T'[ V1, V2 ]
+            !
+            !   = [ -C2*V2'*T'*V1, C2 - C2*V2'*T'*V2 ]
+            !
+            ! So, we can order our computations as follows:
+            !
+            ! C1 = C2*V2'
+            ! C1 = C1*T'
+            ! C2 = C2 - C1*V2
+            ! C1 = -C1*V1
+            !
+            ! To achieve the same end result
+            !
+            ! Check to ensure side and trans are the expected values 
+            !
+            IF( .NOT.SIDER ) THEN
+               CALL XERBLA('ZLARFB0C2', 2)
+               RETURN
+            ELSE IF(.NOT.TRANST) THEN
+               CALL XERBLA('ZLARFB0C2', 3)
+               RETURN
+            END IF
+            !
+            ! C1 = C2*V2'
+            !
+            IF( C2I ) THEN
+               DO J = 1, K
+                  DO I = 1, M
+                     C(I,J) = CONJG(V(J,K+I))
+                  END DO
+               END DO
+            ELSE
+               CALL ZGEMM('No Transpose', 'Conjugate', M, K, N-K,
+     $               ONE, C(1,K+1), LDC, V(1, K+1), LDV, ZERO, C,
+     $               LDC)
+            END IF
+            !
+            ! C1 = C1*T'
+            !
+            CALL ZTRMM('Right', 'Upper', 'Conjugate', 'Non-unit',
+     $            M, K, ONE, T, LDT, C, LDC)
+            !
+            ! C2 = C2 - C1*V2 = -C1*V2 + C2
+            !
+            IF( C2I ) THEN
+               CALL ZGEMM('No Transpose', 'No Transpose', M, N-K, K,
+     $               NEG_ONE, C, LDC, V(1,K+1), LDV, ZERO, C(1,K+1),
+     $               LDC)
+               DO I = 1, M
+                  C(I,K+I) = C(I,K+I) + ONE
+               END DO
+            ELSE
+               CALL ZGEMM('No Transpose', 'No Transpose', M, N-K, K,
+     $               NEG_ONE, C, LDC, V(1,K+1), LDV, ONE, C(1,K+1),
+     $               LDC)
+            END IF
+            !
+            ! C1 = -C1*V1
+            !
+            CALL ZTRMM('Right', 'Upper', 'No Transpose', 'Unit',
+     $            M, K, NEG_ONE, V, LDV, C, LDC)
+         ELSE IF (QL) THEN
+            ! We are computing C = HC = (I - VTV')C
+            ! Where: V = [ V2 ] and C = [ C2 ]
+            !            [ V1 ]         [ C1 ]
+            ! with the following dimensions:
+            !     V1\in\R^{K\times K}
+            !     V2\in\R^{M-K\times K}
+            !     C1=0\in\R^{K\times N}
+            !     C2\in\R^{M-K\times N}
+            ! Since we are assuming that C1 is a zero matrix and it will be
+            ! overwritten on exit, we can use this spot as a temporary workspace
+            ! without having to allocate anything extra.
+            ! This lets us simplify our above equation to get
+            !
+            ! C = HC = (I-[ V2 ]T[ V2' V1' ])[ C2 ]
+            !             [ V1 ]             [ 0  ]
+            !
+            !   = [ C2 ] - [ V2 ]T*V2'*C2
+            !     [ 0  ]   [ V1 ]
+            !
+            !   = [ C2 ] - [ V2*T*V2'*C2 ]
+            !     [ 0  ]   [ V1*T*V2'*C2 ]
+            !
+            !   = [ C2 - V2*T*V2'*C2 ]
+            !     [    - V1*T*V2'*C2 ]
+            !
+            ! So, we can order our computations as follows:
+            !
+            ! C1 = V2'*C2
+            ! C1 = T*C1
+            ! C2 = C2 - V2*C1
+            ! C1 = -V1*C1
+            !
+            ! To achieve the same end result
+            !
+            ! Check to ensure side and trans are the expected values 
+            !
+            IF( .NOT.SIDEL ) THEN
+               CALL XERBLA('ZLARFB0C2', 2)
+               RETURN
+            ELSE IF(TRANST) THEN
+               CALL XERBLA('ZLARFB0C2', 3)
+               RETURN
+            END IF
+            !
+            ! C1 = V2'*C2
+            !
+            IF( C2I ) THEN
+               DO J = 1, N
+                  DO I = 1, K
+                     C(M-K+I,J) = CONJG(V(J,I))
+                  END DO
+               END DO
+            ELSE
+               CALL ZGEMM('Conjugate', 'No Transpose', K, N, M-K,
+     $            ONE, V, LDV, C, LDC, ZERO, C(M-K+1, 1), LDC)
+            END IF
+            !
+            ! C1 = T*C1
+            !
+            CALL ZTRMM('Left', 'Lower', 'No Transpose', 'Non-unit',
+     $         K, N, ONE, T, LDT, C(M-K+1,1), LDC)
+            !
+            ! C2 = C2 - V2*C1 = -V2*C1 + C2
+            !
+            IF( C2I ) THEN
+               CALL ZGEMM('No Transpose', 'No Transpose', M-K, N, K,
+     $            NEG_ONE, V, LDV, C(M-K+1,1), LDC, ZERO, C, LDC)
+               DO I = 1, N
+                  C(I,I) = C(I,I) + ONE
+               END DO
+            ELSE
+               CALL ZGEMM('No Transpose', 'No Transpose', M-K, N, K,
+     $            NEG_ONE, V, LDV, C(M-K+1,1), LDC, ONE, C, LDC)
+            END IF
+            !
+            ! C1 = -V1*C1
+            !
+            CALL ZTRMM('Left', 'Upper', 'No Transpose', 'Unit',
+     $         K, N, NEG_ONE, V(M-K+1,1), LDV, C(M-K+1,1), LDC)
+         ELSE ! IF (RQ) THEN
+            ! We are computing C = CH' = C(I-V'T'V)
+            ! Where: V = [ V2 V1] and C = [ C2 C1 ]
+            ! with the following dimensions:
+            !     V1\in\R^{K\times K}
+            !     V2\in\R^{K\times N-K}
+            !     C1=0\in\R^{M\times K}
+            !     C2\in\R^{M\times N-K}
+            ! Since we are assuming that C1 is a zero matrix and it will be
+            ! overwritten on exit, we can use this spot as a temporary workspace
+            ! without having to allocate anything extra.
+            ! This lets us simplify our above equation to get
+            !
+            ! C = CH' = [ C2, 0 ] (I - [ V2' ]T'[ V2, V1 ]
+            !                          [ V1' ]
+            !
+            !   = [ C2, 0 ] - [ C2, 0 ] [ V2' ]T'[ V2, V1 ]
+            !                           [ V1' ]
+            !
+            !   = [ C2, 0 ] - C2*V2'*T'[ V2, V1 ]
+            !
+            !   = [ C2, 0 ] - [ C2*V2'*T'*V2, C2*V2'*T'*V1 ]
+            !
+            !   = [ C2 - C2*V2'*T'*V2, -C2*V2'*T'*V1 ]
+            !
+            ! So, we can order our computations as follows:
+            !
+            ! C1 = C2*V2'
+            ! C1 = C1*T'
+            ! C2 = C2 - C1*V2
+            ! C1 = -C1*V1
+            !
+            !
+            ! To achieve the same end result
+            !
+            ! Check to ensure side and trans are the expected values 
+            !
+            IF( .NOT.SIDER ) THEN
+               CALL XERBLA('ZLARFB0C2', 2)
+               RETURN
+            ELSE IF(.NOT.TRANST) THEN
+               CALL XERBLA('ZLARFB0C2', 3)
+               RETURN
+            END IF
+            !
+            ! C1 = C2*V2'
+            !
+            IF( C2I ) THEN
+               DO J = 1, K
+                  DO I = 1, M
+                     C(I,N-K+J) = CONJG(V(J,I))
+                  END DO
+               END DO
+            ELSE
+               CALL ZGEMM('No Transpose', 'Conjugate', M, K, N-K,
+     $            ONE, C, LDC, V, LDV, ZERO, C(1, N-K+1), LDC)
+            END IF
+            !
+            ! C1 = C1*T'
+            !
+            CALL ZTRMM('Right', 'Lower', 'Conjugate', 'Non-unit',
+     $         M, K, ONE, T, LDT, C(1, N-K+1), LDC)
+            !
+            ! C2 = C2 - C1*V2 = -C1*V2 + C2
+            !
+            IF( C2I ) THEN
+               CALL ZGEMM('No Transpose', 'No Transpose', M, N-K, K,
+     $            NEG_ONE, C(1, N-K+1), LDC, V, LDV, ZERO, C, LDC)
+               DO I = 1, M
+                  C(I,I) = C(I,I) + ONE
+               END DO
+            ELSE
+               CALL ZGEMM('No Transpose', 'No Transpose', M, N-K, K,
+     $            NEG_ONE, C(1, N-K+1), LDC, V, LDV, ONE, C, LDC)
+            END IF
+            !
+            ! C1 = -C1*V1
+            !
+            CALL ZTRMM('Right', 'Lower', 'No Transpose', 'Unit',
+     $         M, K, NEG_ONE, V(1, N-K+1), LDV, C(1,N-K+1), LDC)
+         END IF
+      END SUBROUTINE
diff --git a/SRC/zunglq.f b/SRC/zunglq.f
index 3cc107560..54aaf7686 100644
--- a/SRC/zunglq.f
+++ b/SRC/zunglq.f
@@ -136,17 +136,13 @@ SUBROUTINE ZUNGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
 *  =====================================================================
 *
-*     .. Parameters ..
-      COMPLEX*16         ZERO
-      PARAMETER          ( ZERO = ( 0.0D+0, 0.0D+0 ) )
-*     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IB, IINFO, IWS, J, KI, KK, L, LDWORK,
-     $                   LWKOPT, NB, NBMIN, NX
+      INTEGER            I, IB, IINFO, IWS, KI, KK, LWKOPT, LDWORK,
+     $                   NB, NBMIN, NX
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           XERBLA, ZLARFB, ZLARFT, ZUNGL2
+      EXTERNAL           XERBLA, ZLARFB0C2, ZLARFT, ZUNGL2
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
@@ -217,67 +213,95 @@ SUBROUTINE ZUNGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
       IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
 *
-*        Use blocked code after the last block.
-*        The first kk rows are handled by the block method.
-*
-         KI = ( ( K-NX-1 ) / NB )*NB
-         KK = MIN( K, KI+NB )
-*
-*        Set A(kk+1:m,1:kk) to zero.
+*        Treat the last NB block starting at KK+1 specially then use our blocking
+*        method from the block starting at KI+1 to 1 
 *
-         DO 20 J = 1, KK
-            DO 10 I = KK + 1, M
-               A( I, J ) = ZERO
-   10       CONTINUE
-   20    CONTINUE
+         KI = K - 2 * NB
+         KK = K - NB
       ELSE
          KK = 0
       END IF
 *
-*     Use unblocked code for the last or only block.
+*     Potentially bail to the unblocked version
 *
-      IF( KK.LT.M )
-     $   CALL ZUNGL2( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA,
-     $                TAU( KK+1 ), WORK, IINFO )
+      IF( KK.EQ.0 ) THEN
+         CALL ZUNGL2( M, N, K, A, LDA, TAU, WORK, IINFO )
+      END IF
 *
       IF( KK.GT.0 ) THEN
 *
-*        Use blocked code
+*        Factor the last block assuming that our first application
+*        will be on the Identity matrix
 *
-         DO 50 I = KI + 1, 1, -NB
-            IB = MIN( NB, K-I+1 )
-            IF( I+IB.LE.M ) THEN
+         I = KK + 1
+         IB = NB
 *
-*              Form the triangular factor of the block reflector
-*              H = H(i) H(i+1) . . . H(i+ib-1)
+*        Form the triangular factor of the block reflector
+*        H = H(i) H(i+1) . . . H(i+ib-1)
 *
-               CALL ZLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I,
-     $                      I ),
-     $                      LDA, TAU( I ), WORK, LDWORK )
+         CALL ZLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ),
+     $               LDA, TAU( I ), WORK, LDWORK )
 *
-*              Apply H**H to A(i+ib:m,i:n) from the right
+*        Apply H**T to A(i+ib:m,i:n) from the right
+*        Exploit the fact that we are applying to an identity 
 *
-               CALL ZLARFB( 'Right', 'Conjugate transpose',
-     $                      'Forward',
-     $                      'Rowwise', M-I-IB+1, N-I+1, IB, A( I, I ),
-     $                      LDA, WORK, LDWORK, A( I+IB, I ), LDA,
-     $                      WORK( IB+1 ), LDWORK )
-            END IF
+         CALL ZLARFB0C2(.TRUE., 'Right', 'Conjugate', 'Forward',
+     $         'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, 
+     $         LDWORK, A(I+IB,I), LDA)
+*
+*        Apply H**T to columns i:n of current block
+
+         CALL ZUNGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ),
+     $                WORK, IINFO )
+*
+*        Use our standard blocking method after the last block
+*
+         DO I = KI + 1, 1, -NB
+            IB = NB
+*
+*           Form the triangular factor of the block reflector
+*           H = H(i) H(i+1) . . . H(i+ib-1)
 *
-*           Apply H**H to columns i:n of current block
+            CALL ZLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ),
+     $                  LDA, TAU( I ), WORK, LDWORK )
 *
+*           Apply H**T to A(i+ib:m,i:n) from the right
+*
+            CALL ZLARFB0C2(.FALSE., 'Right', 'Conjugate', 'Forward',
+     $            'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, 
+     $            LDWORK, A(I+IB,I), LDA)
+*
+*           Apply H**T to columns i:n of current block
+
             CALL ZUNGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ),
-     $                   WORK,
-     $                   IINFO )
+     $                   WORK, IINFO )
+         END DO
+*
+*        This checks for if K was a perfect multiple of NB
+*        so that we only have a special case for the last block when
+*        necessary
+*
+         IF(I.LT.1) THEN
+            IB = I + NB - 1
+            I = 1
+*
+*           Form the triangular factor of the block reflector
+*           H = H(i) H(i+1) . . . H(i+ib-1)
 *
-*           Set columns 1:i-1 of current block to zero
+            CALL ZLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ),
+     $                  LDA, TAU( I ), WORK, LDWORK )
 *
-            DO 40 J = 1, I - 1
-               DO 30 L = I, I + IB - 1
-                  A( L, J ) = ZERO
-   30          CONTINUE
-   40       CONTINUE
-   50    CONTINUE
+*           Apply H**T to A(i+ib:m,i:n) from the right
+*
+            CALL ZLARFB0C2(.FALSE., 'Right', 'Conjugate', 'Forward',
+     $            'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, 
+     $            LDWORK, A(I+IB,I), LDA)
+*
+*           Apply H**T to columns i:n of current block
+
+            CALL ZUNGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ),
+     $                   WORK, IINFO )
+         END IF
       END IF
 *
       WORK( 1 ) = IWS
diff --git a/SRC/zungql.f b/SRC/zungql.f
index a64de501d..74c5ec8e6 100644
--- a/SRC/zungql.f
+++ b/SRC/zungql.f
@@ -95,8 +95,6 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK. LWORK >= max(1,N).
-*>          For optimum performance LWORK >= N*NB, where NB is the
-*>          optimal blocksize.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -136,18 +134,14 @@ SUBROUTINE ZUNGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *     ..
 *
 *  =====================================================================
-*
-*     .. Parameters ..
-      COMPLEX*16         ZERO
-      PARAMETER          ( ZERO = ( 0.0D+0, 0.0D+0 ) )
-*     ..
+*     
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IB, IINFO, IWS, J, KK, L, LDWORK, LWKOPT,
-     $                   NB, NBMIN, NX
+      INTEGER            I, IB, IINFO, IWS, KK, LWKOPT, NB, NBMIN,
+     $                   NX
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           XERBLA, ZLARFB, ZLARFT, ZUNG2L
+      EXTERNAL           XERBLA, ZLARFB0C2, ZLARFT, ZUNG2L
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
@@ -177,7 +171,11 @@ SUBROUTINE ZUNGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
             LWKOPT = 1
          ELSE
             NB = ILAENV( 1, 'ZUNGQL', ' ', M, N, K, -1 )
-            LWKOPT = N*NB
+*
+*           Only need a workspace for zung2l in case of bailout
+*           and for the panel factorization
+*
+            LWKOPT = N
          END IF
          WORK( 1 ) = LWKOPT
 *
@@ -200,88 +198,77 @@ SUBROUTINE ZUNGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
       END IF
 *
       NBMIN = 2
-      NX = 0
+      NX = MAX( 0, ILAENV( 3, 'ZUNGQL', ' ', M, N, K, -1 ))
       IWS = N
-      IF( NB.GT.1 .AND. NB.LT.K ) THEN
-*
-*        Determine when to cross over from blocked to unblocked code.
 *
-         NX = MAX( 0, ILAENV( 3, 'ZUNGQL', ' ', M, N, K, -1 ) )
-         IF( NX.LT.K ) THEN
+      IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
 *
-*           Determine if workspace is large enough for blocked code.
+*        We use blocked code for the entire construction
 *
-            LDWORK = N
-            IWS = LDWORK*NB
-            IF( LWORK.LT.IWS ) THEN
+         KK = K
+      ELSE
+         KK = 0
+      END IF
 *
-*              Not enough workspace to use optimal NB:  reduce NB and
-*              determine the minimum value of NB.
+*     Possibly bail to the unblocked code.
 *
-               NB = LWORK / LDWORK
-               NBMIN = MAX( 2, ILAENV( 2, 'ZUNGQL', ' ', M, N, K,
-     $                      -1 ) )
-            END IF
-         END IF
+      IF( KK.EQ.0 ) THEN
+         CALL ZUNG2L( M, N, K, A, LDA, TAU, WORK, IINFO )
       END IF
 *
-      IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
+      IF( KK.GT.0 ) THEN
 *
-*        Use blocked code after the first block.
-*        The last kk columns are handled by the block method.
+*        Factor the first block assuming that our first application
+*        will be on the Identity matrix
 *
-         KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB )
+         I = 1
+         IB = NB
 *
-*        Set A(m-kk+1:m,1:n-kk) to zero.
+*        Form the triangular factor of the block reflector
+*        H = H(i+ib-1) . . . H(i+1) H(i)
 *
-         DO 20 J = 1, N - KK
-            DO 10 I = M - KK + 1, M
-               A( I, J ) = ZERO
-   10       CONTINUE
-   20    CONTINUE
-      ELSE
-         KK = 0
-      END IF
+         CALL ZLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB,
+     $                  A( 1, N-K+I ), LDA, TAU( I ),
+     $                  A( M-K+I, N-K+I ), LDA)
 *
-*     Use unblocked code for the first or only block.
+*        Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left
+*        Exploit the fact that we are applying to an identity 
 *
-      CALL ZUNG2L( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO )
+         CALL ZLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Backward', 
+     $         'Columnwise', M-K+I+IB-1, N-K+I-1, IB, A(1, N-K+I), 
+     $         LDA, A( M-K+I, N-K+I ), LDA, A, LDA)
 *
-      IF( KK.GT.0 ) THEN
+*        Apply H to rows 1:m-k+i+ib-1 of current block
+*
+         CALL ZUNG2L( M-K+I+IB-1, IB, IB, A( 1, N-K+I ), LDA,
+     $                TAU( I ), WORK, IINFO )
+
+*        Use blocked code on the remaining blocks if there are any.
+*
+         DO I = NB+1, K, NB
 *
-*        Use blocked code
+*           The last block may be less than size NB
 *
-         DO 50 I = K - KK + 1, K, NB
-            IB = MIN( NB, K-I+1 )
-            IF( N-K+I.GT.1 ) THEN
+            IB = MIN(NB, K-I+1)
 *
-*              Form the triangular factor of the block reflector
-*              H = H(i+ib-1) . . . H(i+1) H(i)
+*           Form the triangular factor of the block reflector
+*           H = H(i+ib-1) . . . H(i+1) H(i)
 *
-               CALL ZLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB,
-     $                      A( 1, N-K+I ), LDA, TAU( I ), WORK, LDWORK )
+            CALL ZLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB,
+     $                  A( 1, N-K+I ), LDA, TAU( I ), 
+     $                  A( M-K+I, N-K+I ), LDA )
 *
-*              Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left
+*           Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left
 *
-               CALL ZLARFB( 'Left', 'No transpose', 'Backward',
-     $                      'Columnwise', M-K+I+IB-1, N-K+I-1, IB,
-     $                      A( 1, N-K+I ), LDA, WORK, LDWORK, A, LDA,
-     $                      WORK( IB+1 ), LDWORK )
-            END IF
+            CALL ZLARFB0C2(.FALSE., 'Left', 'No Transpose',
+     $            'Backward', 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, 
+     $            A(1, N-K+I), LDA, A( M-K+I, N-K+I ), LDA, A, LDA)
 *
 *           Apply H to rows 1:m-k+i+ib-1 of current block
 *
             CALL ZUNG2L( M-K+I+IB-1, IB, IB, A( 1, N-K+I ), LDA,
      $                   TAU( I ), WORK, IINFO )
-*
-*           Set rows m-k+i+ib:m of current block to zero
-*
-            DO 40 J = N - K + I, N - K + I + IB - 1
-               DO 30 L = M - K + I + IB, M
-                  A( L, J ) = ZERO
-   30          CONTINUE
-   40       CONTINUE
-   50    CONTINUE
+         END DO
       END IF
 *
       WORK( 1 ) = IWS
diff --git a/SRC/zungqr.f b/SRC/zungqr.f
index 6c9f2e3ff..50d5d91a2 100644
--- a/SRC/zungqr.f
+++ b/SRC/zungqr.f
@@ -95,8 +95,6 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK. LWORK >= max(1,N).
-*>          For optimum performance LWORK >= N*NB, where NB is the
-*>          optimal blocksize.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -137,17 +135,13 @@ SUBROUTINE ZUNGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
 *  =====================================================================
 *
-*     .. Parameters ..
-      COMPLEX*16         ZERO
-      PARAMETER          ( ZERO = ( 0.0D+0, 0.0D+0 ) )
-*     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IB, IINFO, IWS, J, KI, KK, L, LDWORK,
-     $                   LWKOPT, NB, NBMIN, NX
+      INTEGER            I, IB, IINFO, IWS, KI, KK, LWKOPT, NB,
+     $                   NBMIN, NX
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           XERBLA, ZLARFB, ZLARFT, ZUNG2R
+      EXTERNAL           XERBLA, ZLARFB0C2, ZLARFT, ZUNG2R
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
@@ -162,7 +156,11 @@ SUBROUTINE ZUNGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
       INFO = 0
       NB = ILAENV( 1, 'ZUNGQR', ' ', M, N, K, -1 )
-      LWKOPT = MAX( 1, N )*NB
+*
+*     Only need a workspace for zung2r in case of bailout
+*     and for the panel factorization
+*
+      LWKOPT = MAX( 1, N )
       WORK( 1 ) = LWKOPT
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
@@ -191,92 +189,102 @@ SUBROUTINE ZUNGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
       END IF
 *
       NBMIN = 2
-      NX = 0
+*     Determine when to cross over from unblocked to blocked
+      NX = MAX( 0, ILAENV( 3, 'ZUNGQR', ' ', M, N, K, -1 ) )
       IWS = N
-      IF( NB.GT.1 .AND. NB.LT.K ) THEN
 *
-*        Determine when to cross over from blocked to unblocked code.
-*
-         NX = MAX( 0, ILAENV( 3, 'ZUNGQR', ' ', M, N, K, -1 ) )
-         IF( NX.LT.K ) THEN
+      IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
 *
-*           Determine if workspace is large enough for blocked code.
+*        Treat the last NB block starting at KK+1 specially then use our blocking
+*        method from the block starting at KI+1 to 1 
 *
-            LDWORK = N
-            IWS = LDWORK*NB
-            IF( LWORK.LT.IWS ) THEN
+         KI = K - 2 * NB
+         KK = K - NB
+      ELSE
+         KK = 0
+      END IF
 *
-*              Not enough workspace to use optimal NB:  reduce NB and
-*              determine the minimum value of NB.
+*     Potentially bail to the unblocked code.
 *
-               NB = LWORK / LDWORK
-               NBMIN = MAX( 2, ILAENV( 2, 'ZUNGQR', ' ', M, N, K,
-     $                      -1 ) )
-            END IF
-         END IF
+      IF( KK.EQ.0 ) THEN
+         CALL ZUNG2R( M, N, K, A, LDA, TAU, WORK, IINFO )
       END IF
 *
-      IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
+      IF( KK.GT.0 ) THEN
 *
-*        Use blocked code after the last block.
-*        The first kk columns are handled by the block method.
+*        Factor the last block assuming that our first application
+*        will be on the Identity matrix
 *
-         KI = ( ( K-NX-1 ) / NB )*NB
-         KK = MIN( K, KI+NB )
+         I = KK + 1
+         IB = NB
 *
-*        Set A(1:kk,kk+1:n) to zero.
+*        Form the triangular factor of the block reflector
+*        H = H(i) H(i+1) . . . H(i+ib-1)
 *
-         DO 20 J = KK + 1, N
-            DO 10 I = 1, KK
-               A( I, J ) = ZERO
-   10       CONTINUE
-   20    CONTINUE
-      ELSE
-         KK = 0
-      END IF
+         CALL ZLARFT('Forward', 'Column', M-I+1, IB, A(I,I),
+     $                     LDA, TAU(I), A(I,I), LDA)
 *
-*     Use unblocked code for the last or only block.
+*        Apply H to A(i:m,i+ib:n) from the left
+*        Exploit the fact that we are applying to an identity 
 *
-      IF( KK.LT.N )
-     $   CALL ZUNG2R( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA,
-     $                TAU( KK+1 ), WORK, IINFO )
+         CALL ZLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Forward',
+     $      'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), LDA, A(I,I),
+     $      LDA, A(I,I+IB), LDA)
 *
-      IF( KK.GT.0 ) THEN
+*        Apply H to rows i:m of current block
 *
-*        Use blocked code
+         CALL ZUNG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK, IINFO)
 *
-         DO 50 I = KI + 1, 1, -NB
-            IB = MIN( NB, K-I+1 )
-            IF( I+IB.LE.N ) THEN
+*        Use our standard blocking method after the last block
 *
-*              Form the triangular factor of the block reflector
-*              H = H(i) H(i+1) . . . H(i+ib-1)
+         DO I = KI + 1, 1, -NB
+            IB = NB
 *
-               CALL ZLARFT( 'Forward', 'Columnwise', M-I+1, IB,
-     $                      A( I, I ), LDA, TAU( I ), WORK, LDWORK )
+*           Form the triangular factor of the block reflector
+*           H = H(i) H(i+1) . . . H(i+ib-1)
 *
-*              Apply H to A(i:m,i+ib:n) from the left
+            CALL ZLARFT('Forward', 'Column', M-I+1, IB, A(I,I),
+     $         LDA, TAU(I), A(I,I), LDA)
 *
-               CALL ZLARFB( 'Left', 'No transpose', 'Forward',
-     $                      'Columnwise', M-I+1, N-I-IB+1, IB,
-     $                      A( I, I ), LDA, WORK, LDWORK, A( I, I+IB ),
-     $                      LDA, WORK( IB+1 ), LDWORK )
-            END IF
+*           Apply H to A(i:m,i+ib:n) from the left
+*
+            CALL ZLARFB0C2(.FALSE., 'Left', 'No Transpose',
+     $         'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I),
+     $         LDA, A(I,I), LDA, A(I,I+IB), LDA)
+
 *
 *           Apply H to rows i:m of current block
 *
-            CALL ZUNG2R( M-I+1, IB, IB, A( I, I ), LDA, TAU( I ),
-     $                   WORK,
-     $                   IINFO )
+            CALL ZUNG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK,
+     $         IINFO)
+         END DO
+*
+*        This checks for if K was a perfect multiple of NB
+*        so that we only have a special case for the last block when
+*        necessary
+*
+         IF(I.LT.1) THEN
+            IB = I + NB - 1
+            I = 1
 *
-*           Set rows 1:i-1 of current block to zero
+*           Form the triangular factor of the block reflector
+*           H = H(i) H(i+1) . . . H(i+ib-1)
 *
-            DO 40 J = I, I + IB - 1
-               DO 30 L = 1, I - 1
-                  A( L, J ) = ZERO
-   30          CONTINUE
-   40       CONTINUE
-   50    CONTINUE
+            CALL ZLARFT('Forward', 'Column', M-I+1, IB, A(I,I),
+     $         LDA, TAU(I), A(I,I), LDA)
+*
+*           Apply H to A(i:m,i+ib:n) from the left
+*
+            CALL ZLARFB0C2(.FALSE., 'Left', 'No Transpose',
+     $         'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I),
+     $         LDA, A(I,I), LDA, A(I,I+IB), LDA)
+
+*
+*           Apply H to rows i:m of current block
+*
+            CALL ZUNG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK,
+     $         IINFO)
+         END IF
       END IF
 *
       WORK( 1 ) = IWS
diff --git a/SRC/zungrq.f b/SRC/zungrq.f
index dcc2772d6..939fff3fc 100644
--- a/SRC/zungrq.f
+++ b/SRC/zungrq.f
@@ -137,13 +137,9 @@ SUBROUTINE ZUNGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *
 *  =====================================================================
 *
-*     .. Parameters ..
-      COMPLEX*16         ZERO
-      PARAMETER          ( ZERO = ( 0.0D+0, 0.0D+0 ) )
-*     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IB, II, IINFO, IWS, J, KK, L, LDWORK,
+      INTEGER            I, IB, II, IINFO, IWS, KK, LDWORK,
      $                   LWKOPT, NB, NBMIN, NX
 *     ..
 *     .. External Subroutines ..
@@ -230,61 +226,67 @@ SUBROUTINE ZUNGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
 *        Use blocked code after the first block.
 *        The last kk rows are handled by the block method.
 *
-         KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB )
-*
-*        Set A(1:m-kk,n-kk+1:n) to zero.
-*
-         DO 20 J = N - KK + 1, N
-            DO 10 I = 1, M - KK
-               A( I, J ) = ZERO
-   10       CONTINUE
-   20    CONTINUE
+         KK = K
       ELSE
          KK = 0
       END IF
 *
-*     Use unblocked code for the first or only block.
+*     Potentially bail to the unblocked code
 *
-      CALL ZUNGR2( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO )
+      IF( KK.EQ.0 ) THEN
+         CALL ZUNGR2( M, N, K, A, LDA, TAU, WORK, IINFO )
+      END IF
 *
       IF( KK.GT.0 ) THEN
 *
-*        Use blocked code
+*        Factor the first block assuming that our first application
+*        will be on the Identity matrix
 *
-         DO 50 I = K - KK + 1, K, NB
-            IB = MIN( NB, K-I+1 )
-            II = M - K + I
-            IF( II.GT.1 ) THEN
+         I = 1
+         IB = NB
+         II = M - K + I
 *
-*              Form the triangular factor of the block reflector
-*              H = H(i+ib-1) . . . H(i+1) H(i)
+*        Form the triangular factor of the block reflector
+*        H = H(i+ib-1) . . . H(i+1) H(i)
 *
-               CALL ZLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB,
-     $                      A( II, 1 ), LDA, TAU( I ), WORK, LDWORK )
+         CALL ZLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB,
+     $                A( II, 1 ), LDA, TAU( I ), WORK, LDWORK )
 *
-*              Apply H**H to A(1:m-k+i-1,1:n-k+i+ib-1) from the right
+*        Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right
 *
-               CALL ZLARFB( 'Right', 'Conjugate transpose',
-     $                      'Backward',
-     $                      'Rowwise', II-1, N-K+I+IB-1, IB, A( II, 1 ),
-     $                      LDA, WORK, LDWORK, A, LDA, WORK( IB+1 ),
-     $                      LDWORK )
-            END IF
+         CALL ZLARFB0C2(.TRUE., 'Right', 'Conjugate', 'Backward', 
+     $         'Rowwise', II-1, N-K+I+IB-1, IB, A(II,1), LDA, WORK,
+     $         LDWORK, A, LDA)
 *
-*           Apply H**H to columns 1:n-k+i+ib-1 of current block
+*           Apply H**T to columns 1:n-k+i+ib-1 of current block
 *
-            CALL ZUNGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA,
-     $                   TAU( I ),
-     $                   WORK, IINFO )
+         CALL ZUNGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA,
+     $                TAU( I ), WORK, IINFO )
+
+         DO I = NB+1, K, NB
 *
-*           Set columns n-k+i+ib:n of current block to zero
+*           The last block may be less than size NB
 *
-            DO 40 L = N - K + I + IB, N
-               DO 30 J = II, II + IB - 1
-                  A( J, L ) = ZERO
-   30          CONTINUE
-   40       CONTINUE
-   50    CONTINUE
+            IB = MIN(NB, K-I+1)
+            II = M - K + I
+*
+*           Form the triangular factor of the block reflector
+*           H = H(i+ib-1) . . . H(i+1) H(i)
+*
+            CALL ZLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB,
+     $                   A( II, 1 ), LDA, TAU( I ), WORK, LDWORK )
+*
+*           Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right
+*
+            CALL ZLARFB0C2(.FALSE., 'Right', 'Conjugate', 
+     $            'Backward', 'Rowwise', II-1, N-K+I+IB-1, IB, 
+     $            A(II,1), LDA, WORK, LDWORK, A, LDA)
+*
+*           Apply H**T to columns 1:n-k+i+ib-1 of current block
+*
+            CALL ZUNGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA,
+     $                   TAU( I ), WORK, IINFO )
+         END DO
       END IF
 *
       WORK( 1 ) = IWS