diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt
index be426cecd..24740835b 100644
--- a/SRC/CMakeLists.txt
+++ b/SRC/CMakeLists.txt
@@ -106,7 +106,7 @@ set(SLASRC
slaqgb.f slaqge.f slaqp2.f slaqps.f slaqp2rk.f slaqp3rk.f slaqsb.f slaqsp.f slaqsy.f
slaqr0.f slaqr1.f slaqr2.f slaqr3.f slaqr4.f slaqr5.f
slaqtr.f slar1v.f slar2v.f ilaslr.f ilaslc.f
- slarf.f slarf1f.f slarf1l.f slarfb.f slarfb_gett.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f
+ slarf.f slarf1f.f slarf1l.f slarfb.f slarfb0c2.f slarfb_gett.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f
slargv.f slarmm.f slarrv.f slartv.f
slarz.f slarzb.f slarzt.f slasy2.f
slasyf.f slasyf_rook.f slasyf_rk.f slasyf_aa.f
@@ -218,7 +218,7 @@ set(CLASRC
claqhb.f claqhe.f claqhp.f claqp2.f claqps.f claqp2rk.f claqp3rk.f claqsb.f
claqr0.f claqr1.f claqr2.f claqr3.f claqr4.f claqr5.f
claqsp.f claqsy.f clar1v.f clar2v.f ilaclr.f ilaclc.f
- clarf.f clarf1f.f clarf1l.f clarfb.f clarfb_gett.f clarfg.f clarfgp.f clarft.f
+ clarf.f clarf1f.f clarf1l.f clarfb.f clarfb0c2.f clarfb_gett.f clarfg.f clarfgp.f clarft.f
clarfx.f clarfy.f clargv.f clarnv.f clarrv.f clartg.f90 clartv.f
clarz.f clarzb.f clarzt.f clascl.f claset.f clasr.f classq.f90
claswp.f clasyf.f clasyf_rook.f clasyf_rk.f clasyf_aa.f
@@ -307,7 +307,7 @@ set(DLASRC
dlaqgb.f dlaqge.f dlaqp2.f dlaqps.f dlaqp2rk.f dlaqp3rk.f dlaqsb.f dlaqsp.f dlaqsy.f
dlaqr0.f dlaqr1.f dlaqr2.f dlaqr3.f dlaqr4.f dlaqr5.f
dlaqtr.f dlar1v.f dlar2v.f iladlr.f iladlc.f
- dlarf.f dlarfb.f dlarfb_gett.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f dlarf1f.f dlarf1l.f
+ dlarf.f dlarfb.f dlarfb_gett.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f dlarf1f.f dlarf1l.f dlarfb0c2.f
dlargv.f dlarmm.f dlarrv.f dlartv.f
dlarz.f dlarzb.f dlarzt.f dlaswp.f dlasy2.f
dlasyf.f dlasyf_rook.f dlasyf_rk.f dlasyf_aa.f
@@ -418,7 +418,7 @@ set(ZLASRC
zlaqhb.f zlaqhe.f zlaqhp.f zlaqp2.f zlaqps.f zlaqp2rk.f zlaqp3rk.f zlaqsb.f
zlaqr0.f zlaqr1.f zlaqr2.f zlaqr3.f zlaqr4.f zlaqr5.f
zlaqsp.f zlaqsy.f zlar1v.f zlar2v.f ilazlr.f ilazlc.f
- zlarcm.f zlarf.f zlarfb.f zlarfb_gett.f zlarf1f.f zlarf1l.f
+ zlarcm.f zlarf.f zlarfb.f zlarfb0c2.f zlarfb_gett.f zlarf1f.f zlarf1l.f
zlarfg.f zlarfgp.f zlarft.f
zlarfx.f zlarfy.f zlargv.f zlarnv.f zlarrv.f zlartg.f90 zlartv.f
zlarz.f zlarzb.f zlarzt.f zlascl.f zlaset.f zlasr.f
diff --git a/SRC/Makefile b/SRC/Makefile
index 0191626f0..b9eaf1bca 100644
--- a/SRC/Makefile
+++ b/SRC/Makefile
@@ -137,7 +137,7 @@ SLASRC = \
slaqgb.o slaqge.o slaqp2.o slaqps.o slaqp2rk.o slaqp3rk.o slaqsb.o slaqsp.o slaqsy.o \
slaqr0.o slaqr1.o slaqr2.o slaqr3.o slaqr4.o slaqr5.o \
slaqtr.o slar1v.o slar2v.o ilaslr.o ilaslc.o \
- slarf.o slarf1f.o slarf1l.o slarfb.o slarfb_gett.o slarfg.o slarfgp.o slarft.o slarfx.o slarfy.o \
+ slarf.o slarf1f.o slarf1l.o slarfb.o slarfb0c2.o slarfb_gett.o slarfg.o slarfgp.o slarft.o slarfx.o slarfy.o \
slargv.o slarmm.o slarrv.o slartv.o \
slarz.o slarzb.o slarzt.o slaswp.o slasy2.o slasyf.o slasyf_rook.o \
slasyf_rk.o \
@@ -249,7 +249,7 @@ CLASRC = \
claqhb.o claqhe.o claqhp.o claqp2.o claqps.o claqp2rk.o claqp3rk.o claqsb.o \
claqr0.o claqr1.o claqr2.o claqr3.o claqr4.o claqr5.o \
claqsp.o claqsy.o clar1v.o clar2v.o ilaclr.o ilaclc.o \
- clarf.o clarf1f.o clarf1l.o clarfb.o clarfb_gett.o clarfg.o clarft.o clarfgp.o \
+ clarf.o clarf1f.o clarf1l.o clarfb.o clarfb0c2.o clarfb_gett.o clarfg.o clarft.o clarfgp.o \
clarfx.o clarfy.o clargv.o clarnv.o clarrv.o clartg.o clartv.o \
clarz.o clarzb.o clarzt.o clascl.o claset.o clasr.o classq.o \
claswp.o clasyf.o clasyf_rook.o clasyf_rk.o clasyf_aa.o \
@@ -339,7 +339,7 @@ DLASRC = \
dlaqgb.o dlaqge.o dlaqp2.o dlaqps.o dlaqp2rk.o dlaqp3rk.o dlaqsb.o dlaqsp.o dlaqsy.o \
dlaqr0.o dlaqr1.o dlaqr2.o dlaqr3.o dlaqr4.o dlaqr5.o \
dlaqtr.o dlar1v.o dlar2v.o iladlr.o iladlc.o \
- dlarf.o dlarfb.o dlarfb_gett.o dlarfg.o dlarfgp.o dlarft.o dlarfx.o dlarfy.o dlarf1f.o dlarf1l.o\
+ dlarf.o dlarfb.o dlarfb_gett.o dlarfg.o dlarfgp.o dlarft.o dlarfx.o dlarfy.o dlarf1f.o dlarf1l.o dlarfb0c2.o\
dlargv.o dlarmm.o dlarrv.o dlartv.o \
dlarz.o dlarzb.o dlarzt.o dlaswp.o dlasy2.o \
dlasyf.o dlasyf_rook.o dlasyf_rk.o \
@@ -453,7 +453,7 @@ ZLASRC = \
zlaqhb.o zlaqhe.o zlaqhp.o zlaqp2.o zlaqps.o zlaqp2rk.o zlaqp3rk.o zlaqsb.o \
zlaqr0.o zlaqr1.o zlaqr2.o zlaqr3.o zlaqr4.o zlaqr5.o \
zlaqsp.o zlaqsy.o zlar1v.o zlar2v.o ilazlr.o ilazlc.o \
- zlarcm.o zlarf.o zlarfb.o zlarfb_gett.o zlarf1f.o zlarf1l.o \
+ zlarcm.o zlarf.o zlarfb.o zlarfb0c2.o zlarfb_gett.o zlarf1f.o zlarf1l.o \
zlarfg.o zlarft.o zlarfgp.o \
zlarfx.o zlarfy.o zlargv.o zlarnv.o zlarrv.o zlartg.o zlartv.o \
zlarz.o zlarzb.o zlarzt.o zlascl.o zlaset.o zlasr.o \
diff --git a/SRC/clarfb0c2.f b/SRC/clarfb0c2.f
new file mode 100644
index 000000000..4a7a2018d
--- /dev/null
+++ b/SRC/clarfb0c2.f
@@ -0,0 +1,561 @@
+*> \brief \b CLARFB0C2 applies a block reflector or its conjugate-transpose
+* to a rectangular matrix with a 0 block while constructing the explicit Q
+* factor
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE CLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N,
+* $ K, V, LDV, T, LDT, C, LDC)
+* ! Scalar arguments
+* INTEGER M, N, K, LDV, LDC, LDT
+* CHARACTER SIDE, TRANS, DIRECT, STOREV
+* ! True means that we are assuming C2 is the identity matrix
+* ! and thus don't reference whatever is present in C2
+* ! at the beginning.
+* LOGICAL C2I
+* ! Array arguments
+* COMPLEX V(LDV,*), C(LDC,*), T(LDT,*)
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> CLARFB0C2 applies a real block reflector H or its transpose H**H to a
+*> complex m by n matrix C with a 0 block, while computing the explicit Q factor
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] C2I
+*> \verbatim
+*> C2I is LOGICAL
+*> = .TRUE.: Assume the nonzero block of C is the identity matrix
+*> = .FALSE.: Use existing data in the nonzero block of C
+*> \endverbatim
+*>
+*> \param[in] SIDE
+*> \verbatim
+*> SIDE is CHARACTER*1
+*> = 'L': apply H or H**H from the Left
+*> = 'R': apply H or H**H from the Right
+*> \endverbatim
+*>
+*> \param[in] TRANS
+*> \verbatim
+*> TRANS is CHARACTER*1
+*> = 'N': apply H (No transpose)
+*> = 'C': apply H**H (Conjugate transpose)
+*> \endverbatim
+*>
+*> \param[in] DIRECT
+*> \verbatim
+*> DIRECT is CHARACTER*1
+*> Indicates how H is formed from a product of elementary
+*> reflectors
+*> = 'F': H = H(1) H(2) . . . H(k) (Forward)
+*> = 'B': H = H(k) . . . H(2) H(1) (Backward)
+*> \endverbatim
+*>
+*> \param[in] STOREV
+*> \verbatim
+*> STOREV is CHARACTER*1
+*> Indicates how the vectors which define the elementary
+*> reflectors are stored:
+*> = 'C': Columnwise
+*> = 'R': Rowwise
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*> M is INTEGER
+*> The number of rows of the matrix C.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The number of columns of the matrix C.
+*> \endverbatim
+*>
+*> \param[in] K
+*> \verbatim
+*> K is INTEGER
+*> The order of the matrix T (= the number of elementary
+*> reflectors whose product defines the block reflector).
+*> If SIDE = 'L', M >= K >= 0;
+*> if SIDE = 'R', N >= K >= 0.
+*> \endverbatim
+*>
+*> \param[in] V
+*> \verbatim
+*> V is COMPLEX array, dimension
+*> (LDV,K) if STOREV = 'C'
+*> (LDV,M) if STOREV = 'R' and SIDE = 'L'
+*> (LDV,N) if STOREV = 'R' and SIDE = 'R'
+*> See Further Details.
+*> \endverbatim
+*>
+*> \param[in] LDV
+*> \verbatim
+*> LDV is INTEGER
+*> The leading dimension of the array V.
+*> If STOREV = 'C' and SIDE = 'L', LDV >= max(1,M);
+*> if STOREV = 'C' and SIDE = 'R', LDV >= max(1,N);
+*> if STOREV = 'R', LDV >= K.
+*> \endverbatim
+*>
+*> \param[in] T
+*> \verbatim
+*> T is COMPLEX array, dimension (LDT,K)
+*> The triangular K-by-K matrix T in the representation of the
+*> block reflector.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*> LDT is INTEGER
+*> The leading dimension of the array T. LDT >= K.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*> C is COMPLEX array, dimension (LDC,N)
+*> On entry, the M-by-N matrix C.
+*> On exit, C is overwritten by H*C or H**H*C or C*H or C*H**H.
+*> \endverbatim
+*>
+*> \param[in] LDC
+*> \verbatim
+*> LDC is INTEGER
+*> The leading dimension of the array C. LDC >= max(1,M).
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup larfb
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> The shape of the matrix V and the storage of the vectors which define
+*> the H(i) is best illustrated by the following example with n = 5 and
+*> k = 3. The triangular part of V (including its diagonal) is not
+*> referenced.
+*>
+*> DIRECT = 'F' and STOREV = 'C': DIRECT = 'F' and STOREV = 'R':
+*>
+*> V = ( 1 ) V = ( 1 v1 v1 v1 v1 )
+*> ( v1 1 ) ( 1 v2 v2 v2 )
+*> ( v1 v2 1 ) ( 1 v3 v3 )
+*> ( v1 v2 v3 )
+*> ( v1 v2 v3 )
+*>
+*> DIRECT = 'B' and STOREV = 'C': DIRECT = 'B' and STOREV = 'R':
+*>
+*> V = ( v1 v2 v3 ) V = ( v1 v1 1 )
+*> ( v1 v2 v3 ) ( v2 v2 v2 1 )
+*> ( 1 v2 v3 ) ( v3 v3 v3 v3 1 )
+*> ( 1 v3 )
+*> ( 1 )
+*> \endverbatim
+*>
+* =====================================================================
+ SUBROUTINE CLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N,
+ $ K, V, LDV, T, LDT, C, LDC)
+ ! Scalar arguments
+ INTEGER M, N, K, LDV, LDC, LDT
+ CHARACTER SIDE, TRANS, DIRECT, STOREV
+ ! True means that we are assuming C2 is the identity matrix
+ ! and thus don't reference whatever is present in C2
+ ! at the beginning.
+ LOGICAL C2I
+
+ ! Array arguments
+ COMPLEX V(LDV,*), C(LDC,*), T(LDT,*)
+ ! Local scalars
+ LOGICAL QR, LQ, QL, DIRF, COLV, SIDEL, SIDER,
+ $ TRANST
+ INTEGER I, J
+ ! Intrinsic Functions
+ INTRINSIC CONJG
+ ! External functions
+ LOGICAL LSAME
+ EXTERNAL LSAME
+ ! External Subroutines
+ EXTERNAL CGEMM, CTRMM
+ ! Parameters
+ COMPLEX ONE, ZERO, NEG_ONE
+ PARAMETER(ONE=(1.0E+0, 0.0E+0),
+ $ ZERO = (0.0E+0, 0.0E+0),
+ $ NEG_ONE = (-1.0E+0, 0.0E+0))
+
+ ! Beginning of executable statements
+ ! Convert our character flags to logical values
+ DIRF = LSAME(DIRECT,'F')
+ COLV = LSAME(STOREV,'C')
+ SIDEL = LSAME(SIDE,'L')
+ SIDER = LSAME(SIDE,'R')
+ TRANST = LSAME(TRANS,'C')
+
+ ! Determine which of the 4 modes are using.
+ ! QR is when we store the reflectors column by column and have the
+ ! 'first' reflector stored in the first column
+ QR = DIRF.AND.COLV
+
+ ! LQ is when we store the reflectors row by row and have the
+ ! 'first' reflector stored in the first row
+ LQ = DIRF.AND.(.NOT.COLV)
+
+ ! QL is when we store the reflectors column by column and have the
+ ! 'first' reflector stored in the last column
+ QL = (.NOT.DIRF).AND.COLV
+
+ ! RQ is when we store the reflectors row by row and have the
+ ! 'first' reflector stored in the last row
+ ! RQ = (.NOT.DIRF).AND.(.NOT.COLV)
+ ! Since we have exactly one of these 4 modes, we don't need to actually
+ ! store the value of RQ, instead we assume this is the case if we fail
+ ! the above 3 checks.
+
+ IF (QR) THEN
+ ! We are computing C = HC = (I - VTV')C
+ ! Where: V = [ V1 ] and C = [ C1 ]
+ ! [ V2 ] [ C2 ]
+ ! with the following dimensions:
+ ! V1\in\R^{K\times K}
+ ! V2\in\R^{M-K\times K}
+ ! C1=0\in\R^{K\times N}
+ ! C2\in\R^{M-K\times N}
+ ! Since we are assuming that C1 is a zero matrix and it will be
+ ! overwritten on exit, we can use this spot as a temporary workspace
+ ! without having to allocate anything extra.
+ ! This lets us simplify our above equation to get
+ !
+ ! C = HC = (I - [ V1 ]T [V1', V2'])[ 0 ]
+ ! [ V2 ] [ C2 ]
+ ! = [ 0 ] - [ V1 ]T*V2'*C2
+ ! [ C2 ] [ V2 ]
+ !
+ ! = [ 0 ] - [ V1*T*V2'*C2 ]
+ ! [ C2 ] [ V2*T*V2'*C2 ]
+ !
+ ! = [ V1*T*V2'*C2 ]
+ ! [ C2 - V2*T*V2'*C2 ]
+ !
+ ! So, we can order our computations as follows:
+ !
+ ! C1 = V2'*C2
+ ! C1 = T*C1
+ ! C2 = C2 - V2*C1
+ ! C1 = -V1*C1
+ !
+ ! To achieve the same end result
+ !
+ ! Check to ensure side and trans are the expected values
+ !
+ IF( .NOT.SIDEL ) THEN
+ CALL XERBLA('CLARFB0C2', 2)
+ RETURN
+ ELSE IF(TRANST) THEN
+ CALL XERBLA('CLARFB0C2', 3)
+ RETURN
+ END IF
+ !
+ ! C1 = V2'*C2
+ !
+ IF (C2I) THEN
+ DO J = 1, N
+ DO I = 1, K
+ C(I,J) = CONJG(V(K+J,I))
+ END DO
+ END DO
+ ELSE
+ CALL CGEMM('Conjugate', 'No Transpose', K, N, M - K,
+ $ ONE, V(K+1,1), LDV, C(K+1,1), LDC, ZERO,
+ $ C, LDC)
+ END IF
+ !
+ ! C1 = T*C1
+ !
+ CALL CTRMM('Left', 'Upper', 'No Transpose', 'Non-unit',
+ $ K, N, ONE, T, LDT, C, LDC)
+ !
+ ! C2 = C2 - V2*C1 = -V2*C1 + C2
+ !
+ IF (C2I) THEN
+ CALL CGEMM('No Transpose', 'No Transpose', M-K, N, K,
+ $ NEG_ONE, V(K+1,1), LDV, C, LDC, ZERO,
+ $ C(K+1,1), LDC)
+ DO I = 1, N
+ C(K+I,I) = C(K+I,I) + ONE
+ END DO
+ ELSE
+ CALL CGEMM('No Transpose', 'No Transpose', M-K, N, K,
+ $ NEG_ONE, V(K+1,1), LDV, C, LDC, ONE,
+ $ C(K+1,1), LDC)
+ END IF
+ !
+ ! C1 = -V1*C1
+ !
+ CALL CTRMM('Left', 'Lower', 'No Transpose', 'Unit',
+ $ K, N, NEG_ONE, V, LDV, C, LDC)
+ ELSE IF (LQ) THEN
+ ! We are computing C = CH' = C(I-V'T'V)
+ ! Where: V = [ V1 V2 ] and C = [ C1 C2 ]
+ ! with the following dimensions:
+ ! V1\in\R^{K\times K}
+ ! V2\in\R^{K\times N-K}
+ ! C1=0\in\R^{M\times K}
+ ! C2\in\R^{M\times N-K}
+ ! Since we are assuming that C1 is a zero matrix and it will be
+ ! overwritten on exit, we can use this spot as a temporary workspace
+ ! without having to allocate anything extra.
+ ! This lets us simplify our above equation to get
+ !
+ ! C = CH' = [ 0, C2 ](I - [ V1' ]T'[ V1, V2 ])
+ ! [ V2' ]
+ !
+ ! = [ 0, C2 ] - [ 0, C2 ][ V1' ]T'[ V1, V2 ]
+ ! [ V2' ]
+ !
+ ! = [ 0, C2 ] - C2*V2'*T'[ V1, V2 ]
+ !
+ ! = [ -C2*V2'*T'*V1, C2 - C2*V2'*T'*V2 ]
+ !
+ ! So, we can order our computations as follows:
+ !
+ ! C1 = C2*V2'
+ ! C1 = C1*T'
+ ! C2 = C2 - C1*V2
+ ! C1 = -C1*V1
+ !
+ ! To achieve the same end result
+ !
+ ! Check to ensure side and trans are the expected values
+ !
+ IF( .NOT.SIDER ) THEN
+ CALL XERBLA('CLARFB0C2', 2)
+ RETURN
+ ELSE IF(.NOT.TRANST) THEN
+ CALL XERBLA('CLARFB0C2', 3)
+ RETURN
+ END IF
+ !
+ ! C1 = C2*V2'
+ !
+ IF( C2I ) THEN
+ DO J = 1, K
+ DO I = 1, M
+ C(I,J) = CONJG(V(J,K+I))
+ END DO
+ END DO
+ ELSE
+ CALL CGEMM('No Transpose', 'Conjugate', M, K, N-K,
+ $ ONE, C(1,K+1), LDC, V(1, K+1), LDV, ZERO, C,
+ $ LDC)
+ END IF
+ !
+ ! C1 = C1*T'
+ !
+ CALL CTRMM('Right', 'Upper', 'Conjugate', 'Non-unit',
+ $ M, K, ONE, T, LDT, C, LDC)
+ !
+ ! C2 = C2 - C1*V2 = -C1*V2 + C2
+ !
+ IF( C2I ) THEN
+ CALL CGEMM('No Transpose', 'No Transpose', M, N-K, K,
+ $ NEG_ONE, C, LDC, V(1,K+1), LDV, ZERO, C(1,K+1),
+ $ LDC)
+ DO I = 1, M
+ C(I,K+I) = C(I,K+I) + ONE
+ END DO
+ ELSE
+ CALL CGEMM('No Transpose', 'No Transpose', M, N-K, K,
+ $ NEG_ONE, C, LDC, V(1,K+1), LDV, ONE, C(1,K+1),
+ $ LDC)
+ END IF
+ !
+ ! C1 = -C1*V1
+ !
+ CALL CTRMM('Right', 'Upper', 'No Transpose', 'Unit',
+ $ M, K, NEG_ONE, V, LDV, C, LDC)
+ ELSE IF (QL) THEN
+ ! We are computing C = HC = (I - VTV')C
+ ! Where: V = [ V2 ] and C = [ C2 ]
+ ! [ V1 ] [ C1 ]
+ ! with the following dimensions:
+ ! V1\in\R^{K\times K}
+ ! V2\in\R^{M-K\times K}
+ ! C1=0\in\R^{K\times N}
+ ! C2\in\R^{M-K\times N}
+ ! Since we are assuming that C1 is a zero matrix and it will be
+ ! overwritten on exit, we can use this spot as a temporary workspace
+ ! without having to allocate anything extra.
+ ! This lets us simplify our above equation to get
+ !
+ ! C = HC = (I-[ V2 ]T[ V2' V1' ])[ C2 ]
+ ! [ V1 ] [ 0 ]
+ !
+ ! = [ C2 ] - [ V2 ]T*V2'*C2
+ ! [ 0 ] [ V1 ]
+ !
+ ! = [ C2 ] - [ V2*T*V2'*C2 ]
+ ! [ 0 ] [ V1*T*V2'*C2 ]
+ !
+ ! = [ C2 - V2*T*V2'*C2 ]
+ ! [ - V1*T*V2'*C2 ]
+ !
+ ! So, we can order our computations as follows:
+ !
+ ! C1 = V2'*C2
+ ! C1 = T*C1
+ ! C2 = C2 - V2*C1
+ ! C1 = -V1*C1
+ !
+ ! To achieve the same end result
+ !
+ ! Check to ensure side and trans are the expected values
+ !
+ IF( .NOT.SIDEL ) THEN
+ CALL XERBLA('CLARFB0C2', 2)
+ RETURN
+ ELSE IF(TRANST) THEN
+ CALL XERBLA('CLARFB0C2', 3)
+ RETURN
+ END IF
+ !
+ ! C1 = V2'*C2
+ !
+ IF( C2I ) THEN
+ DO J = 1, N
+ DO I = 1, K
+ C(M-K+I,J) = CONJG(V(J,I))
+ END DO
+ END DO
+ ELSE
+ CALL CGEMM('Conjugate', 'No Transpose', K, N, M-K,
+ $ ONE, V, LDV, C, LDC, ZERO, C(M-K+1, 1), LDC)
+ END IF
+ !
+ ! C1 = T*C1
+ !
+ CALL CTRMM('Left', 'Lower', 'No Transpose', 'Non-unit',
+ $ K, N, ONE, T, LDT, C(M-K+1,1), LDC)
+ !
+ ! C2 = C2 - V2*C1 = -V2*C1 + C2
+ !
+ IF( C2I ) THEN
+ CALL CGEMM('No Transpose', 'No Transpose', M-K, N, K,
+ $ NEG_ONE, V, LDV, C(M-K+1,1), LDC, ZERO, C, LDC)
+ DO I = 1, N
+ C(I,I) = C(I,I) + ONE
+ END DO
+ ELSE
+ CALL CGEMM('No Transpose', 'No Transpose', M-K, N, K,
+ $ NEG_ONE, V, LDV, C(M-K+1,1), LDC, ONE, C, LDC)
+ END IF
+ !
+ ! C1 = -V1*C1
+ !
+ CALL CTRMM('Left', 'Upper', 'No Transpose', 'Unit',
+ $ K, N, NEG_ONE, V(M-K+1,1), LDV, C(M-K+1,1), LDC)
+ ELSE ! IF (RQ) THEN
+ ! We are computing C = CH' = C(I-V'T'V)
+ ! Where: V = [ V2 V1] and C = [ C2 C1 ]
+ ! with the following dimensions:
+ ! V1\in\R^{K\times K}
+ ! V2\in\R^{K\times N-K}
+ ! C1=0\in\R^{M\times K}
+ ! C2\in\R^{M\times N-K}
+ ! Since we are assuming that C1 is a zero matrix and it will be
+ ! overwritten on exit, we can use this spot as a temporary workspace
+ ! without having to allocate anything extra.
+ ! This lets us simplify our above equation to get
+ !
+ ! C = CH' = [ C2, 0 ] (I - [ V2' ]T'[ V2, V1 ]
+ ! [ V1' ]
+ !
+ ! = [ C2, 0 ] - [ C2, 0 ] [ V2' ]T'[ V2, V1 ]
+ ! [ V1' ]
+ !
+ ! = [ C2, 0 ] - C2*V2'*T'[ V2, V1 ]
+ !
+ ! = [ C2, 0 ] - [ C2*V2'*T'*V2, C2*V2'*T'*V1 ]
+ !
+ ! = [ C2 - C2*V2'*T'*V2, -C2*V2'*T'*V1 ]
+ !
+ ! So, we can order our computations as follows:
+ !
+ ! C1 = C2*V2'
+ ! C1 = C1*T'
+ ! C2 = C2 - C1*V2
+ ! C1 = -C1*V1
+ !
+ !
+ ! To achieve the same end result
+ !
+ ! Check to ensure side and trans are the expected values
+ !
+ IF( .NOT.SIDER ) THEN
+ CALL XERBLA('CLARFB0C2', 2)
+ RETURN
+ ELSE IF(.NOT.TRANST) THEN
+ CALL XERBLA('CLARFB0C2', 3)
+ RETURN
+ END IF
+ !
+ ! C1 = C2*V2'
+ !
+ IF( C2I ) THEN
+ DO J = 1, K
+ DO I = 1, M
+ C(I,N-K+J) = CONJG(V(J,I))
+ END DO
+ END DO
+ ELSE
+ CALL CGEMM('No Transpose', 'Conjugate', M, K, N-K,
+ $ ONE, C, LDC, V, LDV, ZERO, C(1, N-K+1), LDC)
+ END IF
+ !
+ ! C1 = C1*T'
+ !
+ CALL CTRMM('Right', 'Lower', 'Conjugate', 'Non-unit',
+ $ M, K, ONE, T, LDT, C(1, N-K+1), LDC)
+ !
+ ! C2 = C2 - C1*V2 = -C1*V2 + C2
+ !
+ IF( C2I ) THEN
+ CALL CGEMM('No Transpose', 'No Transpose', M, N-K, K,
+ $ NEG_ONE, C(1, N-K+1), LDC, V, LDV, ZERO, C, LDC)
+ DO I = 1, M
+ C(I,I) = C(I,I) + ONE
+ END DO
+ ELSE
+ CALL CGEMM('No Transpose', 'No Transpose', M, N-K, K,
+ $ NEG_ONE, C(1, N-K+1), LDC, V, LDV, ONE, C, LDC)
+ END IF
+ !
+ ! C1 = -C1*V1
+ !
+ CALL CTRMM('Right', 'Lower', 'No Transpose', 'Unit',
+ $ M, K, NEG_ONE, V(1, N-K+1), LDV, C(1,N-K+1), LDC)
+ END IF
+ END SUBROUTINE
diff --git a/SRC/cunglq.f b/SRC/cunglq.f
index 7f42c3dcb..a57f5a037 100644
--- a/SRC/cunglq.f
+++ b/SRC/cunglq.f
@@ -136,17 +136,13 @@ SUBROUTINE CUNGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
*
* =====================================================================
*
-* .. Parameters ..
- COMPLEX ZERO
- PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) )
-* ..
* .. Local Scalars ..
LOGICAL LQUERY
- INTEGER I, IB, IINFO, IWS, J, KI, KK, L, LDWORK,
- $ LWKOPT, NB, NBMIN, NX
+ INTEGER I, IB, IINFO, IWS, KI, KK, LWKOPT, LDWORK,
+ $ NB, NBMIN, NX
* ..
* .. External Subroutines ..
- EXTERNAL CLARFB, CLARFT, CUNGL2, XERBLA
+ EXTERNAL CLARFB0C2, CLARFT, CUNGL2, XERBLA
* ..
* .. Intrinsic Functions ..
INTRINSIC MAX, MIN
@@ -218,67 +214,95 @@ SUBROUTINE CUNGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
*
IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
*
-* Use blocked code after the last block.
-* The first kk rows are handled by the block method.
-*
- KI = ( ( K-NX-1 ) / NB )*NB
- KK = MIN( K, KI+NB )
-*
-* Set A(kk+1:m,1:kk) to zero.
+* Treat the last NB block starting at KK+1 specially then use our blocking
+* method from the block starting at KI+1 to 1
*
- DO 20 J = 1, KK
- DO 10 I = KK + 1, M
- A( I, J ) = ZERO
- 10 CONTINUE
- 20 CONTINUE
+ KI = K - 2 * NB
+ KK = K - NB
ELSE
KK = 0
END IF
*
-* Use unblocked code for the last or only block.
+* Potentially bail to the unblocked version
*
- IF( KK.LT.M )
- $ CALL CUNGL2( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA,
- $ TAU( KK+1 ), WORK, IINFO )
+ IF( KK.EQ.0 ) THEN
+ CALL CUNGL2( M, N, K, A, LDA, TAU, WORK, IINFO )
+ END IF
*
IF( KK.GT.0 ) THEN
*
-* Use blocked code
+* Factor the last block assuming that our first application
+* will be on the Identity matrix
*
- DO 50 I = KI + 1, 1, -NB
- IB = MIN( NB, K-I+1 )
- IF( I+IB.LE.M ) THEN
+ I = KK + 1
+ IB = NB
*
-* Form the triangular factor of the block reflector
-* H = H(i) H(i+1) . . . H(i+ib-1)
+* Form the triangular factor of the block reflector
+* H = H(i) H(i+1) . . . H(i+ib-1)
*
- CALL CLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I,
- $ I ),
- $ LDA, TAU( I ), WORK, LDWORK )
+ CALL CLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ),
+ $ LDA, TAU( I ), WORK, LDWORK )
*
-* Apply H**H to A(i+ib:m,i:n) from the right
+* Apply H**T to A(i+ib:m,i:n) from the right
+* Exploit the fact that we are applying to an identity
*
- CALL CLARFB( 'Right', 'Conjugate transpose',
- $ 'Forward',
- $ 'Rowwise', M-I-IB+1, N-I+1, IB, A( I, I ),
- $ LDA, WORK, LDWORK, A( I+IB, I ), LDA,
- $ WORK( IB+1 ), LDWORK )
- END IF
+ CALL CLARFB0C2(.TRUE., 'Right', 'Conjugate', 'Forward',
+ $ 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK,
+ $ LDWORK, A(I+IB,I), LDA)
+*
+* Apply H**T to columns i:n of current block
+
+ CALL CUNGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ),
+ $ WORK, IINFO )
+*
+* Use our standard blocking method after the last block
+*
+ DO I = KI + 1, 1, -NB
+ IB = NB
+*
+* Form the triangular factor of the block reflector
+* H = H(i) H(i+1) . . . H(i+ib-1)
*
-* Apply H**H to columns i:n of current block
+ CALL CLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ),
+ $ LDA, TAU( I ), WORK, LDWORK )
*
+* Apply H**T to A(i+ib:m,i:n) from the right
+*
+ CALL CLARFB0C2(.FALSE., 'Right', 'Conjugate', 'Forward',
+ $ 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK,
+ $ LDWORK, A(I+IB,I), LDA)
+*
+* Apply H**T to columns i:n of current block
+
CALL CUNGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ),
- $ WORK,
- $ IINFO )
+ $ WORK, IINFO )
+ END DO
+*
+* This checks for if K was a perfect multiple of NB
+* so that we only have a special case for the last block when
+* necessary
+*
+ IF(I.LT.1) THEN
+ IB = I + NB - 1
+ I = 1
+*
+* Form the triangular factor of the block reflector
+* H = H(i) H(i+1) . . . H(i+ib-1)
*
-* Set columns 1:i-1 of current block to zero
+ CALL CLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ),
+ $ LDA, TAU( I ), WORK, LDWORK )
*
- DO 40 J = 1, I - 1
- DO 30 L = I, I + IB - 1
- A( L, J ) = ZERO
- 30 CONTINUE
- 40 CONTINUE
- 50 CONTINUE
+* Apply H**T to A(i+ib:m,i:n) from the right
+*
+ CALL CLARFB0C2(.FALSE., 'Right', 'Conjugate', 'Forward',
+ $ 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK,
+ $ LDWORK, A(I+IB,I), LDA)
+*
+* Apply H**T to columns i:n of current block
+
+ CALL CUNGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ),
+ $ WORK, IINFO )
+ END IF
END IF
*
WORK( 1 ) = SROUNDUP_LWORK(IWS)
diff --git a/SRC/cungql.f b/SRC/cungql.f
index 3da2702d8..9022d37ac 100644
--- a/SRC/cungql.f
+++ b/SRC/cungql.f
@@ -95,8 +95,6 @@
*> \verbatim
*> LWORK is INTEGER
*> The dimension of the array WORK. LWORK >= max(1,N).
-*> For optimum performance LWORK >= N*NB, where NB is the
-*> optimal blocksize.
*>
*> If LWORK = -1, then a workspace query is assumed; the routine
*> only calculates the optimal size of the WORK array, returns
@@ -137,17 +135,13 @@ SUBROUTINE CUNGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
*
* =====================================================================
*
-* .. Parameters ..
- COMPLEX ZERO
- PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) )
-* ..
* .. Local Scalars ..
LOGICAL LQUERY
- INTEGER I, IB, IINFO, IWS, J, KK, L, LDWORK, LWKOPT,
- $ NB, NBMIN, NX
+ INTEGER I, IB, IINFO, IWS, KK, LWKOPT, NB, NBMIN,
+ $ NX
* ..
* .. External Subroutines ..
- EXTERNAL CLARFB, CLARFT, CUNG2L, XERBLA
+ EXTERNAL CLARFB0C2, CLARFT, CUNG2L, XERBLA
* ..
* .. Intrinsic Functions ..
INTRINSIC MAX, MIN
@@ -178,7 +172,11 @@ SUBROUTINE CUNGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
LWKOPT = 1
ELSE
NB = ILAENV( 1, 'CUNGQL', ' ', M, N, K, -1 )
- LWKOPT = N*NB
+*
+* Only need a workspace for cung2l in case of bailout
+* and for the panel factorization
+*
+ LWKOPT = N
END IF
WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
*
@@ -201,88 +199,77 @@ SUBROUTINE CUNGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
END IF
*
NBMIN = 2
- NX = 0
+ NX = MAX( 0, ILAENV( 3, 'CUNGQL', ' ', M, N, K, -1 ))
IWS = N
- IF( NB.GT.1 .AND. NB.LT.K ) THEN
*
-* Determine when to cross over from blocked to unblocked code.
-*
- NX = MAX( 0, ILAENV( 3, 'CUNGQL', ' ', M, N, K, -1 ) )
- IF( NX.LT.K ) THEN
+ IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
*
-* Determine if workspace is large enough for blocked code.
+* We use blocked code for the entire construction
*
- LDWORK = N
- IWS = LDWORK*NB
- IF( LWORK.LT.IWS ) THEN
+ KK = K
+ ELSE
+ KK = 0
+ END IF
*
-* Not enough workspace to use optimal NB: reduce NB and
-* determine the minimum value of NB.
+* Possibly bail to the unblocked code.
*
- NB = LWORK / LDWORK
- NBMIN = MAX( 2, ILAENV( 2, 'CUNGQL', ' ', M, N, K,
- $ -1 ) )
- END IF
- END IF
+ IF( KK.EQ.0 ) THEN
+ CALL CUNG2L( M, N, K, A, LDA, TAU, WORK, IINFO )
END IF
*
- IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
+ IF( KK.GT.0 ) THEN
*
-* Use blocked code after the first block.
-* The last kk columns are handled by the block method.
+* Factor the first block assuming that our first application
+* will be on the Identity matrix
*
- KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB )
+ I = 1
+ IB = NB
*
-* Set A(m-kk+1:m,1:n-kk) to zero.
+* Form the triangular factor of the block reflector
+* H = H(i+ib-1) . . . H(i+1) H(i)
*
- DO 20 J = 1, N - KK
- DO 10 I = M - KK + 1, M
- A( I, J ) = ZERO
- 10 CONTINUE
- 20 CONTINUE
- ELSE
- KK = 0
- END IF
+ CALL CLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB,
+ $ A( 1, N-K+I ), LDA, TAU( I ),
+ $ A( M-K+I, N-K+I ), LDA)
*
-* Use unblocked code for the first or only block.
+* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left
+* Exploit the fact that we are applying to an identity
*
- CALL CUNG2L( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO )
+ CALL CLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Backward',
+ $ 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, A(1, N-K+I),
+ $ LDA, A( M-K+I, N-K+I ), LDA, A, LDA)
*
- IF( KK.GT.0 ) THEN
+* Apply H to rows 1:m-k+i+ib-1 of current block
*
-* Use blocked code
+ CALL CUNG2L( M-K+I+IB-1, IB, IB, A( 1, N-K+I ), LDA,
+ $ TAU( I ), WORK, IINFO )
+
+* Use blocked code on the remaining blocks if there are any.
*
- DO 50 I = K - KK + 1, K, NB
- IB = MIN( NB, K-I+1 )
- IF( N-K+I.GT.1 ) THEN
+ DO I = NB+1, K, NB
*
-* Form the triangular factor of the block reflector
-* H = H(i+ib-1) . . . H(i+1) H(i)
+* The last block may be less than size NB
*
- CALL CLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB,
- $ A( 1, N-K+I ), LDA, TAU( I ), WORK, LDWORK )
+ IB = MIN(NB, K-I+1)
*
-* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left
+* Form the triangular factor of the block reflector
+* H = H(i+ib-1) . . . H(i+1) H(i)
*
- CALL CLARFB( 'Left', 'No transpose', 'Backward',
- $ 'Columnwise', M-K+I+IB-1, N-K+I-1, IB,
- $ A( 1, N-K+I ), LDA, WORK, LDWORK, A, LDA,
- $ WORK( IB+1 ), LDWORK )
- END IF
+ CALL CLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB,
+ $ A( 1, N-K+I ), LDA, TAU( I ),
+ $ A( M-K+I, N-K+I ), LDA )
+*
+* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left
+*
+ CALL CLARFB0C2(.FALSE., 'Left', 'No Transpose',
+ $ 'Backward', 'Columnwise', M-K+I+IB-1, N-K+I-1, IB,
+ $ A(1, N-K+I), LDA, A( M-K+I, N-K+I ), LDA, A, LDA)
*
* Apply H to rows 1:m-k+i+ib-1 of current block
*
CALL CUNG2L( M-K+I+IB-1, IB, IB, A( 1, N-K+I ), LDA,
$ TAU( I ), WORK, IINFO )
-*
-* Set rows m-k+i+ib:m of current block to zero
-*
- DO 40 J = N - K + I, N - K + I + IB - 1
- DO 30 L = M - K + I + IB, M
- A( L, J ) = ZERO
- 30 CONTINUE
- 40 CONTINUE
- 50 CONTINUE
+ END DO
END IF
*
WORK( 1 ) = CMPLX( IWS )
diff --git a/SRC/cungqr.f b/SRC/cungqr.f
index eb49d2fed..3d7aaceb9 100644
--- a/SRC/cungqr.f
+++ b/SRC/cungqr.f
@@ -95,8 +95,6 @@
*> \verbatim
*> LWORK is INTEGER
*> The dimension of the array WORK. LWORK >= max(1,N).
-*> For optimum performance LWORK >= N*NB, where NB is the
-*> optimal blocksize.
*>
*> If LWORK = -1, then a workspace query is assumed; the routine
*> only calculates the optimal size of the WORK array, returns
@@ -137,17 +135,13 @@ SUBROUTINE CUNGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
*
* =====================================================================
*
-* .. Parameters ..
- COMPLEX ZERO
- PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) )
-* ..
* .. Local Scalars ..
LOGICAL LQUERY
- INTEGER I, IB, IINFO, IWS, J, KI, KK, L, LDWORK,
- $ LWKOPT, NB, NBMIN, NX
+ INTEGER I, IB, IINFO, IWS, KI, KK, LWKOPT, NB,
+ $ NBMIN, NX
* ..
* .. External Subroutines ..
- EXTERNAL CLARFB, CLARFT, CUNG2R, XERBLA
+ EXTERNAL CLARFB0C2, CLARFT, CUNG2R, XERBLA
* ..
* .. Intrinsic Functions ..
INTRINSIC MAX, MIN
@@ -163,7 +157,11 @@ SUBROUTINE CUNGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
*
INFO = 0
NB = ILAENV( 1, 'CUNGQR', ' ', M, N, K, -1 )
- LWKOPT = MAX( 1, N )*NB
+*
+* Only need a workspace for zung2r in case of bailout
+* and for the panel factorization
+*
+ LWKOPT = MAX( 1, N )
WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
LQUERY = ( LWORK.EQ.-1 )
IF( M.LT.0 ) THEN
@@ -192,92 +190,102 @@ SUBROUTINE CUNGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
END IF
*
NBMIN = 2
- NX = 0
+* Determine when to cross over from unblocked to blocked
+ NX = MAX( 0, ILAENV( 3, 'CUNGQR', ' ', M, N, K, -1 ) )
IWS = N
- IF( NB.GT.1 .AND. NB.LT.K ) THEN
*
-* Determine when to cross over from blocked to unblocked code.
-*
- NX = MAX( 0, ILAENV( 3, 'CUNGQR', ' ', M, N, K, -1 ) )
- IF( NX.LT.K ) THEN
+ IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
*
-* Determine if workspace is large enough for blocked code.
+* Treat the last NB block starting at KK+1 specially then use our blocking
+* method from the block starting at KI+1 to 1
*
- LDWORK = N
- IWS = LDWORK*NB
- IF( LWORK.LT.IWS ) THEN
+ KI = K - 2 * NB
+ KK = K - NB
+ ELSE
+ KK = 0
+ END IF
*
-* Not enough workspace to use optimal NB: reduce NB and
-* determine the minimum value of NB.
+* Potentially bail to the unblocked code.
*
- NB = LWORK / LDWORK
- NBMIN = MAX( 2, ILAENV( 2, 'CUNGQR', ' ', M, N, K,
- $ -1 ) )
- END IF
- END IF
+ IF( KK.EQ.0 ) THEN
+ CALL CUNG2R( M, N, K, A, LDA, TAU, WORK, IINFO )
END IF
*
- IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
+ IF( KK.GT.0 ) THEN
*
-* Use blocked code after the last block.
-* The first kk columns are handled by the block method.
+* Factor the last block assuming that our first application
+* will be on the Identity matrix
*
- KI = ( ( K-NX-1 ) / NB )*NB
- KK = MIN( K, KI+NB )
+ I = KK + 1
+ IB = NB
*
-* Set A(1:kk,kk+1:n) to zero.
+* Form the triangular factor of the block reflector
+* H = H(i) H(i+1) . . . H(i+ib-1)
*
- DO 20 J = KK + 1, N
- DO 10 I = 1, KK
- A( I, J ) = ZERO
- 10 CONTINUE
- 20 CONTINUE
- ELSE
- KK = 0
- END IF
+ CALL CLARFT('Forward', 'Column', M-I+1, IB, A(I,I),
+ $ LDA, TAU(I), A(I,I), LDA)
*
-* Use unblocked code for the last or only block.
+* Apply H to A(i:m,i+ib:n) from the left
+* Exploit the fact that we are applying to an identity
*
- IF( KK.LT.N )
- $ CALL CUNG2R( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA,
- $ TAU( KK+1 ), WORK, IINFO )
+ CALL CLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Forward',
+ $ 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), LDA, A(I,I),
+ $ LDA, A(I,I+IB), LDA)
*
- IF( KK.GT.0 ) THEN
+* Apply H to rows i:m of current block
*
-* Use blocked code
+ CALL CUNG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK, IINFO)
*
- DO 50 I = KI + 1, 1, -NB
- IB = MIN( NB, K-I+1 )
- IF( I+IB.LE.N ) THEN
+* Use our standard blocking method after the last block
*
-* Form the triangular factor of the block reflector
-* H = H(i) H(i+1) . . . H(i+ib-1)
+ DO I = KI + 1, 1, -NB
+ IB = NB
*
- CALL CLARFT( 'Forward', 'Columnwise', M-I+1, IB,
- $ A( I, I ), LDA, TAU( I ), WORK, LDWORK )
+* Form the triangular factor of the block reflector
+* H = H(i) H(i+1) . . . H(i+ib-1)
*
-* Apply H to A(i:m,i+ib:n) from the left
+ CALL CLARFT('Forward', 'Column', M-I+1, IB, A(I,I),
+ $ LDA, TAU(I), A(I,I), LDA)
*
- CALL CLARFB( 'Left', 'No transpose', 'Forward',
- $ 'Columnwise', M-I+1, N-I-IB+1, IB,
- $ A( I, I ), LDA, WORK, LDWORK, A( I, I+IB ),
- $ LDA, WORK( IB+1 ), LDWORK )
- END IF
+* Apply H to A(i:m,i+ib:n) from the left
+*
+ CALL CLARFB0C2(.FALSE., 'Left', 'No Transpose',
+ $ 'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I),
+ $ LDA, A(I,I), LDA, A(I,I+IB), LDA)
+
*
* Apply H to rows i:m of current block
*
- CALL CUNG2R( M-I+1, IB, IB, A( I, I ), LDA, TAU( I ),
- $ WORK,
- $ IINFO )
+ CALL CUNG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK,
+ $ IINFO)
+ END DO
+*
+* This checks for if K was a perfect multiple of NB
+* so that we only have a special case for the last block when
+* necessary
+*
+ IF(I.LT.1) THEN
+ IB = I + NB - 1
+ I = 1
*
-* Set rows 1:i-1 of current block to zero
+* Form the triangular factor of the block reflector
+* H = H(i) H(i+1) . . . H(i+ib-1)
*
- DO 40 J = I, I + IB - 1
- DO 30 L = 1, I - 1
- A( L, J ) = ZERO
- 30 CONTINUE
- 40 CONTINUE
- 50 CONTINUE
+ CALL CLARFT('Forward', 'Column', M-I+1, IB, A(I,I),
+ $ LDA, TAU(I), A(I,I), LDA)
+*
+* Apply H to A(i:m,i+ib:n) from the left
+*
+ CALL CLARFB0C2(.FALSE., 'Left', 'No Transpose',
+ $ 'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I),
+ $ LDA, A(I,I), LDA, A(I,I+IB), LDA)
+
+*
+* Apply H to rows i:m of current block
+*
+ CALL CUNG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK,
+ $ IINFO)
+ END IF
END IF
*
WORK( 1 ) = SROUNDUP_LWORK(IWS)
diff --git a/SRC/cungrq.f b/SRC/cungrq.f
index 7e812b839..4a9b2fe51 100644
--- a/SRC/cungrq.f
+++ b/SRC/cungrq.f
@@ -137,17 +137,13 @@ SUBROUTINE CUNGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
*
* =====================================================================
*
-* .. Parameters ..
- COMPLEX ZERO
- PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) )
-* ..
* .. Local Scalars ..
LOGICAL LQUERY
- INTEGER I, IB, II, IINFO, IWS, J, KK, L, LDWORK,
+ INTEGER I, IB, II, IINFO, IWS, KK, LDWORK,
$ LWKOPT, NB, NBMIN, NX
* ..
* .. External Subroutines ..
- EXTERNAL CLARFB, CLARFT, CUNGR2, XERBLA
+ EXTERNAL CLARFB0C2, CLARFT, CUNGR2, XERBLA
* ..
* .. Intrinsic Functions ..
INTRINSIC MAX, MIN
@@ -231,61 +227,67 @@ SUBROUTINE CUNGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
* Use blocked code after the first block.
* The last kk rows are handled by the block method.
*
- KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB )
-*
-* Set A(1:m-kk,n-kk+1:n) to zero.
-*
- DO 20 J = N - KK + 1, N
- DO 10 I = 1, M - KK
- A( I, J ) = ZERO
- 10 CONTINUE
- 20 CONTINUE
+ KK = K
ELSE
KK = 0
END IF
*
-* Use unblocked code for the first or only block.
+* Potentially bail to the unblocked code
*
- CALL CUNGR2( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO )
+ IF( KK.EQ.0 ) THEN
+ CALL CUNGR2( M, N, K, A, LDA, TAU, WORK, IINFO )
+ END IF
*
IF( KK.GT.0 ) THEN
*
-* Use blocked code
+* Factor the first block assuming that our first application
+* will be on the Identity matrix
*
- DO 50 I = K - KK + 1, K, NB
- IB = MIN( NB, K-I+1 )
- II = M - K + I
- IF( II.GT.1 ) THEN
+ I = 1
+ IB = NB
+ II = M - K + I
*
-* Form the triangular factor of the block reflector
-* H = H(i+ib-1) . . . H(i+1) H(i)
+* Form the triangular factor of the block reflector
+* H = H(i+ib-1) . . . H(i+1) H(i)
*
- CALL CLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB,
- $ A( II, 1 ), LDA, TAU( I ), WORK, LDWORK )
+ CALL CLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB,
+ $ A( II, 1 ), LDA, TAU( I ), WORK, LDWORK )
*
-* Apply H**H to A(1:m-k+i-1,1:n-k+i+ib-1) from the right
+* Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right
*
- CALL CLARFB( 'Right', 'Conjugate transpose',
- $ 'Backward',
- $ 'Rowwise', II-1, N-K+I+IB-1, IB, A( II, 1 ),
- $ LDA, WORK, LDWORK, A, LDA, WORK( IB+1 ),
- $ LDWORK )
- END IF
+ CALL CLARFB0C2(.TRUE., 'Right', 'Conjugate', 'Backward',
+ $ 'Rowwise', II-1, N-K+I+IB-1, IB, A(II,1), LDA, WORK,
+ $ LDWORK, A, LDA)
*
-* Apply H**H to columns 1:n-k+i+ib-1 of current block
+* Apply H**T to columns 1:n-k+i+ib-1 of current block
*
- CALL CUNGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA,
- $ TAU( I ),
- $ WORK, IINFO )
+ CALL CUNGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA,
+ $ TAU( I ), WORK, IINFO )
+
+ DO I = NB+1, K, NB
*
-* Set columns n-k+i+ib:n of current block to zero
+* The last block may be less than size NB
*
- DO 40 L = N - K + I + IB, N
- DO 30 J = II, II + IB - 1
- A( J, L ) = ZERO
- 30 CONTINUE
- 40 CONTINUE
- 50 CONTINUE
+ IB = MIN(NB, K-I+1)
+ II = M - K + I
+*
+* Form the triangular factor of the block reflector
+* H = H(i+ib-1) . . . H(i+1) H(i)
+*
+ CALL CLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB,
+ $ A( II, 1 ), LDA, TAU( I ), WORK, LDWORK )
+*
+* Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right
+*
+ CALL CLARFB0C2(.FALSE., 'Right', 'Conjugate',
+ $ 'Backward', 'Rowwise', II-1, N-K+I+IB-1, IB,
+ $ A(II,1), LDA, WORK, LDWORK, A, LDA)
+*
+* Apply H**T to columns 1:n-k+i+ib-1 of current block
+*
+ CALL CUNGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA,
+ $ TAU( I ), WORK, IINFO )
+ END DO
END IF
*
WORK( 1 ) = SROUNDUP_LWORK(IWS)
diff --git a/SRC/dlarfb0c2.f b/SRC/dlarfb0c2.f
new file mode 100644
index 000000000..493580e09
--- /dev/null
+++ b/SRC/dlarfb0c2.f
@@ -0,0 +1,555 @@
+*> \brief \b DLARFB0C2 applies a block reflector or its transpose to a
+* rectangular matrix with a 0 block while constructing the explicit Q factor
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE DLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N,
+* $ K, V, LDV, T, LDT, C, LDC)
+* ! Scalar arguments
+* INTEGER M, N, K, LDV, LDC, LDT
+* CHARACTER SIDE, TRANS, DIRECT, STOREV
+* ! True means that we are assuming C2 is the identity matrix
+* ! and thus don't reference whatever is present in C2
+* ! at the beginning.
+* LOGICAL C2I
+* ! Array arguments
+* DOUBLE PRECISION V(LDV,*), C(LDC,*), T(LDT,*)
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> DLARFB0C2 applies a real block reflector H or its transpose H**T to a
+*> real m by n matrix C with a 0 block, while computing the explicit Q factor
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] C2I
+*> \verbatim
+*> C2I is LOGICAL
+*> = .TRUE.: Assume the nonzero block of C is the identity matrix
+*> = .FALSE.: Use existing data in the nonzero block of C
+*> \endverbatim
+*>
+*> \param[in] SIDE
+*> \verbatim
+*> SIDE is CHARACTER*1
+*> = 'L': apply H or H**T from the Left
+*> = 'R': apply H or H**T from the Right
+*> \endverbatim
+*>
+*> \param[in] TRANS
+*> \verbatim
+*> TRANS is CHARACTER*1
+*> = 'N': apply H (No transpose)
+*> = 'T': apply H**T (Transpose)
+*> \endverbatim
+*>
+*> \param[in] DIRECT
+*> \verbatim
+*> DIRECT is CHARACTER*1
+*> Indicates how H is formed from a product of elementary
+*> reflectors
+*> = 'F': H = H(1) H(2) . . . H(k) (Forward)
+*> = 'B': H = H(k) . . . H(2) H(1) (Backward)
+*> \endverbatim
+*>
+*> \param[in] STOREV
+*> \verbatim
+*> STOREV is CHARACTER*1
+*> Indicates how the vectors which define the elementary
+*> reflectors are stored:
+*> = 'C': Columnwise
+*> = 'R': Rowwise
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*> M is INTEGER
+*> The number of rows of the matrix C.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The number of columns of the matrix C.
+*> \endverbatim
+*>
+*> \param[in] K
+*> \verbatim
+*> K is INTEGER
+*> The order of the matrix T (= the number of elementary
+*> reflectors whose product defines the block reflector).
+*> If SIDE = 'L', M >= K >= 0;
+*> if SIDE = 'R', N >= K >= 0.
+*> \endverbatim
+*>
+*> \param[in] V
+*> \verbatim
+*> V is DOUBLE PRECISION array, dimension
+*> (LDV,K) if STOREV = 'C'
+*> (LDV,M) if STOREV = 'R' and SIDE = 'L'
+*> (LDV,N) if STOREV = 'R' and SIDE = 'R'
+*> The matrix V. See Further Details.
+*> \endverbatim
+*>
+*> \param[in] LDV
+*> \verbatim
+*> LDV is INTEGER
+*> The leading dimension of the array V.
+*> If STOREV = 'C' and SIDE = 'L', LDV >= max(1,M);
+*> if STOREV = 'C' and SIDE = 'R', LDV >= max(1,N);
+*> if STOREV = 'R', LDV >= K.
+*> \endverbatim
+*>
+*> \param[in] T
+*> \verbatim
+*> T is DOUBLE PRECISION array, dimension (LDT,K)
+*> The triangular k by k matrix T in the representation of the
+*> block reflector.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*> LDT is INTEGER
+*> The leading dimension of the array T. LDT >= K.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*> C is DOUBLE PRECISION array, dimension (LDC,N)
+*> On entry, the m by n matrix C.
+*> On exit, C is overwritten by H*C or H**T*C or C*H or C*H**T.
+*> \endverbatim
+*>
+*> \param[in] LDC
+*> \verbatim
+*> LDC is INTEGER
+*> The leading dimension of the array C. LDC >= max(1,M).
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup larfb
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> The shape of the matrix V and the storage of the vectors which define
+*> the H(i) is best illustrated by the following example with n = 5 and
+*> k = 3. The triangular part of V (including its diagonal) is not
+*> referenced.
+*>
+*> DIRECT = 'F' and STOREV = 'C': DIRECT = 'F' and STOREV = 'R':
+*>
+*> V = ( 1 ) V = ( 1 v1 v1 v1 v1 )
+*> ( v1 1 ) ( 1 v2 v2 v2 )
+*> ( v1 v2 1 ) ( 1 v3 v3 )
+*> ( v1 v2 v3 )
+*> ( v1 v2 v3 )
+*>
+*> DIRECT = 'B' and STOREV = 'C': DIRECT = 'B' and STOREV = 'R':
+*>
+*> V = ( v1 v2 v3 ) V = ( v1 v1 1 )
+*> ( v1 v2 v3 ) ( v2 v2 v2 1 )
+*> ( 1 v2 v3 ) ( v3 v3 v3 v3 1 )
+*> ( 1 v3 )
+*> ( 1 )
+*> \endverbatim
+*>
+* =====================================================================
+ SUBROUTINE DLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N,
+ $ K, V, LDV, T, LDT, C, LDC)
+ ! Scalar arguments
+ INTEGER M, N, K, LDV, LDC, LDT
+ CHARACTER SIDE, TRANS, DIRECT, STOREV
+ ! True means that we are assuming C2 is the identity matrix
+ ! and thus don't reference whatever is present in C2
+ ! at the beginning.
+ LOGICAL C2I
+ ! Array arguments
+ DOUBLE PRECISION V(LDV,*), C(LDC,*), T(LDT,*)
+ ! Local scalars
+ LOGICAL QR, LQ, QL, DIRF, COLV, SIDEL, SIDER,
+ $ TRANST
+ INTEGER I, J
+ ! External functions
+ LOGICAL LSAME
+ EXTERNAL LSAME
+ ! External subroutines
+ EXTERNAL DGEMM, DTRMM, XERBLA
+ ! Parameters
+ DOUBLE PRECISION ONE, ZERO, NEG_ONE
+ PARAMETER(ONE=1.0D+0, ZERO = 0.0D+0, NEG_ONE = -1.0D+0)
+
+ ! Beginning of executable statements
+ ! Convert our character flags to logical values
+ DIRF = LSAME(DIRECT,'F')
+ COLV = LSAME(STOREV,'C')
+ SIDEL = LSAME(SIDE,'L')
+ SIDER = LSAME(SIDE,'R')
+ TRANST = LSAME(TRANS,'T')
+
+ ! Determine which of the 4 modes are using.
+ ! QR is when we store the reflectors column by column and have the
+ ! 'first' reflector stored in the first column
+ QR = DIRF.AND.COLV
+
+ ! LQ is when we store the reflectors row by row and have the
+ ! 'first' reflector stored in the first row
+ LQ = DIRF.AND.(.NOT.COLV)
+
+ ! QL is when we store the reflectors column by column and have the
+ ! 'first' reflector stored in the last column
+ QL = (.NOT.DIRF).AND.COLV
+
+ ! RQ is when we store the reflectors row by row and have the
+ ! 'first' reflector stored in the last row
+ ! RQ = (.NOT.DIRF).AND.(.NOT.COLV)
+ ! Since we have exactly one of these 4 modes, we don't need to actually
+ ! store the value of RQ, instead we assume this is the case if we fail
+ ! the above 3 checks.
+
+ IF (QR) THEN
+ ! We are computing C = HC = (I - VTV')C
+ ! Where: V = [ V1 ] and C = [ C1 ]
+ ! [ V2 ] [ C2 ]
+ ! with the following dimensions:
+ ! V1\in\R^{K\times K}
+ ! V2\in\R^{M-K\times K}
+ ! C1=0\in\R^{K\times N}
+ ! C2\in\R^{M-K\times N}
+ ! Since we are assuming that C1 is a zero matrix and it will be
+ ! overwritten on exit, we can use this spot as a temporary workspace
+ ! without having to allocate anything extra.
+ ! This lets us simplify our above equation to get
+ !
+ ! C = HC = (I - [ V1 ]T [V1', V2'])[ 0 ]
+ ! [ V2 ] [ C2 ]
+ ! = [ 0 ] - [ V1 ]T*V2'*C2
+ ! [ C2 ] [ V2 ]
+ !
+ ! = [ 0 ] - [ V1*T*V2'*C2 ]
+ ! [ C2 ] [ V2*T*V2'*C2 ]
+ !
+ ! = [ V1*T*V2'*C2 ]
+ ! [ C2 - V2*T*V2'*C2 ]
+ !
+ ! So, we can order our computations as follows:
+ !
+ ! C1 = V2'*C2
+ ! C1 = T*C1
+ ! C2 = C2 - V2*C1
+ ! C1 = -V1*C1
+ !
+ ! To achieve the same end result
+ !
+ ! Check to ensure side and trans are the expected values
+ !
+ IF( .NOT.SIDEL ) THEN
+ CALL XERBLA('DLARFB0C2', 2)
+ RETURN
+ ELSE IF(TRANST) THEN
+ CALL XERBLA('DLARFB0C2', 3)
+ RETURN
+ END IF
+ !
+ ! C1 = V2'*C2
+ !
+ IF (C2I) THEN
+ DO J = 1, N
+ DO I = 1, K
+ C(I,J) = V(K+J,I)
+ END DO
+ END DO
+ ELSE
+ CALL DGEMM('Transpose', 'No Transpose', K, N, M - K,
+ $ ONE, V(K+1,1), LDV, C(K+1,1), LDC, ZERO,
+ $ C, LDC)
+ END IF
+ !
+ ! C1 = T*C1
+ !
+ CALL DTRMM('Left', 'Upper', 'No Transpose', 'Non-unit',
+ $ K, N, ONE, T, LDT, C, LDC)
+ !
+ ! C2 = C2 - V2*C1 = -V2*C1 + C2
+ !
+ IF (C2I) THEN
+ CALL DGEMM('No Transpose', 'No Transpose', M-K, N, K,
+ $ NEG_ONE, V(K+1,1), LDV, C, LDC, ZERO,
+ $ C(K+1,1), LDC)
+ DO I = 1, N
+ C(K+I,I) = C(K+I,I) + ONE
+ END DO
+ ELSE
+ CALL DGEMM('No Transpose', 'No Transpose', M-K, N, K,
+ $ NEG_ONE, V(K+1,1), LDV, C, LDC, ONE,
+ $ C(K+1,1), LDC)
+ END IF
+ !
+ ! C1 = -V1*C1
+ !
+ CALL DTRMM('Left', 'Lower', 'No Transpose', 'Unit',
+ $ K, N, NEG_ONE, V, LDV, C, LDC)
+ ELSE IF (LQ) THEN
+ ! We are computing C = CH' = C(I-V'T'V)
+ ! Where: V = [ V1 V2 ] and C = [ C1 C2 ]
+ ! with the following dimensions:
+ ! V1\in\R^{K\times K}
+ ! V2\in\R^{K\times N-K}
+ ! C1=0\in\R^{M\times K}
+ ! C2\in\R^{M\times N-K}
+ ! Since we are assuming that C1 is a zero matrix and it will be
+ ! overwritten on exit, we can use this spot as a temporary workspace
+ ! without having to allocate anything extra.
+ ! This lets us simplify our above equation to get
+ !
+ ! C = CH' = [ 0, C2 ](I - [ V1' ]T'[ V1, V2 ])
+ ! [ V2' ]
+ !
+ ! = [ 0, C2 ] - [ 0, C2 ][ V1' ]T'[ V1, V2 ]
+ ! [ V2' ]
+ !
+ ! = [ 0, C2 ] - C2*V2'*T'[ V1, V2 ]
+ !
+ ! = [ -C2*V2'*T'*V1, C2 - C2*V2'*T'*V2 ]
+ !
+ ! So, we can order our computations as follows:
+ !
+ ! C1 = C2*V2'
+ ! C1 = C1*T'
+ ! C2 = C2 - C1*V2
+ ! C1 = -C1*V1
+ !
+ ! To achieve the same end result
+ !
+ ! Check to ensure side and trans are the expected values
+ !
+ IF( .NOT.SIDER ) THEN
+ CALL XERBLA('DLARFB0C2', 2)
+ RETURN
+ ELSE IF(.NOT.TRANST) THEN
+ CALL XERBLA('DLARFB0C2', 3)
+ RETURN
+ END IF
+ !
+ ! C1 = C2*V2'
+ !
+ IF( C2I ) THEN
+ DO J = 1, K
+ DO I = 1, M
+ C(I,J) = V(J,K+I)
+ END DO
+ END DO
+ ELSE
+ CALL DGEMM('No Transpose', 'Transpose', M, K, N-K,
+ $ ONE, C(1,K+1), LDC, V(1, K+1), LDV, ZERO, C,
+ $ LDC)
+ END IF
+ !
+ ! C1 = C1*T'
+ !
+ CALL DTRMM('Right', 'Upper', 'Transpose', 'Non-unit',
+ $ M, K, ONE, T, LDT, C, LDC)
+ !
+ ! C2 = C2 - C1*V2 = -C1*V2 + C2
+ !
+ IF( C2I ) THEN
+ CALL DGEMM('No Transpose', 'No Transpose', M, N-K, K,
+ $ NEG_ONE, C, LDC, V(1,K+1), LDV, ZERO, C(1,K+1),
+ $ LDC)
+ DO I = 1, M
+ C(I,K+I) = C(I,K+I) + ONE
+ END DO
+ ELSE
+ CALL DGEMM('No Transpose', 'No Transpose', M, N-K, K,
+ $ NEG_ONE, C, LDC, V(1,K+1), LDV, ONE, C(1,K+1),
+ $ LDC)
+ END IF
+ !
+ ! C1 = -C1*V1
+ !
+ CALL DTRMM('Right', 'Upper', 'No Transpose', 'Unit',
+ $ M, K, NEG_ONE, V, LDV, C, LDC)
+ ELSE IF (QL) THEN
+ ! We are computing C = HC = (I - VTV')C
+ ! Where: V = [ V2 ] and C = [ C2 ]
+ ! [ V1 ] [ C1 ]
+ ! with the following dimensions:
+ ! V1\in\R^{K\times K}
+ ! V2\in\R^{M-K\times K}
+ ! C1=0\in\R^{K\times N}
+ ! C2\in\R^{M-K\times N}
+ ! Since we are assuming that C1 is a zero matrix and it will be
+ ! overwritten on exit, we can use this spot as a temporary workspace
+ ! without having to allocate anything extra.
+ ! This lets us simplify our above equation to get
+ !
+ ! C = HC = (I-[ V2 ]T[ V2' V1' ])[ C2 ]
+ ! [ V1 ] [ 0 ]
+ !
+ ! = [ C2 ] - [ V2 ]T*V2'*C2
+ ! [ 0 ] [ V1 ]
+ !
+ ! = [ C2 ] - [ V2*T*V2'*C2 ]
+ ! [ 0 ] [ V1*T*V2'*C2 ]
+ !
+ ! = [ C2 - V2*T*V2'*C2 ]
+ ! [ - V1*T*V2'*C2 ]
+ !
+ ! So, we can order our computations as follows:
+ !
+ ! C1 = V2'*C2
+ ! C1 = T*C1
+ ! C2 = C2 - V2*C1
+ ! C1 = -V1*C1
+ !
+ ! To achieve the same end result
+ !
+ ! Check to ensure side and trans are the expected values
+ !
+ IF( .NOT.SIDEL ) THEN
+ CALL XERBLA('DLARFB0C2', 2)
+ RETURN
+ ELSE IF(TRANST) THEN
+ CALL XERBLA('DLARFB0C2', 3)
+ RETURN
+ END IF
+ !
+ ! C1 = V2'*C2
+ !
+ IF( C2I ) THEN
+ DO J = 1, N
+ DO I = 1, K
+ C(M-K+I,J) = V(J,I)
+ END DO
+ END DO
+ ELSE
+ CALL DGEMM('Transpose', 'No Transpose', K, N, M-K,
+ $ ONE, V, LDV, C, LDC, ZERO, C(M-K+1, 1), LDC)
+ END IF
+ !
+ ! C1 = T*C1
+ !
+ CALL DTRMM('Left', 'Lower', 'No Transpose', 'Non-unit',
+ $ K, N, ONE, T, LDT, C(M-K+1,1), LDC)
+ !
+ ! C2 = C2 - V2*C1 = -V2*C1 + C2
+ !
+ IF( C2I ) THEN
+ CALL DGEMM('No Transpose', 'No Transpose', M-K, N, K,
+ $ NEG_ONE, V, LDV, C(M-K+1,1), LDC, ZERO, C, LDC)
+ DO I = 1, N
+ C(I,I) = C(I,I) + ONE
+ END DO
+ ELSE
+ CALL DGEMM('No Transpose', 'No Transpose', M-K, N, K,
+ $ NEG_ONE, V, LDV, C(M-K+1,1), LDC, ONE, C, LDC)
+ END IF
+ !
+ ! C1 = -V1*C1
+ !
+ CALL DTRMM('Left', 'Upper', 'No Transpose', 'Unit',
+ $ K, N, NEG_ONE, V(M-K+1,1), LDV, C(M-K+1,1), LDC)
+ ELSE ! IF (RQ) THEN
+ ! We are computing C = CH' = C(I-V'T'V)
+ ! Where: V = [ V2 V1] and C = [ C2 C1 ]
+ ! with the following dimensions:
+ ! V1\in\R^{K\times K}
+ ! V2\in\R^{K\times N-K}
+ ! C1=0\in\R^{M\times K}
+ ! C2\in\R^{M\times N-K}
+ ! Since we are assuming that C1 is a zero matrix and it will be
+ ! overwritten on exit, we can use this spot as a temporary workspace
+ ! without having to allocate anything extra.
+ ! This lets us simplify our above equation to get
+ !
+ ! C = CH' = [ C2, 0 ] (I - [ V2' ]T'[ V2, V1 ]
+ ! [ V1' ]
+ !
+ ! = [ C2, 0 ] - [ C2, 0 ] [ V2' ]T'[ V2, V1 ]
+ ! [ V1' ]
+ !
+ ! = [ C2, 0 ] - C2*V2'*T'[ V2, V1 ]
+ !
+ ! = [ C2, 0 ] - [ C2*V2'*T'*V2, C2*V2'*T'*V1 ]
+ !
+ ! = [ C2 - C2*V2'*T'*V2, -C2*V2'*T'*V1 ]
+ !
+ ! So, we can order our computations as follows:
+ !
+ ! C1 = C2*V2'
+ ! C1 = C1*T'
+ ! C2 = C2 - C1*V2
+ ! C1 = -C1*V1
+ !
+ !
+ ! To achieve the same end result
+ !
+ ! Check to ensure side and trans are the expected values
+ !
+ IF( .NOT.SIDER ) THEN
+ CALL XERBLA('DLARFB0C2', 2)
+ RETURN
+ ELSE IF(.NOT.TRANST) THEN
+ CALL XERBLA('DLARFB0C2', 3)
+ RETURN
+ END IF
+ !
+ ! C1 = C2*V2'
+ !
+ IF( C2I ) THEN
+ DO J = 1, K
+ DO I = 1, M
+ C(I,N-K+J) = V(J,I)
+ END DO
+ END DO
+ ELSE
+ CALL DGEMM('No Transpose', 'Transpose', M, K, N-K,
+ $ ONE, C, LDC, V, LDV, ZERO, C(1, N-K+1), LDC)
+ END IF
+ !
+ ! C1 = C1*T'
+ !
+ CALL DTRMM('Right', 'Lower', 'Transpose', 'Non-unit',
+ $ M, K, ONE, T, LDT, C(1, N-K+1), LDC)
+ !
+ ! C2 = C2 - C1*V2 = -C1*V2 + C2
+ !
+ IF( C2I ) THEN
+ CALL DGEMM('No Transpose', 'No Transpose', M, N-K, K,
+ $ NEG_ONE, C(1, N-K+1), LDC, V, LDV, ZERO, C, LDC)
+ DO I = 1, M
+ C(I,I) = C(I,I) + ONE
+ END DO
+ ELSE
+ CALL DGEMM('No Transpose', 'No Transpose', M, N-K, K,
+ $ NEG_ONE, C(1, N-K+1), LDC, V, LDV, ONE, C, LDC)
+ END IF
+ !
+ ! C1 = -C1*V1
+ !
+ CALL DTRMM('Right', 'Lower', 'No Transpose', 'Unit',
+ $ M, K, NEG_ONE, V(1, N-K+1), LDV, C(1,N-K+1), LDC)
+ END IF
+ END SUBROUTINE
diff --git a/SRC/dorglq.f b/SRC/dorglq.f
index 71ff93fa6..c7f32afb8 100644
--- a/SRC/dorglq.f
+++ b/SRC/dorglq.f
@@ -5,6 +5,7 @@
* Online html documentation available at
* http://www.netlib.org/lapack/explore-html/
*
+*> \htmlonly
*> Download DORGLQ + dependencies
*>
*> [TGZ]
@@ -12,6 +13,7 @@
*> [ZIP]
*>
*> [TXT]
+*> \endhtmlonly
*
* Definition:
* ===========
@@ -136,17 +138,13 @@ SUBROUTINE DORGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
*
* =====================================================================
*
-* .. Parameters ..
- DOUBLE PRECISION ZERO
- PARAMETER ( ZERO = 0.0D+0 )
-* ..
* .. Local Scalars ..
LOGICAL LQUERY
- INTEGER I, IB, IINFO, IWS, J, KI, KK, L, LDWORK,
- $ LWKOPT, NB, NBMIN, NX
+ INTEGER I, IB, IINFO, IWS, KI, KK, LDWORK, LWKOPT,
+ $ NB, NBMIN, NX
* ..
* .. External Subroutines ..
- EXTERNAL DLARFB, DLARFT, DORGL2, XERBLA
+ EXTERNAL DLARFB0C2, DLARFT, DORGL2, XERBLA
* ..
* .. Intrinsic Functions ..
INTRINSIC MAX, MIN
@@ -161,7 +159,7 @@ SUBROUTINE DORGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
*
INFO = 0
NB = ILAENV( 1, 'DORGLQ', ' ', M, N, K, -1 )
- LWKOPT = MAX( 1, M )*NB
+ LWKOPT = MAX( 1, M ) * NB
WORK( 1 ) = LWKOPT
LQUERY = ( LWORK.EQ.-1 )
IF( M.LT.0 ) THEN
@@ -217,67 +215,90 @@ SUBROUTINE DORGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
*
IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
*
-* Use blocked code after the last block.
-* The first kk rows are handled by the block method.
-*
- KI = ( ( K-NX-1 ) / NB )*NB
- KK = MIN( K, KI+NB )
-*
-* Set A(kk+1:m,1:kk) to zero.
+* Handle the first block assuming we are applying to the
+* identity, then resume regular blocking method after
*
- DO 20 J = 1, KK
- DO 10 I = KK + 1, M
- A( I, J ) = ZERO
- 10 CONTINUE
- 20 CONTINUE
+ KI = K - 2 * NB
+ KK = K - NB
ELSE
KK = 0
END IF
*
-* Use unblocked code for the last or only block.
+* Potentially bail to the unblocked version
*
- IF( KK.LT.M )
- $ CALL DORGL2( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA,
- $ TAU( KK+1 ), WORK, IINFO )
+ IF( KK.EQ.0 ) THEN
+ CALL DORGL2( M, N, K, A, LDA, TAU, WORK, IINFO )
+ END IF
*
IF( KK.GT.0 ) THEN
+ I = KK + 1
+ IB = NB
+*
+* Form the triangular factor of the block reflector
+* H = H(i) H(i+1) . . . H(i+ib-1)
+*
+ CALL DLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ),
+ $ LDA, TAU( I ), WORK, LDWORK )
+*
+* Apply H**T to A(i+ib:m,i:n) from the right
+*
+ CALL DLARFB0C2(.TRUE., 'Right', 'Transpose', 'Forward',
+ $ 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK,
+ $ LDWORK, A(I+IB,I), LDA)
+*
+* Apply H**T to columns i:n of current block
+
+ CALL DORGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ),
+ $ WORK, IINFO )
*
* Use blocked code
*
- DO 50 I = KI + 1, 1, -NB
- IB = MIN( NB, K-I+1 )
- IF( I+IB.LE.M ) THEN
+ DO I = KI + 1, 1, -NB
+ IB = NB
*
-* Form the triangular factor of the block reflector
-* H = H(i) H(i+1) . . . H(i+ib-1)
+* Form the triangular factor of the block reflector
+* H = H(i) H(i+1) . . . H(i+ib-1)
*
- CALL DLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I,
- $ I ),
- $ LDA, TAU( I ), WORK, LDWORK )
+ CALL DLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ),
+ $ LDA, TAU( I ), WORK, LDWORK )
*
-* Apply H**T to A(i+ib:m,i:n) from the right
+* Apply H**T to A(i+ib:m,i:n) from the right
*
- CALL DLARFB( 'Right', 'Transpose', 'Forward',
- $ 'Rowwise',
- $ M-I-IB+1, N-I+1, IB, A( I, I ), LDA, WORK,
- $ LDWORK, A( I+IB, I ), LDA, WORK( IB+1 ),
- $ LDWORK )
- END IF
+ CALL DLARFB0C2(.FALSE., 'Right', 'Transpose', 'Forward',
+ $ 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK,
+ $ LDWORK, A(I+IB,I), LDA)
*
* Apply H**T to columns i:n of current block
-*
+
CALL DORGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ),
- $ WORK,
- $ IINFO )
+ $ WORK, IINFO )
+ END DO
+*
+* This checks for if K was a perfect multiple of NB
+* so that we only have a special case for the last block when
+* necessary
+*
+ IF(I.LT.1) THEN
+ IB = I + NB - 1
+ I = 1
+*
+* Form the triangular factor of the block reflector
+* H = H(i) H(i+1) . . . H(i+ib-1)
+*
+ CALL DLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ),
+ $ LDA, TAU( I ), WORK, LDWORK )
*
-* Set columns 1:i-1 of current block to zero
+* Apply H**T to A(i+ib:m,i:n) from the right
*
- DO 40 J = 1, I - 1
- DO 30 L = I, I + IB - 1
- A( L, J ) = ZERO
- 30 CONTINUE
- 40 CONTINUE
- 50 CONTINUE
+ CALL DLARFB0C2(.FALSE., 'Right', 'Transpose', 'Forward',
+ $ 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK,
+ $ LDWORK, A(I+IB,I), LDA)
+*
+* Apply H**T to columns i:n of current block
+
+ CALL DORGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ),
+ $ WORK, IINFO )
+ END IF
END IF
*
WORK( 1 ) = IWS
diff --git a/SRC/dorgql.f b/SRC/dorgql.f
index c7bba63c2..378147844 100644
--- a/SRC/dorgql.f
+++ b/SRC/dorgql.f
@@ -5,6 +5,7 @@
* Online html documentation available at
* http://www.netlib.org/lapack/explore-html/
*
+*> \htmlonly
*> Download DORGQL + dependencies
*>
*> [TGZ]
@@ -12,6 +13,7 @@
*> [ZIP]
*>
*> [TXT]
+*> \endhtmlonly
*
* Definition:
* ===========
@@ -95,8 +97,6 @@
*> \verbatim
*> LWORK is INTEGER
*> The dimension of the array WORK. LWORK >= max(1,N).
-*> For optimum performance LWORK >= N*NB, where NB is the
-*> optimal blocksize.
*>
*> If LWORK = -1, then a workspace query is assumed; the routine
*> only calculates the optimal size of the WORK array, returns
@@ -137,17 +137,12 @@ SUBROUTINE DORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
*
* =====================================================================
*
-* .. Parameters ..
- DOUBLE PRECISION ZERO
- PARAMETER ( ZERO = 0.0D+0 )
-* ..
* .. Local Scalars ..
LOGICAL LQUERY
- INTEGER I, IB, IINFO, IWS, J, KK, L, LDWORK, LWKOPT,
- $ NB, NBMIN, NX
+ INTEGER I, IB, IINFO, IWS, KK, LWKOPT, NB, NBMIN
* ..
* .. External Subroutines ..
- EXTERNAL DLARFB, DLARFT, DORG2L, XERBLA
+ EXTERNAL DLARFB0C2, DLARFT, DORG2L, XERBLA
* ..
* .. Intrinsic Functions ..
INTRINSIC MAX, MIN
@@ -177,7 +172,8 @@ SUBROUTINE DORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
LWKOPT = 1
ELSE
NB = ILAENV( 1, 'DORGQL', ' ', M, N, K, -1 )
- LWKOPT = N*NB
+ ! Only need a workspace for calls to dorg2l
+ LWKOPT = N
END IF
WORK( 1 ) = LWKOPT
*
@@ -200,88 +196,77 @@ SUBROUTINE DORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
END IF
*
NBMIN = 2
- NX = 0
IWS = N
- IF( NB.GT.1 .AND. NB.LT.K ) THEN
-*
-* Determine when to cross over from blocked to unblocked code.
*
- NX = MAX( 0, ILAENV( 3, 'DORGQL', ' ', M, N, K, -1 ) )
- IF( NX.LT.K ) THEN
+ IF( NB.GE.NBMIN .AND. NB.LT.K ) THEN
*
-* Determine if workspace is large enough for blocked code.
+* We want to use the blocking method as long as our matrix is big enough
*
- LDWORK = N
- IWS = LDWORK*NB
- IF( LWORK.LT.IWS ) THEN
+ KK = K
+ ELSE
+ KK = 0
+ END IF
*
-* Not enough workspace to use optimal NB: reduce NB and
-* determine the minimum value of NB.
+* Possibly bail to the unblocked code.
*
- NB = LWORK / LDWORK
- NBMIN = MAX( 2, ILAENV( 2, 'DORGQL', ' ', M, N, K,
- $ -1 ) )
- END IF
- END IF
+ IF( KK.EQ.0 ) THEN
+ CALL DORG2L( M, N, K, A, LDA, TAU, WORK, IINFO )
END IF
*
- IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
+* Use our blocked code for everything
*
-* Use blocked code after the first block.
-* The last kk columns are handled by the block method.
+ IF( KK.GT.0 ) THEN
*
- KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB )
+* Factor the first block assuming that our first application
+* will be on the Identity matrix
*
-* Set A(m-kk+1:m,1:n-kk) to zero.
+ I = 1
+ IB = NB
*
- DO 20 J = 1, N - KK
- DO 10 I = M - KK + 1, M
- A( I, J ) = ZERO
- 10 CONTINUE
- 20 CONTINUE
- ELSE
- KK = 0
- END IF
+* Form the triangular factor of the block reflector
+* H = H(i+ib-1) . . . H(i+1) H(i)
*
-* Use unblocked code for the first or only block.
+ CALL DLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB,
+ $ A( 1, N-K+I ), LDA, TAU( I ),
+ $ A( M-K+I, N-K+I ), LDA)
*
- CALL DORG2L( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO )
+* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left
*
- IF( KK.GT.0 ) THEN
+ CALL DLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Backward',
+ $ 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, A(1, N-K+I),
+ $ LDA, A( M-K+I, N-K+I ), LDA, A, LDA)
+*
+* Apply H to rows 1:m-k+i+ib-1 of current block
*
-* Use blocked code
+ CALL DORG2L( M-K+I+IB-1, IB, IB, A( 1, N-K+I ), LDA,
+ $ TAU( I ), WORK, IINFO )
+
+* Use blocked code on the remaining blocks if there are any.
*
- DO 50 I = K - KK + 1, K, NB
- IB = MIN( NB, K-I+1 )
- IF( N-K+I.GT.1 ) THEN
+ DO I = NB+1, K, NB
*
-* Form the triangular factor of the block reflector
-* H = H(i+ib-1) . . . H(i+1) H(i)
+* The last block may be less than size NB
*
- CALL DLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB,
- $ A( 1, N-K+I ), LDA, TAU( I ), WORK, LDWORK )
+ IB = MIN(NB, K-I+1)
*
-* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left
+* Form the triangular factor of the block reflector
+* H = H(i+ib-1) . . . H(i+1) H(i)
*
- CALL DLARFB( 'Left', 'No transpose', 'Backward',
- $ 'Columnwise', M-K+I+IB-1, N-K+I-1, IB,
- $ A( 1, N-K+I ), LDA, WORK, LDWORK, A, LDA,
- $ WORK( IB+1 ), LDWORK )
- END IF
+ CALL DLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB,
+ $ A( 1, N-K+I ), LDA, TAU( I ),
+ $ A( M-K+I, N-K+I ), LDA )
+*
+* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left
+*
+ CALL DLARFB0C2(.FALSE., 'Left', 'No Transpose',
+ $ 'Backward', 'Columnwise', M-K+I+IB-1, N-K+I-1, IB,
+ $ A(1, N-K+I), LDA, A( M-K+I, N-K+I ), LDA, A, LDA)
*
* Apply H to rows 1:m-k+i+ib-1 of current block
*
CALL DORG2L( M-K+I+IB-1, IB, IB, A( 1, N-K+I ), LDA,
$ TAU( I ), WORK, IINFO )
-*
-* Set rows m-k+i+ib:m of current block to zero
-*
- DO 40 J = N - K + I, N - K + I + IB - 1
- DO 30 L = M - K + I + IB, M
- A( L, J ) = ZERO
- 30 CONTINUE
- 40 CONTINUE
- 50 CONTINUE
+ END DO
END IF
*
WORK( 1 ) = IWS
diff --git a/SRC/dorgqr.f b/SRC/dorgqr.f
index 83e26588e..8f3c44278 100644
--- a/SRC/dorgqr.f
+++ b/SRC/dorgqr.f
@@ -5,6 +5,7 @@
* Online html documentation available at
* http://www.netlib.org/lapack/explore-html/
*
+*> \htmlonly
*> Download DORGQR + dependencies
*>
*> [TGZ]
@@ -12,6 +13,7 @@
*> [ZIP]
*>
*> [TXT]
+*> \endhtmlonly
*
* Definition:
* ===========
@@ -95,8 +97,6 @@
*> \verbatim
*> LWORK is INTEGER
*> The dimension of the array WORK. LWORK >= max(1,N).
-*> For optimum performance LWORK >= N*NB, where NB is the
-*> optimal blocksize.
*>
*> If LWORK = -1, then a workspace query is assumed; the routine
*> only calculates the optimal size of the WORK array, returns
@@ -119,7 +119,7 @@
*> \author Univ. of Colorado Denver
*> \author NAG Ltd.
*
-*> \ingroup ungqr
+*> \ingroup doubleOTHERcomputational
*
* =====================================================================
SUBROUTINE DORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
@@ -137,17 +137,13 @@ SUBROUTINE DORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
*
* =====================================================================
*
-* .. Parameters ..
- DOUBLE PRECISION ZERO
- PARAMETER ( ZERO = 0.0D+0 )
-* ..
* .. Local Scalars ..
LOGICAL LQUERY
- INTEGER I, IB, IINFO, IWS, J, KI, KK, L, LDWORK,
- $ LWKOPT, NB, NBMIN, NX
+ INTEGER I, IB, IINFO, IWS, KI, KK, LWKOPT,
+ $ NB, NBMIN, NX
* ..
* .. External Subroutines ..
- EXTERNAL DLARFB, DLARFT, DORG2R, XERBLA
+ EXTERNAL DLARFB0C2, DLARFT, DORG2R, XERBLA
* ..
* .. Intrinsic Functions ..
INTRINSIC MAX, MIN
@@ -162,7 +158,9 @@ SUBROUTINE DORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
*
INFO = 0
NB = ILAENV( 1, 'DORGQR', ' ', M, N, K, -1 )
- LWKOPT = MAX( 1, N )*NB
+ ! Only need a workspace for dorg2r in case of bailout and
+ ! for the panel factorization
+ LWKOPT = MAX( 1, N )
WORK( 1 ) = LWKOPT
LQUERY = ( LWORK.EQ.-1 )
IF( M.LT.0 ) THEN
@@ -191,92 +189,93 @@ SUBROUTINE DORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
END IF
*
NBMIN = 2
- NX = 0
+ NX = MAX(0, ILAENV(3, 'SORGQR', ' ', M, N, K, -1))
IWS = N
- IF( NB.GT.1 .AND. NB.LT.K ) THEN
-*
-* Determine when to cross over from blocked to unblocked code.
*
- NX = MAX( 0, ILAENV( 3, 'DORGQR', ' ', M, N, K, -1 ) )
- IF( NX.LT.K ) THEN
+ IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
*
-* Determine if workspace is large enough for blocked code.
+* Handle the first block assuming we are applying to the
+* identity, then resume regular blocking method after
*
- LDWORK = N
- IWS = LDWORK*NB
- IF( LWORK.LT.IWS ) THEN
+ KI = K - 2 * NB
+ KK = K - NB
+ ELSE
+ KK = 0
+ END IF
*
-* Not enough workspace to use optimal NB: reduce NB and
-* determine the minimum value of NB.
+* Potentially bail to the unblocked code.
*
- NB = LWORK / LDWORK
- NBMIN = MAX( 2, ILAENV( 2, 'DORGQR', ' ', M, N, K,
- $ -1 ) )
- END IF
- END IF
+ IF( KK.EQ.0 ) THEN
+ CALL DORG2R( M, N, K, A, LDA, TAU, WORK, IINFO )
END IF
*
- IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
+ IF( KK.GT.0 ) THEN
+ I = KK + 1
+ IB = NB
*
-* Use blocked code after the last block.
-* The first kk columns are handled by the block method.
+* Form the triangular factor of the block reflector
+* H = H(i) H(i+1) . . . H(i+ib-1)
*
- KI = ( ( K-NX-1 ) / NB )*NB
- KK = MIN( K, KI+NB )
+ CALL DLARFT('Forward', 'Column', M-I+1, IB, A(I,I),
+ $ LDA, TAU(I), A(I,I), LDA)
*
-* Set A(1:kk,kk+1:n) to zero.
+* Apply H to A(i:m,i+ib:n) from the left
*
- DO 20 J = KK + 1, N
- DO 10 I = 1, KK
- A( I, J ) = ZERO
- 10 CONTINUE
- 20 CONTINUE
- ELSE
- KK = 0
- END IF
+ CALL DLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Forward',
+ $ 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), LDA, A(I,I),
+ $ LDA, A(I,I+IB), LDA)
*
-* Use unblocked code for the last or only block.
+* Apply H to rows i:m of current block
*
- IF( KK.LT.N )
- $ CALL DORG2R( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA,
- $ TAU( KK+1 ), WORK, IINFO )
+ CALL DORG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK, IINFO)
+ DO I = KI + 1, 1, -NB
+ IB = NB
*
- IF( KK.GT.0 ) THEN
+* Form the triangular factor of the block reflector
+* H = H(i) H(i+1) . . . H(i+ib-1)
*
-* Use blocked code
+ CALL DLARFT('Forward', 'Column', M-I+1, IB, A(I,I),
+ $ LDA, TAU(I), A(I,I), LDA)
*
- DO 50 I = KI + 1, 1, -NB
- IB = MIN( NB, K-I+1 )
- IF( I+IB.LE.N ) THEN
+* Apply H to A(i:m,i+ib:n) from the left
*
-* Form the triangular factor of the block reflector
-* H = H(i) H(i+1) . . . H(i+ib-1)
+ CALL DLARFB0C2(.FALSE., 'Left', 'No Transpose',
+ $ 'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I),
+ $ LDA, A(I,I), LDA, A(I,I+IB), LDA)
+
*
- CALL DLARFT( 'Forward', 'Columnwise', M-I+1, IB,
- $ A( I, I ), LDA, TAU( I ), WORK, LDWORK )
+* Apply H to rows i:m of current block
*
-* Apply H to A(i:m,i+ib:n) from the left
+ CALL DORG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK,
+ $ IINFO)
+ END DO
*
- CALL DLARFB( 'Left', 'No transpose', 'Forward',
- $ 'Columnwise', M-I+1, N-I-IB+1, IB,
- $ A( I, I ), LDA, WORK, LDWORK, A( I, I+IB ),
- $ LDA, WORK( IB+1 ), LDWORK )
- END IF
+* This checks for if K was a perfect multiple of NB
+* so that we only have a special case for the last block when
+* necessary
*
-* Apply H to rows i:m of current block
+ IF(I.LT.1) THEN
+ IB = I + NB - 1
+ I = 1
*
- CALL DORG2R( M-I+1, IB, IB, A( I, I ), LDA, TAU( I ),
- $ WORK,
- $ IINFO )
+* Form the triangular factor of the block reflector
+* H = H(i) H(i+1) . . . H(i+ib-1)
*
-* Set rows 1:i-1 of current block to zero
+ CALL DLARFT('Forward', 'Column', M-I+1, IB, A(I,I),
+ $ LDA, TAU(I), A(I,I), LDA)
*
- DO 40 J = I, I + IB - 1
- DO 30 L = 1, I - 1
- A( L, J ) = ZERO
- 30 CONTINUE
- 40 CONTINUE
- 50 CONTINUE
+* Apply H to A(i:m,i+ib:n) from the left
+*
+ CALL DLARFB0C2(.FALSE., 'Left', 'No Transpose',
+ $ 'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I),
+ $ LDA, A(I,I), LDA, A(I,I+IB), LDA)
+
+*
+* Apply H to rows i:m of current block
+*
+ CALL DORG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK,
+ $ IINFO)
+ END IF
END IF
*
WORK( 1 ) = IWS
diff --git a/SRC/dorgrq.f b/SRC/dorgrq.f
index 126489bbe..811c2cdb5 100644
--- a/SRC/dorgrq.f
+++ b/SRC/dorgrq.f
@@ -5,6 +5,7 @@
* Online html documentation available at
* http://www.netlib.org/lapack/explore-html/
*
+*> \htmlonly
*> Download DORGRQ + dependencies
*>
*> [TGZ]
@@ -12,6 +13,7 @@
*> [ZIP]
*>
*> [TXT]
+*> \endhtmlonly
*
* Definition:
* ===========
@@ -137,17 +139,13 @@ SUBROUTINE DORGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
*
* =====================================================================
*
-* .. Parameters ..
- DOUBLE PRECISION ZERO
- PARAMETER ( ZERO = 0.0D+0 )
-* ..
* .. Local Scalars ..
LOGICAL LQUERY
- INTEGER I, IB, II, IINFO, IWS, J, KK, L, LDWORK,
+ INTEGER I, IB, II, IINFO, IWS, KK, LDWORK,
$ LWKOPT, NB, NBMIN, NX
* ..
* .. External Subroutines ..
- EXTERNAL DLARFB, DLARFT, DORGR2, XERBLA
+ EXTERNAL DLARFB0C2, DLARFT, DORGR2, XERBLA
* ..
* .. Intrinsic Functions ..
INTRINSIC MAX, MIN
@@ -227,63 +225,70 @@ SUBROUTINE DORGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
*
IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
*
-* Use blocked code after the first block.
-* The last kk rows are handled by the block method.
-*
- KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB )
+* We want to use the blocking method as long as our matrix is big enough
+* and it's deemed worthwhile with the extra memory allocations
*
-* Set A(1:m-kk,n-kk+1:n) to zero.
-*
- DO 20 J = N - KK + 1, N
- DO 10 I = 1, M - KK
- A( I, J ) = ZERO
- 10 CONTINUE
- 20 CONTINUE
+ KK = K
ELSE
KK = 0
END IF
*
-* Use unblocked code for the first or only block.
+* Potentially bail to the unblocked code
*
- CALL DORGR2( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO )
+ IF( KK.EQ.0 ) THEN
+ CALL DORGR2( M, N, K, A, LDA, TAU, WORK, IINFO )
+ END IF
*
IF( KK.GT.0 ) THEN
*
-* Use blocked code
+* Factor the first block assuming that our first application
+* will be on the Identity matrix
*
- DO 50 I = K - KK + 1, K, NB
- IB = MIN( NB, K-I+1 )
- II = M - K + I
- IF( II.GT.1 ) THEN
+ I = 1
+ IB = NB
+ II = M - K + I
*
-* Form the triangular factor of the block reflector
-* H = H(i+ib-1) . . . H(i+1) H(i)
+* Form the triangular factor of the block reflector
+* H = H(i+ib-1) . . . H(i+1) H(i)
*
- CALL DLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB,
- $ A( II, 1 ), LDA, TAU( I ), WORK, LDWORK )
+ CALL DLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB,
+ $ A( II, 1 ), LDA, TAU( I ), WORK, LDWORK )
*
-* Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right
+* Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right
*
- CALL DLARFB( 'Right', 'Transpose', 'Backward',
- $ 'Rowwise',
- $ II-1, N-K+I+IB-1, IB, A( II, 1 ), LDA, WORK,
- $ LDWORK, A, LDA, WORK( IB+1 ), LDWORK )
- END IF
+ CALL DLARFB0C2(.TRUE., 'Right', 'Transpose', 'Backward',
+ $ 'Rowwise', II-1, N-K+I+IB-1, IB, A(II,1), LDA, WORK,
+ $ LDWORK, A, LDA)
*
* Apply H**T to columns 1:n-k+i+ib-1 of current block
*
- CALL DORGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA,
- $ TAU( I ),
- $ WORK, IINFO )
+ CALL DORGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA,
+ $ TAU( I ), WORK, IINFO )
+
+ DO I = NB + 1, K, NB
+*
+* The last block may be less than size NB
*
-* Set columns n-k+i+ib:n of current block to zero
+ IB = MIN(NB, K-I+1)
+ II = M - K + I
+*
+* Form the triangular factor of the block reflector
+* H = H(i+ib-1) . . . H(i+1) H(i)
+*
+ CALL DLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB,
+ $ A( II, 1 ), LDA, TAU( I ), WORK, LDWORK )
+*
+* Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right
*
- DO 40 L = N - K + I + IB, N
- DO 30 J = II, II + IB - 1
- A( J, L ) = ZERO
- 30 CONTINUE
- 40 CONTINUE
- 50 CONTINUE
+ CALL DLARFB0C2(.FALSE., 'Right', 'Transpose',
+ $ 'Backward', 'Rowwise', II-1, N-K+I+IB-1, IB,
+ $ A(II,1), LDA, WORK, LDWORK, A, LDA)
+*
+* Apply H**T to columns 1:n-k+i+ib-1 of current block
+*
+ CALL DORGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA,
+ $ TAU( I ), WORK, IINFO )
+ END DO
END IF
*
WORK( 1 ) = IWS
diff --git a/SRC/lapack_64.h b/SRC/lapack_64.h
index e8000bf2c..e8d792094 100644
--- a/SRC/lapack_64.h
+++ b/SRC/lapack_64.h
@@ -331,6 +331,7 @@
#define CLARF1F CLARF1F_64
#define CLARF1L CLARF1L_64
#define CLARFB CLARFB_64
+#define CLARFB0C2 CLARFB0C2_64
#define CLARFB_GETT CLARFB_GETT_64
#define CLARFG CLARFG_64
#define CLARFGP CLARFGP_64
@@ -805,6 +806,7 @@
#define DLARF1F DLARF1F_64
#define DLARF1L DLARF1L_64
#define DLARFB DLARFB_64
+#define DLARFB0C2 DLARFB0C2_64
#define DLARFB_GETT DLARFB_GETT_64
#define DLARFG DLARFG_64
#define DLARFGP DLARFGP_64
@@ -1400,6 +1402,7 @@
#define SLARF1F SLARF1F_64
#define SLARF1L SLARF1L_64
#define SLARFB SLARFB_64
+#define SLARFB0C2 SLARFB0C2_64
#define SLARFB_GETT SLARFB_GETT_64
#define SLARFG SLARFG_64
#define SLARFGP SLARFGP_64
@@ -2046,6 +2049,7 @@
#define ZLARF1F ZLARF1F_64
#define ZLARF1L ZLARF1L_64
#define ZLARFB ZLARFB_64
+#define ZLARFB0C2 ZLARFB0C2_64
#define ZLARFB_GETT ZLARFB_GETT_64
#define ZLARFG ZLARFG_64
#define ZLARFGP ZLARFGP_64
diff --git a/SRC/slarfb0c2.f b/SRC/slarfb0c2.f
new file mode 100644
index 000000000..e3c222904
--- /dev/null
+++ b/SRC/slarfb0c2.f
@@ -0,0 +1,556 @@
+*> \brief \b SLARFB0C2 applies a block reflector or its transpose to a
+* rectangular matrix with a 0 block while constructing the explicit Q factor
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE SLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N,
+* $ K, V, LDV, T, LDT, C, LDC)
+* ! Scalar arguments
+* INTEGER M, N, K, LDV, LDC, LDT
+* CHARACTER SIDE, TRANS, DIRECT, STOREV
+* ! True means that we are assuming C2 is the identity matrix
+* ! and thus don't reference whatever is present in C2
+* ! at the beginning.
+* LOGICAL C2I
+* ! Array arguments
+* REAL V(LDV,*), C(LDC,*), T(LDT,*)
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> SLARFB0C2 applies a real block reflector H or its transpose H**T to a
+*> real m by n matrix C with a 0 block, while computing the explicit Q factor
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] C2I
+*> \verbatim
+*> C2I is LOGICAL
+*> = .TRUE.: Assume the nonzero block of C is the identity matrix
+*> = .FALSE.: Use existing data in the nonzero block of C
+*> \endverbatim
+*>
+*> \param[in] SIDE
+*> \verbatim
+*> SIDE is CHARACTER*1
+*> = 'L': apply H or H**T from the Left
+*> = 'R': apply H or H**T from the Right
+*> \endverbatim
+*>
+*> \param[in] TRANS
+*> \verbatim
+*> TRANS is CHARACTER*1
+*> = 'N': apply H (No transpose)
+*> = 'T': apply H**T (Transpose)
+*> \endverbatim
+*>
+*> \param[in] DIRECT
+*> \verbatim
+*> DIRECT is CHARACTER*1
+*> Indicates how H is formed from a product of elementary
+*> reflectors
+*> = 'F': H = H(1) H(2) . . . H(k) (Forward)
+*> = 'B': H = H(k) . . . H(2) H(1) (Backward)
+*> \endverbatim
+*>
+*> \param[in] STOREV
+*> \verbatim
+*> STOREV is CHARACTER*1
+*> Indicates how the vectors which define the elementary
+*> reflectors are stored:
+*> = 'C': Columnwise
+*> = 'R': Rowwise
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*> M is INTEGER
+*> The number of rows of the matrix C.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The number of columns of the matrix C.
+*> \endverbatim
+*>
+*> \param[in] K
+*> \verbatim
+*> K is INTEGER
+*> The order of the matrix T (= the number of elementary
+*> reflectors whose product defines the block reflector).
+*> If SIDE = 'L', M >= K >= 0;
+*> if SIDE = 'R', N >= K >= 0.
+*> \endverbatim
+*>
+*> \param[in] V
+*> \verbatim
+*> V is REAL array, dimension
+*> (LDV,K) if STOREV = 'C'
+*> (LDV,M) if STOREV = 'R' and SIDE = 'L'
+*> (LDV,N) if STOREV = 'R' and SIDE = 'R'
+*> The matrix V. See Further Details.
+*> \endverbatim
+*>
+*> \param[in] LDV
+*> \verbatim
+*> LDV is INTEGER
+*> The leading dimension of the array V.
+*> If STOREV = 'C' and SIDE = 'L', LDV >= max(1,M);
+*> if STOREV = 'C' and SIDE = 'R', LDV >= max(1,N);
+*> if STOREV = 'R', LDV >= K.
+*> \endverbatim
+*>
+*> \param[in] T
+*> \verbatim
+*> T is REAL array, dimension (LDT,K)
+*> The triangular k by k matrix T in the representation of the
+*> block reflector.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*> LDT is INTEGER
+*> The leading dimension of the array T. LDT >= K.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*> C is REAL array, dimension (LDC,N)
+*> On entry, the m by n matrix C.
+*> On exit, C is overwritten by H*C or H**T*C or C*H or C*H**T.
+*> \endverbatim
+*>
+*> \param[in] LDC
+*> \verbatim
+*> LDC is INTEGER
+*> The leading dimension of the array C. LDC >= max(1,M).
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup larfb
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> The shape of the matrix V and the storage of the vectors which define
+*> the H(i) is best illustrated by the following example with n = 5 and
+*> k = 3. The triangular part of V (including its diagonal) is not
+*> referenced.
+*>
+*> DIRECT = 'F' and STOREV = 'C': DIRECT = 'F' and STOREV = 'R':
+*>
+*> V = ( 1 ) V = ( 1 v1 v1 v1 v1 )
+*> ( v1 1 ) ( 1 v2 v2 v2 )
+*> ( v1 v2 1 ) ( 1 v3 v3 )
+*> ( v1 v2 v3 )
+*> ( v1 v2 v3 )
+*>
+*> DIRECT = 'B' and STOREV = 'C': DIRECT = 'B' and STOREV = 'R':
+*>
+*> V = ( v1 v2 v3 ) V = ( v1 v1 1 )
+*> ( v1 v2 v3 ) ( v2 v2 v2 1 )
+*> ( 1 v2 v3 ) ( v3 v3 v3 v3 1 )
+*> ( 1 v3 )
+*> ( 1 )
+*> \endverbatim
+*>
+* =====================================================================
+ SUBROUTINE SLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N,
+ $ K, V, LDV, T, LDT, C, LDC)
+ ! Scalar arguments
+ INTEGER M, N, K, LDV, LDC, LDT
+ CHARACTER SIDE, TRANS, DIRECT, STOREV
+ ! True means that we are assuming C2 is the identity matrix
+ ! and thus don't reference whatever is present in C2
+ ! at the beginning.
+ LOGICAL C2I
+
+ ! Array arguments
+ REAL V(LDV,*), C(LDC,*), T(LDT,*)
+ ! Local scalars
+ LOGICAL QR, LQ, QL, DIRF, COLV, SIDEL, SIDER,
+ $ TRANST
+ INTEGER I, J
+ ! External functions
+ LOGICAL LSAME
+ EXTERNAL LSAME
+ ! External Subroutines
+ EXTERNAL SGEMM, STRMM, XERBLA
+ ! Parameters
+ REAL ONE, ZERO, NEG_ONE
+ PARAMETER(ONE=1.0E+0, ZERO = 0.0E+0, NEG_ONE = -1.0E+0)
+
+ ! Beginning of executable statements
+ ! Convert our character flags to logical values
+ DIRF = LSAME(DIRECT,'F')
+ COLV = LSAME(STOREV,'C')
+ SIDEL = LSAME(SIDE,'L')
+ SIDER = LSAME(SIDE,'R')
+ TRANST = LSAME(TRANS,'T')
+
+ ! Determine which of the 4 modes are using.
+ ! QR is when we store the reflectors column by column and have the
+ ! 'first' reflector stored in the first column
+ QR = DIRF.AND.COLV
+
+ ! LQ is when we store the reflectors row by row and have the
+ ! 'first' reflector stored in the first row
+ LQ = DIRF.AND.(.NOT.COLV)
+
+ ! QL is when we store the reflectors column by column and have the
+ ! 'first' reflector stored in the last column
+ QL = (.NOT.DIRF).AND.COLV
+
+ ! RQ is when we store the reflectors row by row and have the
+ ! 'first' reflector stored in the last row
+ ! RQ = (.NOT.DIRF).AND.(.NOT.COLV)
+ ! Since we have exactly one of these 4 modes, we don't need to actually
+ ! store the value of RQ, instead we assume this is the case if we fail
+ ! the above 3 checks.
+
+ IF (QR) THEN
+ ! We are computing C = HC = (I - VTV')C
+ ! Where: V = [ V1 ] and C = [ C1 ]
+ ! [ V2 ] [ C2 ]
+ ! with the following dimensions:
+ ! V1\in\R^{K\times K}
+ ! V2\in\R^{M-K\times K}
+ ! C1=0\in\R^{K\times N}
+ ! C2\in\R^{M-K\times N}
+ ! Since we are assuming that C1 is a zero matrix and it will be
+ ! overwritten on exit, we can use this spot as a temporary workspace
+ ! without having to allocate anything extra.
+ ! This lets us simplify our above equation to get
+ !
+ ! C = HC = (I - [ V1 ]T [V1', V2'])[ 0 ]
+ ! [ V2 ] [ C2 ]
+ ! = [ 0 ] - [ V1 ]T*V2'*C2
+ ! [ C2 ] [ V2 ]
+ !
+ ! = [ 0 ] - [ V1*T*V2'*C2 ]
+ ! [ C2 ] [ V2*T*V2'*C2 ]
+ !
+ ! = [ V1*T*V2'*C2 ]
+ ! [ C2 - V2*T*V2'*C2 ]
+ !
+ ! So, we can order our computations as follows:
+ !
+ ! C1 = V2'*C2
+ ! C1 = T*C1
+ ! C2 = C2 - V2*C1
+ ! C1 = -V1*C1
+ !
+ ! To achieve the same end result
+ !
+ ! Check to ensure side and trans are the expected values
+ !
+ IF( .NOT.SIDEL ) THEN
+ CALL XERBLA('SLARFB0C2', 2)
+ RETURN
+ ELSE IF(TRANST) THEN
+ CALL XERBLA('SLARFB0C2', 3)
+ RETURN
+ END IF
+ !
+ ! C1 = V2'*C2
+ !
+ IF (C2I) THEN
+ DO J = 1, N
+ DO I = 1, K
+ C(I,J) = V(K+J,I)
+ END DO
+ END DO
+ ELSE
+ CALL SGEMM('Transpose', 'No Transpose', K, N, M - K,
+ $ ONE, V(K+1,1), LDV, C(K+1,1), LDC, ZERO,
+ $ C, LDC)
+ END IF
+ !
+ ! C1 = T*C1
+ !
+ CALL STRMM('Left', 'Upper', 'No Transpose', 'Non-unit',
+ $ K, N, ONE, T, LDT, C, LDC)
+ !
+ ! C2 = C2 - V2*C1 = -V2*C1 + C2
+ !
+ IF (C2I) THEN
+ CALL SGEMM('No Transpose', 'No Transpose', M-K, N, K,
+ $ NEG_ONE, V(K+1,1), LDV, C, LDC, ZERO,
+ $ C(K+1,1), LDC)
+ DO I = 1, N
+ C(K+I,I) = C(K+I,I) + ONE
+ END DO
+ ELSE
+ CALL SGEMM('No Transpose', 'No Transpose', M-K, N, K,
+ $ NEG_ONE, V(K+1,1), LDV, C, LDC, ONE,
+ $ C(K+1,1), LDC)
+ END IF
+ !
+ ! C1 = -V1*C1
+ !
+ CALL STRMM('Left', 'Lower', 'No Transpose', 'Unit',
+ $ K, N, NEG_ONE, V, LDV, C, LDC)
+ ELSE IF (LQ) THEN
+ ! We are computing C = CH' = C(I-V'T'V)
+ ! Where: V = [ V1 V2 ] and C = [ C1 C2 ]
+ ! with the following dimensions:
+ ! V1\in\R^{K\times K}
+ ! V2\in\R^{K\times N-K}
+ ! C1=0\in\R^{M\times K}
+ ! C2\in\R^{M\times N-K}
+ ! Since we are assuming that C1 is a zero matrix and it will be
+ ! overwritten on exit, we can use this spot as a temporary workspace
+ ! without having to allocate anything extra.
+ ! This lets us simplify our above equation to get
+ !
+ ! C = CH' = [ 0, C2 ](I - [ V1' ]T'[ V1, V2 ])
+ ! [ V2' ]
+ !
+ ! = [ 0, C2 ] - [ 0, C2 ][ V1' ]T'[ V1, V2 ]
+ ! [ V2' ]
+ !
+ ! = [ 0, C2 ] - C2*V2'*T'[ V1, V2 ]
+ !
+ ! = [ -C2*V2'*T'*V1, C2 - C2*V2'*T'*V2 ]
+ !
+ ! So, we can order our computations as follows:
+ !
+ ! C1 = C2*V2'
+ ! C1 = C1*T'
+ ! C2 = C2 - C1*V2
+ ! C1 = -C1*V1
+ !
+ ! To achieve the same end result
+ !
+ ! Check to ensure side and trans are the expected values
+ !
+ IF( .NOT.SIDER ) THEN
+ CALL XERBLA('SLARFB0C2', 2)
+ RETURN
+ ELSE IF(.NOT.TRANST) THEN
+ CALL XERBLA('SLARFB0C2', 3)
+ RETURN
+ END IF
+ !
+ ! C1 = C2*V2'
+ !
+ IF( C2I ) THEN
+ DO J = 1, K
+ DO I = 1, M
+ C(I,J) = V(J,K+I)
+ END DO
+ END DO
+ ELSE
+ CALL SGEMM('No Transpose', 'Transpose', M, K, N-K,
+ $ ONE, C(1,K+1), LDC, V(1, K+1), LDV, ZERO, C,
+ $ LDC)
+ END IF
+ !
+ ! C1 = C1*T'
+ !
+ CALL STRMM('Right', 'Upper', 'Transpose', 'Non-unit',
+ $ M, K, ONE, T, LDT, C, LDC)
+ !
+ ! C2 = C2 - C1*V2 = -C1*V2 + C2
+ !
+ IF( C2I ) THEN
+ CALL SGEMM('No Transpose', 'No Transpose', M, N-K, K,
+ $ NEG_ONE, C, LDC, V(1,K+1), LDV, ZERO, C(1,K+1),
+ $ LDC)
+ DO I = 1, M
+ C(I,K+I) = C(I,K+I) + ONE
+ END DO
+ ELSE
+ CALL SGEMM('No Transpose', 'No Transpose', M, N-K, K,
+ $ NEG_ONE, C, LDC, V(1,K+1), LDV, ONE, C(1,K+1),
+ $ LDC)
+ END IF
+ !
+ ! C1 = -C1*V1
+ !
+ CALL STRMM('Right', 'Upper', 'No Transpose', 'Unit',
+ $ M, K, NEG_ONE, V, LDV, C, LDC)
+ ELSE IF (QL) THEN
+ ! We are computing C = HC = (I - VTV')C
+ ! Where: V = [ V2 ] and C = [ C2 ]
+ ! [ V1 ] [ C1 ]
+ ! with the following dimensions:
+ ! V1\in\R^{K\times K}
+ ! V2\in\R^{M-K\times K}
+ ! C1=0\in\R^{K\times N}
+ ! C2\in\R^{M-K\times N}
+ ! Since we are assuming that C1 is a zero matrix and it will be
+ ! overwritten on exit, we can use this spot as a temporary workspace
+ ! without having to allocate anything extra.
+ ! This lets us simplify our above equation to get
+ !
+ ! C = HC = (I-[ V2 ]T[ V2' V1' ])[ C2 ]
+ ! [ V1 ] [ 0 ]
+ !
+ ! = [ C2 ] - [ V2 ]T*V2'*C2
+ ! [ 0 ] [ V1 ]
+ !
+ ! = [ C2 ] - [ V2*T*V2'*C2 ]
+ ! [ 0 ] [ V1*T*V2'*C2 ]
+ !
+ ! = [ C2 - V2*T*V2'*C2 ]
+ ! [ - V1*T*V2'*C2 ]
+ !
+ ! So, we can order our computations as follows:
+ !
+ ! C1 = V2'*C2
+ ! C1 = T*C1
+ ! C2 = C2 - V2*C1
+ ! C1 = -V1*C1
+ !
+ ! To achieve the same end result
+ !
+ ! Check to ensure side and trans are the expected values
+ !
+ IF( .NOT.SIDEL ) THEN
+ CALL XERBLA('SLARFB0C2', 2)
+ RETURN
+ ELSE IF(TRANST) THEN
+ CALL XERBLA('SLARFB0C2', 3)
+ RETURN
+ END IF
+ !
+ ! C1 = V2'*C2
+ !
+ IF( C2I ) THEN
+ DO J = 1, N
+ DO I = 1, K
+ C(M-K+I,J) = V(J,I)
+ END DO
+ END DO
+ ELSE
+ CALL SGEMM('Transpose', 'No Transpose', K, N, M-K,
+ $ ONE, V, LDV, C, LDC, ZERO, C(M-K+1, 1), LDC)
+ END IF
+ !
+ ! C1 = T*C1
+ !
+ CALL STRMM('Left', 'Lower', 'No Transpose', 'Non-unit',
+ $ K, N, ONE, T, LDT, C(M-K+1,1), LDC)
+ !
+ ! C2 = C2 - V2*C1 = -V2*C1 + C2
+ !
+ IF( C2I ) THEN
+ CALL SGEMM('No Transpose', 'No Transpose', M-K, N, K,
+ $ NEG_ONE, V, LDV, C(M-K+1,1), LDC, ZERO, C, LDC)
+ DO I = 1, N
+ C(I,I) = C(I,I) + ONE
+ END DO
+ ELSE
+ CALL SGEMM('No Transpose', 'No Transpose', M-K, N, K,
+ $ NEG_ONE, V, LDV, C(M-K+1,1), LDC, ONE, C, LDC)
+ END IF
+ !
+ ! C1 = -V1*C1
+ !
+ CALL STRMM('Left', 'Upper', 'No Transpose', 'Unit',
+ $ K, N, NEG_ONE, V(M-K+1,1), LDV, C(M-K+1,1), LDC)
+ ELSE ! IF (RQ) THEN
+ ! We are computing C = CH' = C(I-V'T'V)
+ ! Where: V = [ V2 V1] and C = [ C2 C1 ]
+ ! with the following dimensions:
+ ! V1\in\R^{K\times K}
+ ! V2\in\R^{K\times N-K}
+ ! C1=0\in\R^{M\times K}
+ ! C2\in\R^{M\times N-K}
+ ! Since we are assuming that C1 is a zero matrix and it will be
+ ! overwritten on exit, we can use this spot as a temporary workspace
+ ! without having to allocate anything extra.
+ ! This lets us simplify our above equation to get
+ !
+ ! C = CH' = [ C2, 0 ] (I - [ V2' ]T'[ V2, V1 ]
+ ! [ V1' ]
+ !
+ ! = [ C2, 0 ] - [ C2, 0 ] [ V2' ]T'[ V2, V1 ]
+ ! [ V1' ]
+ !
+ ! = [ C2, 0 ] - C2*V2'*T'[ V2, V1 ]
+ !
+ ! = [ C2, 0 ] - [ C2*V2'*T'*V2, C2*V2'*T'*V1 ]
+ !
+ ! = [ C2 - C2*V2'*T'*V2, -C2*V2'*T'*V1 ]
+ !
+ ! So, we can order our computations as follows:
+ !
+ ! C1 = C2*V2'
+ ! C1 = C1*T'
+ ! C2 = C2 - C1*V2
+ ! C1 = -C1*V1
+ !
+ !
+ ! To achieve the same end result
+ !
+ ! Check to ensure side and trans are the expected values
+ !
+ IF( .NOT.SIDER ) THEN
+ CALL XERBLA('SLARFB0C2', 2)
+ RETURN
+ ELSE IF(.NOT.TRANST) THEN
+ CALL XERBLA('SLARFB0C2', 3)
+ RETURN
+ END IF
+ !
+ ! C1 = C2*V2'
+ !
+ IF( C2I ) THEN
+ DO J = 1, K
+ DO I = 1, M
+ C(I,N-K+J) = V(J,I)
+ END DO
+ END DO
+ ELSE
+ CALL SGEMM('No Transpose', 'Transpose', M, K, N-K,
+ $ ONE, C, LDC, V, LDV, ZERO, C(1, N-K+1), LDC)
+ END IF
+ !
+ ! C1 = C1*T'
+ !
+ CALL STRMM('Right', 'Lower', 'Transpose', 'Non-unit',
+ $ M, K, ONE, T, LDT, C(1, N-K+1), LDC)
+ !
+ ! C2 = C2 - C1*V2 = -C1*V2 + C2
+ !
+ IF( C2I ) THEN
+ CALL SGEMM('No Transpose', 'No Transpose', M, N-K, K,
+ $ NEG_ONE, C(1, N-K+1), LDC, V, LDV, ZERO, C, LDC)
+ DO I = 1, M
+ C(I,I) = C(I,I) + ONE
+ END DO
+ ELSE
+ CALL SGEMM('No Transpose', 'No Transpose', M, N-K, K,
+ $ NEG_ONE, C(1, N-K+1), LDC, V, LDV, ONE, C, LDC)
+ END IF
+ !
+ ! C1 = -C1*V1
+ !
+ CALL STRMM('Right', 'Lower', 'No Transpose', 'Unit',
+ $ M, K, NEG_ONE, V(1, N-K+1), LDV, C(1,N-K+1), LDC)
+ END IF
+ END SUBROUTINE
diff --git a/SRC/sorglq.f b/SRC/sorglq.f
index fb5ef79cc..574b3ae2c 100644
--- a/SRC/sorglq.f
+++ b/SRC/sorglq.f
@@ -146,7 +146,7 @@ SUBROUTINE SORGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
$ LWKOPT, NB, NBMIN, NX
* ..
* .. External Subroutines ..
- EXTERNAL SLARFB, SLARFT, SORGL2, XERBLA
+ EXTERNAL SLARFB0C2, SLARFT, SORGL2, XERBLA
* ..
* .. Intrinsic Functions ..
INTRINSIC MAX, MIN
@@ -218,67 +218,90 @@ SUBROUTINE SORGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
*
IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
*
-* Use blocked code after the last block.
-* The first kk rows are handled by the block method.
+* Handle the first block assuming we are applying to the
+* identity, then resume regular blocking method after
*
- KI = ( ( K-NX-1 ) / NB )*NB
- KK = MIN( K, KI+NB )
-*
-* Set A(kk+1:m,1:kk) to zero.
-*
- DO 20 J = 1, KK
- DO 10 I = KK + 1, M
- A( I, J ) = ZERO
- 10 CONTINUE
- 20 CONTINUE
+ KI = K - 2 * NB
+ KK = K - NB
ELSE
KK = 0
END IF
*
-* Use unblocked code for the last or only block.
+* Potentially bail to the unblocked version
*
- IF( KK.LT.M )
- $ CALL SORGL2( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA,
- $ TAU( KK+1 ), WORK, IINFO )
+ IF( KK.EQ.0 ) THEN
+ CALL SORGL2( M, N, K, A, LDA, TAU, WORK, IINFO )
+ END IF
*
IF( KK.GT.0 ) THEN
+ I = KK + 1
+ IB = NB
+*
+* Form the triangular factor of the block reflector
+* H = H(i) H(i+1) . . . H(i+ib-1)
+*
+ CALL SLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ),
+ $ LDA, TAU( I ), WORK, LDWORK )
+*
+* Apply H**T to A(i+ib:m,i:n) from the right
+*
+ CALL SLARFB0C2(.TRUE., 'Right', 'Transpose', 'Forward',
+ $ 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK,
+ $ LDWORK, A(I+IB,I), LDA)
+*
+* Apply H**T to columns i:n of current block
+
+ CALL SORGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ),
+ $ WORK, IINFO )
*
* Use blocked code
*
- DO 50 I = KI + 1, 1, -NB
- IB = MIN( NB, K-I+1 )
- IF( I+IB.LE.M ) THEN
+ DO I = KI + 1, 1, -NB
+ IB = NB
*
-* Form the triangular factor of the block reflector
-* H = H(i) H(i+1) . . . H(i+ib-1)
+* Form the triangular factor of the block reflector
+* H = H(i) H(i+1) . . . H(i+ib-1)
*
- CALL SLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I,
- $ I ),
- $ LDA, TAU( I ), WORK, LDWORK )
+ CALL SLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ),
+ $ LDA, TAU( I ), WORK, LDWORK )
*
-* Apply H**T to A(i+ib:m,i:n) from the right
+* Apply H**T to A(i+ib:m,i:n) from the right
*
- CALL SLARFB( 'Right', 'Transpose', 'Forward',
- $ 'Rowwise',
- $ M-I-IB+1, N-I+1, IB, A( I, I ), LDA, WORK,
- $ LDWORK, A( I+IB, I ), LDA, WORK( IB+1 ),
- $ LDWORK )
- END IF
+ CALL SLARFB0C2(.FALSE., 'Right', 'Transpose', 'Forward',
+ $ 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK,
+ $ LDWORK, A(I+IB,I), LDA)
*
* Apply H**T to columns i:n of current block
-*
+
CALL SORGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ),
- $ WORK,
- $ IINFO )
+ $ WORK, IINFO )
+ END DO
+*
+* This checks for if K was a perfect multiple of NB
+* so that we only have a special case for the last block when
+* necessary
+*
+ IF(I.LT.1) THEN
+ IB = I + NB - 1
+ I = 1
+*
+* Form the triangular factor of the block reflector
+* H = H(i) H(i+1) . . . H(i+ib-1)
+*
+ CALL SLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ),
+ $ LDA, TAU( I ), WORK, LDWORK )
*
-* Set columns 1:i-1 of current block to zero
+* Apply H**T to A(i+ib:m,i:n) from the right
*
- DO 40 J = 1, I - 1
- DO 30 L = I, I + IB - 1
- A( L, J ) = ZERO
- 30 CONTINUE
- 40 CONTINUE
- 50 CONTINUE
+ CALL SLARFB0C2(.FALSE., 'Right', 'Transpose', 'Forward',
+ $ 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK,
+ $ LDWORK, A(I+IB,I), LDA)
+*
+* Apply H**T to columns i:n of current block
+
+ CALL SORGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ),
+ $ WORK, IINFO )
+ END IF
END IF
*
WORK( 1 ) = SROUNDUP_LWORK(IWS)
diff --git a/SRC/sorgql.f b/SRC/sorgql.f
index 7257ff2d9..c9b2918b7 100644
--- a/SRC/sorgql.f
+++ b/SRC/sorgql.f
@@ -95,8 +95,6 @@
*> \verbatim
*> LWORK is INTEGER
*> The dimension of the array WORK. LWORK >= max(1,N).
-*> For optimum performance LWORK >= N*NB, where NB is the
-*> optimal blocksize.
*>
*> If LWORK = -1, then a workspace query is assumed; the routine
*> only calculates the optimal size of the WORK array, returns
@@ -143,11 +141,11 @@ SUBROUTINE SORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
* ..
* .. Local Scalars ..
LOGICAL LQUERY
- INTEGER I, IB, IINFO, IWS, J, KK, L, LDWORK, LWKOPT,
- $ NB, NBMIN, NX
+ INTEGER I, IB, IINFO, IWS, J, KK, LWKOPT, NB,
+ $ NBMIN, NX
* ..
* .. External Subroutines ..
- EXTERNAL SLARFB, SLARFT, SORG2L, XERBLA
+ EXTERNAL SLARFB0C2, SLARFT, SORG2L, XERBLA
* ..
* .. Intrinsic Functions ..
INTRINSIC MAX, MIN
@@ -178,7 +176,8 @@ SUBROUTINE SORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
LWKOPT = 1
ELSE
NB = ILAENV( 1, 'SORGQL', ' ', M, N, K, -1 )
- LWKOPT = N*NB
+ ! Only need a workspace for calls to dorg2l
+ LWKOPT = N
END IF
WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
*
@@ -201,88 +200,76 @@ SUBROUTINE SORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
END IF
*
NBMIN = 2
- NX = 0
+ NX = MAX(0, ILAENV(3, 'SORGQL', ' ', M, N, K, -1))
IWS = N
- IF( NB.GT.1 .AND. NB.LT.K ) THEN
*
-* Determine when to cross over from blocked to unblocked code.
-*
- NX = MAX( 0, ILAENV( 3, 'SORGQL', ' ', M, N, K, -1 ) )
- IF( NX.LT.K ) THEN
+ IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
*
-* Determine if workspace is large enough for blocked code.
+* We want to use the blocking method as long as our matrix is big enough
*
- LDWORK = N
- IWS = LDWORK*NB
- IF( LWORK.LT.IWS ) THEN
+ KK = K
+ ELSE
+ KK = 0
+ END IF
*
-* Not enough workspace to use optimal NB: reduce NB and
-* determine the minimum value of NB.
+* Possibly bail to the unblocked code.
*
- NB = LWORK / LDWORK
- NBMIN = MAX( 2, ILAENV( 2, 'SORGQL', ' ', M, N, K,
- $ -1 ) )
- END IF
- END IF
+ IF( KK.EQ.0 ) THEN
+ CALL SORG2L( M, N, K, A, LDA, TAU, WORK, IINFO )
END IF
*
- IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
+ IF( KK.GT.0 ) THEN
*
-* Use blocked code after the first block.
-* The last kk columns are handled by the block method.
+* Factor the first block assuming that our first application
+* will be on the Identity matrix
*
- KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB )
+ I = 1
+ IB = NB
*
-* Set A(m-kk+1:m,1:n-kk) to zero.
+* Form the triangular factor of the block reflector
+* H = H(i+ib-1) . . . H(i+1) H(i)
*
- DO 20 J = 1, N - KK
- DO 10 I = M - KK + 1, M
- A( I, J ) = ZERO
- 10 CONTINUE
- 20 CONTINUE
- ELSE
- KK = 0
- END IF
+ CALL SLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB,
+ $ A( 1, N-K+I ), LDA, TAU( I ),
+ $ A( M-K+I, N-K+I ), LDA)
*
-* Use unblocked code for the first or only block.
+* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left
*
- CALL SORG2L( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO )
+ CALL SLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Backward',
+ $ 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, A(1, N-K+I),
+ $ LDA, A( M-K+I, N-K+I ), LDA, A, LDA)
*
- IF( KK.GT.0 ) THEN
+* Apply H to rows 1:m-k+i+ib-1 of current block
*
-* Use blocked code
+ CALL SORG2L( M-K+I+IB-1, IB, IB, A( 1, N-K+I ), LDA,
+ $ TAU( I ), WORK, IINFO )
+
+* Use blocked code on the remaining blocks if there are any.
*
- DO 50 I = K - KK + 1, K, NB
- IB = MIN( NB, K-I+1 )
- IF( N-K+I.GT.1 ) THEN
+ DO I = NB+1, K, NB
*
-* Form the triangular factor of the block reflector
-* H = H(i+ib-1) . . . H(i+1) H(i)
+* The last block may be less than size NB
*
- CALL SLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB,
- $ A( 1, N-K+I ), LDA, TAU( I ), WORK, LDWORK )
+ IB = MIN(NB, K-I+1)
*
-* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left
+* Form the triangular factor of the block reflector
+* H = H(i+ib-1) . . . H(i+1) H(i)
*
- CALL SLARFB( 'Left', 'No transpose', 'Backward',
- $ 'Columnwise', M-K+I+IB-1, N-K+I-1, IB,
- $ A( 1, N-K+I ), LDA, WORK, LDWORK, A, LDA,
- $ WORK( IB+1 ), LDWORK )
- END IF
+ CALL SLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB,
+ $ A( 1, N-K+I ), LDA, TAU( I ),
+ $ A( M-K+I, N-K+I ), LDA )
+*
+* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left
+*
+ CALL SLARFB0C2(.FALSE., 'Left', 'No Transpose',
+ $ 'Backward', 'Columnwise', M-K+I+IB-1, N-K+I-1, IB,
+ $ A(1, N-K+I), LDA, A( M-K+I, N-K+I ), LDA, A, LDA)
*
* Apply H to rows 1:m-k+i+ib-1 of current block
*
CALL SORG2L( M-K+I+IB-1, IB, IB, A( 1, N-K+I ), LDA,
$ TAU( I ), WORK, IINFO )
-*
-* Set rows m-k+i+ib:m of current block to zero
-*
- DO 40 J = N - K + I, N - K + I + IB - 1
- DO 30 L = M - K + I + IB, M
- A( L, J ) = ZERO
- 30 CONTINUE
- 40 CONTINUE
- 50 CONTINUE
+ END DO
END IF
*
WORK( 1 ) = SROUNDUP_LWORK(IWS)
diff --git a/SRC/sorgqr.f b/SRC/sorgqr.f
index 47d1fd248..070257c46 100644
--- a/SRC/sorgqr.f
+++ b/SRC/sorgqr.f
@@ -95,8 +95,6 @@
*> \verbatim
*> LWORK is INTEGER
*> The dimension of the array WORK. LWORK >= max(1,N).
-*> For optimum performance LWORK >= N*NB, where NB is the
-*> optimal blocksize.
*>
*> If LWORK = -1, then a workspace query is assumed; the routine
*> only calculates the optimal size of the WORK array, returns
@@ -147,7 +145,7 @@ SUBROUTINE SORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
$ LWKOPT, NB, NBMIN, NX
* ..
* .. External Subroutines ..
- EXTERNAL SLARFB, SLARFT, SORG2R, XERBLA
+ EXTERNAL SLARFB0C2, SLARFT, SORG2R, XERBLA
* ..
* .. Intrinsic Functions ..
INTRINSIC MAX, MIN
@@ -163,7 +161,7 @@ SUBROUTINE SORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
*
INFO = 0
NB = ILAENV( 1, 'SORGQR', ' ', M, N, K, -1 )
- LWKOPT = MAX( 1, N )*NB
+ LWKOPT = MAX( 1, N )
WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
LQUERY = ( LWORK.EQ.-1 )
IF( M.LT.0 ) THEN
@@ -194,92 +192,92 @@ SUBROUTINE SORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
NBMIN = 2
NX = 0
IWS = N
- IF( NB.GT.1 .AND. NB.LT.K ) THEN
*
-* Determine when to cross over from blocked to unblocked code.
+ IF( NB.GE.NBMIN .AND. NB.LT.K ) THEN
*
- NX = MAX( 0, ILAENV( 3, 'SORGQR', ' ', M, N, K, -1 ) )
- IF( NX.LT.K ) THEN
+* Handle the first block assuming we are applying to the
+* identity, then resume regular blocking method after
*
-* Determine if workspace is large enough for blocked code.
-*
- LDWORK = N
- IWS = LDWORK*NB
- IF( LWORK.LT.IWS ) THEN
+ KI = K - 2 * NB
+ KK = K - NB
+ ELSE
+ KK = 0
+ END IF
*
-* Not enough workspace to use optimal NB: reduce NB and
-* determine the minimum value of NB.
+* Potentially bail to the unblocked code.
*
- NB = LWORK / LDWORK
- NBMIN = MAX( 2, ILAENV( 2, 'SORGQR', ' ', M, N, K,
- $ -1 ) )
- END IF
- END IF
+ IF( KK.EQ.0 ) THEN
+ CALL SORG2R( M, N, K, A, LDA, TAU, WORK, IINFO )
END IF
*
- IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
+ IF( KK.GT.0 ) THEN
+ I = KK + 1
+ IB = NB
*
-* Use blocked code after the last block.
-* The first kk columns are handled by the block method.
+* Form the triangular factor of the block reflector
+* H = H(i) H(i+1) . . . H(i+ib-1)
*
- KI = ( ( K-NX-1 ) / NB )*NB
- KK = MIN( K, KI+NB )
+ CALL SLARFT('Forward', 'Column', M-I+1, IB, A(I,I),
+ $ LDA, TAU(I), A(I,I), LDA)
*
-* Set A(1:kk,kk+1:n) to zero.
+* Apply H to A(i:m,i+ib:n) from the left
*
- DO 20 J = KK + 1, N
- DO 10 I = 1, KK
- A( I, J ) = ZERO
- 10 CONTINUE
- 20 CONTINUE
- ELSE
- KK = 0
- END IF
+ CALL SLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Forward',
+ $ 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), LDA, A(I,I),
+ $ LDA, A(I,I+IB), LDA)
*
-* Use unblocked code for the last or only block.
+* Apply H to rows i:m of current block
*
- IF( KK.LT.N )
- $ CALL SORG2R( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA,
- $ TAU( KK+1 ), WORK, IINFO )
+ CALL SORG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK, IINFO)
+ DO I = KI + 1, 1, -NB
+ IB = NB
*
- IF( KK.GT.0 ) THEN
+* Form the triangular factor of the block reflector
+* H = H(i) H(i+1) . . . H(i+ib-1)
*
-* Use blocked code
+ CALL SLARFT('Forward', 'Column', M-I+1, IB, A(I,I),
+ $ LDA, TAU(I), A(I,I), LDA)
*
- DO 50 I = KI + 1, 1, -NB
- IB = MIN( NB, K-I+1 )
- IF( I+IB.LE.N ) THEN
+* Apply H to A(i:m,i+ib:n) from the left
*
-* Form the triangular factor of the block reflector
-* H = H(i) H(i+1) . . . H(i+ib-1)
+ CALL SLARFB0C2(.FALSE., 'Left', 'No Transpose',
+ $ 'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I),
+ $ LDA, A(I,I), LDA, A(I,I+IB), LDA)
+
*
- CALL SLARFT( 'Forward', 'Columnwise', M-I+1, IB,
- $ A( I, I ), LDA, TAU( I ), WORK, LDWORK )
+* Apply H to rows i:m of current block
*
-* Apply H to A(i:m,i+ib:n) from the left
+ CALL SORG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK,
+ $ IINFO)
+ END DO
*
- CALL SLARFB( 'Left', 'No transpose', 'Forward',
- $ 'Columnwise', M-I+1, N-I-IB+1, IB,
- $ A( I, I ), LDA, WORK, LDWORK, A( I, I+IB ),
- $ LDA, WORK( IB+1 ), LDWORK )
- END IF
+* This checks for if K was a perfect multiple of NB
+* so that we only have a special case for the last block when
+* necessary
*
-* Apply H to rows i:m of current block
+ IF(I.LT.1) THEN
+ IB = I + NB - 1
+ I = 1
*
- CALL SORG2R( M-I+1, IB, IB, A( I, I ), LDA, TAU( I ),
- $ WORK,
- $ IINFO )
+* Form the triangular factor of the block reflector
+* H = H(i) H(i+1) . . . H(i+ib-1)
*
-* Set rows 1:i-1 of current block to zero
+ CALL SLARFT('Forward', 'Column', M-I+1, IB, A(I,I),
+ $ LDA, TAU(I), A(I,I), LDA)
*
- DO 40 J = I, I + IB - 1
- DO 30 L = 1, I - 1
- A( L, J ) = ZERO
- 30 CONTINUE
- 40 CONTINUE
- 50 CONTINUE
- END IF
+* Apply H to A(i:m,i+ib:n) from the left
+*
+ CALL SLARFB0C2(.FALSE., 'Left', 'No Transpose',
+ $ 'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I),
+ $ LDA, A(I,I), LDA, A(I,I+IB), LDA)
+
*
+* Apply H to rows i:m of current block
+*
+ CALL SORG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK,
+ $ IINFO)
+ END IF
+ END IF
WORK( 1 ) = SROUNDUP_LWORK(IWS)
RETURN
*
diff --git a/SRC/sorgrq.f b/SRC/sorgrq.f
index 5fe685c1d..ea1a4e73d 100644
--- a/SRC/sorgrq.f
+++ b/SRC/sorgrq.f
@@ -147,7 +147,7 @@ SUBROUTINE SORGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
$ LWKOPT, NB, NBMIN, NX
* ..
* .. External Subroutines ..
- EXTERNAL SLARFB, SLARFT, SORGR2, XERBLA
+ EXTERNAL SLARFB0C2, SLARFT, SORGR2, XERBLA
* ..
* .. Intrinsic Functions ..
INTRINSIC MAX, MIN
@@ -228,63 +228,70 @@ SUBROUTINE SORGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
*
IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
*
-* Use blocked code after the first block.
-* The last kk rows are handled by the block method.
+* We want to use the blocking method as long as our matrix is big enough
+* and it's deemed worthwhile with the extra memory allocations
*
- KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB )
-*
-* Set A(1:m-kk,n-kk+1:n) to zero.
-*
- DO 20 J = N - KK + 1, N
- DO 10 I = 1, M - KK
- A( I, J ) = ZERO
- 10 CONTINUE
- 20 CONTINUE
+ KK = K
ELSE
KK = 0
END IF
*
-* Use unblocked code for the first or only block.
+* Potentially bail to the unblocked code
*
- CALL SORGR2( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO )
+ IF( KK.EQ.0 ) THEN
+ CALL SORGR2( M, N, K, A, LDA, TAU, WORK, IINFO )
+ END IF
*
IF( KK.GT.0 ) THEN
*
-* Use blocked code
+* Factor the first block assuming that our first application
+* will be on the Identity matrix
*
- DO 50 I = K - KK + 1, K, NB
- IB = MIN( NB, K-I+1 )
- II = M - K + I
- IF( II.GT.1 ) THEN
+ I = 1
+ IB = NB
+ II = M - K + I
*
-* Form the triangular factor of the block reflector
-* H = H(i+ib-1) . . . H(i+1) H(i)
+* Form the triangular factor of the block reflector
+* H = H(i+ib-1) . . . H(i+1) H(i)
*
- CALL SLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB,
- $ A( II, 1 ), LDA, TAU( I ), WORK, LDWORK )
+ CALL SLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB,
+ $ A( II, 1 ), LDA, TAU( I ), WORK, LDWORK )
*
-* Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right
+* Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right
*
- CALL SLARFB( 'Right', 'Transpose', 'Backward',
- $ 'Rowwise',
- $ II-1, N-K+I+IB-1, IB, A( II, 1 ), LDA, WORK,
- $ LDWORK, A, LDA, WORK( IB+1 ), LDWORK )
- END IF
+ CALL SLARFB0C2(.TRUE., 'Right', 'Transpose', 'Backward',
+ $ 'Rowwise', II-1, N-K+I+IB-1, IB, A(II,1), LDA, WORK,
+ $ LDWORK, A, LDA)
*
* Apply H**T to columns 1:n-k+i+ib-1 of current block
*
- CALL SORGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA,
- $ TAU( I ),
- $ WORK, IINFO )
+ CALL SORGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA,
+ $ TAU( I ), WORK, IINFO )
+
+ DO I = NB + 1, K, NB
*
-* Set columns n-k+i+ib:n of current block to zero
+* The last block may be less than size NB
+*
+ IB = MIN(NB, K-I+1)
+ II = M - K + I
*
- DO 40 L = N - K + I + IB, N
- DO 30 J = II, II + IB - 1
- A( J, L ) = ZERO
- 30 CONTINUE
- 40 CONTINUE
- 50 CONTINUE
+* Form the triangular factor of the block reflector
+* H = H(i+ib-1) . . . H(i+1) H(i)
+*
+ CALL SLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB,
+ $ A( II, 1 ), LDA, TAU( I ), WORK, LDWORK )
+*
+* Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right
+*
+ CALL SLARFB0C2(.FALSE., 'Right', 'Transpose',
+ $ 'Backward', 'Rowwise', II-1, N-K+I+IB-1, IB,
+ $ A(II,1), LDA, WORK, LDWORK, A, LDA)
+*
+* Apply H**T to columns 1:n-k+i+ib-1 of current block
+*
+ CALL SORGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA,
+ $ TAU( I ), WORK, IINFO )
+ END DO
END IF
*
WORK( 1 ) = SROUNDUP_LWORK(IWS)
diff --git a/SRC/zlarfb0c2.f b/SRC/zlarfb0c2.f
new file mode 100644
index 000000000..1f43b4b14
--- /dev/null
+++ b/SRC/zlarfb0c2.f
@@ -0,0 +1,560 @@
+*> \brief \b ZLARFB0C2 applies a block reflector or its conjugate-transpose
+* to a rectangular matrix with a 0 block while constructing the explicit Q
+* factor
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE ZLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N,
+* $ K, V, LDV, T, LDT, C, LDC)
+* ! Scalar arguments
+* INTEGER M, N, K, LDV, LDC, LDT
+* CHARACTER SIDE, TRANS, DIRECT, STOREV
+* ! True means that we are assuming C2 is the identity matrix
+* ! and thus don't reference whatever is present in C2
+* ! at the beginning.
+* LOGICAL C2I
+* ! Array arguments
+* COMPLEX*16 V(LDV,*), C(LDC,*), T(LDT,*)
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> ZLARFB0C2 applies a real block reflector H or its transpose H**H to a
+*> complex m by n matrix C with a 0 block, while computing the explicit Q factor
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] C2I
+*> \verbatim
+*> C2I is LOGICAL
+*> = .TRUE.: Assume the nonzero block of C is the identity matrix
+*> = .FALSE.: Use existing data in the nonzero block of C
+*> \endverbatim
+*>
+*> \param[in] SIDE
+*> \verbatim
+*> SIDE is CHARACTER*1
+*> = 'L': apply H or H**H from the Left
+*> = 'R': apply H or H**H from the Right
+*> \endverbatim
+*>
+*> \param[in] TRANS
+*> \verbatim
+*> TRANS is CHARACTER*1
+*> = 'N': apply H (No transpose)
+*> = 'C': apply H**H (Conjugate transpose)
+*> \endverbatim
+*>
+*> \param[in] DIRECT
+*> \verbatim
+*> DIRECT is CHARACTER*1
+*> Indicates how H is formed from a product of elementary
+*> reflectors
+*> = 'F': H = H(1) H(2) . . . H(k) (Forward)
+*> = 'B': H = H(k) . . . H(2) H(1) (Backward)
+*> \endverbatim
+*>
+*> \param[in] STOREV
+*> \verbatim
+*> STOREV is CHARACTER*1
+*> Indicates how the vectors which define the elementary
+*> reflectors are stored:
+*> = 'C': Columnwise
+*> = 'R': Rowwise
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*> M is INTEGER
+*> The number of rows of the matrix C.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The number of columns of the matrix C.
+*> \endverbatim
+*>
+*> \param[in] K
+*> \verbatim
+*> K is INTEGER
+*> The order of the matrix T (= the number of elementary
+*> reflectors whose product defines the block reflector).
+*> If SIDE = 'L', M >= K >= 0;
+*> if SIDE = 'R', N >= K >= 0.
+*> \endverbatim
+*>
+*> \param[in] V
+*> \verbatim
+*> V is COMPLEX*16 array, dimension
+*> (LDV,K) if STOREV = 'C'
+*> (LDV,M) if STOREV = 'R' and SIDE = 'L'
+*> (LDV,N) if STOREV = 'R' and SIDE = 'R'
+*> See Further Details.
+*> \endverbatim
+*>
+*> \param[in] LDV
+*> \verbatim
+*> LDV is INTEGER
+*> The leading dimension of the array V.
+*> If STOREV = 'C' and SIDE = 'L', LDV >= max(1,M);
+*> if STOREV = 'C' and SIDE = 'R', LDV >= max(1,N);
+*> if STOREV = 'R', LDV >= K.
+*> \endverbatim
+*>
+*> \param[in] T
+*> \verbatim
+*> T is COMPLEX*16 array, dimension (LDT,K)
+*> The triangular K-by-K matrix T in the representation of the
+*> block reflector.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*> LDT is INTEGER
+*> The leading dimension of the array T. LDT >= K.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*> C is COMPLEX*16 array, dimension (LDC,N)
+*> On entry, the M-by-N matrix C.
+*> On exit, C is overwritten by H*C or H**H*C or C*H or C*H**H.
+*> \endverbatim
+*>
+*> \param[in] LDC
+*> \verbatim
+*> LDC is INTEGER
+*> The leading dimension of the array C. LDC >= max(1,M).
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup larfb
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> The shape of the matrix V and the storage of the vectors which define
+*> the H(i) is best illustrated by the following example with n = 5 and
+*> k = 3. The triangular part of V (including its diagonal) is not
+*> referenced.
+*>
+*> DIRECT = 'F' and STOREV = 'C': DIRECT = 'F' and STOREV = 'R':
+*>
+*> V = ( 1 ) V = ( 1 v1 v1 v1 v1 )
+*> ( v1 1 ) ( 1 v2 v2 v2 )
+*> ( v1 v2 1 ) ( 1 v3 v3 )
+*> ( v1 v2 v3 )
+*> ( v1 v2 v3 )
+*>
+*> DIRECT = 'B' and STOREV = 'C': DIRECT = 'B' and STOREV = 'R':
+*>
+*> V = ( v1 v2 v3 ) V = ( v1 v1 1 )
+*> ( v1 v2 v3 ) ( v2 v2 v2 1 )
+*> ( 1 v2 v3 ) ( v3 v3 v3 v3 1 )
+*> ( 1 v3 )
+*> ( 1 )
+*> \endverbatim
+*>
+* =====================================================================
+ SUBROUTINE ZLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N,
+ $ K, V, LDV, T, LDT, C, LDC)
+ ! Scalar arguments
+ INTEGER M, N, K, LDV, LDC, LDT
+ CHARACTER SIDE, TRANS, DIRECT, STOREV
+ ! True means that we are assuming C2 is the identity matrix
+ ! and thus don't reference whatever is present in C2
+ ! at the beginning.
+ LOGICAL C2I
+ ! Array arguments
+ COMPLEX*16 V(LDV,*), C(LDC,*), T(LDT,*)
+ ! Local scalars
+ LOGICAL QR, LQ, QL, DIRF, COLV, SIDEL, SIDER,
+ $ TRANST
+ INTEGER I, J
+ ! Intrinsic Functions
+ INTRINSIC CONJG
+ ! External functions
+ LOGICAL LSAME
+ EXTERNAL LSAME
+ ! External Subroutines
+ EXTERNAL ZGEMM, ZTRMM
+ ! Parameters
+ COMPLEX*16 ONE, ZERO, NEG_ONE
+ PARAMETER(ONE=(1.0D+0, 0.0D+0),
+ $ ZERO = (0.0D+0, 0.0D+0),
+ $ NEG_ONE = (-1.0D+0, 0.0D+0))
+
+ ! Beginning of executable statements
+ ! Convert our character flags to logical values
+ DIRF = LSAME(DIRECT,'F')
+ COLV = LSAME(STOREV,'C')
+ SIDEL = LSAME(SIDE,'L')
+ SIDER = LSAME(SIDE,'R')
+ TRANST = LSAME(TRANS,'C')
+
+ ! Determine which of the 4 modes are using.
+ ! QR is when we store the reflectors column by column and have the
+ ! 'first' reflector stored in the first column
+ QR = DIRF.AND.COLV
+
+ ! LQ is when we store the reflectors row by row and have the
+ ! 'first' reflector stored in the first row
+ LQ = DIRF.AND.(.NOT.COLV)
+
+ ! QL is when we store the reflectors column by column and have the
+ ! 'first' reflector stored in the last column
+ QL = (.NOT.DIRF).AND.COLV
+
+ ! RQ is when we store the reflectors row by row and have the
+ ! 'first' reflector stored in the last row
+ ! RQ = (.NOT.DIRF).AND.(.NOT.COLV)
+ ! Since we have exactly one of these 4 modes, we don't need to actually
+ ! store the value of RQ, instead we assume this is the case if we fail
+ ! the above 3 checks.
+
+ IF (QR) THEN
+ ! We are computing C = HC = (I - VTV')C
+ ! Where: V = [ V1 ] and C = [ C1 ]
+ ! [ V2 ] [ C2 ]
+ ! with the following dimensions:
+ ! V1\in\R^{K\times K}
+ ! V2\in\R^{M-K\times K}
+ ! C1=0\in\R^{K\times N}
+ ! C2\in\R^{M-K\times N}
+ ! Since we are assuming that C1 is a zero matrix and it will be
+ ! overwritten on exit, we can use this spot as a temporary workspace
+ ! without having to allocate anything extra.
+ ! This lets us simplify our above equation to get
+ !
+ ! C = HC = (I - [ V1 ]T [V1', V2'])[ 0 ]
+ ! [ V2 ] [ C2 ]
+ ! = [ 0 ] - [ V1 ]T*V2'*C2
+ ! [ C2 ] [ V2 ]
+ !
+ ! = [ 0 ] - [ V1*T*V2'*C2 ]
+ ! [ C2 ] [ V2*T*V2'*C2 ]
+ !
+ ! = [ V1*T*V2'*C2 ]
+ ! [ C2 - V2*T*V2'*C2 ]
+ !
+ ! So, we can order our computations as follows:
+ !
+ ! C1 = V2'*C2
+ ! C1 = T*C1
+ ! C2 = C2 - V2*C1
+ ! C1 = -V1*C1
+ !
+ ! To achieve the same end result
+ !
+ ! Check to ensure side and trans are the expected values
+ !
+ IF( .NOT.SIDEL ) THEN
+ CALL XERBLA('ZLARFB0C2', 2)
+ RETURN
+ ELSE IF(TRANST) THEN
+ CALL XERBLA('ZLARFB0C2', 3)
+ RETURN
+ END IF
+ !
+ ! C1 = V2'*C2
+ !
+ IF (C2I) THEN
+ DO J = 1, N
+ DO I = 1, K
+ C(I,J) = CONJG(V(K+J,I))
+ END DO
+ END DO
+ ELSE
+ CALL ZGEMM('Conjugate', 'No Transpose', K, N, M - K,
+ $ ONE, V(K+1,1), LDV, C(K+1,1), LDC, ZERO,
+ $ C, LDC)
+ END IF
+ !
+ ! C1 = T*C1
+ !
+ CALL ZTRMM('Left', 'Upper', 'No Transpose', 'Non-unit',
+ $ K, N, ONE, T, LDT, C, LDC)
+ !
+ ! C2 = C2 - V2*C1 = -V2*C1 + C2
+ !
+ IF (C2I) THEN
+ CALL ZGEMM('No Transpose', 'No Transpose', M-K, N, K,
+ $ NEG_ONE, V(K+1,1), LDV, C, LDC, ZERO,
+ $ C(K+1,1), LDC)
+ DO I = 1, N
+ C(K+I,I) = C(K+I,I) + ONE
+ END DO
+ ELSE
+ CALL ZGEMM('No Transpose', 'No Transpose', M-K, N, K,
+ $ NEG_ONE, V(K+1,1), LDV, C, LDC, ONE,
+ $ C(K+1,1), LDC)
+ END IF
+ !
+ ! C1 = -V1*C1
+ !
+ CALL ZTRMM('Left', 'Lower', 'No Transpose', 'Unit',
+ $ K, N, NEG_ONE, V, LDV, C, LDC)
+ ELSE IF (LQ) THEN
+ ! We are computing C = CH' = C(I-V'T'V)
+ ! Where: V = [ V1 V2 ] and C = [ C1 C2 ]
+ ! with the following dimensions:
+ ! V1\in\R^{K\times K}
+ ! V2\in\R^{K\times N-K}
+ ! C1=0\in\R^{M\times K}
+ ! C2\in\R^{M\times N-K}
+ ! Since we are assuming that C1 is a zero matrix and it will be
+ ! overwritten on exit, we can use this spot as a temporary workspace
+ ! without having to allocate anything extra.
+ ! This lets us simplify our above equation to get
+ !
+ ! C = CH' = [ 0, C2 ](I - [ V1' ]T'[ V1, V2 ])
+ ! [ V2' ]
+ !
+ ! = [ 0, C2 ] - [ 0, C2 ][ V1' ]T'[ V1, V2 ]
+ ! [ V2' ]
+ !
+ ! = [ 0, C2 ] - C2*V2'*T'[ V1, V2 ]
+ !
+ ! = [ -C2*V2'*T'*V1, C2 - C2*V2'*T'*V2 ]
+ !
+ ! So, we can order our computations as follows:
+ !
+ ! C1 = C2*V2'
+ ! C1 = C1*T'
+ ! C2 = C2 - C1*V2
+ ! C1 = -C1*V1
+ !
+ ! To achieve the same end result
+ !
+ ! Check to ensure side and trans are the expected values
+ !
+ IF( .NOT.SIDER ) THEN
+ CALL XERBLA('ZLARFB0C2', 2)
+ RETURN
+ ELSE IF(.NOT.TRANST) THEN
+ CALL XERBLA('ZLARFB0C2', 3)
+ RETURN
+ END IF
+ !
+ ! C1 = C2*V2'
+ !
+ IF( C2I ) THEN
+ DO J = 1, K
+ DO I = 1, M
+ C(I,J) = CONJG(V(J,K+I))
+ END DO
+ END DO
+ ELSE
+ CALL ZGEMM('No Transpose', 'Conjugate', M, K, N-K,
+ $ ONE, C(1,K+1), LDC, V(1, K+1), LDV, ZERO, C,
+ $ LDC)
+ END IF
+ !
+ ! C1 = C1*T'
+ !
+ CALL ZTRMM('Right', 'Upper', 'Conjugate', 'Non-unit',
+ $ M, K, ONE, T, LDT, C, LDC)
+ !
+ ! C2 = C2 - C1*V2 = -C1*V2 + C2
+ !
+ IF( C2I ) THEN
+ CALL ZGEMM('No Transpose', 'No Transpose', M, N-K, K,
+ $ NEG_ONE, C, LDC, V(1,K+1), LDV, ZERO, C(1,K+1),
+ $ LDC)
+ DO I = 1, M
+ C(I,K+I) = C(I,K+I) + ONE
+ END DO
+ ELSE
+ CALL ZGEMM('No Transpose', 'No Transpose', M, N-K, K,
+ $ NEG_ONE, C, LDC, V(1,K+1), LDV, ONE, C(1,K+1),
+ $ LDC)
+ END IF
+ !
+ ! C1 = -C1*V1
+ !
+ CALL ZTRMM('Right', 'Upper', 'No Transpose', 'Unit',
+ $ M, K, NEG_ONE, V, LDV, C, LDC)
+ ELSE IF (QL) THEN
+ ! We are computing C = HC = (I - VTV')C
+ ! Where: V = [ V2 ] and C = [ C2 ]
+ ! [ V1 ] [ C1 ]
+ ! with the following dimensions:
+ ! V1\in\R^{K\times K}
+ ! V2\in\R^{M-K\times K}
+ ! C1=0\in\R^{K\times N}
+ ! C2\in\R^{M-K\times N}
+ ! Since we are assuming that C1 is a zero matrix and it will be
+ ! overwritten on exit, we can use this spot as a temporary workspace
+ ! without having to allocate anything extra.
+ ! This lets us simplify our above equation to get
+ !
+ ! C = HC = (I-[ V2 ]T[ V2' V1' ])[ C2 ]
+ ! [ V1 ] [ 0 ]
+ !
+ ! = [ C2 ] - [ V2 ]T*V2'*C2
+ ! [ 0 ] [ V1 ]
+ !
+ ! = [ C2 ] - [ V2*T*V2'*C2 ]
+ ! [ 0 ] [ V1*T*V2'*C2 ]
+ !
+ ! = [ C2 - V2*T*V2'*C2 ]
+ ! [ - V1*T*V2'*C2 ]
+ !
+ ! So, we can order our computations as follows:
+ !
+ ! C1 = V2'*C2
+ ! C1 = T*C1
+ ! C2 = C2 - V2*C1
+ ! C1 = -V1*C1
+ !
+ ! To achieve the same end result
+ !
+ ! Check to ensure side and trans are the expected values
+ !
+ IF( .NOT.SIDEL ) THEN
+ CALL XERBLA('ZLARFB0C2', 2)
+ RETURN
+ ELSE IF(TRANST) THEN
+ CALL XERBLA('ZLARFB0C2', 3)
+ RETURN
+ END IF
+ !
+ ! C1 = V2'*C2
+ !
+ IF( C2I ) THEN
+ DO J = 1, N
+ DO I = 1, K
+ C(M-K+I,J) = CONJG(V(J,I))
+ END DO
+ END DO
+ ELSE
+ CALL ZGEMM('Conjugate', 'No Transpose', K, N, M-K,
+ $ ONE, V, LDV, C, LDC, ZERO, C(M-K+1, 1), LDC)
+ END IF
+ !
+ ! C1 = T*C1
+ !
+ CALL ZTRMM('Left', 'Lower', 'No Transpose', 'Non-unit',
+ $ K, N, ONE, T, LDT, C(M-K+1,1), LDC)
+ !
+ ! C2 = C2 - V2*C1 = -V2*C1 + C2
+ !
+ IF( C2I ) THEN
+ CALL ZGEMM('No Transpose', 'No Transpose', M-K, N, K,
+ $ NEG_ONE, V, LDV, C(M-K+1,1), LDC, ZERO, C, LDC)
+ DO I = 1, N
+ C(I,I) = C(I,I) + ONE
+ END DO
+ ELSE
+ CALL ZGEMM('No Transpose', 'No Transpose', M-K, N, K,
+ $ NEG_ONE, V, LDV, C(M-K+1,1), LDC, ONE, C, LDC)
+ END IF
+ !
+ ! C1 = -V1*C1
+ !
+ CALL ZTRMM('Left', 'Upper', 'No Transpose', 'Unit',
+ $ K, N, NEG_ONE, V(M-K+1,1), LDV, C(M-K+1,1), LDC)
+ ELSE ! IF (RQ) THEN
+ ! We are computing C = CH' = C(I-V'T'V)
+ ! Where: V = [ V2 V1] and C = [ C2 C1 ]
+ ! with the following dimensions:
+ ! V1\in\R^{K\times K}
+ ! V2\in\R^{K\times N-K}
+ ! C1=0\in\R^{M\times K}
+ ! C2\in\R^{M\times N-K}
+ ! Since we are assuming that C1 is a zero matrix and it will be
+ ! overwritten on exit, we can use this spot as a temporary workspace
+ ! without having to allocate anything extra.
+ ! This lets us simplify our above equation to get
+ !
+ ! C = CH' = [ C2, 0 ] (I - [ V2' ]T'[ V2, V1 ]
+ ! [ V1' ]
+ !
+ ! = [ C2, 0 ] - [ C2, 0 ] [ V2' ]T'[ V2, V1 ]
+ ! [ V1' ]
+ !
+ ! = [ C2, 0 ] - C2*V2'*T'[ V2, V1 ]
+ !
+ ! = [ C2, 0 ] - [ C2*V2'*T'*V2, C2*V2'*T'*V1 ]
+ !
+ ! = [ C2 - C2*V2'*T'*V2, -C2*V2'*T'*V1 ]
+ !
+ ! So, we can order our computations as follows:
+ !
+ ! C1 = C2*V2'
+ ! C1 = C1*T'
+ ! C2 = C2 - C1*V2
+ ! C1 = -C1*V1
+ !
+ !
+ ! To achieve the same end result
+ !
+ ! Check to ensure side and trans are the expected values
+ !
+ IF( .NOT.SIDER ) THEN
+ CALL XERBLA('ZLARFB0C2', 2)
+ RETURN
+ ELSE IF(.NOT.TRANST) THEN
+ CALL XERBLA('ZLARFB0C2', 3)
+ RETURN
+ END IF
+ !
+ ! C1 = C2*V2'
+ !
+ IF( C2I ) THEN
+ DO J = 1, K
+ DO I = 1, M
+ C(I,N-K+J) = CONJG(V(J,I))
+ END DO
+ END DO
+ ELSE
+ CALL ZGEMM('No Transpose', 'Conjugate', M, K, N-K,
+ $ ONE, C, LDC, V, LDV, ZERO, C(1, N-K+1), LDC)
+ END IF
+ !
+ ! C1 = C1*T'
+ !
+ CALL ZTRMM('Right', 'Lower', 'Conjugate', 'Non-unit',
+ $ M, K, ONE, T, LDT, C(1, N-K+1), LDC)
+ !
+ ! C2 = C2 - C1*V2 = -C1*V2 + C2
+ !
+ IF( C2I ) THEN
+ CALL ZGEMM('No Transpose', 'No Transpose', M, N-K, K,
+ $ NEG_ONE, C(1, N-K+1), LDC, V, LDV, ZERO, C, LDC)
+ DO I = 1, M
+ C(I,I) = C(I,I) + ONE
+ END DO
+ ELSE
+ CALL ZGEMM('No Transpose', 'No Transpose', M, N-K, K,
+ $ NEG_ONE, C(1, N-K+1), LDC, V, LDV, ONE, C, LDC)
+ END IF
+ !
+ ! C1 = -C1*V1
+ !
+ CALL ZTRMM('Right', 'Lower', 'No Transpose', 'Unit',
+ $ M, K, NEG_ONE, V(1, N-K+1), LDV, C(1,N-K+1), LDC)
+ END IF
+ END SUBROUTINE
diff --git a/SRC/zunglq.f b/SRC/zunglq.f
index 3cc107560..54aaf7686 100644
--- a/SRC/zunglq.f
+++ b/SRC/zunglq.f
@@ -136,17 +136,13 @@ SUBROUTINE ZUNGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
*
* =====================================================================
*
-* .. Parameters ..
- COMPLEX*16 ZERO
- PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) )
-* ..
* .. Local Scalars ..
LOGICAL LQUERY
- INTEGER I, IB, IINFO, IWS, J, KI, KK, L, LDWORK,
- $ LWKOPT, NB, NBMIN, NX
+ INTEGER I, IB, IINFO, IWS, KI, KK, LWKOPT, LDWORK,
+ $ NB, NBMIN, NX
* ..
* .. External Subroutines ..
- EXTERNAL XERBLA, ZLARFB, ZLARFT, ZUNGL2
+ EXTERNAL XERBLA, ZLARFB0C2, ZLARFT, ZUNGL2
* ..
* .. Intrinsic Functions ..
INTRINSIC MAX, MIN
@@ -217,67 +213,95 @@ SUBROUTINE ZUNGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
*
IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
*
-* Use blocked code after the last block.
-* The first kk rows are handled by the block method.
-*
- KI = ( ( K-NX-1 ) / NB )*NB
- KK = MIN( K, KI+NB )
-*
-* Set A(kk+1:m,1:kk) to zero.
+* Treat the last NB block starting at KK+1 specially then use our blocking
+* method from the block starting at KI+1 to 1
*
- DO 20 J = 1, KK
- DO 10 I = KK + 1, M
- A( I, J ) = ZERO
- 10 CONTINUE
- 20 CONTINUE
+ KI = K - 2 * NB
+ KK = K - NB
ELSE
KK = 0
END IF
*
-* Use unblocked code for the last or only block.
+* Potentially bail to the unblocked version
*
- IF( KK.LT.M )
- $ CALL ZUNGL2( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA,
- $ TAU( KK+1 ), WORK, IINFO )
+ IF( KK.EQ.0 ) THEN
+ CALL ZUNGL2( M, N, K, A, LDA, TAU, WORK, IINFO )
+ END IF
*
IF( KK.GT.0 ) THEN
*
-* Use blocked code
+* Factor the last block assuming that our first application
+* will be on the Identity matrix
*
- DO 50 I = KI + 1, 1, -NB
- IB = MIN( NB, K-I+1 )
- IF( I+IB.LE.M ) THEN
+ I = KK + 1
+ IB = NB
*
-* Form the triangular factor of the block reflector
-* H = H(i) H(i+1) . . . H(i+ib-1)
+* Form the triangular factor of the block reflector
+* H = H(i) H(i+1) . . . H(i+ib-1)
*
- CALL ZLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I,
- $ I ),
- $ LDA, TAU( I ), WORK, LDWORK )
+ CALL ZLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ),
+ $ LDA, TAU( I ), WORK, LDWORK )
*
-* Apply H**H to A(i+ib:m,i:n) from the right
+* Apply H**T to A(i+ib:m,i:n) from the right
+* Exploit the fact that we are applying to an identity
*
- CALL ZLARFB( 'Right', 'Conjugate transpose',
- $ 'Forward',
- $ 'Rowwise', M-I-IB+1, N-I+1, IB, A( I, I ),
- $ LDA, WORK, LDWORK, A( I+IB, I ), LDA,
- $ WORK( IB+1 ), LDWORK )
- END IF
+ CALL ZLARFB0C2(.TRUE., 'Right', 'Conjugate', 'Forward',
+ $ 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK,
+ $ LDWORK, A(I+IB,I), LDA)
+*
+* Apply H**T to columns i:n of current block
+
+ CALL ZUNGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ),
+ $ WORK, IINFO )
+*
+* Use our standard blocking method after the last block
+*
+ DO I = KI + 1, 1, -NB
+ IB = NB
+*
+* Form the triangular factor of the block reflector
+* H = H(i) H(i+1) . . . H(i+ib-1)
*
-* Apply H**H to columns i:n of current block
+ CALL ZLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ),
+ $ LDA, TAU( I ), WORK, LDWORK )
*
+* Apply H**T to A(i+ib:m,i:n) from the right
+*
+ CALL ZLARFB0C2(.FALSE., 'Right', 'Conjugate', 'Forward',
+ $ 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK,
+ $ LDWORK, A(I+IB,I), LDA)
+*
+* Apply H**T to columns i:n of current block
+
CALL ZUNGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ),
- $ WORK,
- $ IINFO )
+ $ WORK, IINFO )
+ END DO
+*
+* This checks for if K was a perfect multiple of NB
+* so that we only have a special case for the last block when
+* necessary
+*
+ IF(I.LT.1) THEN
+ IB = I + NB - 1
+ I = 1
+*
+* Form the triangular factor of the block reflector
+* H = H(i) H(i+1) . . . H(i+ib-1)
*
-* Set columns 1:i-1 of current block to zero
+ CALL ZLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ),
+ $ LDA, TAU( I ), WORK, LDWORK )
*
- DO 40 J = 1, I - 1
- DO 30 L = I, I + IB - 1
- A( L, J ) = ZERO
- 30 CONTINUE
- 40 CONTINUE
- 50 CONTINUE
+* Apply H**T to A(i+ib:m,i:n) from the right
+*
+ CALL ZLARFB0C2(.FALSE., 'Right', 'Conjugate', 'Forward',
+ $ 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK,
+ $ LDWORK, A(I+IB,I), LDA)
+*
+* Apply H**T to columns i:n of current block
+
+ CALL ZUNGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ),
+ $ WORK, IINFO )
+ END IF
END IF
*
WORK( 1 ) = IWS
diff --git a/SRC/zungql.f b/SRC/zungql.f
index a64de501d..74c5ec8e6 100644
--- a/SRC/zungql.f
+++ b/SRC/zungql.f
@@ -95,8 +95,6 @@
*> \verbatim
*> LWORK is INTEGER
*> The dimension of the array WORK. LWORK >= max(1,N).
-*> For optimum performance LWORK >= N*NB, where NB is the
-*> optimal blocksize.
*>
*> If LWORK = -1, then a workspace query is assumed; the routine
*> only calculates the optimal size of the WORK array, returns
@@ -136,18 +134,14 @@ SUBROUTINE ZUNGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
* ..
*
* =====================================================================
-*
-* .. Parameters ..
- COMPLEX*16 ZERO
- PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) )
-* ..
+*
* .. Local Scalars ..
LOGICAL LQUERY
- INTEGER I, IB, IINFO, IWS, J, KK, L, LDWORK, LWKOPT,
- $ NB, NBMIN, NX
+ INTEGER I, IB, IINFO, IWS, KK, LWKOPT, NB, NBMIN,
+ $ NX
* ..
* .. External Subroutines ..
- EXTERNAL XERBLA, ZLARFB, ZLARFT, ZUNG2L
+ EXTERNAL XERBLA, ZLARFB0C2, ZLARFT, ZUNG2L
* ..
* .. Intrinsic Functions ..
INTRINSIC MAX, MIN
@@ -177,7 +171,11 @@ SUBROUTINE ZUNGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
LWKOPT = 1
ELSE
NB = ILAENV( 1, 'ZUNGQL', ' ', M, N, K, -1 )
- LWKOPT = N*NB
+*
+* Only need a workspace for zung2l in case of bailout
+* and for the panel factorization
+*
+ LWKOPT = N
END IF
WORK( 1 ) = LWKOPT
*
@@ -200,88 +198,77 @@ SUBROUTINE ZUNGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
END IF
*
NBMIN = 2
- NX = 0
+ NX = MAX( 0, ILAENV( 3, 'ZUNGQL', ' ', M, N, K, -1 ))
IWS = N
- IF( NB.GT.1 .AND. NB.LT.K ) THEN
-*
-* Determine when to cross over from blocked to unblocked code.
*
- NX = MAX( 0, ILAENV( 3, 'ZUNGQL', ' ', M, N, K, -1 ) )
- IF( NX.LT.K ) THEN
+ IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
*
-* Determine if workspace is large enough for blocked code.
+* We use blocked code for the entire construction
*
- LDWORK = N
- IWS = LDWORK*NB
- IF( LWORK.LT.IWS ) THEN
+ KK = K
+ ELSE
+ KK = 0
+ END IF
*
-* Not enough workspace to use optimal NB: reduce NB and
-* determine the minimum value of NB.
+* Possibly bail to the unblocked code.
*
- NB = LWORK / LDWORK
- NBMIN = MAX( 2, ILAENV( 2, 'ZUNGQL', ' ', M, N, K,
- $ -1 ) )
- END IF
- END IF
+ IF( KK.EQ.0 ) THEN
+ CALL ZUNG2L( M, N, K, A, LDA, TAU, WORK, IINFO )
END IF
*
- IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
+ IF( KK.GT.0 ) THEN
*
-* Use blocked code after the first block.
-* The last kk columns are handled by the block method.
+* Factor the first block assuming that our first application
+* will be on the Identity matrix
*
- KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB )
+ I = 1
+ IB = NB
*
-* Set A(m-kk+1:m,1:n-kk) to zero.
+* Form the triangular factor of the block reflector
+* H = H(i+ib-1) . . . H(i+1) H(i)
*
- DO 20 J = 1, N - KK
- DO 10 I = M - KK + 1, M
- A( I, J ) = ZERO
- 10 CONTINUE
- 20 CONTINUE
- ELSE
- KK = 0
- END IF
+ CALL ZLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB,
+ $ A( 1, N-K+I ), LDA, TAU( I ),
+ $ A( M-K+I, N-K+I ), LDA)
*
-* Use unblocked code for the first or only block.
+* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left
+* Exploit the fact that we are applying to an identity
*
- CALL ZUNG2L( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO )
+ CALL ZLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Backward',
+ $ 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, A(1, N-K+I),
+ $ LDA, A( M-K+I, N-K+I ), LDA, A, LDA)
*
- IF( KK.GT.0 ) THEN
+* Apply H to rows 1:m-k+i+ib-1 of current block
+*
+ CALL ZUNG2L( M-K+I+IB-1, IB, IB, A( 1, N-K+I ), LDA,
+ $ TAU( I ), WORK, IINFO )
+
+* Use blocked code on the remaining blocks if there are any.
+*
+ DO I = NB+1, K, NB
*
-* Use blocked code
+* The last block may be less than size NB
*
- DO 50 I = K - KK + 1, K, NB
- IB = MIN( NB, K-I+1 )
- IF( N-K+I.GT.1 ) THEN
+ IB = MIN(NB, K-I+1)
*
-* Form the triangular factor of the block reflector
-* H = H(i+ib-1) . . . H(i+1) H(i)
+* Form the triangular factor of the block reflector
+* H = H(i+ib-1) . . . H(i+1) H(i)
*
- CALL ZLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB,
- $ A( 1, N-K+I ), LDA, TAU( I ), WORK, LDWORK )
+ CALL ZLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB,
+ $ A( 1, N-K+I ), LDA, TAU( I ),
+ $ A( M-K+I, N-K+I ), LDA )
*
-* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left
+* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left
*
- CALL ZLARFB( 'Left', 'No transpose', 'Backward',
- $ 'Columnwise', M-K+I+IB-1, N-K+I-1, IB,
- $ A( 1, N-K+I ), LDA, WORK, LDWORK, A, LDA,
- $ WORK( IB+1 ), LDWORK )
- END IF
+ CALL ZLARFB0C2(.FALSE., 'Left', 'No Transpose',
+ $ 'Backward', 'Columnwise', M-K+I+IB-1, N-K+I-1, IB,
+ $ A(1, N-K+I), LDA, A( M-K+I, N-K+I ), LDA, A, LDA)
*
* Apply H to rows 1:m-k+i+ib-1 of current block
*
CALL ZUNG2L( M-K+I+IB-1, IB, IB, A( 1, N-K+I ), LDA,
$ TAU( I ), WORK, IINFO )
-*
-* Set rows m-k+i+ib:m of current block to zero
-*
- DO 40 J = N - K + I, N - K + I + IB - 1
- DO 30 L = M - K + I + IB, M
- A( L, J ) = ZERO
- 30 CONTINUE
- 40 CONTINUE
- 50 CONTINUE
+ END DO
END IF
*
WORK( 1 ) = IWS
diff --git a/SRC/zungqr.f b/SRC/zungqr.f
index 6c9f2e3ff..50d5d91a2 100644
--- a/SRC/zungqr.f
+++ b/SRC/zungqr.f
@@ -95,8 +95,6 @@
*> \verbatim
*> LWORK is INTEGER
*> The dimension of the array WORK. LWORK >= max(1,N).
-*> For optimum performance LWORK >= N*NB, where NB is the
-*> optimal blocksize.
*>
*> If LWORK = -1, then a workspace query is assumed; the routine
*> only calculates the optimal size of the WORK array, returns
@@ -137,17 +135,13 @@ SUBROUTINE ZUNGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
*
* =====================================================================
*
-* .. Parameters ..
- COMPLEX*16 ZERO
- PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) )
-* ..
* .. Local Scalars ..
LOGICAL LQUERY
- INTEGER I, IB, IINFO, IWS, J, KI, KK, L, LDWORK,
- $ LWKOPT, NB, NBMIN, NX
+ INTEGER I, IB, IINFO, IWS, KI, KK, LWKOPT, NB,
+ $ NBMIN, NX
* ..
* .. External Subroutines ..
- EXTERNAL XERBLA, ZLARFB, ZLARFT, ZUNG2R
+ EXTERNAL XERBLA, ZLARFB0C2, ZLARFT, ZUNG2R
* ..
* .. Intrinsic Functions ..
INTRINSIC MAX, MIN
@@ -162,7 +156,11 @@ SUBROUTINE ZUNGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
*
INFO = 0
NB = ILAENV( 1, 'ZUNGQR', ' ', M, N, K, -1 )
- LWKOPT = MAX( 1, N )*NB
+*
+* Only need a workspace for zung2r in case of bailout
+* and for the panel factorization
+*
+ LWKOPT = MAX( 1, N )
WORK( 1 ) = LWKOPT
LQUERY = ( LWORK.EQ.-1 )
IF( M.LT.0 ) THEN
@@ -191,92 +189,102 @@ SUBROUTINE ZUNGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
END IF
*
NBMIN = 2
- NX = 0
+* Determine when to cross over from unblocked to blocked
+ NX = MAX( 0, ILAENV( 3, 'ZUNGQR', ' ', M, N, K, -1 ) )
IWS = N
- IF( NB.GT.1 .AND. NB.LT.K ) THEN
*
-* Determine when to cross over from blocked to unblocked code.
-*
- NX = MAX( 0, ILAENV( 3, 'ZUNGQR', ' ', M, N, K, -1 ) )
- IF( NX.LT.K ) THEN
+ IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
*
-* Determine if workspace is large enough for blocked code.
+* Treat the last NB block starting at KK+1 specially then use our blocking
+* method from the block starting at KI+1 to 1
*
- LDWORK = N
- IWS = LDWORK*NB
- IF( LWORK.LT.IWS ) THEN
+ KI = K - 2 * NB
+ KK = K - NB
+ ELSE
+ KK = 0
+ END IF
*
-* Not enough workspace to use optimal NB: reduce NB and
-* determine the minimum value of NB.
+* Potentially bail to the unblocked code.
*
- NB = LWORK / LDWORK
- NBMIN = MAX( 2, ILAENV( 2, 'ZUNGQR', ' ', M, N, K,
- $ -1 ) )
- END IF
- END IF
+ IF( KK.EQ.0 ) THEN
+ CALL ZUNG2R( M, N, K, A, LDA, TAU, WORK, IINFO )
END IF
*
- IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
+ IF( KK.GT.0 ) THEN
*
-* Use blocked code after the last block.
-* The first kk columns are handled by the block method.
+* Factor the last block assuming that our first application
+* will be on the Identity matrix
*
- KI = ( ( K-NX-1 ) / NB )*NB
- KK = MIN( K, KI+NB )
+ I = KK + 1
+ IB = NB
*
-* Set A(1:kk,kk+1:n) to zero.
+* Form the triangular factor of the block reflector
+* H = H(i) H(i+1) . . . H(i+ib-1)
*
- DO 20 J = KK + 1, N
- DO 10 I = 1, KK
- A( I, J ) = ZERO
- 10 CONTINUE
- 20 CONTINUE
- ELSE
- KK = 0
- END IF
+ CALL ZLARFT('Forward', 'Column', M-I+1, IB, A(I,I),
+ $ LDA, TAU(I), A(I,I), LDA)
*
-* Use unblocked code for the last or only block.
+* Apply H to A(i:m,i+ib:n) from the left
+* Exploit the fact that we are applying to an identity
*
- IF( KK.LT.N )
- $ CALL ZUNG2R( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA,
- $ TAU( KK+1 ), WORK, IINFO )
+ CALL ZLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Forward',
+ $ 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), LDA, A(I,I),
+ $ LDA, A(I,I+IB), LDA)
*
- IF( KK.GT.0 ) THEN
+* Apply H to rows i:m of current block
*
-* Use blocked code
+ CALL ZUNG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK, IINFO)
*
- DO 50 I = KI + 1, 1, -NB
- IB = MIN( NB, K-I+1 )
- IF( I+IB.LE.N ) THEN
+* Use our standard blocking method after the last block
*
-* Form the triangular factor of the block reflector
-* H = H(i) H(i+1) . . . H(i+ib-1)
+ DO I = KI + 1, 1, -NB
+ IB = NB
*
- CALL ZLARFT( 'Forward', 'Columnwise', M-I+1, IB,
- $ A( I, I ), LDA, TAU( I ), WORK, LDWORK )
+* Form the triangular factor of the block reflector
+* H = H(i) H(i+1) . . . H(i+ib-1)
*
-* Apply H to A(i:m,i+ib:n) from the left
+ CALL ZLARFT('Forward', 'Column', M-I+1, IB, A(I,I),
+ $ LDA, TAU(I), A(I,I), LDA)
*
- CALL ZLARFB( 'Left', 'No transpose', 'Forward',
- $ 'Columnwise', M-I+1, N-I-IB+1, IB,
- $ A( I, I ), LDA, WORK, LDWORK, A( I, I+IB ),
- $ LDA, WORK( IB+1 ), LDWORK )
- END IF
+* Apply H to A(i:m,i+ib:n) from the left
+*
+ CALL ZLARFB0C2(.FALSE., 'Left', 'No Transpose',
+ $ 'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I),
+ $ LDA, A(I,I), LDA, A(I,I+IB), LDA)
+
*
* Apply H to rows i:m of current block
*
- CALL ZUNG2R( M-I+1, IB, IB, A( I, I ), LDA, TAU( I ),
- $ WORK,
- $ IINFO )
+ CALL ZUNG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK,
+ $ IINFO)
+ END DO
+*
+* This checks for if K was a perfect multiple of NB
+* so that we only have a special case for the last block when
+* necessary
+*
+ IF(I.LT.1) THEN
+ IB = I + NB - 1
+ I = 1
*
-* Set rows 1:i-1 of current block to zero
+* Form the triangular factor of the block reflector
+* H = H(i) H(i+1) . . . H(i+ib-1)
*
- DO 40 J = I, I + IB - 1
- DO 30 L = 1, I - 1
- A( L, J ) = ZERO
- 30 CONTINUE
- 40 CONTINUE
- 50 CONTINUE
+ CALL ZLARFT('Forward', 'Column', M-I+1, IB, A(I,I),
+ $ LDA, TAU(I), A(I,I), LDA)
+*
+* Apply H to A(i:m,i+ib:n) from the left
+*
+ CALL ZLARFB0C2(.FALSE., 'Left', 'No Transpose',
+ $ 'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I),
+ $ LDA, A(I,I), LDA, A(I,I+IB), LDA)
+
+*
+* Apply H to rows i:m of current block
+*
+ CALL ZUNG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK,
+ $ IINFO)
+ END IF
END IF
*
WORK( 1 ) = IWS
diff --git a/SRC/zungrq.f b/SRC/zungrq.f
index dcc2772d6..939fff3fc 100644
--- a/SRC/zungrq.f
+++ b/SRC/zungrq.f
@@ -137,13 +137,9 @@ SUBROUTINE ZUNGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
*
* =====================================================================
*
-* .. Parameters ..
- COMPLEX*16 ZERO
- PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) )
-* ..
* .. Local Scalars ..
LOGICAL LQUERY
- INTEGER I, IB, II, IINFO, IWS, J, KK, L, LDWORK,
+ INTEGER I, IB, II, IINFO, IWS, KK, LDWORK,
$ LWKOPT, NB, NBMIN, NX
* ..
* .. External Subroutines ..
@@ -230,61 +226,67 @@ SUBROUTINE ZUNGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
* Use blocked code after the first block.
* The last kk rows are handled by the block method.
*
- KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB )
-*
-* Set A(1:m-kk,n-kk+1:n) to zero.
-*
- DO 20 J = N - KK + 1, N
- DO 10 I = 1, M - KK
- A( I, J ) = ZERO
- 10 CONTINUE
- 20 CONTINUE
+ KK = K
ELSE
KK = 0
END IF
*
-* Use unblocked code for the first or only block.
+* Potentially bail to the unblocked code
*
- CALL ZUNGR2( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO )
+ IF( KK.EQ.0 ) THEN
+ CALL ZUNGR2( M, N, K, A, LDA, TAU, WORK, IINFO )
+ END IF
*
IF( KK.GT.0 ) THEN
*
-* Use blocked code
+* Factor the first block assuming that our first application
+* will be on the Identity matrix
*
- DO 50 I = K - KK + 1, K, NB
- IB = MIN( NB, K-I+1 )
- II = M - K + I
- IF( II.GT.1 ) THEN
+ I = 1
+ IB = NB
+ II = M - K + I
*
-* Form the triangular factor of the block reflector
-* H = H(i+ib-1) . . . H(i+1) H(i)
+* Form the triangular factor of the block reflector
+* H = H(i+ib-1) . . . H(i+1) H(i)
*
- CALL ZLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB,
- $ A( II, 1 ), LDA, TAU( I ), WORK, LDWORK )
+ CALL ZLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB,
+ $ A( II, 1 ), LDA, TAU( I ), WORK, LDWORK )
*
-* Apply H**H to A(1:m-k+i-1,1:n-k+i+ib-1) from the right
+* Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right
*
- CALL ZLARFB( 'Right', 'Conjugate transpose',
- $ 'Backward',
- $ 'Rowwise', II-1, N-K+I+IB-1, IB, A( II, 1 ),
- $ LDA, WORK, LDWORK, A, LDA, WORK( IB+1 ),
- $ LDWORK )
- END IF
+ CALL ZLARFB0C2(.TRUE., 'Right', 'Conjugate', 'Backward',
+ $ 'Rowwise', II-1, N-K+I+IB-1, IB, A(II,1), LDA, WORK,
+ $ LDWORK, A, LDA)
*
-* Apply H**H to columns 1:n-k+i+ib-1 of current block
+* Apply H**T to columns 1:n-k+i+ib-1 of current block
*
- CALL ZUNGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA,
- $ TAU( I ),
- $ WORK, IINFO )
+ CALL ZUNGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA,
+ $ TAU( I ), WORK, IINFO )
+
+ DO I = NB+1, K, NB
*
-* Set columns n-k+i+ib:n of current block to zero
+* The last block may be less than size NB
*
- DO 40 L = N - K + I + IB, N
- DO 30 J = II, II + IB - 1
- A( J, L ) = ZERO
- 30 CONTINUE
- 40 CONTINUE
- 50 CONTINUE
+ IB = MIN(NB, K-I+1)
+ II = M - K + I
+*
+* Form the triangular factor of the block reflector
+* H = H(i+ib-1) . . . H(i+1) H(i)
+*
+ CALL ZLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB,
+ $ A( II, 1 ), LDA, TAU( I ), WORK, LDWORK )
+*
+* Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right
+*
+ CALL ZLARFB0C2(.FALSE., 'Right', 'Conjugate',
+ $ 'Backward', 'Rowwise', II-1, N-K+I+IB-1, IB,
+ $ A(II,1), LDA, WORK, LDWORK, A, LDA)
+*
+* Apply H**T to columns 1:n-k+i+ib-1 of current block
+*
+ CALL ZUNGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA,
+ $ TAU( I ), WORK, IINFO )
+ END DO
END IF
*
WORK( 1 ) = IWS