diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt index be426cecd..24740835b 100644 --- a/SRC/CMakeLists.txt +++ b/SRC/CMakeLists.txt @@ -106,7 +106,7 @@ set(SLASRC slaqgb.f slaqge.f slaqp2.f slaqps.f slaqp2rk.f slaqp3rk.f slaqsb.f slaqsp.f slaqsy.f slaqr0.f slaqr1.f slaqr2.f slaqr3.f slaqr4.f slaqr5.f slaqtr.f slar1v.f slar2v.f ilaslr.f ilaslc.f - slarf.f slarf1f.f slarf1l.f slarfb.f slarfb_gett.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f + slarf.f slarf1f.f slarf1l.f slarfb.f slarfb0c2.f slarfb_gett.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f slargv.f slarmm.f slarrv.f slartv.f slarz.f slarzb.f slarzt.f slasy2.f slasyf.f slasyf_rook.f slasyf_rk.f slasyf_aa.f @@ -218,7 +218,7 @@ set(CLASRC claqhb.f claqhe.f claqhp.f claqp2.f claqps.f claqp2rk.f claqp3rk.f claqsb.f claqr0.f claqr1.f claqr2.f claqr3.f claqr4.f claqr5.f claqsp.f claqsy.f clar1v.f clar2v.f ilaclr.f ilaclc.f - clarf.f clarf1f.f clarf1l.f clarfb.f clarfb_gett.f clarfg.f clarfgp.f clarft.f + clarf.f clarf1f.f clarf1l.f clarfb.f clarfb0c2.f clarfb_gett.f clarfg.f clarfgp.f clarft.f clarfx.f clarfy.f clargv.f clarnv.f clarrv.f clartg.f90 clartv.f clarz.f clarzb.f clarzt.f clascl.f claset.f clasr.f classq.f90 claswp.f clasyf.f clasyf_rook.f clasyf_rk.f clasyf_aa.f @@ -307,7 +307,7 @@ set(DLASRC dlaqgb.f dlaqge.f dlaqp2.f dlaqps.f dlaqp2rk.f dlaqp3rk.f dlaqsb.f dlaqsp.f dlaqsy.f dlaqr0.f dlaqr1.f dlaqr2.f dlaqr3.f dlaqr4.f dlaqr5.f dlaqtr.f dlar1v.f dlar2v.f iladlr.f iladlc.f - dlarf.f dlarfb.f dlarfb_gett.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f dlarf1f.f dlarf1l.f + dlarf.f dlarfb.f dlarfb_gett.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f dlarf1f.f dlarf1l.f dlarfb0c2.f dlargv.f dlarmm.f dlarrv.f dlartv.f dlarz.f dlarzb.f dlarzt.f dlaswp.f dlasy2.f dlasyf.f dlasyf_rook.f dlasyf_rk.f dlasyf_aa.f @@ -418,7 +418,7 @@ set(ZLASRC zlaqhb.f zlaqhe.f zlaqhp.f zlaqp2.f zlaqps.f zlaqp2rk.f zlaqp3rk.f zlaqsb.f zlaqr0.f zlaqr1.f zlaqr2.f zlaqr3.f zlaqr4.f zlaqr5.f zlaqsp.f zlaqsy.f zlar1v.f zlar2v.f ilazlr.f ilazlc.f - zlarcm.f zlarf.f zlarfb.f zlarfb_gett.f zlarf1f.f zlarf1l.f + zlarcm.f zlarf.f zlarfb.f zlarfb0c2.f zlarfb_gett.f zlarf1f.f zlarf1l.f zlarfg.f zlarfgp.f zlarft.f zlarfx.f zlarfy.f zlargv.f zlarnv.f zlarrv.f zlartg.f90 zlartv.f zlarz.f zlarzb.f zlarzt.f zlascl.f zlaset.f zlasr.f diff --git a/SRC/Makefile b/SRC/Makefile index 0191626f0..b9eaf1bca 100644 --- a/SRC/Makefile +++ b/SRC/Makefile @@ -137,7 +137,7 @@ SLASRC = \ slaqgb.o slaqge.o slaqp2.o slaqps.o slaqp2rk.o slaqp3rk.o slaqsb.o slaqsp.o slaqsy.o \ slaqr0.o slaqr1.o slaqr2.o slaqr3.o slaqr4.o slaqr5.o \ slaqtr.o slar1v.o slar2v.o ilaslr.o ilaslc.o \ - slarf.o slarf1f.o slarf1l.o slarfb.o slarfb_gett.o slarfg.o slarfgp.o slarft.o slarfx.o slarfy.o \ + slarf.o slarf1f.o slarf1l.o slarfb.o slarfb0c2.o slarfb_gett.o slarfg.o slarfgp.o slarft.o slarfx.o slarfy.o \ slargv.o slarmm.o slarrv.o slartv.o \ slarz.o slarzb.o slarzt.o slaswp.o slasy2.o slasyf.o slasyf_rook.o \ slasyf_rk.o \ @@ -249,7 +249,7 @@ CLASRC = \ claqhb.o claqhe.o claqhp.o claqp2.o claqps.o claqp2rk.o claqp3rk.o claqsb.o \ claqr0.o claqr1.o claqr2.o claqr3.o claqr4.o claqr5.o \ claqsp.o claqsy.o clar1v.o clar2v.o ilaclr.o ilaclc.o \ - clarf.o clarf1f.o clarf1l.o clarfb.o clarfb_gett.o clarfg.o clarft.o clarfgp.o \ + clarf.o clarf1f.o clarf1l.o clarfb.o clarfb0c2.o clarfb_gett.o clarfg.o clarft.o clarfgp.o \ clarfx.o clarfy.o clargv.o clarnv.o clarrv.o clartg.o clartv.o \ clarz.o clarzb.o clarzt.o clascl.o claset.o clasr.o classq.o \ claswp.o clasyf.o clasyf_rook.o clasyf_rk.o clasyf_aa.o \ @@ -339,7 +339,7 @@ DLASRC = \ dlaqgb.o dlaqge.o dlaqp2.o dlaqps.o dlaqp2rk.o dlaqp3rk.o dlaqsb.o dlaqsp.o dlaqsy.o \ dlaqr0.o dlaqr1.o dlaqr2.o dlaqr3.o dlaqr4.o dlaqr5.o \ dlaqtr.o dlar1v.o dlar2v.o iladlr.o iladlc.o \ - dlarf.o dlarfb.o dlarfb_gett.o dlarfg.o dlarfgp.o dlarft.o dlarfx.o dlarfy.o dlarf1f.o dlarf1l.o\ + dlarf.o dlarfb.o dlarfb_gett.o dlarfg.o dlarfgp.o dlarft.o dlarfx.o dlarfy.o dlarf1f.o dlarf1l.o dlarfb0c2.o\ dlargv.o dlarmm.o dlarrv.o dlartv.o \ dlarz.o dlarzb.o dlarzt.o dlaswp.o dlasy2.o \ dlasyf.o dlasyf_rook.o dlasyf_rk.o \ @@ -453,7 +453,7 @@ ZLASRC = \ zlaqhb.o zlaqhe.o zlaqhp.o zlaqp2.o zlaqps.o zlaqp2rk.o zlaqp3rk.o zlaqsb.o \ zlaqr0.o zlaqr1.o zlaqr2.o zlaqr3.o zlaqr4.o zlaqr5.o \ zlaqsp.o zlaqsy.o zlar1v.o zlar2v.o ilazlr.o ilazlc.o \ - zlarcm.o zlarf.o zlarfb.o zlarfb_gett.o zlarf1f.o zlarf1l.o \ + zlarcm.o zlarf.o zlarfb.o zlarfb0c2.o zlarfb_gett.o zlarf1f.o zlarf1l.o \ zlarfg.o zlarft.o zlarfgp.o \ zlarfx.o zlarfy.o zlargv.o zlarnv.o zlarrv.o zlartg.o zlartv.o \ zlarz.o zlarzb.o zlarzt.o zlascl.o zlaset.o zlasr.o \ diff --git a/SRC/clarfb0c2.f b/SRC/clarfb0c2.f new file mode 100644 index 000000000..4a7a2018d --- /dev/null +++ b/SRC/clarfb0c2.f @@ -0,0 +1,561 @@ +*> \brief \b CLARFB0C2 applies a block reflector or its conjugate-transpose +* to a rectangular matrix with a 0 block while constructing the explicit Q +* factor +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* +* Definition: +* =========== +* +* SUBROUTINE CLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N, +* $ K, V, LDV, T, LDT, C, LDC) +* ! Scalar arguments +* INTEGER M, N, K, LDV, LDC, LDT +* CHARACTER SIDE, TRANS, DIRECT, STOREV +* ! True means that we are assuming C2 is the identity matrix +* ! and thus don't reference whatever is present in C2 +* ! at the beginning. +* LOGICAL C2I +* ! Array arguments +* COMPLEX V(LDV,*), C(LDC,*), T(LDT,*) +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CLARFB0C2 applies a real block reflector H or its transpose H**H to a +*> complex m by n matrix C with a 0 block, while computing the explicit Q factor +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] C2I +*> \verbatim +*> C2I is LOGICAL +*> = .TRUE.: Assume the nonzero block of C is the identity matrix +*> = .FALSE.: Use existing data in the nonzero block of C +*> \endverbatim +*> +*> \param[in] SIDE +*> \verbatim +*> SIDE is CHARACTER*1 +*> = 'L': apply H or H**H from the Left +*> = 'R': apply H or H**H from the Right +*> \endverbatim +*> +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> = 'N': apply H (No transpose) +*> = 'C': apply H**H (Conjugate transpose) +*> \endverbatim +*> +*> \param[in] DIRECT +*> \verbatim +*> DIRECT is CHARACTER*1 +*> Indicates how H is formed from a product of elementary +*> reflectors +*> = 'F': H = H(1) H(2) . . . H(k) (Forward) +*> = 'B': H = H(k) . . . H(2) H(1) (Backward) +*> \endverbatim +*> +*> \param[in] STOREV +*> \verbatim +*> STOREV is CHARACTER*1 +*> Indicates how the vectors which define the elementary +*> reflectors are stored: +*> = 'C': Columnwise +*> = 'R': Rowwise +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix C. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix C. +*> \endverbatim +*> +*> \param[in] K +*> \verbatim +*> K is INTEGER +*> The order of the matrix T (= the number of elementary +*> reflectors whose product defines the block reflector). +*> If SIDE = 'L', M >= K >= 0; +*> if SIDE = 'R', N >= K >= 0. +*> \endverbatim +*> +*> \param[in] V +*> \verbatim +*> V is COMPLEX array, dimension +*> (LDV,K) if STOREV = 'C' +*> (LDV,M) if STOREV = 'R' and SIDE = 'L' +*> (LDV,N) if STOREV = 'R' and SIDE = 'R' +*> See Further Details. +*> \endverbatim +*> +*> \param[in] LDV +*> \verbatim +*> LDV is INTEGER +*> The leading dimension of the array V. +*> If STOREV = 'C' and SIDE = 'L', LDV >= max(1,M); +*> if STOREV = 'C' and SIDE = 'R', LDV >= max(1,N); +*> if STOREV = 'R', LDV >= K. +*> \endverbatim +*> +*> \param[in] T +*> \verbatim +*> T is COMPLEX array, dimension (LDT,K) +*> The triangular K-by-K matrix T in the representation of the +*> block reflector. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. LDT >= K. +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is COMPLEX array, dimension (LDC,N) +*> On entry, the M-by-N matrix C. +*> On exit, C is overwritten by H*C or H**H*C or C*H or C*H**H. +*> \endverbatim +*> +*> \param[in] LDC +*> \verbatim +*> LDC is INTEGER +*> The leading dimension of the array C. LDC >= max(1,M). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup larfb +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> The shape of the matrix V and the storage of the vectors which define +*> the H(i) is best illustrated by the following example with n = 5 and +*> k = 3. The triangular part of V (including its diagonal) is not +*> referenced. +*> +*> DIRECT = 'F' and STOREV = 'C': DIRECT = 'F' and STOREV = 'R': +*> +*> V = ( 1 ) V = ( 1 v1 v1 v1 v1 ) +*> ( v1 1 ) ( 1 v2 v2 v2 ) +*> ( v1 v2 1 ) ( 1 v3 v3 ) +*> ( v1 v2 v3 ) +*> ( v1 v2 v3 ) +*> +*> DIRECT = 'B' and STOREV = 'C': DIRECT = 'B' and STOREV = 'R': +*> +*> V = ( v1 v2 v3 ) V = ( v1 v1 1 ) +*> ( v1 v2 v3 ) ( v2 v2 v2 1 ) +*> ( 1 v2 v3 ) ( v3 v3 v3 v3 1 ) +*> ( 1 v3 ) +*> ( 1 ) +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE CLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N, + $ K, V, LDV, T, LDT, C, LDC) + ! Scalar arguments + INTEGER M, N, K, LDV, LDC, LDT + CHARACTER SIDE, TRANS, DIRECT, STOREV + ! True means that we are assuming C2 is the identity matrix + ! and thus don't reference whatever is present in C2 + ! at the beginning. + LOGICAL C2I + + ! Array arguments + COMPLEX V(LDV,*), C(LDC,*), T(LDT,*) + ! Local scalars + LOGICAL QR, LQ, QL, DIRF, COLV, SIDEL, SIDER, + $ TRANST + INTEGER I, J + ! Intrinsic Functions + INTRINSIC CONJG + ! External functions + LOGICAL LSAME + EXTERNAL LSAME + ! External Subroutines + EXTERNAL CGEMM, CTRMM + ! Parameters + COMPLEX ONE, ZERO, NEG_ONE + PARAMETER(ONE=(1.0E+0, 0.0E+0), + $ ZERO = (0.0E+0, 0.0E+0), + $ NEG_ONE = (-1.0E+0, 0.0E+0)) + + ! Beginning of executable statements + ! Convert our character flags to logical values + DIRF = LSAME(DIRECT,'F') + COLV = LSAME(STOREV,'C') + SIDEL = LSAME(SIDE,'L') + SIDER = LSAME(SIDE,'R') + TRANST = LSAME(TRANS,'C') + + ! Determine which of the 4 modes are using. + ! QR is when we store the reflectors column by column and have the + ! 'first' reflector stored in the first column + QR = DIRF.AND.COLV + + ! LQ is when we store the reflectors row by row and have the + ! 'first' reflector stored in the first row + LQ = DIRF.AND.(.NOT.COLV) + + ! QL is when we store the reflectors column by column and have the + ! 'first' reflector stored in the last column + QL = (.NOT.DIRF).AND.COLV + + ! RQ is when we store the reflectors row by row and have the + ! 'first' reflector stored in the last row + ! RQ = (.NOT.DIRF).AND.(.NOT.COLV) + ! Since we have exactly one of these 4 modes, we don't need to actually + ! store the value of RQ, instead we assume this is the case if we fail + ! the above 3 checks. + + IF (QR) THEN + ! We are computing C = HC = (I - VTV')C + ! Where: V = [ V1 ] and C = [ C1 ] + ! [ V2 ] [ C2 ] + ! with the following dimensions: + ! V1\in\R^{K\times K} + ! V2\in\R^{M-K\times K} + ! C1=0\in\R^{K\times N} + ! C2\in\R^{M-K\times N} + ! Since we are assuming that C1 is a zero matrix and it will be + ! overwritten on exit, we can use this spot as a temporary workspace + ! without having to allocate anything extra. + ! This lets us simplify our above equation to get + ! + ! C = HC = (I - [ V1 ]T [V1', V2'])[ 0 ] + ! [ V2 ] [ C2 ] + ! = [ 0 ] - [ V1 ]T*V2'*C2 + ! [ C2 ] [ V2 ] + ! + ! = [ 0 ] - [ V1*T*V2'*C2 ] + ! [ C2 ] [ V2*T*V2'*C2 ] + ! + ! = [ V1*T*V2'*C2 ] + ! [ C2 - V2*T*V2'*C2 ] + ! + ! So, we can order our computations as follows: + ! + ! C1 = V2'*C2 + ! C1 = T*C1 + ! C2 = C2 - V2*C1 + ! C1 = -V1*C1 + ! + ! To achieve the same end result + ! + ! Check to ensure side and trans are the expected values + ! + IF( .NOT.SIDEL ) THEN + CALL XERBLA('CLARFB0C2', 2) + RETURN + ELSE IF(TRANST) THEN + CALL XERBLA('CLARFB0C2', 3) + RETURN + END IF + ! + ! C1 = V2'*C2 + ! + IF (C2I) THEN + DO J = 1, N + DO I = 1, K + C(I,J) = CONJG(V(K+J,I)) + END DO + END DO + ELSE + CALL CGEMM('Conjugate', 'No Transpose', K, N, M - K, + $ ONE, V(K+1,1), LDV, C(K+1,1), LDC, ZERO, + $ C, LDC) + END IF + ! + ! C1 = T*C1 + ! + CALL CTRMM('Left', 'Upper', 'No Transpose', 'Non-unit', + $ K, N, ONE, T, LDT, C, LDC) + ! + ! C2 = C2 - V2*C1 = -V2*C1 + C2 + ! + IF (C2I) THEN + CALL CGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V(K+1,1), LDV, C, LDC, ZERO, + $ C(K+1,1), LDC) + DO I = 1, N + C(K+I,I) = C(K+I,I) + ONE + END DO + ELSE + CALL CGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V(K+1,1), LDV, C, LDC, ONE, + $ C(K+1,1), LDC) + END IF + ! + ! C1 = -V1*C1 + ! + CALL CTRMM('Left', 'Lower', 'No Transpose', 'Unit', + $ K, N, NEG_ONE, V, LDV, C, LDC) + ELSE IF (LQ) THEN + ! We are computing C = CH' = C(I-V'T'V) + ! Where: V = [ V1 V2 ] and C = [ C1 C2 ] + ! with the following dimensions: + ! V1\in\R^{K\times K} + ! V2\in\R^{K\times N-K} + ! C1=0\in\R^{M\times K} + ! C2\in\R^{M\times N-K} + ! Since we are assuming that C1 is a zero matrix and it will be + ! overwritten on exit, we can use this spot as a temporary workspace + ! without having to allocate anything extra. + ! This lets us simplify our above equation to get + ! + ! C = CH' = [ 0, C2 ](I - [ V1' ]T'[ V1, V2 ]) + ! [ V2' ] + ! + ! = [ 0, C2 ] - [ 0, C2 ][ V1' ]T'[ V1, V2 ] + ! [ V2' ] + ! + ! = [ 0, C2 ] - C2*V2'*T'[ V1, V2 ] + ! + ! = [ -C2*V2'*T'*V1, C2 - C2*V2'*T'*V2 ] + ! + ! So, we can order our computations as follows: + ! + ! C1 = C2*V2' + ! C1 = C1*T' + ! C2 = C2 - C1*V2 + ! C1 = -C1*V1 + ! + ! To achieve the same end result + ! + ! Check to ensure side and trans are the expected values + ! + IF( .NOT.SIDER ) THEN + CALL XERBLA('CLARFB0C2', 2) + RETURN + ELSE IF(.NOT.TRANST) THEN + CALL XERBLA('CLARFB0C2', 3) + RETURN + END IF + ! + ! C1 = C2*V2' + ! + IF( C2I ) THEN + DO J = 1, K + DO I = 1, M + C(I,J) = CONJG(V(J,K+I)) + END DO + END DO + ELSE + CALL CGEMM('No Transpose', 'Conjugate', M, K, N-K, + $ ONE, C(1,K+1), LDC, V(1, K+1), LDV, ZERO, C, + $ LDC) + END IF + ! + ! C1 = C1*T' + ! + CALL CTRMM('Right', 'Upper', 'Conjugate', 'Non-unit', + $ M, K, ONE, T, LDT, C, LDC) + ! + ! C2 = C2 - C1*V2 = -C1*V2 + C2 + ! + IF( C2I ) THEN + CALL CGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C, LDC, V(1,K+1), LDV, ZERO, C(1,K+1), + $ LDC) + DO I = 1, M + C(I,K+I) = C(I,K+I) + ONE + END DO + ELSE + CALL CGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C, LDC, V(1,K+1), LDV, ONE, C(1,K+1), + $ LDC) + END IF + ! + ! C1 = -C1*V1 + ! + CALL CTRMM('Right', 'Upper', 'No Transpose', 'Unit', + $ M, K, NEG_ONE, V, LDV, C, LDC) + ELSE IF (QL) THEN + ! We are computing C = HC = (I - VTV')C + ! Where: V = [ V2 ] and C = [ C2 ] + ! [ V1 ] [ C1 ] + ! with the following dimensions: + ! V1\in\R^{K\times K} + ! V2\in\R^{M-K\times K} + ! C1=0\in\R^{K\times N} + ! C2\in\R^{M-K\times N} + ! Since we are assuming that C1 is a zero matrix and it will be + ! overwritten on exit, we can use this spot as a temporary workspace + ! without having to allocate anything extra. + ! This lets us simplify our above equation to get + ! + ! C = HC = (I-[ V2 ]T[ V2' V1' ])[ C2 ] + ! [ V1 ] [ 0 ] + ! + ! = [ C2 ] - [ V2 ]T*V2'*C2 + ! [ 0 ] [ V1 ] + ! + ! = [ C2 ] - [ V2*T*V2'*C2 ] + ! [ 0 ] [ V1*T*V2'*C2 ] + ! + ! = [ C2 - V2*T*V2'*C2 ] + ! [ - V1*T*V2'*C2 ] + ! + ! So, we can order our computations as follows: + ! + ! C1 = V2'*C2 + ! C1 = T*C1 + ! C2 = C2 - V2*C1 + ! C1 = -V1*C1 + ! + ! To achieve the same end result + ! + ! Check to ensure side and trans are the expected values + ! + IF( .NOT.SIDEL ) THEN + CALL XERBLA('CLARFB0C2', 2) + RETURN + ELSE IF(TRANST) THEN + CALL XERBLA('CLARFB0C2', 3) + RETURN + END IF + ! + ! C1 = V2'*C2 + ! + IF( C2I ) THEN + DO J = 1, N + DO I = 1, K + C(M-K+I,J) = CONJG(V(J,I)) + END DO + END DO + ELSE + CALL CGEMM('Conjugate', 'No Transpose', K, N, M-K, + $ ONE, V, LDV, C, LDC, ZERO, C(M-K+1, 1), LDC) + END IF + ! + ! C1 = T*C1 + ! + CALL CTRMM('Left', 'Lower', 'No Transpose', 'Non-unit', + $ K, N, ONE, T, LDT, C(M-K+1,1), LDC) + ! + ! C2 = C2 - V2*C1 = -V2*C1 + C2 + ! + IF( C2I ) THEN + CALL CGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V, LDV, C(M-K+1,1), LDC, ZERO, C, LDC) + DO I = 1, N + C(I,I) = C(I,I) + ONE + END DO + ELSE + CALL CGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V, LDV, C(M-K+1,1), LDC, ONE, C, LDC) + END IF + ! + ! C1 = -V1*C1 + ! + CALL CTRMM('Left', 'Upper', 'No Transpose', 'Unit', + $ K, N, NEG_ONE, V(M-K+1,1), LDV, C(M-K+1,1), LDC) + ELSE ! IF (RQ) THEN + ! We are computing C = CH' = C(I-V'T'V) + ! Where: V = [ V2 V1] and C = [ C2 C1 ] + ! with the following dimensions: + ! V1\in\R^{K\times K} + ! V2\in\R^{K\times N-K} + ! C1=0\in\R^{M\times K} + ! C2\in\R^{M\times N-K} + ! Since we are assuming that C1 is a zero matrix and it will be + ! overwritten on exit, we can use this spot as a temporary workspace + ! without having to allocate anything extra. + ! This lets us simplify our above equation to get + ! + ! C = CH' = [ C2, 0 ] (I - [ V2' ]T'[ V2, V1 ] + ! [ V1' ] + ! + ! = [ C2, 0 ] - [ C2, 0 ] [ V2' ]T'[ V2, V1 ] + ! [ V1' ] + ! + ! = [ C2, 0 ] - C2*V2'*T'[ V2, V1 ] + ! + ! = [ C2, 0 ] - [ C2*V2'*T'*V2, C2*V2'*T'*V1 ] + ! + ! = [ C2 - C2*V2'*T'*V2, -C2*V2'*T'*V1 ] + ! + ! So, we can order our computations as follows: + ! + ! C1 = C2*V2' + ! C1 = C1*T' + ! C2 = C2 - C1*V2 + ! C1 = -C1*V1 + ! + ! + ! To achieve the same end result + ! + ! Check to ensure side and trans are the expected values + ! + IF( .NOT.SIDER ) THEN + CALL XERBLA('CLARFB0C2', 2) + RETURN + ELSE IF(.NOT.TRANST) THEN + CALL XERBLA('CLARFB0C2', 3) + RETURN + END IF + ! + ! C1 = C2*V2' + ! + IF( C2I ) THEN + DO J = 1, K + DO I = 1, M + C(I,N-K+J) = CONJG(V(J,I)) + END DO + END DO + ELSE + CALL CGEMM('No Transpose', 'Conjugate', M, K, N-K, + $ ONE, C, LDC, V, LDV, ZERO, C(1, N-K+1), LDC) + END IF + ! + ! C1 = C1*T' + ! + CALL CTRMM('Right', 'Lower', 'Conjugate', 'Non-unit', + $ M, K, ONE, T, LDT, C(1, N-K+1), LDC) + ! + ! C2 = C2 - C1*V2 = -C1*V2 + C2 + ! + IF( C2I ) THEN + CALL CGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C(1, N-K+1), LDC, V, LDV, ZERO, C, LDC) + DO I = 1, M + C(I,I) = C(I,I) + ONE + END DO + ELSE + CALL CGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C(1, N-K+1), LDC, V, LDV, ONE, C, LDC) + END IF + ! + ! C1 = -C1*V1 + ! + CALL CTRMM('Right', 'Lower', 'No Transpose', 'Unit', + $ M, K, NEG_ONE, V(1, N-K+1), LDV, C(1,N-K+1), LDC) + END IF + END SUBROUTINE diff --git a/SRC/cunglq.f b/SRC/cunglq.f index 7f42c3dcb..a57f5a037 100644 --- a/SRC/cunglq.f +++ b/SRC/cunglq.f @@ -136,17 +136,13 @@ SUBROUTINE CUNGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * * ===================================================================== * -* .. Parameters .. - COMPLEX ZERO - PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) -* .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, J, KI, KK, L, LDWORK, - $ LWKOPT, NB, NBMIN, NX + INTEGER I, IB, IINFO, IWS, KI, KK, LWKOPT, LDWORK, + $ NB, NBMIN, NX * .. * .. External Subroutines .. - EXTERNAL CLARFB, CLARFT, CUNGL2, XERBLA + EXTERNAL CLARFB0C2, CLARFT, CUNGL2, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -218,67 +214,95 @@ SUBROUTINE CUNGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN * -* Use blocked code after the last block. -* The first kk rows are handled by the block method. -* - KI = ( ( K-NX-1 ) / NB )*NB - KK = MIN( K, KI+NB ) -* -* Set A(kk+1:m,1:kk) to zero. +* Treat the last NB block starting at KK+1 specially then use our blocking +* method from the block starting at KI+1 to 1 * - DO 20 J = 1, KK - DO 10 I = KK + 1, M - A( I, J ) = ZERO - 10 CONTINUE - 20 CONTINUE + KI = K - 2 * NB + KK = K - NB ELSE KK = 0 END IF * -* Use unblocked code for the last or only block. +* Potentially bail to the unblocked version * - IF( KK.LT.M ) - $ CALL CUNGL2( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA, - $ TAU( KK+1 ), WORK, IINFO ) + IF( KK.EQ.0 ) THEN + CALL CUNGL2( M, N, K, A, LDA, TAU, WORK, IINFO ) + END IF * IF( KK.GT.0 ) THEN * -* Use blocked code +* Factor the last block assuming that our first application +* will be on the Identity matrix * - DO 50 I = KI + 1, 1, -NB - IB = MIN( NB, K-I+1 ) - IF( I+IB.LE.M ) THEN + I = KK + 1 + IB = NB * -* Form the triangular factor of the block reflector -* H = H(i) H(i+1) . . . H(i+ib-1) +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * - CALL CLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, - $ I ), - $ LDA, TAU( I ), WORK, LDWORK ) + CALL CLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ), + $ LDA, TAU( I ), WORK, LDWORK ) * -* Apply H**H to A(i+ib:m,i:n) from the right +* Apply H**T to A(i+ib:m,i:n) from the right +* Exploit the fact that we are applying to an identity * - CALL CLARFB( 'Right', 'Conjugate transpose', - $ 'Forward', - $ 'Rowwise', M-I-IB+1, N-I+1, IB, A( I, I ), - $ LDA, WORK, LDWORK, A( I+IB, I ), LDA, - $ WORK( IB+1 ), LDWORK ) - END IF + CALL CLARFB0C2(.TRUE., 'Right', 'Conjugate', 'Forward', + $ 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, + $ LDWORK, A(I+IB,I), LDA) +* +* Apply H**T to columns i:n of current block + + CALL CUNGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ), + $ WORK, IINFO ) +* +* Use our standard blocking method after the last block +* + DO I = KI + 1, 1, -NB + IB = NB +* +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * -* Apply H**H to columns i:n of current block + CALL CLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ), + $ LDA, TAU( I ), WORK, LDWORK ) * +* Apply H**T to A(i+ib:m,i:n) from the right +* + CALL CLARFB0C2(.FALSE., 'Right', 'Conjugate', 'Forward', + $ 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, + $ LDWORK, A(I+IB,I), LDA) +* +* Apply H**T to columns i:n of current block + CALL CUNGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ), - $ WORK, - $ IINFO ) + $ WORK, IINFO ) + END DO +* +* This checks for if K was a perfect multiple of NB +* so that we only have a special case for the last block when +* necessary +* + IF(I.LT.1) THEN + IB = I + NB - 1 + I = 1 +* +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * -* Set columns 1:i-1 of current block to zero + CALL CLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ), + $ LDA, TAU( I ), WORK, LDWORK ) * - DO 40 J = 1, I - 1 - DO 30 L = I, I + IB - 1 - A( L, J ) = ZERO - 30 CONTINUE - 40 CONTINUE - 50 CONTINUE +* Apply H**T to A(i+ib:m,i:n) from the right +* + CALL CLARFB0C2(.FALSE., 'Right', 'Conjugate', 'Forward', + $ 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, + $ LDWORK, A(I+IB,I), LDA) +* +* Apply H**T to columns i:n of current block + + CALL CUNGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ), + $ WORK, IINFO ) + END IF END IF * WORK( 1 ) = SROUNDUP_LWORK(IWS) diff --git a/SRC/cungql.f b/SRC/cungql.f index 3da2702d8..9022d37ac 100644 --- a/SRC/cungql.f +++ b/SRC/cungql.f @@ -95,8 +95,6 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. LWORK >= max(1,N). -*> For optimum performance LWORK >= N*NB, where NB is the -*> optimal blocksize. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -137,17 +135,13 @@ SUBROUTINE CUNGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * * ===================================================================== * -* .. Parameters .. - COMPLEX ZERO - PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) -* .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, J, KK, L, LDWORK, LWKOPT, - $ NB, NBMIN, NX + INTEGER I, IB, IINFO, IWS, KK, LWKOPT, NB, NBMIN, + $ NX * .. * .. External Subroutines .. - EXTERNAL CLARFB, CLARFT, CUNG2L, XERBLA + EXTERNAL CLARFB0C2, CLARFT, CUNG2L, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -178,7 +172,11 @@ SUBROUTINE CUNGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) LWKOPT = 1 ELSE NB = ILAENV( 1, 'CUNGQL', ' ', M, N, K, -1 ) - LWKOPT = N*NB +* +* Only need a workspace for cung2l in case of bailout +* and for the panel factorization +* + LWKOPT = N END IF WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * @@ -201,88 +199,77 @@ SUBROUTINE CUNGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) END IF * NBMIN = 2 - NX = 0 + NX = MAX( 0, ILAENV( 3, 'CUNGQL', ' ', M, N, K, -1 )) IWS = N - IF( NB.GT.1 .AND. NB.LT.K ) THEN * -* Determine when to cross over from blocked to unblocked code. -* - NX = MAX( 0, ILAENV( 3, 'CUNGQL', ' ', M, N, K, -1 ) ) - IF( NX.LT.K ) THEN + IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN * -* Determine if workspace is large enough for blocked code. +* We use blocked code for the entire construction * - LDWORK = N - IWS = LDWORK*NB - IF( LWORK.LT.IWS ) THEN + KK = K + ELSE + KK = 0 + END IF * -* Not enough workspace to use optimal NB: reduce NB and -* determine the minimum value of NB. +* Possibly bail to the unblocked code. * - NB = LWORK / LDWORK - NBMIN = MAX( 2, ILAENV( 2, 'CUNGQL', ' ', M, N, K, - $ -1 ) ) - END IF - END IF + IF( KK.EQ.0 ) THEN + CALL CUNG2L( M, N, K, A, LDA, TAU, WORK, IINFO ) END IF * - IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN + IF( KK.GT.0 ) THEN * -* Use blocked code after the first block. -* The last kk columns are handled by the block method. +* Factor the first block assuming that our first application +* will be on the Identity matrix * - KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB ) + I = 1 + IB = NB * -* Set A(m-kk+1:m,1:n-kk) to zero. +* Form the triangular factor of the block reflector +* H = H(i+ib-1) . . . H(i+1) H(i) * - DO 20 J = 1, N - KK - DO 10 I = M - KK + 1, M - A( I, J ) = ZERO - 10 CONTINUE - 20 CONTINUE - ELSE - KK = 0 - END IF + CALL CLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB, + $ A( 1, N-K+I ), LDA, TAU( I ), + $ A( M-K+I, N-K+I ), LDA) * -* Use unblocked code for the first or only block. +* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left +* Exploit the fact that we are applying to an identity * - CALL CUNG2L( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO ) + CALL CLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Backward', + $ 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, A(1, N-K+I), + $ LDA, A( M-K+I, N-K+I ), LDA, A, LDA) * - IF( KK.GT.0 ) THEN +* Apply H to rows 1:m-k+i+ib-1 of current block * -* Use blocked code + CALL CUNG2L( M-K+I+IB-1, IB, IB, A( 1, N-K+I ), LDA, + $ TAU( I ), WORK, IINFO ) + +* Use blocked code on the remaining blocks if there are any. * - DO 50 I = K - KK + 1, K, NB - IB = MIN( NB, K-I+1 ) - IF( N-K+I.GT.1 ) THEN + DO I = NB+1, K, NB * -* Form the triangular factor of the block reflector -* H = H(i+ib-1) . . . H(i+1) H(i) +* The last block may be less than size NB * - CALL CLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB, - $ A( 1, N-K+I ), LDA, TAU( I ), WORK, LDWORK ) + IB = MIN(NB, K-I+1) * -* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left +* Form the triangular factor of the block reflector +* H = H(i+ib-1) . . . H(i+1) H(i) * - CALL CLARFB( 'Left', 'No transpose', 'Backward', - $ 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, - $ A( 1, N-K+I ), LDA, WORK, LDWORK, A, LDA, - $ WORK( IB+1 ), LDWORK ) - END IF + CALL CLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB, + $ A( 1, N-K+I ), LDA, TAU( I ), + $ A( M-K+I, N-K+I ), LDA ) +* +* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left +* + CALL CLARFB0C2(.FALSE., 'Left', 'No Transpose', + $ 'Backward', 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, + $ A(1, N-K+I), LDA, A( M-K+I, N-K+I ), LDA, A, LDA) * * Apply H to rows 1:m-k+i+ib-1 of current block * CALL CUNG2L( M-K+I+IB-1, IB, IB, A( 1, N-K+I ), LDA, $ TAU( I ), WORK, IINFO ) -* -* Set rows m-k+i+ib:m of current block to zero -* - DO 40 J = N - K + I, N - K + I + IB - 1 - DO 30 L = M - K + I + IB, M - A( L, J ) = ZERO - 30 CONTINUE - 40 CONTINUE - 50 CONTINUE + END DO END IF * WORK( 1 ) = CMPLX( IWS ) diff --git a/SRC/cungqr.f b/SRC/cungqr.f index eb49d2fed..3d7aaceb9 100644 --- a/SRC/cungqr.f +++ b/SRC/cungqr.f @@ -95,8 +95,6 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. LWORK >= max(1,N). -*> For optimum performance LWORK >= N*NB, where NB is the -*> optimal blocksize. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -137,17 +135,13 @@ SUBROUTINE CUNGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * * ===================================================================== * -* .. Parameters .. - COMPLEX ZERO - PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) -* .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, J, KI, KK, L, LDWORK, - $ LWKOPT, NB, NBMIN, NX + INTEGER I, IB, IINFO, IWS, KI, KK, LWKOPT, NB, + $ NBMIN, NX * .. * .. External Subroutines .. - EXTERNAL CLARFB, CLARFT, CUNG2R, XERBLA + EXTERNAL CLARFB0C2, CLARFT, CUNG2R, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -163,7 +157,11 @@ SUBROUTINE CUNGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * INFO = 0 NB = ILAENV( 1, 'CUNGQR', ' ', M, N, K, -1 ) - LWKOPT = MAX( 1, N )*NB +* +* Only need a workspace for zung2r in case of bailout +* and for the panel factorization +* + LWKOPT = MAX( 1, N ) WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN @@ -192,92 +190,102 @@ SUBROUTINE CUNGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) END IF * NBMIN = 2 - NX = 0 +* Determine when to cross over from unblocked to blocked + NX = MAX( 0, ILAENV( 3, 'CUNGQR', ' ', M, N, K, -1 ) ) IWS = N - IF( NB.GT.1 .AND. NB.LT.K ) THEN * -* Determine when to cross over from blocked to unblocked code. -* - NX = MAX( 0, ILAENV( 3, 'CUNGQR', ' ', M, N, K, -1 ) ) - IF( NX.LT.K ) THEN + IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN * -* Determine if workspace is large enough for blocked code. +* Treat the last NB block starting at KK+1 specially then use our blocking +* method from the block starting at KI+1 to 1 * - LDWORK = N - IWS = LDWORK*NB - IF( LWORK.LT.IWS ) THEN + KI = K - 2 * NB + KK = K - NB + ELSE + KK = 0 + END IF * -* Not enough workspace to use optimal NB: reduce NB and -* determine the minimum value of NB. +* Potentially bail to the unblocked code. * - NB = LWORK / LDWORK - NBMIN = MAX( 2, ILAENV( 2, 'CUNGQR', ' ', M, N, K, - $ -1 ) ) - END IF - END IF + IF( KK.EQ.0 ) THEN + CALL CUNG2R( M, N, K, A, LDA, TAU, WORK, IINFO ) END IF * - IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN + IF( KK.GT.0 ) THEN * -* Use blocked code after the last block. -* The first kk columns are handled by the block method. +* Factor the last block assuming that our first application +* will be on the Identity matrix * - KI = ( ( K-NX-1 ) / NB )*NB - KK = MIN( K, KI+NB ) + I = KK + 1 + IB = NB * -* Set A(1:kk,kk+1:n) to zero. +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * - DO 20 J = KK + 1, N - DO 10 I = 1, KK - A( I, J ) = ZERO - 10 CONTINUE - 20 CONTINUE - ELSE - KK = 0 - END IF + CALL CLARFT('Forward', 'Column', M-I+1, IB, A(I,I), + $ LDA, TAU(I), A(I,I), LDA) * -* Use unblocked code for the last or only block. +* Apply H to A(i:m,i+ib:n) from the left +* Exploit the fact that we are applying to an identity * - IF( KK.LT.N ) - $ CALL CUNG2R( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA, - $ TAU( KK+1 ), WORK, IINFO ) + CALL CLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Forward', + $ 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), LDA, A(I,I), + $ LDA, A(I,I+IB), LDA) * - IF( KK.GT.0 ) THEN +* Apply H to rows i:m of current block * -* Use blocked code + CALL CUNG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK, IINFO) * - DO 50 I = KI + 1, 1, -NB - IB = MIN( NB, K-I+1 ) - IF( I+IB.LE.N ) THEN +* Use our standard blocking method after the last block * -* Form the triangular factor of the block reflector -* H = H(i) H(i+1) . . . H(i+ib-1) + DO I = KI + 1, 1, -NB + IB = NB * - CALL CLARFT( 'Forward', 'Columnwise', M-I+1, IB, - $ A( I, I ), LDA, TAU( I ), WORK, LDWORK ) +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * -* Apply H to A(i:m,i+ib:n) from the left + CALL CLARFT('Forward', 'Column', M-I+1, IB, A(I,I), + $ LDA, TAU(I), A(I,I), LDA) * - CALL CLARFB( 'Left', 'No transpose', 'Forward', - $ 'Columnwise', M-I+1, N-I-IB+1, IB, - $ A( I, I ), LDA, WORK, LDWORK, A( I, I+IB ), - $ LDA, WORK( IB+1 ), LDWORK ) - END IF +* Apply H to A(i:m,i+ib:n) from the left +* + CALL CLARFB0C2(.FALSE., 'Left', 'No Transpose', + $ 'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), + $ LDA, A(I,I), LDA, A(I,I+IB), LDA) + * * Apply H to rows i:m of current block * - CALL CUNG2R( M-I+1, IB, IB, A( I, I ), LDA, TAU( I ), - $ WORK, - $ IINFO ) + CALL CUNG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK, + $ IINFO) + END DO +* +* This checks for if K was a perfect multiple of NB +* so that we only have a special case for the last block when +* necessary +* + IF(I.LT.1) THEN + IB = I + NB - 1 + I = 1 * -* Set rows 1:i-1 of current block to zero +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * - DO 40 J = I, I + IB - 1 - DO 30 L = 1, I - 1 - A( L, J ) = ZERO - 30 CONTINUE - 40 CONTINUE - 50 CONTINUE + CALL CLARFT('Forward', 'Column', M-I+1, IB, A(I,I), + $ LDA, TAU(I), A(I,I), LDA) +* +* Apply H to A(i:m,i+ib:n) from the left +* + CALL CLARFB0C2(.FALSE., 'Left', 'No Transpose', + $ 'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), + $ LDA, A(I,I), LDA, A(I,I+IB), LDA) + +* +* Apply H to rows i:m of current block +* + CALL CUNG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK, + $ IINFO) + END IF END IF * WORK( 1 ) = SROUNDUP_LWORK(IWS) diff --git a/SRC/cungrq.f b/SRC/cungrq.f index 7e812b839..4a9b2fe51 100644 --- a/SRC/cungrq.f +++ b/SRC/cungrq.f @@ -137,17 +137,13 @@ SUBROUTINE CUNGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * * ===================================================================== * -* .. Parameters .. - COMPLEX ZERO - PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) -* .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, II, IINFO, IWS, J, KK, L, LDWORK, + INTEGER I, IB, II, IINFO, IWS, KK, LDWORK, $ LWKOPT, NB, NBMIN, NX * .. * .. External Subroutines .. - EXTERNAL CLARFB, CLARFT, CUNGR2, XERBLA + EXTERNAL CLARFB0C2, CLARFT, CUNGR2, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -231,61 +227,67 @@ SUBROUTINE CUNGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * Use blocked code after the first block. * The last kk rows are handled by the block method. * - KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB ) -* -* Set A(1:m-kk,n-kk+1:n) to zero. -* - DO 20 J = N - KK + 1, N - DO 10 I = 1, M - KK - A( I, J ) = ZERO - 10 CONTINUE - 20 CONTINUE + KK = K ELSE KK = 0 END IF * -* Use unblocked code for the first or only block. +* Potentially bail to the unblocked code * - CALL CUNGR2( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO ) + IF( KK.EQ.0 ) THEN + CALL CUNGR2( M, N, K, A, LDA, TAU, WORK, IINFO ) + END IF * IF( KK.GT.0 ) THEN * -* Use blocked code +* Factor the first block assuming that our first application +* will be on the Identity matrix * - DO 50 I = K - KK + 1, K, NB - IB = MIN( NB, K-I+1 ) - II = M - K + I - IF( II.GT.1 ) THEN + I = 1 + IB = NB + II = M - K + I * -* Form the triangular factor of the block reflector -* H = H(i+ib-1) . . . H(i+1) H(i) +* Form the triangular factor of the block reflector +* H = H(i+ib-1) . . . H(i+1) H(i) * - CALL CLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB, - $ A( II, 1 ), LDA, TAU( I ), WORK, LDWORK ) + CALL CLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB, + $ A( II, 1 ), LDA, TAU( I ), WORK, LDWORK ) * -* Apply H**H to A(1:m-k+i-1,1:n-k+i+ib-1) from the right +* Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right * - CALL CLARFB( 'Right', 'Conjugate transpose', - $ 'Backward', - $ 'Rowwise', II-1, N-K+I+IB-1, IB, A( II, 1 ), - $ LDA, WORK, LDWORK, A, LDA, WORK( IB+1 ), - $ LDWORK ) - END IF + CALL CLARFB0C2(.TRUE., 'Right', 'Conjugate', 'Backward', + $ 'Rowwise', II-1, N-K+I+IB-1, IB, A(II,1), LDA, WORK, + $ LDWORK, A, LDA) * -* Apply H**H to columns 1:n-k+i+ib-1 of current block +* Apply H**T to columns 1:n-k+i+ib-1 of current block * - CALL CUNGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA, - $ TAU( I ), - $ WORK, IINFO ) + CALL CUNGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA, + $ TAU( I ), WORK, IINFO ) + + DO I = NB+1, K, NB * -* Set columns n-k+i+ib:n of current block to zero +* The last block may be less than size NB * - DO 40 L = N - K + I + IB, N - DO 30 J = II, II + IB - 1 - A( J, L ) = ZERO - 30 CONTINUE - 40 CONTINUE - 50 CONTINUE + IB = MIN(NB, K-I+1) + II = M - K + I +* +* Form the triangular factor of the block reflector +* H = H(i+ib-1) . . . H(i+1) H(i) +* + CALL CLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB, + $ A( II, 1 ), LDA, TAU( I ), WORK, LDWORK ) +* +* Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right +* + CALL CLARFB0C2(.FALSE., 'Right', 'Conjugate', + $ 'Backward', 'Rowwise', II-1, N-K+I+IB-1, IB, + $ A(II,1), LDA, WORK, LDWORK, A, LDA) +* +* Apply H**T to columns 1:n-k+i+ib-1 of current block +* + CALL CUNGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA, + $ TAU( I ), WORK, IINFO ) + END DO END IF * WORK( 1 ) = SROUNDUP_LWORK(IWS) diff --git a/SRC/dlarfb0c2.f b/SRC/dlarfb0c2.f new file mode 100644 index 000000000..493580e09 --- /dev/null +++ b/SRC/dlarfb0c2.f @@ -0,0 +1,555 @@ +*> \brief \b DLARFB0C2 applies a block reflector or its transpose to a +* rectangular matrix with a 0 block while constructing the explicit Q factor +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* +* Definition: +* =========== +* +* SUBROUTINE DLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N, +* $ K, V, LDV, T, LDT, C, LDC) +* ! Scalar arguments +* INTEGER M, N, K, LDV, LDC, LDT +* CHARACTER SIDE, TRANS, DIRECT, STOREV +* ! True means that we are assuming C2 is the identity matrix +* ! and thus don't reference whatever is present in C2 +* ! at the beginning. +* LOGICAL C2I +* ! Array arguments +* DOUBLE PRECISION V(LDV,*), C(LDC,*), T(LDT,*) +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DLARFB0C2 applies a real block reflector H or its transpose H**T to a +*> real m by n matrix C with a 0 block, while computing the explicit Q factor +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] C2I +*> \verbatim +*> C2I is LOGICAL +*> = .TRUE.: Assume the nonzero block of C is the identity matrix +*> = .FALSE.: Use existing data in the nonzero block of C +*> \endverbatim +*> +*> \param[in] SIDE +*> \verbatim +*> SIDE is CHARACTER*1 +*> = 'L': apply H or H**T from the Left +*> = 'R': apply H or H**T from the Right +*> \endverbatim +*> +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> = 'N': apply H (No transpose) +*> = 'T': apply H**T (Transpose) +*> \endverbatim +*> +*> \param[in] DIRECT +*> \verbatim +*> DIRECT is CHARACTER*1 +*> Indicates how H is formed from a product of elementary +*> reflectors +*> = 'F': H = H(1) H(2) . . . H(k) (Forward) +*> = 'B': H = H(k) . . . H(2) H(1) (Backward) +*> \endverbatim +*> +*> \param[in] STOREV +*> \verbatim +*> STOREV is CHARACTER*1 +*> Indicates how the vectors which define the elementary +*> reflectors are stored: +*> = 'C': Columnwise +*> = 'R': Rowwise +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix C. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix C. +*> \endverbatim +*> +*> \param[in] K +*> \verbatim +*> K is INTEGER +*> The order of the matrix T (= the number of elementary +*> reflectors whose product defines the block reflector). +*> If SIDE = 'L', M >= K >= 0; +*> if SIDE = 'R', N >= K >= 0. +*> \endverbatim +*> +*> \param[in] V +*> \verbatim +*> V is DOUBLE PRECISION array, dimension +*> (LDV,K) if STOREV = 'C' +*> (LDV,M) if STOREV = 'R' and SIDE = 'L' +*> (LDV,N) if STOREV = 'R' and SIDE = 'R' +*> The matrix V. See Further Details. +*> \endverbatim +*> +*> \param[in] LDV +*> \verbatim +*> LDV is INTEGER +*> The leading dimension of the array V. +*> If STOREV = 'C' and SIDE = 'L', LDV >= max(1,M); +*> if STOREV = 'C' and SIDE = 'R', LDV >= max(1,N); +*> if STOREV = 'R', LDV >= K. +*> \endverbatim +*> +*> \param[in] T +*> \verbatim +*> T is DOUBLE PRECISION array, dimension (LDT,K) +*> The triangular k by k matrix T in the representation of the +*> block reflector. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. LDT >= K. +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is DOUBLE PRECISION array, dimension (LDC,N) +*> On entry, the m by n matrix C. +*> On exit, C is overwritten by H*C or H**T*C or C*H or C*H**T. +*> \endverbatim +*> +*> \param[in] LDC +*> \verbatim +*> LDC is INTEGER +*> The leading dimension of the array C. LDC >= max(1,M). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup larfb +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> The shape of the matrix V and the storage of the vectors which define +*> the H(i) is best illustrated by the following example with n = 5 and +*> k = 3. The triangular part of V (including its diagonal) is not +*> referenced. +*> +*> DIRECT = 'F' and STOREV = 'C': DIRECT = 'F' and STOREV = 'R': +*> +*> V = ( 1 ) V = ( 1 v1 v1 v1 v1 ) +*> ( v1 1 ) ( 1 v2 v2 v2 ) +*> ( v1 v2 1 ) ( 1 v3 v3 ) +*> ( v1 v2 v3 ) +*> ( v1 v2 v3 ) +*> +*> DIRECT = 'B' and STOREV = 'C': DIRECT = 'B' and STOREV = 'R': +*> +*> V = ( v1 v2 v3 ) V = ( v1 v1 1 ) +*> ( v1 v2 v3 ) ( v2 v2 v2 1 ) +*> ( 1 v2 v3 ) ( v3 v3 v3 v3 1 ) +*> ( 1 v3 ) +*> ( 1 ) +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE DLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N, + $ K, V, LDV, T, LDT, C, LDC) + ! Scalar arguments + INTEGER M, N, K, LDV, LDC, LDT + CHARACTER SIDE, TRANS, DIRECT, STOREV + ! True means that we are assuming C2 is the identity matrix + ! and thus don't reference whatever is present in C2 + ! at the beginning. + LOGICAL C2I + ! Array arguments + DOUBLE PRECISION V(LDV,*), C(LDC,*), T(LDT,*) + ! Local scalars + LOGICAL QR, LQ, QL, DIRF, COLV, SIDEL, SIDER, + $ TRANST + INTEGER I, J + ! External functions + LOGICAL LSAME + EXTERNAL LSAME + ! External subroutines + EXTERNAL DGEMM, DTRMM, XERBLA + ! Parameters + DOUBLE PRECISION ONE, ZERO, NEG_ONE + PARAMETER(ONE=1.0D+0, ZERO = 0.0D+0, NEG_ONE = -1.0D+0) + + ! Beginning of executable statements + ! Convert our character flags to logical values + DIRF = LSAME(DIRECT,'F') + COLV = LSAME(STOREV,'C') + SIDEL = LSAME(SIDE,'L') + SIDER = LSAME(SIDE,'R') + TRANST = LSAME(TRANS,'T') + + ! Determine which of the 4 modes are using. + ! QR is when we store the reflectors column by column and have the + ! 'first' reflector stored in the first column + QR = DIRF.AND.COLV + + ! LQ is when we store the reflectors row by row and have the + ! 'first' reflector stored in the first row + LQ = DIRF.AND.(.NOT.COLV) + + ! QL is when we store the reflectors column by column and have the + ! 'first' reflector stored in the last column + QL = (.NOT.DIRF).AND.COLV + + ! RQ is when we store the reflectors row by row and have the + ! 'first' reflector stored in the last row + ! RQ = (.NOT.DIRF).AND.(.NOT.COLV) + ! Since we have exactly one of these 4 modes, we don't need to actually + ! store the value of RQ, instead we assume this is the case if we fail + ! the above 3 checks. + + IF (QR) THEN + ! We are computing C = HC = (I - VTV')C + ! Where: V = [ V1 ] and C = [ C1 ] + ! [ V2 ] [ C2 ] + ! with the following dimensions: + ! V1\in\R^{K\times K} + ! V2\in\R^{M-K\times K} + ! C1=0\in\R^{K\times N} + ! C2\in\R^{M-K\times N} + ! Since we are assuming that C1 is a zero matrix and it will be + ! overwritten on exit, we can use this spot as a temporary workspace + ! without having to allocate anything extra. + ! This lets us simplify our above equation to get + ! + ! C = HC = (I - [ V1 ]T [V1', V2'])[ 0 ] + ! [ V2 ] [ C2 ] + ! = [ 0 ] - [ V1 ]T*V2'*C2 + ! [ C2 ] [ V2 ] + ! + ! = [ 0 ] - [ V1*T*V2'*C2 ] + ! [ C2 ] [ V2*T*V2'*C2 ] + ! + ! = [ V1*T*V2'*C2 ] + ! [ C2 - V2*T*V2'*C2 ] + ! + ! So, we can order our computations as follows: + ! + ! C1 = V2'*C2 + ! C1 = T*C1 + ! C2 = C2 - V2*C1 + ! C1 = -V1*C1 + ! + ! To achieve the same end result + ! + ! Check to ensure side and trans are the expected values + ! + IF( .NOT.SIDEL ) THEN + CALL XERBLA('DLARFB0C2', 2) + RETURN + ELSE IF(TRANST) THEN + CALL XERBLA('DLARFB0C2', 3) + RETURN + END IF + ! + ! C1 = V2'*C2 + ! + IF (C2I) THEN + DO J = 1, N + DO I = 1, K + C(I,J) = V(K+J,I) + END DO + END DO + ELSE + CALL DGEMM('Transpose', 'No Transpose', K, N, M - K, + $ ONE, V(K+1,1), LDV, C(K+1,1), LDC, ZERO, + $ C, LDC) + END IF + ! + ! C1 = T*C1 + ! + CALL DTRMM('Left', 'Upper', 'No Transpose', 'Non-unit', + $ K, N, ONE, T, LDT, C, LDC) + ! + ! C2 = C2 - V2*C1 = -V2*C1 + C2 + ! + IF (C2I) THEN + CALL DGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V(K+1,1), LDV, C, LDC, ZERO, + $ C(K+1,1), LDC) + DO I = 1, N + C(K+I,I) = C(K+I,I) + ONE + END DO + ELSE + CALL DGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V(K+1,1), LDV, C, LDC, ONE, + $ C(K+1,1), LDC) + END IF + ! + ! C1 = -V1*C1 + ! + CALL DTRMM('Left', 'Lower', 'No Transpose', 'Unit', + $ K, N, NEG_ONE, V, LDV, C, LDC) + ELSE IF (LQ) THEN + ! We are computing C = CH' = C(I-V'T'V) + ! Where: V = [ V1 V2 ] and C = [ C1 C2 ] + ! with the following dimensions: + ! V1\in\R^{K\times K} + ! V2\in\R^{K\times N-K} + ! C1=0\in\R^{M\times K} + ! C2\in\R^{M\times N-K} + ! Since we are assuming that C1 is a zero matrix and it will be + ! overwritten on exit, we can use this spot as a temporary workspace + ! without having to allocate anything extra. + ! This lets us simplify our above equation to get + ! + ! C = CH' = [ 0, C2 ](I - [ V1' ]T'[ V1, V2 ]) + ! [ V2' ] + ! + ! = [ 0, C2 ] - [ 0, C2 ][ V1' ]T'[ V1, V2 ] + ! [ V2' ] + ! + ! = [ 0, C2 ] - C2*V2'*T'[ V1, V2 ] + ! + ! = [ -C2*V2'*T'*V1, C2 - C2*V2'*T'*V2 ] + ! + ! So, we can order our computations as follows: + ! + ! C1 = C2*V2' + ! C1 = C1*T' + ! C2 = C2 - C1*V2 + ! C1 = -C1*V1 + ! + ! To achieve the same end result + ! + ! Check to ensure side and trans are the expected values + ! + IF( .NOT.SIDER ) THEN + CALL XERBLA('DLARFB0C2', 2) + RETURN + ELSE IF(.NOT.TRANST) THEN + CALL XERBLA('DLARFB0C2', 3) + RETURN + END IF + ! + ! C1 = C2*V2' + ! + IF( C2I ) THEN + DO J = 1, K + DO I = 1, M + C(I,J) = V(J,K+I) + END DO + END DO + ELSE + CALL DGEMM('No Transpose', 'Transpose', M, K, N-K, + $ ONE, C(1,K+1), LDC, V(1, K+1), LDV, ZERO, C, + $ LDC) + END IF + ! + ! C1 = C1*T' + ! + CALL DTRMM('Right', 'Upper', 'Transpose', 'Non-unit', + $ M, K, ONE, T, LDT, C, LDC) + ! + ! C2 = C2 - C1*V2 = -C1*V2 + C2 + ! + IF( C2I ) THEN + CALL DGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C, LDC, V(1,K+1), LDV, ZERO, C(1,K+1), + $ LDC) + DO I = 1, M + C(I,K+I) = C(I,K+I) + ONE + END DO + ELSE + CALL DGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C, LDC, V(1,K+1), LDV, ONE, C(1,K+1), + $ LDC) + END IF + ! + ! C1 = -C1*V1 + ! + CALL DTRMM('Right', 'Upper', 'No Transpose', 'Unit', + $ M, K, NEG_ONE, V, LDV, C, LDC) + ELSE IF (QL) THEN + ! We are computing C = HC = (I - VTV')C + ! Where: V = [ V2 ] and C = [ C2 ] + ! [ V1 ] [ C1 ] + ! with the following dimensions: + ! V1\in\R^{K\times K} + ! V2\in\R^{M-K\times K} + ! C1=0\in\R^{K\times N} + ! C2\in\R^{M-K\times N} + ! Since we are assuming that C1 is a zero matrix and it will be + ! overwritten on exit, we can use this spot as a temporary workspace + ! without having to allocate anything extra. + ! This lets us simplify our above equation to get + ! + ! C = HC = (I-[ V2 ]T[ V2' V1' ])[ C2 ] + ! [ V1 ] [ 0 ] + ! + ! = [ C2 ] - [ V2 ]T*V2'*C2 + ! [ 0 ] [ V1 ] + ! + ! = [ C2 ] - [ V2*T*V2'*C2 ] + ! [ 0 ] [ V1*T*V2'*C2 ] + ! + ! = [ C2 - V2*T*V2'*C2 ] + ! [ - V1*T*V2'*C2 ] + ! + ! So, we can order our computations as follows: + ! + ! C1 = V2'*C2 + ! C1 = T*C1 + ! C2 = C2 - V2*C1 + ! C1 = -V1*C1 + ! + ! To achieve the same end result + ! + ! Check to ensure side and trans are the expected values + ! + IF( .NOT.SIDEL ) THEN + CALL XERBLA('DLARFB0C2', 2) + RETURN + ELSE IF(TRANST) THEN + CALL XERBLA('DLARFB0C2', 3) + RETURN + END IF + ! + ! C1 = V2'*C2 + ! + IF( C2I ) THEN + DO J = 1, N + DO I = 1, K + C(M-K+I,J) = V(J,I) + END DO + END DO + ELSE + CALL DGEMM('Transpose', 'No Transpose', K, N, M-K, + $ ONE, V, LDV, C, LDC, ZERO, C(M-K+1, 1), LDC) + END IF + ! + ! C1 = T*C1 + ! + CALL DTRMM('Left', 'Lower', 'No Transpose', 'Non-unit', + $ K, N, ONE, T, LDT, C(M-K+1,1), LDC) + ! + ! C2 = C2 - V2*C1 = -V2*C1 + C2 + ! + IF( C2I ) THEN + CALL DGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V, LDV, C(M-K+1,1), LDC, ZERO, C, LDC) + DO I = 1, N + C(I,I) = C(I,I) + ONE + END DO + ELSE + CALL DGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V, LDV, C(M-K+1,1), LDC, ONE, C, LDC) + END IF + ! + ! C1 = -V1*C1 + ! + CALL DTRMM('Left', 'Upper', 'No Transpose', 'Unit', + $ K, N, NEG_ONE, V(M-K+1,1), LDV, C(M-K+1,1), LDC) + ELSE ! IF (RQ) THEN + ! We are computing C = CH' = C(I-V'T'V) + ! Where: V = [ V2 V1] and C = [ C2 C1 ] + ! with the following dimensions: + ! V1\in\R^{K\times K} + ! V2\in\R^{K\times N-K} + ! C1=0\in\R^{M\times K} + ! C2\in\R^{M\times N-K} + ! Since we are assuming that C1 is a zero matrix and it will be + ! overwritten on exit, we can use this spot as a temporary workspace + ! without having to allocate anything extra. + ! This lets us simplify our above equation to get + ! + ! C = CH' = [ C2, 0 ] (I - [ V2' ]T'[ V2, V1 ] + ! [ V1' ] + ! + ! = [ C2, 0 ] - [ C2, 0 ] [ V2' ]T'[ V2, V1 ] + ! [ V1' ] + ! + ! = [ C2, 0 ] - C2*V2'*T'[ V2, V1 ] + ! + ! = [ C2, 0 ] - [ C2*V2'*T'*V2, C2*V2'*T'*V1 ] + ! + ! = [ C2 - C2*V2'*T'*V2, -C2*V2'*T'*V1 ] + ! + ! So, we can order our computations as follows: + ! + ! C1 = C2*V2' + ! C1 = C1*T' + ! C2 = C2 - C1*V2 + ! C1 = -C1*V1 + ! + ! + ! To achieve the same end result + ! + ! Check to ensure side and trans are the expected values + ! + IF( .NOT.SIDER ) THEN + CALL XERBLA('DLARFB0C2', 2) + RETURN + ELSE IF(.NOT.TRANST) THEN + CALL XERBLA('DLARFB0C2', 3) + RETURN + END IF + ! + ! C1 = C2*V2' + ! + IF( C2I ) THEN + DO J = 1, K + DO I = 1, M + C(I,N-K+J) = V(J,I) + END DO + END DO + ELSE + CALL DGEMM('No Transpose', 'Transpose', M, K, N-K, + $ ONE, C, LDC, V, LDV, ZERO, C(1, N-K+1), LDC) + END IF + ! + ! C1 = C1*T' + ! + CALL DTRMM('Right', 'Lower', 'Transpose', 'Non-unit', + $ M, K, ONE, T, LDT, C(1, N-K+1), LDC) + ! + ! C2 = C2 - C1*V2 = -C1*V2 + C2 + ! + IF( C2I ) THEN + CALL DGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C(1, N-K+1), LDC, V, LDV, ZERO, C, LDC) + DO I = 1, M + C(I,I) = C(I,I) + ONE + END DO + ELSE + CALL DGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C(1, N-K+1), LDC, V, LDV, ONE, C, LDC) + END IF + ! + ! C1 = -C1*V1 + ! + CALL DTRMM('Right', 'Lower', 'No Transpose', 'Unit', + $ M, K, NEG_ONE, V(1, N-K+1), LDV, C(1,N-K+1), LDC) + END IF + END SUBROUTINE diff --git a/SRC/dorglq.f b/SRC/dorglq.f index 71ff93fa6..c7f32afb8 100644 --- a/SRC/dorglq.f +++ b/SRC/dorglq.f @@ -5,6 +5,7 @@ * Online html documentation available at * http://www.netlib.org/lapack/explore-html/ * +*> \htmlonly *> Download DORGLQ + dependencies *> *> [TGZ] @@ -12,6 +13,7 @@ *> [ZIP] *> *> [TXT] +*> \endhtmlonly * * Definition: * =========== @@ -136,17 +138,13 @@ SUBROUTINE DORGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * * ===================================================================== * -* .. Parameters .. - DOUBLE PRECISION ZERO - PARAMETER ( ZERO = 0.0D+0 ) -* .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, J, KI, KK, L, LDWORK, - $ LWKOPT, NB, NBMIN, NX + INTEGER I, IB, IINFO, IWS, KI, KK, LDWORK, LWKOPT, + $ NB, NBMIN, NX * .. * .. External Subroutines .. - EXTERNAL DLARFB, DLARFT, DORGL2, XERBLA + EXTERNAL DLARFB0C2, DLARFT, DORGL2, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -161,7 +159,7 @@ SUBROUTINE DORGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * INFO = 0 NB = ILAENV( 1, 'DORGLQ', ' ', M, N, K, -1 ) - LWKOPT = MAX( 1, M )*NB + LWKOPT = MAX( 1, M ) * NB WORK( 1 ) = LWKOPT LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN @@ -217,67 +215,90 @@ SUBROUTINE DORGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN * -* Use blocked code after the last block. -* The first kk rows are handled by the block method. -* - KI = ( ( K-NX-1 ) / NB )*NB - KK = MIN( K, KI+NB ) -* -* Set A(kk+1:m,1:kk) to zero. +* Handle the first block assuming we are applying to the +* identity, then resume regular blocking method after * - DO 20 J = 1, KK - DO 10 I = KK + 1, M - A( I, J ) = ZERO - 10 CONTINUE - 20 CONTINUE + KI = K - 2 * NB + KK = K - NB ELSE KK = 0 END IF * -* Use unblocked code for the last or only block. +* Potentially bail to the unblocked version * - IF( KK.LT.M ) - $ CALL DORGL2( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA, - $ TAU( KK+1 ), WORK, IINFO ) + IF( KK.EQ.0 ) THEN + CALL DORGL2( M, N, K, A, LDA, TAU, WORK, IINFO ) + END IF * IF( KK.GT.0 ) THEN + I = KK + 1 + IB = NB +* +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) +* + CALL DLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ), + $ LDA, TAU( I ), WORK, LDWORK ) +* +* Apply H**T to A(i+ib:m,i:n) from the right +* + CALL DLARFB0C2(.TRUE., 'Right', 'Transpose', 'Forward', + $ 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, + $ LDWORK, A(I+IB,I), LDA) +* +* Apply H**T to columns i:n of current block + + CALL DORGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ), + $ WORK, IINFO ) * * Use blocked code * - DO 50 I = KI + 1, 1, -NB - IB = MIN( NB, K-I+1 ) - IF( I+IB.LE.M ) THEN + DO I = KI + 1, 1, -NB + IB = NB * -* Form the triangular factor of the block reflector -* H = H(i) H(i+1) . . . H(i+ib-1) +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * - CALL DLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, - $ I ), - $ LDA, TAU( I ), WORK, LDWORK ) + CALL DLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ), + $ LDA, TAU( I ), WORK, LDWORK ) * -* Apply H**T to A(i+ib:m,i:n) from the right +* Apply H**T to A(i+ib:m,i:n) from the right * - CALL DLARFB( 'Right', 'Transpose', 'Forward', - $ 'Rowwise', - $ M-I-IB+1, N-I+1, IB, A( I, I ), LDA, WORK, - $ LDWORK, A( I+IB, I ), LDA, WORK( IB+1 ), - $ LDWORK ) - END IF + CALL DLARFB0C2(.FALSE., 'Right', 'Transpose', 'Forward', + $ 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, + $ LDWORK, A(I+IB,I), LDA) * * Apply H**T to columns i:n of current block -* + CALL DORGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ), - $ WORK, - $ IINFO ) + $ WORK, IINFO ) + END DO +* +* This checks for if K was a perfect multiple of NB +* so that we only have a special case for the last block when +* necessary +* + IF(I.LT.1) THEN + IB = I + NB - 1 + I = 1 +* +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) +* + CALL DLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ), + $ LDA, TAU( I ), WORK, LDWORK ) * -* Set columns 1:i-1 of current block to zero +* Apply H**T to A(i+ib:m,i:n) from the right * - DO 40 J = 1, I - 1 - DO 30 L = I, I + IB - 1 - A( L, J ) = ZERO - 30 CONTINUE - 40 CONTINUE - 50 CONTINUE + CALL DLARFB0C2(.FALSE., 'Right', 'Transpose', 'Forward', + $ 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, + $ LDWORK, A(I+IB,I), LDA) +* +* Apply H**T to columns i:n of current block + + CALL DORGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ), + $ WORK, IINFO ) + END IF END IF * WORK( 1 ) = IWS diff --git a/SRC/dorgql.f b/SRC/dorgql.f index c7bba63c2..378147844 100644 --- a/SRC/dorgql.f +++ b/SRC/dorgql.f @@ -5,6 +5,7 @@ * Online html documentation available at * http://www.netlib.org/lapack/explore-html/ * +*> \htmlonly *> Download DORGQL + dependencies *> *> [TGZ] @@ -12,6 +13,7 @@ *> [ZIP] *> *> [TXT] +*> \endhtmlonly * * Definition: * =========== @@ -95,8 +97,6 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. LWORK >= max(1,N). -*> For optimum performance LWORK >= N*NB, where NB is the -*> optimal blocksize. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -137,17 +137,12 @@ SUBROUTINE DORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * * ===================================================================== * -* .. Parameters .. - DOUBLE PRECISION ZERO - PARAMETER ( ZERO = 0.0D+0 ) -* .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, J, KK, L, LDWORK, LWKOPT, - $ NB, NBMIN, NX + INTEGER I, IB, IINFO, IWS, KK, LWKOPT, NB, NBMIN * .. * .. External Subroutines .. - EXTERNAL DLARFB, DLARFT, DORG2L, XERBLA + EXTERNAL DLARFB0C2, DLARFT, DORG2L, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -177,7 +172,8 @@ SUBROUTINE DORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) LWKOPT = 1 ELSE NB = ILAENV( 1, 'DORGQL', ' ', M, N, K, -1 ) - LWKOPT = N*NB + ! Only need a workspace for calls to dorg2l + LWKOPT = N END IF WORK( 1 ) = LWKOPT * @@ -200,88 +196,77 @@ SUBROUTINE DORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) END IF * NBMIN = 2 - NX = 0 IWS = N - IF( NB.GT.1 .AND. NB.LT.K ) THEN -* -* Determine when to cross over from blocked to unblocked code. * - NX = MAX( 0, ILAENV( 3, 'DORGQL', ' ', M, N, K, -1 ) ) - IF( NX.LT.K ) THEN + IF( NB.GE.NBMIN .AND. NB.LT.K ) THEN * -* Determine if workspace is large enough for blocked code. +* We want to use the blocking method as long as our matrix is big enough * - LDWORK = N - IWS = LDWORK*NB - IF( LWORK.LT.IWS ) THEN + KK = K + ELSE + KK = 0 + END IF * -* Not enough workspace to use optimal NB: reduce NB and -* determine the minimum value of NB. +* Possibly bail to the unblocked code. * - NB = LWORK / LDWORK - NBMIN = MAX( 2, ILAENV( 2, 'DORGQL', ' ', M, N, K, - $ -1 ) ) - END IF - END IF + IF( KK.EQ.0 ) THEN + CALL DORG2L( M, N, K, A, LDA, TAU, WORK, IINFO ) END IF * - IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN +* Use our blocked code for everything * -* Use blocked code after the first block. -* The last kk columns are handled by the block method. + IF( KK.GT.0 ) THEN * - KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB ) +* Factor the first block assuming that our first application +* will be on the Identity matrix * -* Set A(m-kk+1:m,1:n-kk) to zero. + I = 1 + IB = NB * - DO 20 J = 1, N - KK - DO 10 I = M - KK + 1, M - A( I, J ) = ZERO - 10 CONTINUE - 20 CONTINUE - ELSE - KK = 0 - END IF +* Form the triangular factor of the block reflector +* H = H(i+ib-1) . . . H(i+1) H(i) * -* Use unblocked code for the first or only block. + CALL DLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB, + $ A( 1, N-K+I ), LDA, TAU( I ), + $ A( M-K+I, N-K+I ), LDA) * - CALL DORG2L( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO ) +* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left * - IF( KK.GT.0 ) THEN + CALL DLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Backward', + $ 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, A(1, N-K+I), + $ LDA, A( M-K+I, N-K+I ), LDA, A, LDA) +* +* Apply H to rows 1:m-k+i+ib-1 of current block * -* Use blocked code + CALL DORG2L( M-K+I+IB-1, IB, IB, A( 1, N-K+I ), LDA, + $ TAU( I ), WORK, IINFO ) + +* Use blocked code on the remaining blocks if there are any. * - DO 50 I = K - KK + 1, K, NB - IB = MIN( NB, K-I+1 ) - IF( N-K+I.GT.1 ) THEN + DO I = NB+1, K, NB * -* Form the triangular factor of the block reflector -* H = H(i+ib-1) . . . H(i+1) H(i) +* The last block may be less than size NB * - CALL DLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB, - $ A( 1, N-K+I ), LDA, TAU( I ), WORK, LDWORK ) + IB = MIN(NB, K-I+1) * -* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left +* Form the triangular factor of the block reflector +* H = H(i+ib-1) . . . H(i+1) H(i) * - CALL DLARFB( 'Left', 'No transpose', 'Backward', - $ 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, - $ A( 1, N-K+I ), LDA, WORK, LDWORK, A, LDA, - $ WORK( IB+1 ), LDWORK ) - END IF + CALL DLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB, + $ A( 1, N-K+I ), LDA, TAU( I ), + $ A( M-K+I, N-K+I ), LDA ) +* +* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left +* + CALL DLARFB0C2(.FALSE., 'Left', 'No Transpose', + $ 'Backward', 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, + $ A(1, N-K+I), LDA, A( M-K+I, N-K+I ), LDA, A, LDA) * * Apply H to rows 1:m-k+i+ib-1 of current block * CALL DORG2L( M-K+I+IB-1, IB, IB, A( 1, N-K+I ), LDA, $ TAU( I ), WORK, IINFO ) -* -* Set rows m-k+i+ib:m of current block to zero -* - DO 40 J = N - K + I, N - K + I + IB - 1 - DO 30 L = M - K + I + IB, M - A( L, J ) = ZERO - 30 CONTINUE - 40 CONTINUE - 50 CONTINUE + END DO END IF * WORK( 1 ) = IWS diff --git a/SRC/dorgqr.f b/SRC/dorgqr.f index 83e26588e..8f3c44278 100644 --- a/SRC/dorgqr.f +++ b/SRC/dorgqr.f @@ -5,6 +5,7 @@ * Online html documentation available at * http://www.netlib.org/lapack/explore-html/ * +*> \htmlonly *> Download DORGQR + dependencies *> *> [TGZ] @@ -12,6 +13,7 @@ *> [ZIP] *> *> [TXT] +*> \endhtmlonly * * Definition: * =========== @@ -95,8 +97,6 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. LWORK >= max(1,N). -*> For optimum performance LWORK >= N*NB, where NB is the -*> optimal blocksize. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -119,7 +119,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup ungqr +*> \ingroup doubleOTHERcomputational * * ===================================================================== SUBROUTINE DORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) @@ -137,17 +137,13 @@ SUBROUTINE DORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * * ===================================================================== * -* .. Parameters .. - DOUBLE PRECISION ZERO - PARAMETER ( ZERO = 0.0D+0 ) -* .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, J, KI, KK, L, LDWORK, - $ LWKOPT, NB, NBMIN, NX + INTEGER I, IB, IINFO, IWS, KI, KK, LWKOPT, + $ NB, NBMIN, NX * .. * .. External Subroutines .. - EXTERNAL DLARFB, DLARFT, DORG2R, XERBLA + EXTERNAL DLARFB0C2, DLARFT, DORG2R, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -162,7 +158,9 @@ SUBROUTINE DORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * INFO = 0 NB = ILAENV( 1, 'DORGQR', ' ', M, N, K, -1 ) - LWKOPT = MAX( 1, N )*NB + ! Only need a workspace for dorg2r in case of bailout and + ! for the panel factorization + LWKOPT = MAX( 1, N ) WORK( 1 ) = LWKOPT LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN @@ -191,92 +189,93 @@ SUBROUTINE DORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) END IF * NBMIN = 2 - NX = 0 + NX = MAX(0, ILAENV(3, 'SORGQR', ' ', M, N, K, -1)) IWS = N - IF( NB.GT.1 .AND. NB.LT.K ) THEN -* -* Determine when to cross over from blocked to unblocked code. * - NX = MAX( 0, ILAENV( 3, 'DORGQR', ' ', M, N, K, -1 ) ) - IF( NX.LT.K ) THEN + IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN * -* Determine if workspace is large enough for blocked code. +* Handle the first block assuming we are applying to the +* identity, then resume regular blocking method after * - LDWORK = N - IWS = LDWORK*NB - IF( LWORK.LT.IWS ) THEN + KI = K - 2 * NB + KK = K - NB + ELSE + KK = 0 + END IF * -* Not enough workspace to use optimal NB: reduce NB and -* determine the minimum value of NB. +* Potentially bail to the unblocked code. * - NB = LWORK / LDWORK - NBMIN = MAX( 2, ILAENV( 2, 'DORGQR', ' ', M, N, K, - $ -1 ) ) - END IF - END IF + IF( KK.EQ.0 ) THEN + CALL DORG2R( M, N, K, A, LDA, TAU, WORK, IINFO ) END IF * - IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN + IF( KK.GT.0 ) THEN + I = KK + 1 + IB = NB * -* Use blocked code after the last block. -* The first kk columns are handled by the block method. +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * - KI = ( ( K-NX-1 ) / NB )*NB - KK = MIN( K, KI+NB ) + CALL DLARFT('Forward', 'Column', M-I+1, IB, A(I,I), + $ LDA, TAU(I), A(I,I), LDA) * -* Set A(1:kk,kk+1:n) to zero. +* Apply H to A(i:m,i+ib:n) from the left * - DO 20 J = KK + 1, N - DO 10 I = 1, KK - A( I, J ) = ZERO - 10 CONTINUE - 20 CONTINUE - ELSE - KK = 0 - END IF + CALL DLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Forward', + $ 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), LDA, A(I,I), + $ LDA, A(I,I+IB), LDA) * -* Use unblocked code for the last or only block. +* Apply H to rows i:m of current block * - IF( KK.LT.N ) - $ CALL DORG2R( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA, - $ TAU( KK+1 ), WORK, IINFO ) + CALL DORG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK, IINFO) + DO I = KI + 1, 1, -NB + IB = NB * - IF( KK.GT.0 ) THEN +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * -* Use blocked code + CALL DLARFT('Forward', 'Column', M-I+1, IB, A(I,I), + $ LDA, TAU(I), A(I,I), LDA) * - DO 50 I = KI + 1, 1, -NB - IB = MIN( NB, K-I+1 ) - IF( I+IB.LE.N ) THEN +* Apply H to A(i:m,i+ib:n) from the left * -* Form the triangular factor of the block reflector -* H = H(i) H(i+1) . . . H(i+ib-1) + CALL DLARFB0C2(.FALSE., 'Left', 'No Transpose', + $ 'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), + $ LDA, A(I,I), LDA, A(I,I+IB), LDA) + * - CALL DLARFT( 'Forward', 'Columnwise', M-I+1, IB, - $ A( I, I ), LDA, TAU( I ), WORK, LDWORK ) +* Apply H to rows i:m of current block * -* Apply H to A(i:m,i+ib:n) from the left + CALL DORG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK, + $ IINFO) + END DO * - CALL DLARFB( 'Left', 'No transpose', 'Forward', - $ 'Columnwise', M-I+1, N-I-IB+1, IB, - $ A( I, I ), LDA, WORK, LDWORK, A( I, I+IB ), - $ LDA, WORK( IB+1 ), LDWORK ) - END IF +* This checks for if K was a perfect multiple of NB +* so that we only have a special case for the last block when +* necessary * -* Apply H to rows i:m of current block + IF(I.LT.1) THEN + IB = I + NB - 1 + I = 1 * - CALL DORG2R( M-I+1, IB, IB, A( I, I ), LDA, TAU( I ), - $ WORK, - $ IINFO ) +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * -* Set rows 1:i-1 of current block to zero + CALL DLARFT('Forward', 'Column', M-I+1, IB, A(I,I), + $ LDA, TAU(I), A(I,I), LDA) * - DO 40 J = I, I + IB - 1 - DO 30 L = 1, I - 1 - A( L, J ) = ZERO - 30 CONTINUE - 40 CONTINUE - 50 CONTINUE +* Apply H to A(i:m,i+ib:n) from the left +* + CALL DLARFB0C2(.FALSE., 'Left', 'No Transpose', + $ 'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), + $ LDA, A(I,I), LDA, A(I,I+IB), LDA) + +* +* Apply H to rows i:m of current block +* + CALL DORG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK, + $ IINFO) + END IF END IF * WORK( 1 ) = IWS diff --git a/SRC/dorgrq.f b/SRC/dorgrq.f index 126489bbe..811c2cdb5 100644 --- a/SRC/dorgrq.f +++ b/SRC/dorgrq.f @@ -5,6 +5,7 @@ * Online html documentation available at * http://www.netlib.org/lapack/explore-html/ * +*> \htmlonly *> Download DORGRQ + dependencies *> *> [TGZ] @@ -12,6 +13,7 @@ *> [ZIP] *> *> [TXT] +*> \endhtmlonly * * Definition: * =========== @@ -137,17 +139,13 @@ SUBROUTINE DORGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * * ===================================================================== * -* .. Parameters .. - DOUBLE PRECISION ZERO - PARAMETER ( ZERO = 0.0D+0 ) -* .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, II, IINFO, IWS, J, KK, L, LDWORK, + INTEGER I, IB, II, IINFO, IWS, KK, LDWORK, $ LWKOPT, NB, NBMIN, NX * .. * .. External Subroutines .. - EXTERNAL DLARFB, DLARFT, DORGR2, XERBLA + EXTERNAL DLARFB0C2, DLARFT, DORGR2, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -227,63 +225,70 @@ SUBROUTINE DORGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN * -* Use blocked code after the first block. -* The last kk rows are handled by the block method. -* - KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB ) +* We want to use the blocking method as long as our matrix is big enough +* and it's deemed worthwhile with the extra memory allocations * -* Set A(1:m-kk,n-kk+1:n) to zero. -* - DO 20 J = N - KK + 1, N - DO 10 I = 1, M - KK - A( I, J ) = ZERO - 10 CONTINUE - 20 CONTINUE + KK = K ELSE KK = 0 END IF * -* Use unblocked code for the first or only block. +* Potentially bail to the unblocked code * - CALL DORGR2( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO ) + IF( KK.EQ.0 ) THEN + CALL DORGR2( M, N, K, A, LDA, TAU, WORK, IINFO ) + END IF * IF( KK.GT.0 ) THEN * -* Use blocked code +* Factor the first block assuming that our first application +* will be on the Identity matrix * - DO 50 I = K - KK + 1, K, NB - IB = MIN( NB, K-I+1 ) - II = M - K + I - IF( II.GT.1 ) THEN + I = 1 + IB = NB + II = M - K + I * -* Form the triangular factor of the block reflector -* H = H(i+ib-1) . . . H(i+1) H(i) +* Form the triangular factor of the block reflector +* H = H(i+ib-1) . . . H(i+1) H(i) * - CALL DLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB, - $ A( II, 1 ), LDA, TAU( I ), WORK, LDWORK ) + CALL DLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB, + $ A( II, 1 ), LDA, TAU( I ), WORK, LDWORK ) * -* Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right +* Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right * - CALL DLARFB( 'Right', 'Transpose', 'Backward', - $ 'Rowwise', - $ II-1, N-K+I+IB-1, IB, A( II, 1 ), LDA, WORK, - $ LDWORK, A, LDA, WORK( IB+1 ), LDWORK ) - END IF + CALL DLARFB0C2(.TRUE., 'Right', 'Transpose', 'Backward', + $ 'Rowwise', II-1, N-K+I+IB-1, IB, A(II,1), LDA, WORK, + $ LDWORK, A, LDA) * * Apply H**T to columns 1:n-k+i+ib-1 of current block * - CALL DORGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA, - $ TAU( I ), - $ WORK, IINFO ) + CALL DORGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA, + $ TAU( I ), WORK, IINFO ) + + DO I = NB + 1, K, NB +* +* The last block may be less than size NB * -* Set columns n-k+i+ib:n of current block to zero + IB = MIN(NB, K-I+1) + II = M - K + I +* +* Form the triangular factor of the block reflector +* H = H(i+ib-1) . . . H(i+1) H(i) +* + CALL DLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB, + $ A( II, 1 ), LDA, TAU( I ), WORK, LDWORK ) +* +* Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right * - DO 40 L = N - K + I + IB, N - DO 30 J = II, II + IB - 1 - A( J, L ) = ZERO - 30 CONTINUE - 40 CONTINUE - 50 CONTINUE + CALL DLARFB0C2(.FALSE., 'Right', 'Transpose', + $ 'Backward', 'Rowwise', II-1, N-K+I+IB-1, IB, + $ A(II,1), LDA, WORK, LDWORK, A, LDA) +* +* Apply H**T to columns 1:n-k+i+ib-1 of current block +* + CALL DORGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA, + $ TAU( I ), WORK, IINFO ) + END DO END IF * WORK( 1 ) = IWS diff --git a/SRC/lapack_64.h b/SRC/lapack_64.h index e8000bf2c..e8d792094 100644 --- a/SRC/lapack_64.h +++ b/SRC/lapack_64.h @@ -331,6 +331,7 @@ #define CLARF1F CLARF1F_64 #define CLARF1L CLARF1L_64 #define CLARFB CLARFB_64 +#define CLARFB0C2 CLARFB0C2_64 #define CLARFB_GETT CLARFB_GETT_64 #define CLARFG CLARFG_64 #define CLARFGP CLARFGP_64 @@ -805,6 +806,7 @@ #define DLARF1F DLARF1F_64 #define DLARF1L DLARF1L_64 #define DLARFB DLARFB_64 +#define DLARFB0C2 DLARFB0C2_64 #define DLARFB_GETT DLARFB_GETT_64 #define DLARFG DLARFG_64 #define DLARFGP DLARFGP_64 @@ -1400,6 +1402,7 @@ #define SLARF1F SLARF1F_64 #define SLARF1L SLARF1L_64 #define SLARFB SLARFB_64 +#define SLARFB0C2 SLARFB0C2_64 #define SLARFB_GETT SLARFB_GETT_64 #define SLARFG SLARFG_64 #define SLARFGP SLARFGP_64 @@ -2046,6 +2049,7 @@ #define ZLARF1F ZLARF1F_64 #define ZLARF1L ZLARF1L_64 #define ZLARFB ZLARFB_64 +#define ZLARFB0C2 ZLARFB0C2_64 #define ZLARFB_GETT ZLARFB_GETT_64 #define ZLARFG ZLARFG_64 #define ZLARFGP ZLARFGP_64 diff --git a/SRC/slarfb0c2.f b/SRC/slarfb0c2.f new file mode 100644 index 000000000..e3c222904 --- /dev/null +++ b/SRC/slarfb0c2.f @@ -0,0 +1,556 @@ +*> \brief \b SLARFB0C2 applies a block reflector or its transpose to a +* rectangular matrix with a 0 block while constructing the explicit Q factor +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* +* Definition: +* =========== +* +* SUBROUTINE SLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N, +* $ K, V, LDV, T, LDT, C, LDC) +* ! Scalar arguments +* INTEGER M, N, K, LDV, LDC, LDT +* CHARACTER SIDE, TRANS, DIRECT, STOREV +* ! True means that we are assuming C2 is the identity matrix +* ! and thus don't reference whatever is present in C2 +* ! at the beginning. +* LOGICAL C2I +* ! Array arguments +* REAL V(LDV,*), C(LDC,*), T(LDT,*) +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SLARFB0C2 applies a real block reflector H or its transpose H**T to a +*> real m by n matrix C with a 0 block, while computing the explicit Q factor +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] C2I +*> \verbatim +*> C2I is LOGICAL +*> = .TRUE.: Assume the nonzero block of C is the identity matrix +*> = .FALSE.: Use existing data in the nonzero block of C +*> \endverbatim +*> +*> \param[in] SIDE +*> \verbatim +*> SIDE is CHARACTER*1 +*> = 'L': apply H or H**T from the Left +*> = 'R': apply H or H**T from the Right +*> \endverbatim +*> +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> = 'N': apply H (No transpose) +*> = 'T': apply H**T (Transpose) +*> \endverbatim +*> +*> \param[in] DIRECT +*> \verbatim +*> DIRECT is CHARACTER*1 +*> Indicates how H is formed from a product of elementary +*> reflectors +*> = 'F': H = H(1) H(2) . . . H(k) (Forward) +*> = 'B': H = H(k) . . . H(2) H(1) (Backward) +*> \endverbatim +*> +*> \param[in] STOREV +*> \verbatim +*> STOREV is CHARACTER*1 +*> Indicates how the vectors which define the elementary +*> reflectors are stored: +*> = 'C': Columnwise +*> = 'R': Rowwise +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix C. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix C. +*> \endverbatim +*> +*> \param[in] K +*> \verbatim +*> K is INTEGER +*> The order of the matrix T (= the number of elementary +*> reflectors whose product defines the block reflector). +*> If SIDE = 'L', M >= K >= 0; +*> if SIDE = 'R', N >= K >= 0. +*> \endverbatim +*> +*> \param[in] V +*> \verbatim +*> V is REAL array, dimension +*> (LDV,K) if STOREV = 'C' +*> (LDV,M) if STOREV = 'R' and SIDE = 'L' +*> (LDV,N) if STOREV = 'R' and SIDE = 'R' +*> The matrix V. See Further Details. +*> \endverbatim +*> +*> \param[in] LDV +*> \verbatim +*> LDV is INTEGER +*> The leading dimension of the array V. +*> If STOREV = 'C' and SIDE = 'L', LDV >= max(1,M); +*> if STOREV = 'C' and SIDE = 'R', LDV >= max(1,N); +*> if STOREV = 'R', LDV >= K. +*> \endverbatim +*> +*> \param[in] T +*> \verbatim +*> T is REAL array, dimension (LDT,K) +*> The triangular k by k matrix T in the representation of the +*> block reflector. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. LDT >= K. +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is REAL array, dimension (LDC,N) +*> On entry, the m by n matrix C. +*> On exit, C is overwritten by H*C or H**T*C or C*H or C*H**T. +*> \endverbatim +*> +*> \param[in] LDC +*> \verbatim +*> LDC is INTEGER +*> The leading dimension of the array C. LDC >= max(1,M). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup larfb +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> The shape of the matrix V and the storage of the vectors which define +*> the H(i) is best illustrated by the following example with n = 5 and +*> k = 3. The triangular part of V (including its diagonal) is not +*> referenced. +*> +*> DIRECT = 'F' and STOREV = 'C': DIRECT = 'F' and STOREV = 'R': +*> +*> V = ( 1 ) V = ( 1 v1 v1 v1 v1 ) +*> ( v1 1 ) ( 1 v2 v2 v2 ) +*> ( v1 v2 1 ) ( 1 v3 v3 ) +*> ( v1 v2 v3 ) +*> ( v1 v2 v3 ) +*> +*> DIRECT = 'B' and STOREV = 'C': DIRECT = 'B' and STOREV = 'R': +*> +*> V = ( v1 v2 v3 ) V = ( v1 v1 1 ) +*> ( v1 v2 v3 ) ( v2 v2 v2 1 ) +*> ( 1 v2 v3 ) ( v3 v3 v3 v3 1 ) +*> ( 1 v3 ) +*> ( 1 ) +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE SLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N, + $ K, V, LDV, T, LDT, C, LDC) + ! Scalar arguments + INTEGER M, N, K, LDV, LDC, LDT + CHARACTER SIDE, TRANS, DIRECT, STOREV + ! True means that we are assuming C2 is the identity matrix + ! and thus don't reference whatever is present in C2 + ! at the beginning. + LOGICAL C2I + + ! Array arguments + REAL V(LDV,*), C(LDC,*), T(LDT,*) + ! Local scalars + LOGICAL QR, LQ, QL, DIRF, COLV, SIDEL, SIDER, + $ TRANST + INTEGER I, J + ! External functions + LOGICAL LSAME + EXTERNAL LSAME + ! External Subroutines + EXTERNAL SGEMM, STRMM, XERBLA + ! Parameters + REAL ONE, ZERO, NEG_ONE + PARAMETER(ONE=1.0E+0, ZERO = 0.0E+0, NEG_ONE = -1.0E+0) + + ! Beginning of executable statements + ! Convert our character flags to logical values + DIRF = LSAME(DIRECT,'F') + COLV = LSAME(STOREV,'C') + SIDEL = LSAME(SIDE,'L') + SIDER = LSAME(SIDE,'R') + TRANST = LSAME(TRANS,'T') + + ! Determine which of the 4 modes are using. + ! QR is when we store the reflectors column by column and have the + ! 'first' reflector stored in the first column + QR = DIRF.AND.COLV + + ! LQ is when we store the reflectors row by row and have the + ! 'first' reflector stored in the first row + LQ = DIRF.AND.(.NOT.COLV) + + ! QL is when we store the reflectors column by column and have the + ! 'first' reflector stored in the last column + QL = (.NOT.DIRF).AND.COLV + + ! RQ is when we store the reflectors row by row and have the + ! 'first' reflector stored in the last row + ! RQ = (.NOT.DIRF).AND.(.NOT.COLV) + ! Since we have exactly one of these 4 modes, we don't need to actually + ! store the value of RQ, instead we assume this is the case if we fail + ! the above 3 checks. + + IF (QR) THEN + ! We are computing C = HC = (I - VTV')C + ! Where: V = [ V1 ] and C = [ C1 ] + ! [ V2 ] [ C2 ] + ! with the following dimensions: + ! V1\in\R^{K\times K} + ! V2\in\R^{M-K\times K} + ! C1=0\in\R^{K\times N} + ! C2\in\R^{M-K\times N} + ! Since we are assuming that C1 is a zero matrix and it will be + ! overwritten on exit, we can use this spot as a temporary workspace + ! without having to allocate anything extra. + ! This lets us simplify our above equation to get + ! + ! C = HC = (I - [ V1 ]T [V1', V2'])[ 0 ] + ! [ V2 ] [ C2 ] + ! = [ 0 ] - [ V1 ]T*V2'*C2 + ! [ C2 ] [ V2 ] + ! + ! = [ 0 ] - [ V1*T*V2'*C2 ] + ! [ C2 ] [ V2*T*V2'*C2 ] + ! + ! = [ V1*T*V2'*C2 ] + ! [ C2 - V2*T*V2'*C2 ] + ! + ! So, we can order our computations as follows: + ! + ! C1 = V2'*C2 + ! C1 = T*C1 + ! C2 = C2 - V2*C1 + ! C1 = -V1*C1 + ! + ! To achieve the same end result + ! + ! Check to ensure side and trans are the expected values + ! + IF( .NOT.SIDEL ) THEN + CALL XERBLA('SLARFB0C2', 2) + RETURN + ELSE IF(TRANST) THEN + CALL XERBLA('SLARFB0C2', 3) + RETURN + END IF + ! + ! C1 = V2'*C2 + ! + IF (C2I) THEN + DO J = 1, N + DO I = 1, K + C(I,J) = V(K+J,I) + END DO + END DO + ELSE + CALL SGEMM('Transpose', 'No Transpose', K, N, M - K, + $ ONE, V(K+1,1), LDV, C(K+1,1), LDC, ZERO, + $ C, LDC) + END IF + ! + ! C1 = T*C1 + ! + CALL STRMM('Left', 'Upper', 'No Transpose', 'Non-unit', + $ K, N, ONE, T, LDT, C, LDC) + ! + ! C2 = C2 - V2*C1 = -V2*C1 + C2 + ! + IF (C2I) THEN + CALL SGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V(K+1,1), LDV, C, LDC, ZERO, + $ C(K+1,1), LDC) + DO I = 1, N + C(K+I,I) = C(K+I,I) + ONE + END DO + ELSE + CALL SGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V(K+1,1), LDV, C, LDC, ONE, + $ C(K+1,1), LDC) + END IF + ! + ! C1 = -V1*C1 + ! + CALL STRMM('Left', 'Lower', 'No Transpose', 'Unit', + $ K, N, NEG_ONE, V, LDV, C, LDC) + ELSE IF (LQ) THEN + ! We are computing C = CH' = C(I-V'T'V) + ! Where: V = [ V1 V2 ] and C = [ C1 C2 ] + ! with the following dimensions: + ! V1\in\R^{K\times K} + ! V2\in\R^{K\times N-K} + ! C1=0\in\R^{M\times K} + ! C2\in\R^{M\times N-K} + ! Since we are assuming that C1 is a zero matrix and it will be + ! overwritten on exit, we can use this spot as a temporary workspace + ! without having to allocate anything extra. + ! This lets us simplify our above equation to get + ! + ! C = CH' = [ 0, C2 ](I - [ V1' ]T'[ V1, V2 ]) + ! [ V2' ] + ! + ! = [ 0, C2 ] - [ 0, C2 ][ V1' ]T'[ V1, V2 ] + ! [ V2' ] + ! + ! = [ 0, C2 ] - C2*V2'*T'[ V1, V2 ] + ! + ! = [ -C2*V2'*T'*V1, C2 - C2*V2'*T'*V2 ] + ! + ! So, we can order our computations as follows: + ! + ! C1 = C2*V2' + ! C1 = C1*T' + ! C2 = C2 - C1*V2 + ! C1 = -C1*V1 + ! + ! To achieve the same end result + ! + ! Check to ensure side and trans are the expected values + ! + IF( .NOT.SIDER ) THEN + CALL XERBLA('SLARFB0C2', 2) + RETURN + ELSE IF(.NOT.TRANST) THEN + CALL XERBLA('SLARFB0C2', 3) + RETURN + END IF + ! + ! C1 = C2*V2' + ! + IF( C2I ) THEN + DO J = 1, K + DO I = 1, M + C(I,J) = V(J,K+I) + END DO + END DO + ELSE + CALL SGEMM('No Transpose', 'Transpose', M, K, N-K, + $ ONE, C(1,K+1), LDC, V(1, K+1), LDV, ZERO, C, + $ LDC) + END IF + ! + ! C1 = C1*T' + ! + CALL STRMM('Right', 'Upper', 'Transpose', 'Non-unit', + $ M, K, ONE, T, LDT, C, LDC) + ! + ! C2 = C2 - C1*V2 = -C1*V2 + C2 + ! + IF( C2I ) THEN + CALL SGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C, LDC, V(1,K+1), LDV, ZERO, C(1,K+1), + $ LDC) + DO I = 1, M + C(I,K+I) = C(I,K+I) + ONE + END DO + ELSE + CALL SGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C, LDC, V(1,K+1), LDV, ONE, C(1,K+1), + $ LDC) + END IF + ! + ! C1 = -C1*V1 + ! + CALL STRMM('Right', 'Upper', 'No Transpose', 'Unit', + $ M, K, NEG_ONE, V, LDV, C, LDC) + ELSE IF (QL) THEN + ! We are computing C = HC = (I - VTV')C + ! Where: V = [ V2 ] and C = [ C2 ] + ! [ V1 ] [ C1 ] + ! with the following dimensions: + ! V1\in\R^{K\times K} + ! V2\in\R^{M-K\times K} + ! C1=0\in\R^{K\times N} + ! C2\in\R^{M-K\times N} + ! Since we are assuming that C1 is a zero matrix and it will be + ! overwritten on exit, we can use this spot as a temporary workspace + ! without having to allocate anything extra. + ! This lets us simplify our above equation to get + ! + ! C = HC = (I-[ V2 ]T[ V2' V1' ])[ C2 ] + ! [ V1 ] [ 0 ] + ! + ! = [ C2 ] - [ V2 ]T*V2'*C2 + ! [ 0 ] [ V1 ] + ! + ! = [ C2 ] - [ V2*T*V2'*C2 ] + ! [ 0 ] [ V1*T*V2'*C2 ] + ! + ! = [ C2 - V2*T*V2'*C2 ] + ! [ - V1*T*V2'*C2 ] + ! + ! So, we can order our computations as follows: + ! + ! C1 = V2'*C2 + ! C1 = T*C1 + ! C2 = C2 - V2*C1 + ! C1 = -V1*C1 + ! + ! To achieve the same end result + ! + ! Check to ensure side and trans are the expected values + ! + IF( .NOT.SIDEL ) THEN + CALL XERBLA('SLARFB0C2', 2) + RETURN + ELSE IF(TRANST) THEN + CALL XERBLA('SLARFB0C2', 3) + RETURN + END IF + ! + ! C1 = V2'*C2 + ! + IF( C2I ) THEN + DO J = 1, N + DO I = 1, K + C(M-K+I,J) = V(J,I) + END DO + END DO + ELSE + CALL SGEMM('Transpose', 'No Transpose', K, N, M-K, + $ ONE, V, LDV, C, LDC, ZERO, C(M-K+1, 1), LDC) + END IF + ! + ! C1 = T*C1 + ! + CALL STRMM('Left', 'Lower', 'No Transpose', 'Non-unit', + $ K, N, ONE, T, LDT, C(M-K+1,1), LDC) + ! + ! C2 = C2 - V2*C1 = -V2*C1 + C2 + ! + IF( C2I ) THEN + CALL SGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V, LDV, C(M-K+1,1), LDC, ZERO, C, LDC) + DO I = 1, N + C(I,I) = C(I,I) + ONE + END DO + ELSE + CALL SGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V, LDV, C(M-K+1,1), LDC, ONE, C, LDC) + END IF + ! + ! C1 = -V1*C1 + ! + CALL STRMM('Left', 'Upper', 'No Transpose', 'Unit', + $ K, N, NEG_ONE, V(M-K+1,1), LDV, C(M-K+1,1), LDC) + ELSE ! IF (RQ) THEN + ! We are computing C = CH' = C(I-V'T'V) + ! Where: V = [ V2 V1] and C = [ C2 C1 ] + ! with the following dimensions: + ! V1\in\R^{K\times K} + ! V2\in\R^{K\times N-K} + ! C1=0\in\R^{M\times K} + ! C2\in\R^{M\times N-K} + ! Since we are assuming that C1 is a zero matrix and it will be + ! overwritten on exit, we can use this spot as a temporary workspace + ! without having to allocate anything extra. + ! This lets us simplify our above equation to get + ! + ! C = CH' = [ C2, 0 ] (I - [ V2' ]T'[ V2, V1 ] + ! [ V1' ] + ! + ! = [ C2, 0 ] - [ C2, 0 ] [ V2' ]T'[ V2, V1 ] + ! [ V1' ] + ! + ! = [ C2, 0 ] - C2*V2'*T'[ V2, V1 ] + ! + ! = [ C2, 0 ] - [ C2*V2'*T'*V2, C2*V2'*T'*V1 ] + ! + ! = [ C2 - C2*V2'*T'*V2, -C2*V2'*T'*V1 ] + ! + ! So, we can order our computations as follows: + ! + ! C1 = C2*V2' + ! C1 = C1*T' + ! C2 = C2 - C1*V2 + ! C1 = -C1*V1 + ! + ! + ! To achieve the same end result + ! + ! Check to ensure side and trans are the expected values + ! + IF( .NOT.SIDER ) THEN + CALL XERBLA('SLARFB0C2', 2) + RETURN + ELSE IF(.NOT.TRANST) THEN + CALL XERBLA('SLARFB0C2', 3) + RETURN + END IF + ! + ! C1 = C2*V2' + ! + IF( C2I ) THEN + DO J = 1, K + DO I = 1, M + C(I,N-K+J) = V(J,I) + END DO + END DO + ELSE + CALL SGEMM('No Transpose', 'Transpose', M, K, N-K, + $ ONE, C, LDC, V, LDV, ZERO, C(1, N-K+1), LDC) + END IF + ! + ! C1 = C1*T' + ! + CALL STRMM('Right', 'Lower', 'Transpose', 'Non-unit', + $ M, K, ONE, T, LDT, C(1, N-K+1), LDC) + ! + ! C2 = C2 - C1*V2 = -C1*V2 + C2 + ! + IF( C2I ) THEN + CALL SGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C(1, N-K+1), LDC, V, LDV, ZERO, C, LDC) + DO I = 1, M + C(I,I) = C(I,I) + ONE + END DO + ELSE + CALL SGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C(1, N-K+1), LDC, V, LDV, ONE, C, LDC) + END IF + ! + ! C1 = -C1*V1 + ! + CALL STRMM('Right', 'Lower', 'No Transpose', 'Unit', + $ M, K, NEG_ONE, V(1, N-K+1), LDV, C(1,N-K+1), LDC) + END IF + END SUBROUTINE diff --git a/SRC/sorglq.f b/SRC/sorglq.f index fb5ef79cc..574b3ae2c 100644 --- a/SRC/sorglq.f +++ b/SRC/sorglq.f @@ -146,7 +146,7 @@ SUBROUTINE SORGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) $ LWKOPT, NB, NBMIN, NX * .. * .. External Subroutines .. - EXTERNAL SLARFB, SLARFT, SORGL2, XERBLA + EXTERNAL SLARFB0C2, SLARFT, SORGL2, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -218,67 +218,90 @@ SUBROUTINE SORGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN * -* Use blocked code after the last block. -* The first kk rows are handled by the block method. +* Handle the first block assuming we are applying to the +* identity, then resume regular blocking method after * - KI = ( ( K-NX-1 ) / NB )*NB - KK = MIN( K, KI+NB ) -* -* Set A(kk+1:m,1:kk) to zero. -* - DO 20 J = 1, KK - DO 10 I = KK + 1, M - A( I, J ) = ZERO - 10 CONTINUE - 20 CONTINUE + KI = K - 2 * NB + KK = K - NB ELSE KK = 0 END IF * -* Use unblocked code for the last or only block. +* Potentially bail to the unblocked version * - IF( KK.LT.M ) - $ CALL SORGL2( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA, - $ TAU( KK+1 ), WORK, IINFO ) + IF( KK.EQ.0 ) THEN + CALL SORGL2( M, N, K, A, LDA, TAU, WORK, IINFO ) + END IF * IF( KK.GT.0 ) THEN + I = KK + 1 + IB = NB +* +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) +* + CALL SLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ), + $ LDA, TAU( I ), WORK, LDWORK ) +* +* Apply H**T to A(i+ib:m,i:n) from the right +* + CALL SLARFB0C2(.TRUE., 'Right', 'Transpose', 'Forward', + $ 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, + $ LDWORK, A(I+IB,I), LDA) +* +* Apply H**T to columns i:n of current block + + CALL SORGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ), + $ WORK, IINFO ) * * Use blocked code * - DO 50 I = KI + 1, 1, -NB - IB = MIN( NB, K-I+1 ) - IF( I+IB.LE.M ) THEN + DO I = KI + 1, 1, -NB + IB = NB * -* Form the triangular factor of the block reflector -* H = H(i) H(i+1) . . . H(i+ib-1) +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * - CALL SLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, - $ I ), - $ LDA, TAU( I ), WORK, LDWORK ) + CALL SLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ), + $ LDA, TAU( I ), WORK, LDWORK ) * -* Apply H**T to A(i+ib:m,i:n) from the right +* Apply H**T to A(i+ib:m,i:n) from the right * - CALL SLARFB( 'Right', 'Transpose', 'Forward', - $ 'Rowwise', - $ M-I-IB+1, N-I+1, IB, A( I, I ), LDA, WORK, - $ LDWORK, A( I+IB, I ), LDA, WORK( IB+1 ), - $ LDWORK ) - END IF + CALL SLARFB0C2(.FALSE., 'Right', 'Transpose', 'Forward', + $ 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, + $ LDWORK, A(I+IB,I), LDA) * * Apply H**T to columns i:n of current block -* + CALL SORGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ), - $ WORK, - $ IINFO ) + $ WORK, IINFO ) + END DO +* +* This checks for if K was a perfect multiple of NB +* so that we only have a special case for the last block when +* necessary +* + IF(I.LT.1) THEN + IB = I + NB - 1 + I = 1 +* +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) +* + CALL SLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ), + $ LDA, TAU( I ), WORK, LDWORK ) * -* Set columns 1:i-1 of current block to zero +* Apply H**T to A(i+ib:m,i:n) from the right * - DO 40 J = 1, I - 1 - DO 30 L = I, I + IB - 1 - A( L, J ) = ZERO - 30 CONTINUE - 40 CONTINUE - 50 CONTINUE + CALL SLARFB0C2(.FALSE., 'Right', 'Transpose', 'Forward', + $ 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, + $ LDWORK, A(I+IB,I), LDA) +* +* Apply H**T to columns i:n of current block + + CALL SORGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ), + $ WORK, IINFO ) + END IF END IF * WORK( 1 ) = SROUNDUP_LWORK(IWS) diff --git a/SRC/sorgql.f b/SRC/sorgql.f index 7257ff2d9..c9b2918b7 100644 --- a/SRC/sorgql.f +++ b/SRC/sorgql.f @@ -95,8 +95,6 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. LWORK >= max(1,N). -*> For optimum performance LWORK >= N*NB, where NB is the -*> optimal blocksize. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -143,11 +141,11 @@ SUBROUTINE SORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, J, KK, L, LDWORK, LWKOPT, - $ NB, NBMIN, NX + INTEGER I, IB, IINFO, IWS, J, KK, LWKOPT, NB, + $ NBMIN, NX * .. * .. External Subroutines .. - EXTERNAL SLARFB, SLARFT, SORG2L, XERBLA + EXTERNAL SLARFB0C2, SLARFT, SORG2L, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -178,7 +176,8 @@ SUBROUTINE SORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) LWKOPT = 1 ELSE NB = ILAENV( 1, 'SORGQL', ' ', M, N, K, -1 ) - LWKOPT = N*NB + ! Only need a workspace for calls to dorg2l + LWKOPT = N END IF WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * @@ -201,88 +200,76 @@ SUBROUTINE SORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) END IF * NBMIN = 2 - NX = 0 + NX = MAX(0, ILAENV(3, 'SORGQL', ' ', M, N, K, -1)) IWS = N - IF( NB.GT.1 .AND. NB.LT.K ) THEN * -* Determine when to cross over from blocked to unblocked code. -* - NX = MAX( 0, ILAENV( 3, 'SORGQL', ' ', M, N, K, -1 ) ) - IF( NX.LT.K ) THEN + IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN * -* Determine if workspace is large enough for blocked code. +* We want to use the blocking method as long as our matrix is big enough * - LDWORK = N - IWS = LDWORK*NB - IF( LWORK.LT.IWS ) THEN + KK = K + ELSE + KK = 0 + END IF * -* Not enough workspace to use optimal NB: reduce NB and -* determine the minimum value of NB. +* Possibly bail to the unblocked code. * - NB = LWORK / LDWORK - NBMIN = MAX( 2, ILAENV( 2, 'SORGQL', ' ', M, N, K, - $ -1 ) ) - END IF - END IF + IF( KK.EQ.0 ) THEN + CALL SORG2L( M, N, K, A, LDA, TAU, WORK, IINFO ) END IF * - IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN + IF( KK.GT.0 ) THEN * -* Use blocked code after the first block. -* The last kk columns are handled by the block method. +* Factor the first block assuming that our first application +* will be on the Identity matrix * - KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB ) + I = 1 + IB = NB * -* Set A(m-kk+1:m,1:n-kk) to zero. +* Form the triangular factor of the block reflector +* H = H(i+ib-1) . . . H(i+1) H(i) * - DO 20 J = 1, N - KK - DO 10 I = M - KK + 1, M - A( I, J ) = ZERO - 10 CONTINUE - 20 CONTINUE - ELSE - KK = 0 - END IF + CALL SLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB, + $ A( 1, N-K+I ), LDA, TAU( I ), + $ A( M-K+I, N-K+I ), LDA) * -* Use unblocked code for the first or only block. +* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left * - CALL SORG2L( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO ) + CALL SLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Backward', + $ 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, A(1, N-K+I), + $ LDA, A( M-K+I, N-K+I ), LDA, A, LDA) * - IF( KK.GT.0 ) THEN +* Apply H to rows 1:m-k+i+ib-1 of current block * -* Use blocked code + CALL SORG2L( M-K+I+IB-1, IB, IB, A( 1, N-K+I ), LDA, + $ TAU( I ), WORK, IINFO ) + +* Use blocked code on the remaining blocks if there are any. * - DO 50 I = K - KK + 1, K, NB - IB = MIN( NB, K-I+1 ) - IF( N-K+I.GT.1 ) THEN + DO I = NB+1, K, NB * -* Form the triangular factor of the block reflector -* H = H(i+ib-1) . . . H(i+1) H(i) +* The last block may be less than size NB * - CALL SLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB, - $ A( 1, N-K+I ), LDA, TAU( I ), WORK, LDWORK ) + IB = MIN(NB, K-I+1) * -* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left +* Form the triangular factor of the block reflector +* H = H(i+ib-1) . . . H(i+1) H(i) * - CALL SLARFB( 'Left', 'No transpose', 'Backward', - $ 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, - $ A( 1, N-K+I ), LDA, WORK, LDWORK, A, LDA, - $ WORK( IB+1 ), LDWORK ) - END IF + CALL SLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB, + $ A( 1, N-K+I ), LDA, TAU( I ), + $ A( M-K+I, N-K+I ), LDA ) +* +* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left +* + CALL SLARFB0C2(.FALSE., 'Left', 'No Transpose', + $ 'Backward', 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, + $ A(1, N-K+I), LDA, A( M-K+I, N-K+I ), LDA, A, LDA) * * Apply H to rows 1:m-k+i+ib-1 of current block * CALL SORG2L( M-K+I+IB-1, IB, IB, A( 1, N-K+I ), LDA, $ TAU( I ), WORK, IINFO ) -* -* Set rows m-k+i+ib:m of current block to zero -* - DO 40 J = N - K + I, N - K + I + IB - 1 - DO 30 L = M - K + I + IB, M - A( L, J ) = ZERO - 30 CONTINUE - 40 CONTINUE - 50 CONTINUE + END DO END IF * WORK( 1 ) = SROUNDUP_LWORK(IWS) diff --git a/SRC/sorgqr.f b/SRC/sorgqr.f index 47d1fd248..070257c46 100644 --- a/SRC/sorgqr.f +++ b/SRC/sorgqr.f @@ -95,8 +95,6 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. LWORK >= max(1,N). -*> For optimum performance LWORK >= N*NB, where NB is the -*> optimal blocksize. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -147,7 +145,7 @@ SUBROUTINE SORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) $ LWKOPT, NB, NBMIN, NX * .. * .. External Subroutines .. - EXTERNAL SLARFB, SLARFT, SORG2R, XERBLA + EXTERNAL SLARFB0C2, SLARFT, SORG2R, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -163,7 +161,7 @@ SUBROUTINE SORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * INFO = 0 NB = ILAENV( 1, 'SORGQR', ' ', M, N, K, -1 ) - LWKOPT = MAX( 1, N )*NB + LWKOPT = MAX( 1, N ) WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN @@ -194,92 +192,92 @@ SUBROUTINE SORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) NBMIN = 2 NX = 0 IWS = N - IF( NB.GT.1 .AND. NB.LT.K ) THEN * -* Determine when to cross over from blocked to unblocked code. + IF( NB.GE.NBMIN .AND. NB.LT.K ) THEN * - NX = MAX( 0, ILAENV( 3, 'SORGQR', ' ', M, N, K, -1 ) ) - IF( NX.LT.K ) THEN +* Handle the first block assuming we are applying to the +* identity, then resume regular blocking method after * -* Determine if workspace is large enough for blocked code. -* - LDWORK = N - IWS = LDWORK*NB - IF( LWORK.LT.IWS ) THEN + KI = K - 2 * NB + KK = K - NB + ELSE + KK = 0 + END IF * -* Not enough workspace to use optimal NB: reduce NB and -* determine the minimum value of NB. +* Potentially bail to the unblocked code. * - NB = LWORK / LDWORK - NBMIN = MAX( 2, ILAENV( 2, 'SORGQR', ' ', M, N, K, - $ -1 ) ) - END IF - END IF + IF( KK.EQ.0 ) THEN + CALL SORG2R( M, N, K, A, LDA, TAU, WORK, IINFO ) END IF * - IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN + IF( KK.GT.0 ) THEN + I = KK + 1 + IB = NB * -* Use blocked code after the last block. -* The first kk columns are handled by the block method. +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * - KI = ( ( K-NX-1 ) / NB )*NB - KK = MIN( K, KI+NB ) + CALL SLARFT('Forward', 'Column', M-I+1, IB, A(I,I), + $ LDA, TAU(I), A(I,I), LDA) * -* Set A(1:kk,kk+1:n) to zero. +* Apply H to A(i:m,i+ib:n) from the left * - DO 20 J = KK + 1, N - DO 10 I = 1, KK - A( I, J ) = ZERO - 10 CONTINUE - 20 CONTINUE - ELSE - KK = 0 - END IF + CALL SLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Forward', + $ 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), LDA, A(I,I), + $ LDA, A(I,I+IB), LDA) * -* Use unblocked code for the last or only block. +* Apply H to rows i:m of current block * - IF( KK.LT.N ) - $ CALL SORG2R( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA, - $ TAU( KK+1 ), WORK, IINFO ) + CALL SORG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK, IINFO) + DO I = KI + 1, 1, -NB + IB = NB * - IF( KK.GT.0 ) THEN +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * -* Use blocked code + CALL SLARFT('Forward', 'Column', M-I+1, IB, A(I,I), + $ LDA, TAU(I), A(I,I), LDA) * - DO 50 I = KI + 1, 1, -NB - IB = MIN( NB, K-I+1 ) - IF( I+IB.LE.N ) THEN +* Apply H to A(i:m,i+ib:n) from the left * -* Form the triangular factor of the block reflector -* H = H(i) H(i+1) . . . H(i+ib-1) + CALL SLARFB0C2(.FALSE., 'Left', 'No Transpose', + $ 'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), + $ LDA, A(I,I), LDA, A(I,I+IB), LDA) + * - CALL SLARFT( 'Forward', 'Columnwise', M-I+1, IB, - $ A( I, I ), LDA, TAU( I ), WORK, LDWORK ) +* Apply H to rows i:m of current block * -* Apply H to A(i:m,i+ib:n) from the left + CALL SORG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK, + $ IINFO) + END DO * - CALL SLARFB( 'Left', 'No transpose', 'Forward', - $ 'Columnwise', M-I+1, N-I-IB+1, IB, - $ A( I, I ), LDA, WORK, LDWORK, A( I, I+IB ), - $ LDA, WORK( IB+1 ), LDWORK ) - END IF +* This checks for if K was a perfect multiple of NB +* so that we only have a special case for the last block when +* necessary * -* Apply H to rows i:m of current block + IF(I.LT.1) THEN + IB = I + NB - 1 + I = 1 * - CALL SORG2R( M-I+1, IB, IB, A( I, I ), LDA, TAU( I ), - $ WORK, - $ IINFO ) +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * -* Set rows 1:i-1 of current block to zero + CALL SLARFT('Forward', 'Column', M-I+1, IB, A(I,I), + $ LDA, TAU(I), A(I,I), LDA) * - DO 40 J = I, I + IB - 1 - DO 30 L = 1, I - 1 - A( L, J ) = ZERO - 30 CONTINUE - 40 CONTINUE - 50 CONTINUE - END IF +* Apply H to A(i:m,i+ib:n) from the left +* + CALL SLARFB0C2(.FALSE., 'Left', 'No Transpose', + $ 'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), + $ LDA, A(I,I), LDA, A(I,I+IB), LDA) + * +* Apply H to rows i:m of current block +* + CALL SORG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK, + $ IINFO) + END IF + END IF WORK( 1 ) = SROUNDUP_LWORK(IWS) RETURN * diff --git a/SRC/sorgrq.f b/SRC/sorgrq.f index 5fe685c1d..ea1a4e73d 100644 --- a/SRC/sorgrq.f +++ b/SRC/sorgrq.f @@ -147,7 +147,7 @@ SUBROUTINE SORGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) $ LWKOPT, NB, NBMIN, NX * .. * .. External Subroutines .. - EXTERNAL SLARFB, SLARFT, SORGR2, XERBLA + EXTERNAL SLARFB0C2, SLARFT, SORGR2, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -228,63 +228,70 @@ SUBROUTINE SORGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN * -* Use blocked code after the first block. -* The last kk rows are handled by the block method. +* We want to use the blocking method as long as our matrix is big enough +* and it's deemed worthwhile with the extra memory allocations * - KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB ) -* -* Set A(1:m-kk,n-kk+1:n) to zero. -* - DO 20 J = N - KK + 1, N - DO 10 I = 1, M - KK - A( I, J ) = ZERO - 10 CONTINUE - 20 CONTINUE + KK = K ELSE KK = 0 END IF * -* Use unblocked code for the first or only block. +* Potentially bail to the unblocked code * - CALL SORGR2( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO ) + IF( KK.EQ.0 ) THEN + CALL SORGR2( M, N, K, A, LDA, TAU, WORK, IINFO ) + END IF * IF( KK.GT.0 ) THEN * -* Use blocked code +* Factor the first block assuming that our first application +* will be on the Identity matrix * - DO 50 I = K - KK + 1, K, NB - IB = MIN( NB, K-I+1 ) - II = M - K + I - IF( II.GT.1 ) THEN + I = 1 + IB = NB + II = M - K + I * -* Form the triangular factor of the block reflector -* H = H(i+ib-1) . . . H(i+1) H(i) +* Form the triangular factor of the block reflector +* H = H(i+ib-1) . . . H(i+1) H(i) * - CALL SLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB, - $ A( II, 1 ), LDA, TAU( I ), WORK, LDWORK ) + CALL SLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB, + $ A( II, 1 ), LDA, TAU( I ), WORK, LDWORK ) * -* Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right +* Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right * - CALL SLARFB( 'Right', 'Transpose', 'Backward', - $ 'Rowwise', - $ II-1, N-K+I+IB-1, IB, A( II, 1 ), LDA, WORK, - $ LDWORK, A, LDA, WORK( IB+1 ), LDWORK ) - END IF + CALL SLARFB0C2(.TRUE., 'Right', 'Transpose', 'Backward', + $ 'Rowwise', II-1, N-K+I+IB-1, IB, A(II,1), LDA, WORK, + $ LDWORK, A, LDA) * * Apply H**T to columns 1:n-k+i+ib-1 of current block * - CALL SORGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA, - $ TAU( I ), - $ WORK, IINFO ) + CALL SORGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA, + $ TAU( I ), WORK, IINFO ) + + DO I = NB + 1, K, NB * -* Set columns n-k+i+ib:n of current block to zero +* The last block may be less than size NB +* + IB = MIN(NB, K-I+1) + II = M - K + I * - DO 40 L = N - K + I + IB, N - DO 30 J = II, II + IB - 1 - A( J, L ) = ZERO - 30 CONTINUE - 40 CONTINUE - 50 CONTINUE +* Form the triangular factor of the block reflector +* H = H(i+ib-1) . . . H(i+1) H(i) +* + CALL SLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB, + $ A( II, 1 ), LDA, TAU( I ), WORK, LDWORK ) +* +* Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right +* + CALL SLARFB0C2(.FALSE., 'Right', 'Transpose', + $ 'Backward', 'Rowwise', II-1, N-K+I+IB-1, IB, + $ A(II,1), LDA, WORK, LDWORK, A, LDA) +* +* Apply H**T to columns 1:n-k+i+ib-1 of current block +* + CALL SORGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA, + $ TAU( I ), WORK, IINFO ) + END DO END IF * WORK( 1 ) = SROUNDUP_LWORK(IWS) diff --git a/SRC/zlarfb0c2.f b/SRC/zlarfb0c2.f new file mode 100644 index 000000000..1f43b4b14 --- /dev/null +++ b/SRC/zlarfb0c2.f @@ -0,0 +1,560 @@ +*> \brief \b ZLARFB0C2 applies a block reflector or its conjugate-transpose +* to a rectangular matrix with a 0 block while constructing the explicit Q +* factor +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* +* Definition: +* =========== +* +* SUBROUTINE ZLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N, +* $ K, V, LDV, T, LDT, C, LDC) +* ! Scalar arguments +* INTEGER M, N, K, LDV, LDC, LDT +* CHARACTER SIDE, TRANS, DIRECT, STOREV +* ! True means that we are assuming C2 is the identity matrix +* ! and thus don't reference whatever is present in C2 +* ! at the beginning. +* LOGICAL C2I +* ! Array arguments +* COMPLEX*16 V(LDV,*), C(LDC,*), T(LDT,*) +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZLARFB0C2 applies a real block reflector H or its transpose H**H to a +*> complex m by n matrix C with a 0 block, while computing the explicit Q factor +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] C2I +*> \verbatim +*> C2I is LOGICAL +*> = .TRUE.: Assume the nonzero block of C is the identity matrix +*> = .FALSE.: Use existing data in the nonzero block of C +*> \endverbatim +*> +*> \param[in] SIDE +*> \verbatim +*> SIDE is CHARACTER*1 +*> = 'L': apply H or H**H from the Left +*> = 'R': apply H or H**H from the Right +*> \endverbatim +*> +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> = 'N': apply H (No transpose) +*> = 'C': apply H**H (Conjugate transpose) +*> \endverbatim +*> +*> \param[in] DIRECT +*> \verbatim +*> DIRECT is CHARACTER*1 +*> Indicates how H is formed from a product of elementary +*> reflectors +*> = 'F': H = H(1) H(2) . . . H(k) (Forward) +*> = 'B': H = H(k) . . . H(2) H(1) (Backward) +*> \endverbatim +*> +*> \param[in] STOREV +*> \verbatim +*> STOREV is CHARACTER*1 +*> Indicates how the vectors which define the elementary +*> reflectors are stored: +*> = 'C': Columnwise +*> = 'R': Rowwise +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix C. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix C. +*> \endverbatim +*> +*> \param[in] K +*> \verbatim +*> K is INTEGER +*> The order of the matrix T (= the number of elementary +*> reflectors whose product defines the block reflector). +*> If SIDE = 'L', M >= K >= 0; +*> if SIDE = 'R', N >= K >= 0. +*> \endverbatim +*> +*> \param[in] V +*> \verbatim +*> V is COMPLEX*16 array, dimension +*> (LDV,K) if STOREV = 'C' +*> (LDV,M) if STOREV = 'R' and SIDE = 'L' +*> (LDV,N) if STOREV = 'R' and SIDE = 'R' +*> See Further Details. +*> \endverbatim +*> +*> \param[in] LDV +*> \verbatim +*> LDV is INTEGER +*> The leading dimension of the array V. +*> If STOREV = 'C' and SIDE = 'L', LDV >= max(1,M); +*> if STOREV = 'C' and SIDE = 'R', LDV >= max(1,N); +*> if STOREV = 'R', LDV >= K. +*> \endverbatim +*> +*> \param[in] T +*> \verbatim +*> T is COMPLEX*16 array, dimension (LDT,K) +*> The triangular K-by-K matrix T in the representation of the +*> block reflector. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. LDT >= K. +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is COMPLEX*16 array, dimension (LDC,N) +*> On entry, the M-by-N matrix C. +*> On exit, C is overwritten by H*C or H**H*C or C*H or C*H**H. +*> \endverbatim +*> +*> \param[in] LDC +*> \verbatim +*> LDC is INTEGER +*> The leading dimension of the array C. LDC >= max(1,M). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup larfb +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> The shape of the matrix V and the storage of the vectors which define +*> the H(i) is best illustrated by the following example with n = 5 and +*> k = 3. The triangular part of V (including its diagonal) is not +*> referenced. +*> +*> DIRECT = 'F' and STOREV = 'C': DIRECT = 'F' and STOREV = 'R': +*> +*> V = ( 1 ) V = ( 1 v1 v1 v1 v1 ) +*> ( v1 1 ) ( 1 v2 v2 v2 ) +*> ( v1 v2 1 ) ( 1 v3 v3 ) +*> ( v1 v2 v3 ) +*> ( v1 v2 v3 ) +*> +*> DIRECT = 'B' and STOREV = 'C': DIRECT = 'B' and STOREV = 'R': +*> +*> V = ( v1 v2 v3 ) V = ( v1 v1 1 ) +*> ( v1 v2 v3 ) ( v2 v2 v2 1 ) +*> ( 1 v2 v3 ) ( v3 v3 v3 v3 1 ) +*> ( 1 v3 ) +*> ( 1 ) +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE ZLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N, + $ K, V, LDV, T, LDT, C, LDC) + ! Scalar arguments + INTEGER M, N, K, LDV, LDC, LDT + CHARACTER SIDE, TRANS, DIRECT, STOREV + ! True means that we are assuming C2 is the identity matrix + ! and thus don't reference whatever is present in C2 + ! at the beginning. + LOGICAL C2I + ! Array arguments + COMPLEX*16 V(LDV,*), C(LDC,*), T(LDT,*) + ! Local scalars + LOGICAL QR, LQ, QL, DIRF, COLV, SIDEL, SIDER, + $ TRANST + INTEGER I, J + ! Intrinsic Functions + INTRINSIC CONJG + ! External functions + LOGICAL LSAME + EXTERNAL LSAME + ! External Subroutines + EXTERNAL ZGEMM, ZTRMM + ! Parameters + COMPLEX*16 ONE, ZERO, NEG_ONE + PARAMETER(ONE=(1.0D+0, 0.0D+0), + $ ZERO = (0.0D+0, 0.0D+0), + $ NEG_ONE = (-1.0D+0, 0.0D+0)) + + ! Beginning of executable statements + ! Convert our character flags to logical values + DIRF = LSAME(DIRECT,'F') + COLV = LSAME(STOREV,'C') + SIDEL = LSAME(SIDE,'L') + SIDER = LSAME(SIDE,'R') + TRANST = LSAME(TRANS,'C') + + ! Determine which of the 4 modes are using. + ! QR is when we store the reflectors column by column and have the + ! 'first' reflector stored in the first column + QR = DIRF.AND.COLV + + ! LQ is when we store the reflectors row by row and have the + ! 'first' reflector stored in the first row + LQ = DIRF.AND.(.NOT.COLV) + + ! QL is when we store the reflectors column by column and have the + ! 'first' reflector stored in the last column + QL = (.NOT.DIRF).AND.COLV + + ! RQ is when we store the reflectors row by row and have the + ! 'first' reflector stored in the last row + ! RQ = (.NOT.DIRF).AND.(.NOT.COLV) + ! Since we have exactly one of these 4 modes, we don't need to actually + ! store the value of RQ, instead we assume this is the case if we fail + ! the above 3 checks. + + IF (QR) THEN + ! We are computing C = HC = (I - VTV')C + ! Where: V = [ V1 ] and C = [ C1 ] + ! [ V2 ] [ C2 ] + ! with the following dimensions: + ! V1\in\R^{K\times K} + ! V2\in\R^{M-K\times K} + ! C1=0\in\R^{K\times N} + ! C2\in\R^{M-K\times N} + ! Since we are assuming that C1 is a zero matrix and it will be + ! overwritten on exit, we can use this spot as a temporary workspace + ! without having to allocate anything extra. + ! This lets us simplify our above equation to get + ! + ! C = HC = (I - [ V1 ]T [V1', V2'])[ 0 ] + ! [ V2 ] [ C2 ] + ! = [ 0 ] - [ V1 ]T*V2'*C2 + ! [ C2 ] [ V2 ] + ! + ! = [ 0 ] - [ V1*T*V2'*C2 ] + ! [ C2 ] [ V2*T*V2'*C2 ] + ! + ! = [ V1*T*V2'*C2 ] + ! [ C2 - V2*T*V2'*C2 ] + ! + ! So, we can order our computations as follows: + ! + ! C1 = V2'*C2 + ! C1 = T*C1 + ! C2 = C2 - V2*C1 + ! C1 = -V1*C1 + ! + ! To achieve the same end result + ! + ! Check to ensure side and trans are the expected values + ! + IF( .NOT.SIDEL ) THEN + CALL XERBLA('ZLARFB0C2', 2) + RETURN + ELSE IF(TRANST) THEN + CALL XERBLA('ZLARFB0C2', 3) + RETURN + END IF + ! + ! C1 = V2'*C2 + ! + IF (C2I) THEN + DO J = 1, N + DO I = 1, K + C(I,J) = CONJG(V(K+J,I)) + END DO + END DO + ELSE + CALL ZGEMM('Conjugate', 'No Transpose', K, N, M - K, + $ ONE, V(K+1,1), LDV, C(K+1,1), LDC, ZERO, + $ C, LDC) + END IF + ! + ! C1 = T*C1 + ! + CALL ZTRMM('Left', 'Upper', 'No Transpose', 'Non-unit', + $ K, N, ONE, T, LDT, C, LDC) + ! + ! C2 = C2 - V2*C1 = -V2*C1 + C2 + ! + IF (C2I) THEN + CALL ZGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V(K+1,1), LDV, C, LDC, ZERO, + $ C(K+1,1), LDC) + DO I = 1, N + C(K+I,I) = C(K+I,I) + ONE + END DO + ELSE + CALL ZGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V(K+1,1), LDV, C, LDC, ONE, + $ C(K+1,1), LDC) + END IF + ! + ! C1 = -V1*C1 + ! + CALL ZTRMM('Left', 'Lower', 'No Transpose', 'Unit', + $ K, N, NEG_ONE, V, LDV, C, LDC) + ELSE IF (LQ) THEN + ! We are computing C = CH' = C(I-V'T'V) + ! Where: V = [ V1 V2 ] and C = [ C1 C2 ] + ! with the following dimensions: + ! V1\in\R^{K\times K} + ! V2\in\R^{K\times N-K} + ! C1=0\in\R^{M\times K} + ! C2\in\R^{M\times N-K} + ! Since we are assuming that C1 is a zero matrix and it will be + ! overwritten on exit, we can use this spot as a temporary workspace + ! without having to allocate anything extra. + ! This lets us simplify our above equation to get + ! + ! C = CH' = [ 0, C2 ](I - [ V1' ]T'[ V1, V2 ]) + ! [ V2' ] + ! + ! = [ 0, C2 ] - [ 0, C2 ][ V1' ]T'[ V1, V2 ] + ! [ V2' ] + ! + ! = [ 0, C2 ] - C2*V2'*T'[ V1, V2 ] + ! + ! = [ -C2*V2'*T'*V1, C2 - C2*V2'*T'*V2 ] + ! + ! So, we can order our computations as follows: + ! + ! C1 = C2*V2' + ! C1 = C1*T' + ! C2 = C2 - C1*V2 + ! C1 = -C1*V1 + ! + ! To achieve the same end result + ! + ! Check to ensure side and trans are the expected values + ! + IF( .NOT.SIDER ) THEN + CALL XERBLA('ZLARFB0C2', 2) + RETURN + ELSE IF(.NOT.TRANST) THEN + CALL XERBLA('ZLARFB0C2', 3) + RETURN + END IF + ! + ! C1 = C2*V2' + ! + IF( C2I ) THEN + DO J = 1, K + DO I = 1, M + C(I,J) = CONJG(V(J,K+I)) + END DO + END DO + ELSE + CALL ZGEMM('No Transpose', 'Conjugate', M, K, N-K, + $ ONE, C(1,K+1), LDC, V(1, K+1), LDV, ZERO, C, + $ LDC) + END IF + ! + ! C1 = C1*T' + ! + CALL ZTRMM('Right', 'Upper', 'Conjugate', 'Non-unit', + $ M, K, ONE, T, LDT, C, LDC) + ! + ! C2 = C2 - C1*V2 = -C1*V2 + C2 + ! + IF( C2I ) THEN + CALL ZGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C, LDC, V(1,K+1), LDV, ZERO, C(1,K+1), + $ LDC) + DO I = 1, M + C(I,K+I) = C(I,K+I) + ONE + END DO + ELSE + CALL ZGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C, LDC, V(1,K+1), LDV, ONE, C(1,K+1), + $ LDC) + END IF + ! + ! C1 = -C1*V1 + ! + CALL ZTRMM('Right', 'Upper', 'No Transpose', 'Unit', + $ M, K, NEG_ONE, V, LDV, C, LDC) + ELSE IF (QL) THEN + ! We are computing C = HC = (I - VTV')C + ! Where: V = [ V2 ] and C = [ C2 ] + ! [ V1 ] [ C1 ] + ! with the following dimensions: + ! V1\in\R^{K\times K} + ! V2\in\R^{M-K\times K} + ! C1=0\in\R^{K\times N} + ! C2\in\R^{M-K\times N} + ! Since we are assuming that C1 is a zero matrix and it will be + ! overwritten on exit, we can use this spot as a temporary workspace + ! without having to allocate anything extra. + ! This lets us simplify our above equation to get + ! + ! C = HC = (I-[ V2 ]T[ V2' V1' ])[ C2 ] + ! [ V1 ] [ 0 ] + ! + ! = [ C2 ] - [ V2 ]T*V2'*C2 + ! [ 0 ] [ V1 ] + ! + ! = [ C2 ] - [ V2*T*V2'*C2 ] + ! [ 0 ] [ V1*T*V2'*C2 ] + ! + ! = [ C2 - V2*T*V2'*C2 ] + ! [ - V1*T*V2'*C2 ] + ! + ! So, we can order our computations as follows: + ! + ! C1 = V2'*C2 + ! C1 = T*C1 + ! C2 = C2 - V2*C1 + ! C1 = -V1*C1 + ! + ! To achieve the same end result + ! + ! Check to ensure side and trans are the expected values + ! + IF( .NOT.SIDEL ) THEN + CALL XERBLA('ZLARFB0C2', 2) + RETURN + ELSE IF(TRANST) THEN + CALL XERBLA('ZLARFB0C2', 3) + RETURN + END IF + ! + ! C1 = V2'*C2 + ! + IF( C2I ) THEN + DO J = 1, N + DO I = 1, K + C(M-K+I,J) = CONJG(V(J,I)) + END DO + END DO + ELSE + CALL ZGEMM('Conjugate', 'No Transpose', K, N, M-K, + $ ONE, V, LDV, C, LDC, ZERO, C(M-K+1, 1), LDC) + END IF + ! + ! C1 = T*C1 + ! + CALL ZTRMM('Left', 'Lower', 'No Transpose', 'Non-unit', + $ K, N, ONE, T, LDT, C(M-K+1,1), LDC) + ! + ! C2 = C2 - V2*C1 = -V2*C1 + C2 + ! + IF( C2I ) THEN + CALL ZGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V, LDV, C(M-K+1,1), LDC, ZERO, C, LDC) + DO I = 1, N + C(I,I) = C(I,I) + ONE + END DO + ELSE + CALL ZGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V, LDV, C(M-K+1,1), LDC, ONE, C, LDC) + END IF + ! + ! C1 = -V1*C1 + ! + CALL ZTRMM('Left', 'Upper', 'No Transpose', 'Unit', + $ K, N, NEG_ONE, V(M-K+1,1), LDV, C(M-K+1,1), LDC) + ELSE ! IF (RQ) THEN + ! We are computing C = CH' = C(I-V'T'V) + ! Where: V = [ V2 V1] and C = [ C2 C1 ] + ! with the following dimensions: + ! V1\in\R^{K\times K} + ! V2\in\R^{K\times N-K} + ! C1=0\in\R^{M\times K} + ! C2\in\R^{M\times N-K} + ! Since we are assuming that C1 is a zero matrix and it will be + ! overwritten on exit, we can use this spot as a temporary workspace + ! without having to allocate anything extra. + ! This lets us simplify our above equation to get + ! + ! C = CH' = [ C2, 0 ] (I - [ V2' ]T'[ V2, V1 ] + ! [ V1' ] + ! + ! = [ C2, 0 ] - [ C2, 0 ] [ V2' ]T'[ V2, V1 ] + ! [ V1' ] + ! + ! = [ C2, 0 ] - C2*V2'*T'[ V2, V1 ] + ! + ! = [ C2, 0 ] - [ C2*V2'*T'*V2, C2*V2'*T'*V1 ] + ! + ! = [ C2 - C2*V2'*T'*V2, -C2*V2'*T'*V1 ] + ! + ! So, we can order our computations as follows: + ! + ! C1 = C2*V2' + ! C1 = C1*T' + ! C2 = C2 - C1*V2 + ! C1 = -C1*V1 + ! + ! + ! To achieve the same end result + ! + ! Check to ensure side and trans are the expected values + ! + IF( .NOT.SIDER ) THEN + CALL XERBLA('ZLARFB0C2', 2) + RETURN + ELSE IF(.NOT.TRANST) THEN + CALL XERBLA('ZLARFB0C2', 3) + RETURN + END IF + ! + ! C1 = C2*V2' + ! + IF( C2I ) THEN + DO J = 1, K + DO I = 1, M + C(I,N-K+J) = CONJG(V(J,I)) + END DO + END DO + ELSE + CALL ZGEMM('No Transpose', 'Conjugate', M, K, N-K, + $ ONE, C, LDC, V, LDV, ZERO, C(1, N-K+1), LDC) + END IF + ! + ! C1 = C1*T' + ! + CALL ZTRMM('Right', 'Lower', 'Conjugate', 'Non-unit', + $ M, K, ONE, T, LDT, C(1, N-K+1), LDC) + ! + ! C2 = C2 - C1*V2 = -C1*V2 + C2 + ! + IF( C2I ) THEN + CALL ZGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C(1, N-K+1), LDC, V, LDV, ZERO, C, LDC) + DO I = 1, M + C(I,I) = C(I,I) + ONE + END DO + ELSE + CALL ZGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C(1, N-K+1), LDC, V, LDV, ONE, C, LDC) + END IF + ! + ! C1 = -C1*V1 + ! + CALL ZTRMM('Right', 'Lower', 'No Transpose', 'Unit', + $ M, K, NEG_ONE, V(1, N-K+1), LDV, C(1,N-K+1), LDC) + END IF + END SUBROUTINE diff --git a/SRC/zunglq.f b/SRC/zunglq.f index 3cc107560..54aaf7686 100644 --- a/SRC/zunglq.f +++ b/SRC/zunglq.f @@ -136,17 +136,13 @@ SUBROUTINE ZUNGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * * ===================================================================== * -* .. Parameters .. - COMPLEX*16 ZERO - PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) -* .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, J, KI, KK, L, LDWORK, - $ LWKOPT, NB, NBMIN, NX + INTEGER I, IB, IINFO, IWS, KI, KK, LWKOPT, LDWORK, + $ NB, NBMIN, NX * .. * .. External Subroutines .. - EXTERNAL XERBLA, ZLARFB, ZLARFT, ZUNGL2 + EXTERNAL XERBLA, ZLARFB0C2, ZLARFT, ZUNGL2 * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -217,67 +213,95 @@ SUBROUTINE ZUNGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN * -* Use blocked code after the last block. -* The first kk rows are handled by the block method. -* - KI = ( ( K-NX-1 ) / NB )*NB - KK = MIN( K, KI+NB ) -* -* Set A(kk+1:m,1:kk) to zero. +* Treat the last NB block starting at KK+1 specially then use our blocking +* method from the block starting at KI+1 to 1 * - DO 20 J = 1, KK - DO 10 I = KK + 1, M - A( I, J ) = ZERO - 10 CONTINUE - 20 CONTINUE + KI = K - 2 * NB + KK = K - NB ELSE KK = 0 END IF * -* Use unblocked code for the last or only block. +* Potentially bail to the unblocked version * - IF( KK.LT.M ) - $ CALL ZUNGL2( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA, - $ TAU( KK+1 ), WORK, IINFO ) + IF( KK.EQ.0 ) THEN + CALL ZUNGL2( M, N, K, A, LDA, TAU, WORK, IINFO ) + END IF * IF( KK.GT.0 ) THEN * -* Use blocked code +* Factor the last block assuming that our first application +* will be on the Identity matrix * - DO 50 I = KI + 1, 1, -NB - IB = MIN( NB, K-I+1 ) - IF( I+IB.LE.M ) THEN + I = KK + 1 + IB = NB * -* Form the triangular factor of the block reflector -* H = H(i) H(i+1) . . . H(i+ib-1) +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * - CALL ZLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, - $ I ), - $ LDA, TAU( I ), WORK, LDWORK ) + CALL ZLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ), + $ LDA, TAU( I ), WORK, LDWORK ) * -* Apply H**H to A(i+ib:m,i:n) from the right +* Apply H**T to A(i+ib:m,i:n) from the right +* Exploit the fact that we are applying to an identity * - CALL ZLARFB( 'Right', 'Conjugate transpose', - $ 'Forward', - $ 'Rowwise', M-I-IB+1, N-I+1, IB, A( I, I ), - $ LDA, WORK, LDWORK, A( I+IB, I ), LDA, - $ WORK( IB+1 ), LDWORK ) - END IF + CALL ZLARFB0C2(.TRUE., 'Right', 'Conjugate', 'Forward', + $ 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, + $ LDWORK, A(I+IB,I), LDA) +* +* Apply H**T to columns i:n of current block + + CALL ZUNGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ), + $ WORK, IINFO ) +* +* Use our standard blocking method after the last block +* + DO I = KI + 1, 1, -NB + IB = NB +* +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * -* Apply H**H to columns i:n of current block + CALL ZLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ), + $ LDA, TAU( I ), WORK, LDWORK ) * +* Apply H**T to A(i+ib:m,i:n) from the right +* + CALL ZLARFB0C2(.FALSE., 'Right', 'Conjugate', 'Forward', + $ 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, + $ LDWORK, A(I+IB,I), LDA) +* +* Apply H**T to columns i:n of current block + CALL ZUNGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ), - $ WORK, - $ IINFO ) + $ WORK, IINFO ) + END DO +* +* This checks for if K was a perfect multiple of NB +* so that we only have a special case for the last block when +* necessary +* + IF(I.LT.1) THEN + IB = I + NB - 1 + I = 1 +* +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * -* Set columns 1:i-1 of current block to zero + CALL ZLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ), + $ LDA, TAU( I ), WORK, LDWORK ) * - DO 40 J = 1, I - 1 - DO 30 L = I, I + IB - 1 - A( L, J ) = ZERO - 30 CONTINUE - 40 CONTINUE - 50 CONTINUE +* Apply H**T to A(i+ib:m,i:n) from the right +* + CALL ZLARFB0C2(.FALSE., 'Right', 'Conjugate', 'Forward', + $ 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, + $ LDWORK, A(I+IB,I), LDA) +* +* Apply H**T to columns i:n of current block + + CALL ZUNGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ), + $ WORK, IINFO ) + END IF END IF * WORK( 1 ) = IWS diff --git a/SRC/zungql.f b/SRC/zungql.f index a64de501d..74c5ec8e6 100644 --- a/SRC/zungql.f +++ b/SRC/zungql.f @@ -95,8 +95,6 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. LWORK >= max(1,N). -*> For optimum performance LWORK >= N*NB, where NB is the -*> optimal blocksize. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -136,18 +134,14 @@ SUBROUTINE ZUNGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * .. * * ===================================================================== -* -* .. Parameters .. - COMPLEX*16 ZERO - PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) -* .. +* * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, J, KK, L, LDWORK, LWKOPT, - $ NB, NBMIN, NX + INTEGER I, IB, IINFO, IWS, KK, LWKOPT, NB, NBMIN, + $ NX * .. * .. External Subroutines .. - EXTERNAL XERBLA, ZLARFB, ZLARFT, ZUNG2L + EXTERNAL XERBLA, ZLARFB0C2, ZLARFT, ZUNG2L * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -177,7 +171,11 @@ SUBROUTINE ZUNGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) LWKOPT = 1 ELSE NB = ILAENV( 1, 'ZUNGQL', ' ', M, N, K, -1 ) - LWKOPT = N*NB +* +* Only need a workspace for zung2l in case of bailout +* and for the panel factorization +* + LWKOPT = N END IF WORK( 1 ) = LWKOPT * @@ -200,88 +198,77 @@ SUBROUTINE ZUNGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) END IF * NBMIN = 2 - NX = 0 + NX = MAX( 0, ILAENV( 3, 'ZUNGQL', ' ', M, N, K, -1 )) IWS = N - IF( NB.GT.1 .AND. NB.LT.K ) THEN -* -* Determine when to cross over from blocked to unblocked code. * - NX = MAX( 0, ILAENV( 3, 'ZUNGQL', ' ', M, N, K, -1 ) ) - IF( NX.LT.K ) THEN + IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN * -* Determine if workspace is large enough for blocked code. +* We use blocked code for the entire construction * - LDWORK = N - IWS = LDWORK*NB - IF( LWORK.LT.IWS ) THEN + KK = K + ELSE + KK = 0 + END IF * -* Not enough workspace to use optimal NB: reduce NB and -* determine the minimum value of NB. +* Possibly bail to the unblocked code. * - NB = LWORK / LDWORK - NBMIN = MAX( 2, ILAENV( 2, 'ZUNGQL', ' ', M, N, K, - $ -1 ) ) - END IF - END IF + IF( KK.EQ.0 ) THEN + CALL ZUNG2L( M, N, K, A, LDA, TAU, WORK, IINFO ) END IF * - IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN + IF( KK.GT.0 ) THEN * -* Use blocked code after the first block. -* The last kk columns are handled by the block method. +* Factor the first block assuming that our first application +* will be on the Identity matrix * - KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB ) + I = 1 + IB = NB * -* Set A(m-kk+1:m,1:n-kk) to zero. +* Form the triangular factor of the block reflector +* H = H(i+ib-1) . . . H(i+1) H(i) * - DO 20 J = 1, N - KK - DO 10 I = M - KK + 1, M - A( I, J ) = ZERO - 10 CONTINUE - 20 CONTINUE - ELSE - KK = 0 - END IF + CALL ZLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB, + $ A( 1, N-K+I ), LDA, TAU( I ), + $ A( M-K+I, N-K+I ), LDA) * -* Use unblocked code for the first or only block. +* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left +* Exploit the fact that we are applying to an identity * - CALL ZUNG2L( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO ) + CALL ZLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Backward', + $ 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, A(1, N-K+I), + $ LDA, A( M-K+I, N-K+I ), LDA, A, LDA) * - IF( KK.GT.0 ) THEN +* Apply H to rows 1:m-k+i+ib-1 of current block +* + CALL ZUNG2L( M-K+I+IB-1, IB, IB, A( 1, N-K+I ), LDA, + $ TAU( I ), WORK, IINFO ) + +* Use blocked code on the remaining blocks if there are any. +* + DO I = NB+1, K, NB * -* Use blocked code +* The last block may be less than size NB * - DO 50 I = K - KK + 1, K, NB - IB = MIN( NB, K-I+1 ) - IF( N-K+I.GT.1 ) THEN + IB = MIN(NB, K-I+1) * -* Form the triangular factor of the block reflector -* H = H(i+ib-1) . . . H(i+1) H(i) +* Form the triangular factor of the block reflector +* H = H(i+ib-1) . . . H(i+1) H(i) * - CALL ZLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB, - $ A( 1, N-K+I ), LDA, TAU( I ), WORK, LDWORK ) + CALL ZLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB, + $ A( 1, N-K+I ), LDA, TAU( I ), + $ A( M-K+I, N-K+I ), LDA ) * -* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left +* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left * - CALL ZLARFB( 'Left', 'No transpose', 'Backward', - $ 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, - $ A( 1, N-K+I ), LDA, WORK, LDWORK, A, LDA, - $ WORK( IB+1 ), LDWORK ) - END IF + CALL ZLARFB0C2(.FALSE., 'Left', 'No Transpose', + $ 'Backward', 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, + $ A(1, N-K+I), LDA, A( M-K+I, N-K+I ), LDA, A, LDA) * * Apply H to rows 1:m-k+i+ib-1 of current block * CALL ZUNG2L( M-K+I+IB-1, IB, IB, A( 1, N-K+I ), LDA, $ TAU( I ), WORK, IINFO ) -* -* Set rows m-k+i+ib:m of current block to zero -* - DO 40 J = N - K + I, N - K + I + IB - 1 - DO 30 L = M - K + I + IB, M - A( L, J ) = ZERO - 30 CONTINUE - 40 CONTINUE - 50 CONTINUE + END DO END IF * WORK( 1 ) = IWS diff --git a/SRC/zungqr.f b/SRC/zungqr.f index 6c9f2e3ff..50d5d91a2 100644 --- a/SRC/zungqr.f +++ b/SRC/zungqr.f @@ -95,8 +95,6 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. LWORK >= max(1,N). -*> For optimum performance LWORK >= N*NB, where NB is the -*> optimal blocksize. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -137,17 +135,13 @@ SUBROUTINE ZUNGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * * ===================================================================== * -* .. Parameters .. - COMPLEX*16 ZERO - PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) -* .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, J, KI, KK, L, LDWORK, - $ LWKOPT, NB, NBMIN, NX + INTEGER I, IB, IINFO, IWS, KI, KK, LWKOPT, NB, + $ NBMIN, NX * .. * .. External Subroutines .. - EXTERNAL XERBLA, ZLARFB, ZLARFT, ZUNG2R + EXTERNAL XERBLA, ZLARFB0C2, ZLARFT, ZUNG2R * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -162,7 +156,11 @@ SUBROUTINE ZUNGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * INFO = 0 NB = ILAENV( 1, 'ZUNGQR', ' ', M, N, K, -1 ) - LWKOPT = MAX( 1, N )*NB +* +* Only need a workspace for zung2r in case of bailout +* and for the panel factorization +* + LWKOPT = MAX( 1, N ) WORK( 1 ) = LWKOPT LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN @@ -191,92 +189,102 @@ SUBROUTINE ZUNGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) END IF * NBMIN = 2 - NX = 0 +* Determine when to cross over from unblocked to blocked + NX = MAX( 0, ILAENV( 3, 'ZUNGQR', ' ', M, N, K, -1 ) ) IWS = N - IF( NB.GT.1 .AND. NB.LT.K ) THEN * -* Determine when to cross over from blocked to unblocked code. -* - NX = MAX( 0, ILAENV( 3, 'ZUNGQR', ' ', M, N, K, -1 ) ) - IF( NX.LT.K ) THEN + IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN * -* Determine if workspace is large enough for blocked code. +* Treat the last NB block starting at KK+1 specially then use our blocking +* method from the block starting at KI+1 to 1 * - LDWORK = N - IWS = LDWORK*NB - IF( LWORK.LT.IWS ) THEN + KI = K - 2 * NB + KK = K - NB + ELSE + KK = 0 + END IF * -* Not enough workspace to use optimal NB: reduce NB and -* determine the minimum value of NB. +* Potentially bail to the unblocked code. * - NB = LWORK / LDWORK - NBMIN = MAX( 2, ILAENV( 2, 'ZUNGQR', ' ', M, N, K, - $ -1 ) ) - END IF - END IF + IF( KK.EQ.0 ) THEN + CALL ZUNG2R( M, N, K, A, LDA, TAU, WORK, IINFO ) END IF * - IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN + IF( KK.GT.0 ) THEN * -* Use blocked code after the last block. -* The first kk columns are handled by the block method. +* Factor the last block assuming that our first application +* will be on the Identity matrix * - KI = ( ( K-NX-1 ) / NB )*NB - KK = MIN( K, KI+NB ) + I = KK + 1 + IB = NB * -* Set A(1:kk,kk+1:n) to zero. +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * - DO 20 J = KK + 1, N - DO 10 I = 1, KK - A( I, J ) = ZERO - 10 CONTINUE - 20 CONTINUE - ELSE - KK = 0 - END IF + CALL ZLARFT('Forward', 'Column', M-I+1, IB, A(I,I), + $ LDA, TAU(I), A(I,I), LDA) * -* Use unblocked code for the last or only block. +* Apply H to A(i:m,i+ib:n) from the left +* Exploit the fact that we are applying to an identity * - IF( KK.LT.N ) - $ CALL ZUNG2R( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA, - $ TAU( KK+1 ), WORK, IINFO ) + CALL ZLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Forward', + $ 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), LDA, A(I,I), + $ LDA, A(I,I+IB), LDA) * - IF( KK.GT.0 ) THEN +* Apply H to rows i:m of current block * -* Use blocked code + CALL ZUNG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK, IINFO) * - DO 50 I = KI + 1, 1, -NB - IB = MIN( NB, K-I+1 ) - IF( I+IB.LE.N ) THEN +* Use our standard blocking method after the last block * -* Form the triangular factor of the block reflector -* H = H(i) H(i+1) . . . H(i+ib-1) + DO I = KI + 1, 1, -NB + IB = NB * - CALL ZLARFT( 'Forward', 'Columnwise', M-I+1, IB, - $ A( I, I ), LDA, TAU( I ), WORK, LDWORK ) +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * -* Apply H to A(i:m,i+ib:n) from the left + CALL ZLARFT('Forward', 'Column', M-I+1, IB, A(I,I), + $ LDA, TAU(I), A(I,I), LDA) * - CALL ZLARFB( 'Left', 'No transpose', 'Forward', - $ 'Columnwise', M-I+1, N-I-IB+1, IB, - $ A( I, I ), LDA, WORK, LDWORK, A( I, I+IB ), - $ LDA, WORK( IB+1 ), LDWORK ) - END IF +* Apply H to A(i:m,i+ib:n) from the left +* + CALL ZLARFB0C2(.FALSE., 'Left', 'No Transpose', + $ 'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), + $ LDA, A(I,I), LDA, A(I,I+IB), LDA) + * * Apply H to rows i:m of current block * - CALL ZUNG2R( M-I+1, IB, IB, A( I, I ), LDA, TAU( I ), - $ WORK, - $ IINFO ) + CALL ZUNG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK, + $ IINFO) + END DO +* +* This checks for if K was a perfect multiple of NB +* so that we only have a special case for the last block when +* necessary +* + IF(I.LT.1) THEN + IB = I + NB - 1 + I = 1 * -* Set rows 1:i-1 of current block to zero +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * - DO 40 J = I, I + IB - 1 - DO 30 L = 1, I - 1 - A( L, J ) = ZERO - 30 CONTINUE - 40 CONTINUE - 50 CONTINUE + CALL ZLARFT('Forward', 'Column', M-I+1, IB, A(I,I), + $ LDA, TAU(I), A(I,I), LDA) +* +* Apply H to A(i:m,i+ib:n) from the left +* + CALL ZLARFB0C2(.FALSE., 'Left', 'No Transpose', + $ 'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), + $ LDA, A(I,I), LDA, A(I,I+IB), LDA) + +* +* Apply H to rows i:m of current block +* + CALL ZUNG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK, + $ IINFO) + END IF END IF * WORK( 1 ) = IWS diff --git a/SRC/zungrq.f b/SRC/zungrq.f index dcc2772d6..939fff3fc 100644 --- a/SRC/zungrq.f +++ b/SRC/zungrq.f @@ -137,13 +137,9 @@ SUBROUTINE ZUNGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * * ===================================================================== * -* .. Parameters .. - COMPLEX*16 ZERO - PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) -* .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, II, IINFO, IWS, J, KK, L, LDWORK, + INTEGER I, IB, II, IINFO, IWS, KK, LDWORK, $ LWKOPT, NB, NBMIN, NX * .. * .. External Subroutines .. @@ -230,61 +226,67 @@ SUBROUTINE ZUNGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * Use blocked code after the first block. * The last kk rows are handled by the block method. * - KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB ) -* -* Set A(1:m-kk,n-kk+1:n) to zero. -* - DO 20 J = N - KK + 1, N - DO 10 I = 1, M - KK - A( I, J ) = ZERO - 10 CONTINUE - 20 CONTINUE + KK = K ELSE KK = 0 END IF * -* Use unblocked code for the first or only block. +* Potentially bail to the unblocked code * - CALL ZUNGR2( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO ) + IF( KK.EQ.0 ) THEN + CALL ZUNGR2( M, N, K, A, LDA, TAU, WORK, IINFO ) + END IF * IF( KK.GT.0 ) THEN * -* Use blocked code +* Factor the first block assuming that our first application +* will be on the Identity matrix * - DO 50 I = K - KK + 1, K, NB - IB = MIN( NB, K-I+1 ) - II = M - K + I - IF( II.GT.1 ) THEN + I = 1 + IB = NB + II = M - K + I * -* Form the triangular factor of the block reflector -* H = H(i+ib-1) . . . H(i+1) H(i) +* Form the triangular factor of the block reflector +* H = H(i+ib-1) . . . H(i+1) H(i) * - CALL ZLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB, - $ A( II, 1 ), LDA, TAU( I ), WORK, LDWORK ) + CALL ZLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB, + $ A( II, 1 ), LDA, TAU( I ), WORK, LDWORK ) * -* Apply H**H to A(1:m-k+i-1,1:n-k+i+ib-1) from the right +* Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right * - CALL ZLARFB( 'Right', 'Conjugate transpose', - $ 'Backward', - $ 'Rowwise', II-1, N-K+I+IB-1, IB, A( II, 1 ), - $ LDA, WORK, LDWORK, A, LDA, WORK( IB+1 ), - $ LDWORK ) - END IF + CALL ZLARFB0C2(.TRUE., 'Right', 'Conjugate', 'Backward', + $ 'Rowwise', II-1, N-K+I+IB-1, IB, A(II,1), LDA, WORK, + $ LDWORK, A, LDA) * -* Apply H**H to columns 1:n-k+i+ib-1 of current block +* Apply H**T to columns 1:n-k+i+ib-1 of current block * - CALL ZUNGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA, - $ TAU( I ), - $ WORK, IINFO ) + CALL ZUNGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA, + $ TAU( I ), WORK, IINFO ) + + DO I = NB+1, K, NB * -* Set columns n-k+i+ib:n of current block to zero +* The last block may be less than size NB * - DO 40 L = N - K + I + IB, N - DO 30 J = II, II + IB - 1 - A( J, L ) = ZERO - 30 CONTINUE - 40 CONTINUE - 50 CONTINUE + IB = MIN(NB, K-I+1) + II = M - K + I +* +* Form the triangular factor of the block reflector +* H = H(i+ib-1) . . . H(i+1) H(i) +* + CALL ZLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB, + $ A( II, 1 ), LDA, TAU( I ), WORK, LDWORK ) +* +* Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right +* + CALL ZLARFB0C2(.FALSE., 'Right', 'Conjugate', + $ 'Backward', 'Rowwise', II-1, N-K+I+IB-1, IB, + $ A(II,1), LDA, WORK, LDWORK, A, LDA) +* +* Apply H**T to columns 1:n-k+i+ib-1 of current block +* + CALL ZUNGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA, + $ TAU( I ), WORK, IINFO ) + END DO END IF * WORK( 1 ) = IWS